diff --git a/cachelib/allocator/nvmcache/NavyConfig.cpp b/cachelib/allocator/nvmcache/NavyConfig.cpp index f78e43d52a..d8ead5ae3b 100644 --- a/cachelib/allocator/nvmcache/NavyConfig.cpp +++ b/cachelib/allocator/nvmcache/NavyConfig.cpp @@ -229,6 +229,52 @@ BigHashConfig& BigHashConfig::setSizePctAndMaxItemSize( return *this; } +KangarooConfig& KangarooConfig::setSizePctAndMaxItemSize( + unsigned int sizePct, uint64_t smallItemMaxSize) { + if (sizePct > 100) { + throw std::invalid_argument(folly::sformat( + "to enable Kangaroo, Kangaroo size pct should be in the range of [0, 100]" + ", but {} is set", + sizePct)); + } + if (sizePct == 0) { + XLOG(INFO) << "Kangaroo is not configured"; + } + sizePct_ = sizePct; + smallItemMaxSize_ = smallItemMaxSize; + return *this; +} + +KangarooConfig& KangarooConfig::setLog(unsigned int sizePct, + uint64_t physicalPartitions, + uint64_t indexPerPhysicalPartitions, + uint32_t threshold) { + if (sizePct > 100) { + throw std::invalid_argument(folly::sformat( + "to enable KangarooLog, KangarooLog size pct should be in the range of [0, 100]" + ", but {} is set", + sizePct)); + } + if (sizePct == 0) { + XLOG(INFO) << "KangarooLog is not configured"; + } + logSizePct_ = sizePct; + if (indexPerPhysicalPartitions == 0) { + throw std::invalid_argument(folly::sformat( + "to enable KangarooLog, need >=1 index partitions per physical partition, {} is set", + indexPerPhysicalPartitions)); + } + if (physicalPartitions == 0) { + throw std::invalid_argument(folly::sformat( + "to enable KangarooLog, need >=1 physical partitions, {} is set", + physicalPartitions)); + } + physicalPartitions_ = physicalPartitions; + indexPerPhysicalPartitions_ = indexPerPhysicalPartitions; + threshold_ = threshold; + return *this; +} + void NavyConfig::setBigHash(unsigned int bigHashSizePct, uint32_t bigHashBucketSize, uint64_t bigHashBucketBfSize, @@ -238,6 +284,22 @@ void NavyConfig::setBigHash(unsigned int bigHashSizePct, .setBucketSize(bigHashBucketSize) .setBucketBfSize(bigHashBucketBfSize); } + +void NavyConfig::setKangaroo(unsigned int kangarooSizePct, + uint32_t kangarooBucketSize, + uint64_t kangarooBucketBfSize, + uint64_t kangarooSmallItemMaxSize, + uint64_t kangarooLogSizePct, + uint64_t kangarooLogThreshold, + uint64_t kangarooLogPhysicalPartitions, + uint32_t kangarooLogIndexPerPhysicalPartitions) { + kangarooConfig_ + .setSizePctAndMaxItemSize(kangarooSizePct, kangarooSmallItemMaxSize) + .setBucketSize(kangarooBucketSize) + .setBucketBfSize(kangarooBucketBfSize) + .setLog(kangarooLogSizePct, kangarooLogThreshold, + kangarooLogPhysicalPartitions, kangarooLogIndexPerPhysicalPartitions); +} // job scheduler settings void NavyConfig::setNavyReqOrderingShards(uint64_t navyReqOrderingShards) { if (navyReqOrderingShards == 0) { diff --git a/cachelib/allocator/nvmcache/NavyConfig.h b/cachelib/allocator/nvmcache/NavyConfig.h index b515b0bc64..b20feb6382 100644 --- a/cachelib/allocator/nvmcache/NavyConfig.h +++ b/cachelib/allocator/nvmcache/NavyConfig.h @@ -321,6 +321,90 @@ class BigHashConfig { uint64_t smallItemMaxSize_{}; }; +/** + * KangarooConfig provides APIs for users to configure Kangaroo engine, which is + * one part of NavyConfig. + * + * By this class, users can: + * - enable Kangaroo by setting sizePct > 0 + * - set maximum item size + * - set bucket size + * - set bloom filter size (0 to disable bloom filter) + * - set log percent and number of partitions + * - get the values of all the above parameters + */ +class KangarooConfig { + public: + // Set Kangaroo device percentage and maximum item size(in bytes) to enable + // Kangaroo engine. Default value of sizePct and smallItemMaxSize is 0, + // meaning Kangaroo is not enabled. + // @throw std::invalid_argument if sizePct is not in the range of + // [0, 100]. + KangarooConfig& setSizePctAndMaxItemSize(unsigned int sizePct, + uint64_t smallItemMaxSize); + + // Set the bucket size in bytes for Kangaroo engine. + // Default value is 4096. + KangarooConfig& setBucketSize(uint32_t bucketSize) noexcept { + bucketSize_ = bucketSize; + return *this; + } + + // Set bloom filter size per bucket in bytes for Kangaroo engine. + // 0 means bloom filter will not be applied. Default value is 8. + KangarooConfig& setBucketBfSize(uint64_t bucketBfSize) noexcept { + bucketBfSize_ = bucketBfSize; + return *this; + } + + // Set bloom filter size per bucket in bytes for Kangaroo engine. + // 0 means bloom filter will not be applied. Default value is 8. + KangarooConfig& setLog(unsigned int sizePct, + uint64_t physicalPartitions, + uint64_t indexPerPhysicalParitions, + uint32_t threshold); + + bool isBloomFilterEnabled() const { return bucketBfSize_ > 0; } + + unsigned int getSizePct() const { return sizePct_; } + + uint32_t getBucketSize() const { return bucketSize_; } + + uint64_t getBucketBfSize() const { return bucketBfSize_; } + + uint64_t getSmallItemMaxSize() const { return smallItemMaxSize_; } + + unsigned int getLogSizePct() const { return logSizePct_; } + + uint64_t getPhysicalPartitions() const { return physicalPartitions_; } + + uint64_t getIndexPerPhysicalPartitions() const { return indexPerPhysicalPartitions_; } + + uint32_t getLogThreshold() const { return threshold_; } + + private: + // Percentage of how much of the device out of all is given to Kangaroo + // engine in Navy, e.g. 50. + unsigned int sizePct_{0}; + // Navy Kangaroo engine's bucket size (must be multiple of the minimum + // device io block size). + // This size determines how big each bucket is and what is the physical + // write granularity onto the device. + uint32_t bucketSize_{4096}; + // The bloom filter size per bucket in bytes for Navy Kangaroo engine + uint64_t bucketBfSize_{8}; + // The maximum item size to put into Navy Kangaroo engine. + uint64_t smallItemMaxSize_{}; + // Percent of Kangaroo to dedicate to KangarooLog + unsigned int logSizePct_{0}; + // Number of physical partitions of KangarooLog + uint64_t physicalPartitions_{1}; + // Number of index partitions of KangarooLog + uint64_t indexPerPhysicalPartitions_{1}; + // Threshold for moving items from KangarooLog to sets + uint32_t threshold_{1}; +}; + /** * NavyConfig provides APIs for users to set up Navy related settings for * NvmCache. @@ -341,6 +425,7 @@ class NavyConfig { bool usesSimpleFile() const noexcept { return !fileName_.empty(); } bool usesRaidFiles() const noexcept { return raidPaths_.size() > 0; } bool isBigHashEnabled() const { return bigHashConfig_.getSizePct() > 0; } + bool isKangarooEnabled() const { return kangarooConfig_.getSizePct() > 0; } std::map serialize() const; // Getters: @@ -371,15 +456,21 @@ class NavyConfig { // Returns the threshold of classifying an item as small item or large item // for Navy engine. uint64_t getSmallItemThreshold() const { - if (!isBigHashEnabled()) { + if (isBigHashEnabled()) { + return bigHashConfig_.getSmallItemMaxSize(); + } else if (isKangarooEnabled()) { + return kangarooConfig_.getSmallItemMaxSize(); + } else { return 0; } - return bigHashConfig_.getSmallItemMaxSize(); } // Return a const BlockCacheConfig to read values of its parameters. const BigHashConfig& bigHash() const { return bigHashConfig_; } + // Return a const KangarooConfig to read values of its parameters. + const KangarooConfig& kangaroo() const { return kangarooConfig_; } + // Return a const BlockCacheConfig to read values of its parameters. const BlockCacheConfig& blockCache() const { return blockCacheConfig_; } @@ -506,6 +597,21 @@ class NavyConfig { uint64_t bigHashSmallItemMaxSize); // Return BigHashConfig for configuration. BigHashConfig& bigHash() noexcept { return bigHashConfig_; } + + // ============ Kangaroo settings ============= + // (Deprecated) Set the parameters for Kangaroo. + // @throw std::invalid_argument if kangarooSizePct is not in the range of + // 0~100. + void setKangaroo(unsigned int kangarooSizePct, + uint32_t kangarooBucketSize, + uint64_t kangarooBucketBfSize, + uint64_t kangarooSmallItemMaxSize, + uint64_t kangarooLogSizePct, + uint64_t kangarooLogThreshold, + uint64_t kangarooLogPhysicalPartitions, + uint32_t kangarooLogIndexPerPhysicalPartitions); + // Return KangarooConfig for configuration. + KangarooConfig& kangaroo() noexcept { return kangarooConfig_; } // ============ Job scheduler settings ============= void setReaderAndWriterThreads(unsigned int readerThreads, @@ -556,6 +662,9 @@ class NavyConfig { // ============ BigHash settings ============= BigHashConfig bigHashConfig_{}; + + // ============ Kangaroo settings ============= + KangarooConfig kangarooConfig_{}; // ============ Job scheduler settings ============= // Number of asynchronous worker thread for read operation. diff --git a/cachelib/allocator/nvmcache/NavySetup.cpp b/cachelib/allocator/nvmcache/NavySetup.cpp index 4e1d12249a..8ec5c144f1 100644 --- a/cachelib/allocator/nvmcache/NavySetup.cpp +++ b/cachelib/allocator/nvmcache/NavySetup.cpp @@ -93,6 +93,66 @@ uint64_t setupBigHash(const navy::BigHashConfig& bigHashConfig, return bigHashCacheOffset; } +uint64_t setupKangaroo(const navy::KangarooConfig& kangarooConfig, + uint32_t ioAlignSize, + uint64_t totalCacheSize, + uint64_t metadataSize, + cachelib::navy::CacheProto& proto) { + auto bucketSize = kangarooConfig.getBucketSize(); + if (bucketSize != alignUp(bucketSize, ioAlignSize)) { + throw std::invalid_argument( + folly::sformat("Bucket size: {} is not aligned to ioAlignSize: {}", + bucketSize, ioAlignSize)); + } + + // If enabled, Kangaroo storage starts after BlockCache's. + const auto sizeReservedForKangaroo = + totalCacheSize * kangarooConfig.getSizePct() / 100ul; + + const uint64_t kangarooCacheOffset = + alignUp(totalCacheSize - sizeReservedForKangaroo, bucketSize); + const uint64_t kangarooCacheSize = + alignDown(totalCacheSize - kangarooCacheOffset, bucketSize); + + auto kangaroo = cachelib::navy::createKangarooProto(); + kangaroo->setLayout(kangarooCacheOffset, kangarooCacheSize, bucketSize); + + // Bucket Bloom filter size, bytes + // + // Experiments showed that if we have 16 bytes for BF with 25 entries, + // then optimal number of hash functions is 4 and false positive rate + // below 10%. + if (kangarooConfig.isBloomFilterEnabled()) { + // We set 4 hash function unconditionally. This seems to be the best + // for our use case. If BF size to bucket size ratio gets lower, try + // to reduce number of hashes. + constexpr uint32_t kNumHashes = 4; + const uint32_t bitsPerHash = + kangarooConfig.getBucketBfSize() * 8 / kNumHashes; + kangaroo->setBloomFilter(kNumHashes, bitsPerHash); + } + + if (kangarooConfig.getLogSizePct()) { + const uint64_t logSize = alignDown( + kangarooCacheSize * kangarooConfig.getLogSizePct() / 100ul, + bucketSize * 64); + const uint32_t threshold = kangarooConfig.getLogThreshold(); + const uint64_t indexPerPhysical = kangarooConfig.getIndexPerPhysicalPartitions(); + const uint64_t physical = kangarooConfig.getPhysicalPartitions(); + kangaroo->setLog(logSize, threshold, physical, indexPerPhysical); + } + + proto.setKangaroo(std::move(kangaroo), kangarooConfig.getSmallItemMaxSize()); + + if (kangarooCacheOffset <= metadataSize) { + throw std::invalid_argument("NVM cache size is not big enough!"); + } + XLOG(INFO) << "metadataSize: " << metadataSize + << " kangarooCacheOffset: " << kangarooCacheOffset + << " kangarooCacheSize: " << kangarooCacheSize; + return kangarooCacheOffset; +} + void setupBlockCache(const navy::BlockCacheConfig& blockCacheConfig, uint64_t blockCacheSize, uint32_t ioAlignSize, @@ -204,6 +264,16 @@ void setupCacheProtos(const navy::NavyConfig& config, XLOG(INFO) << "metadataSize: " << metadataSize << ". No bighash."; blockCacheSize = totalCacheSize - metadataSize; } + + // Set up Kangaroo if enabled + if (config.isKangarooEnabled()) { + auto kangarooCacheOffset = setupKangaroo(config.kangaroo(), ioAlignSize, + totalCacheSize, metadataSize, proto); + blockCacheSize = kangarooCacheOffset - metadataSize; + } else { + XLOG(INFO) << "metadataSize: " << metadataSize << ". No kangaroo."; + blockCacheSize = totalCacheSize - metadataSize; + } // Set up BlockCache if enabled if (blockCacheSize > 0) { diff --git a/cachelib/cachebench/cache/Cache-inl.h b/cachelib/cachebench/cache/Cache-inl.h index f27a4107f9..19d93e63b3 100644 --- a/cachelib/cachebench/cache/Cache-inl.h +++ b/cachelib/cachebench/cache/Cache-inl.h @@ -184,6 +184,17 @@ Cache::Cache(const CacheConfig& config, config_.navyBloomFilterPerBucketSize, config_.navySmallItemMaxSize); } + + if (config_.navyKangarooSizePct > 0) { + nvmConfig.navyConfig.setKangaroo(config_.navyKangarooSizePct, + config_.navyKangarooBucketSize, + config_.navyBloomFilterPerBucketSize, + config_.navySmallItemMaxSize, + config_.navyKangarooLogSizePct, + config_.navyKangarooLogThreshold, + config_.navyKangarooLogPhysicalPartitions, + config_.navyKangarooLogIndexPerPhysicalPartitions); + } nvmConfig.navyConfig.setMaxParcelMemoryMB(config_.navyParcelMemoryMB); @@ -524,6 +535,7 @@ Stats Cache::getStats() const { ret.numNvmItems = lookup("navy_bh_items") + lookup("navy_bc_items"); ret.numNvmBytesWritten = lookup("navy_device_bytes_written"); uint64_t now = fetchNandWrites(); + if (now > nandBytesBegin_) { ret.numNvmNandBytesWritten = now - nandBytesBegin_; } diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h index 457a47b94b..b443e7b81d 100644 --- a/cachelib/cachebench/cache/Cache.h +++ b/cachelib/cachebench/cache/Cache.h @@ -347,6 +347,7 @@ class Cache { // reading of the nand bytes written for the benchmark if enabled. const uint64_t nandBytesBegin_{0}; + uint64_t writtenBytes_{0}; // latency stats of cachelib APIs inside cachebench mutable util::PercentileStats cacheFindLatency_; diff --git a/cachelib/cachebench/util/CacheConfig.cpp b/cachelib/cachebench/util/CacheConfig.cpp index 13ae93f257..a3cb2363c5 100644 --- a/cachelib/cachebench/util/CacheConfig.cpp +++ b/cachelib/cachebench/util/CacheConfig.cpp @@ -74,6 +74,7 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) { JSONSetVal(configJson, navyWriterThreads); JSONSetVal(configJson, navyCleanRegions); JSONSetVal(configJson, navyAdmissionWriteRateMB); + JSONSetVal(configJson, navyAdmissionProb); JSONSetVal(configJson, navyMaxConcurrentInserts); JSONSetVal(configJson, navyDataChecksum); JSONSetVal(configJson, navyNumInmemBuffers); @@ -85,6 +86,13 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) { JSONSetVal(configJson, mlNvmAdmissionPolicy); JSONSetVal(configJson, mlNvmAdmissionPolicyLocation); + + JSONSetVal(configJson, navyKangarooSizePct); + JSONSetVal(configJson, navyKangarooBucketSize); + JSONSetVal(configJson, navyKangarooLogSizePct); + JSONSetVal(configJson, navyKangarooLogThreshold); + JSONSetVal(configJson, navyKangarooLogPhysicalPartitions); + JSONSetVal(configJson, navyKangarooLogIndexPerPhysicalPartitions); JSONSetVal(configJson, useTraceTimeStamp); JSONSetVal(configJson, printNvmCounters); @@ -95,7 +103,7 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) { // if you added new fields to the configuration, update the JSONSetVal // to make them available for the json configs and increment the size // below - checkCorrectSize(); + checkCorrectSize(); if (numPools != poolSizes.size()) { throw std::invalid_argument(folly::sformat( diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h index 7d3a524bd7..447f602a86 100644 --- a/cachelib/cachebench/util/CacheConfig.h +++ b/cachelib/cachebench/util/CacheConfig.h @@ -138,6 +138,26 @@ struct CacheConfig : public JSONConfig { // bucket size for BigHash. This controls the write amplification for small // objects in Navy. Every small object write performs a RMW for a bucket. uint64_t navyBigHashBucketSize = 4096; + + // Determines how much of the device is given to kangaroo engine in Navy + uint64_t navyKangarooSizePct = 0; + + // Kangaroo Bucket Size determines how big each bucket is and what + // is the phsyical write granularity onto the device. + uint64_t navyKangarooBucketSize = 4096; + + // Determines how much of the device is given to kangaroo log + uint64_t navyKangarooLogSizePct = 0; + + // Kangaroo threshold for objects to be moved from log to a bucket + uint64_t navyKangarooLogThreshold = 2; + + // Number of physical partitions for Log, makes indexing smaller, increases + // buffer space + uint64_t navyKangarooLogPhysicalPartitions = 1; + + // Number of index partitions per physical partitions, allows for smaller index + uint64_t navyKangarooLogIndexPerPhysicalPartitions = 1; // Big Hash bloom filter size in bytes per bucket above. uint64_t navyBloomFilterPerBucketSize = 8; @@ -169,6 +189,8 @@ struct CacheConfig : public JSONConfig { // disabled when value is 0 uint32_t navyAdmissionWriteRateMB{0}; + double navyAdmissionProb{0}; + // maximum pending inserts before rejecting new inserts. uint32_t navyMaxConcurrentInserts{1000000}; diff --git a/cachelib/navy/CMakeLists.txt b/cachelib/navy/CMakeLists.txt index 30c69c6727..ea6a2c72b9 100644 --- a/cachelib/navy/CMakeLists.txt +++ b/cachelib/navy/CMakeLists.txt @@ -36,6 +36,16 @@ add_library (cachelib_navy common/Types.cpp driver/Driver.cpp Factory.cpp + kangaroo/ChainedLogIndex.cpp + kangaroo/Kangaroo.cpp + kangaroo/KangarooLog.cpp + kangaroo/KangarooLogSegment.cpp + kangaroo/KangarooBucketStorage.cpp + kangaroo/LogBucket.cpp + kangaroo/RripBucket.cpp + kangaroo/RripBitVector.cpp + kangaroo/RripBucketStorage.cpp + kangaroo/KangarooSizeDistribution.cpp scheduler/ThreadPoolJobScheduler.cpp scheduler/ThreadPoolJobQueue.cpp serialization/RecordIO.cpp diff --git a/cachelib/navy/Factory.cpp b/cachelib/navy/Factory.cpp index ee121c282c..a4651ab43d 100644 --- a/cachelib/navy/Factory.cpp +++ b/cachelib/navy/Factory.cpp @@ -28,6 +28,7 @@ #include "cachelib/navy/block_cache/LruPolicy.h" #include "cachelib/navy/block_cache/PercentageReinsertionPolicy.h" #include "cachelib/navy/driver/Driver.h" +#include "cachelib/navy/kangaroo/Kangaroo.h" #include "cachelib/navy/serialization/RecordIO.h" namespace facebook { @@ -174,6 +175,67 @@ class BigHashProtoImpl final : public BigHashProto { uint32_t hashTableBitSize_{}; }; +class KangarooProtoImpl final : public KangarooProto { + public: + KangarooProtoImpl() = default; + ~KangarooProtoImpl() override = default; + + void setLayout(uint64_t baseOffset, + uint64_t size, + uint32_t bucketSize) override { + config_.cacheBaseOffset = baseOffset; + config_.totalSetSize = size; + config_.bucketSize = bucketSize; + } + + void setBloomFilter(uint32_t numHashes, uint32_t hashTableBitSize) override { + // Want to make @setLayout and Bloom filter setup independent. + bloomFilterEnabled_ = true; + numHashes_ = numHashes; + hashTableBitSize_ = hashTableBitSize; + } + + void setLog(uint64_t logSize, + uint32_t threshold, + uint32_t physicalPartitions, + uint32_t indexPartitionsPerPhysical) override { + config_.logConfig.logSize = logSize; + config_.totalSetSize = config_.totalSetSize - logSize; + config_.logConfig.logBaseOffset = config_.cacheBaseOffset + config_.totalSetSize; + config_.logConfig.threshold = threshold; + config_.logIndexPartitionsPerPhysical = indexPartitionsPerPhysical; + config_.logConfig.logPhysicalPartitions = physicalPartitions; + config_.logConfig.numTotalIndexBuckets = config_.numBuckets() - + config_.numBuckets() % (indexPartitionsPerPhysical * physicalPartitions); + } + + void setDevice(Device* device) { config_.device = device; } + + void setDestructorCb(DestructorCallback cb) { + config_.destructorCb = std::move(cb); + } + + std::unique_ptr create() && { + if (bloomFilterEnabled_) { + if (config_.bucketSize == 0) { + throw std::invalid_argument{"invalid bucket size"}; + } + config_.bloomFilter = std::make_unique( + config_.numBuckets(), numHashes_, hashTableBitSize_); + } + config_.rripBitVector = std::make_unique( + config_.numBuckets()); + return std::make_unique(std::move(config_)); + } + + private: + Kangaroo::Config config_; + bool bloomFilterEnabled_{false}; + bool nruEnabled_{false}; + uint32_t numHashes_{}; + uint32_t hashTableBitSize_{}; +}; + class CacheProtoImpl final : public CacheProto { public: CacheProtoImpl() = default; @@ -203,6 +265,12 @@ class CacheProtoImpl final : public CacheProto { config_.smallItemMaxSize = smallItemMaxSize; } + void setKangaroo(std::unique_ptr proto, + uint32_t smallItemMaxSize) override { + kangarooProto_ = std::move(proto); + config_.smallItemMaxSize = smallItemMaxSize; + } + void setDestructorCallback(DestructorCallback cb) override { destructorCb_ = std::move(cb); } @@ -268,6 +336,15 @@ class CacheProtoImpl final : public CacheProto { config_.smallItemCache = std::move(*bhProto).create(); } } + + if (kangarooProto_) { + auto kangarooProto = dynamic_cast(kangarooProto_.get()); + if (kangarooProto != nullptr) { + kangarooProto->setDevice(config_.device.get()); + kangarooProto->setDestructorCb(destructorCb_); + config_.smallItemCache = std::move(*kangarooProto).create(); + } + } return std::make_unique(std::move(config_)); } @@ -276,6 +353,7 @@ class CacheProtoImpl final : public CacheProto { DestructorCallback destructorCb_; std::unique_ptr blockCacheProto_; std::unique_ptr bigHashProto_; + std::unique_ptr kangarooProto_; Driver::Config config_; }; // Open cache file @fileName and set it size to @size. @@ -331,6 +409,10 @@ std::unique_ptr createBigHashProto() { return std::make_unique(); } +std::unique_ptr createKangarooProto() { + return std::make_unique(); +} + std::unique_ptr createCacheProto() { return std::make_unique(); } diff --git a/cachelib/navy/Factory.h b/cachelib/navy/Factory.h index 72dbfa3439..fc3111b54e 100644 --- a/cachelib/navy/Factory.h +++ b/cachelib/navy/Factory.h @@ -110,6 +110,33 @@ class BigHashProto { uint32_t hashTableBitSize) = 0; }; +// Kangaroo engine proto. kangaroo is used to cache small objects (under 2KB) +// more efficiently than BigHash by itself. +// User sets up this proto object and passes it to CacheProto::setKangaroo. +class KangarooProto { + public: + virtual ~KangarooProto() = default; + + // Set cache layout. Cache will start at @baseOffset and will be @size bytes + // on the device. Kangaroo divides its device spcae into a number of fixed size + // buckets, represented by @bucketSize. All IO happens on bucket-size + // granularity. + virtual void setLayout(uint64_t baseOffset, + uint64_t size, + uint32_t bucketSize) = 0; + + // Enable Bloom filter with @numHashes hash functions, each mapped into an + // bit array of @hashTableBitSize bits. + virtual void setBloomFilter(uint32_t numHashes, + uint32_t hashTableBitSize) = 0; + + // Enable part of cache space to be log + virtual void setLog(uint64_t logSize, + uint32_t threshold, + uint32_t physicalPartitions, + uint32_t indexPartitionsPerPhysical) = 0; +}; + // Cache object prototype. Setup cache desired parameters and pass proto to // @createCache function. class CacheProto { @@ -136,6 +163,10 @@ class CacheProto { virtual void setBigHash(std::unique_ptr proto, uint32_t smallItemMaxSize) = 0; + // Set up kangaroo engine. + virtual void setKangaroo(std::unique_ptr proto, + uint32_t smallItemMaxSize) = 0; + // Set JobScheduler for async function calls. virtual void setJobScheduler(std::unique_ptr ex) = 0; @@ -172,6 +203,9 @@ std::unique_ptr createBlockCacheProto(); // Creates BigHash engine prototype. std::unique_ptr createBigHashProto(); +// Creates Kangaroo engine prototype +std::unique_ptr createKangarooProto(); + // Creates Cache object prototype. std::unique_ptr createCacheProto(); diff --git a/cachelib/navy/kangaroo/ChainedLogIndex.cpp b/cachelib/navy/kangaroo/ChainedLogIndex.cpp new file mode 100644 index 0000000000..aa06b68ec1 --- /dev/null +++ b/cachelib/navy/kangaroo/ChainedLogIndex.cpp @@ -0,0 +1,254 @@ +#include +#include + +#include "cachelib/navy/kangaroo/ChainedLogIndex.h" + +namespace facebook { +namespace cachelib { +namespace navy { + +void ChainedLogIndex::allocate() { + { + numAllocations_++; + allocations.resize(numAllocations_); + allocations[numAllocations_ - 1] = new ChainedLogIndexEntry[allocationSize_]; + } +} + +ChainedLogIndex::ChainedLogIndex(uint64_t numHashBuckets, + uint16_t allocationSize, SetNumberCallback setNumberCb) + : numHashBuckets_{numHashBuckets}, + allocationSize_{allocationSize}, + numMutexes_{numHashBuckets / 10 + 1}, + setNumberCb_{setNumberCb} { + mutexes_ = std::make_unique(numMutexes_); + index_.resize(numHashBuckets_, -1); + { + std::unique_lock lock{allocationMutex_}; + allocate(); + } +} + +ChainedLogIndex::~ChainedLogIndex() { + { + std::unique_lock lock{allocationMutex_}; + /*for (uint64_t i = 0; i < numAllocations_; i++) { + delete allocations[i]; + }*/ + } +} + +ChainedLogIndexEntry* ChainedLogIndex::findEntryNoLock(uint16_t offset) { + uint16_t arrayOffset = offset % allocationSize_; + uint16_t vectorOffset = offset / allocationSize_; + if (vectorOffset > numAllocations_) { + return nullptr; + } + return &allocations[vectorOffset][arrayOffset]; +} + +ChainedLogIndexEntry* ChainedLogIndex::findEntry(uint16_t offset) { + std::shared_lock lock{allocationMutex_}; + return findEntryNoLock(offset); +} + +ChainedLogIndexEntry* ChainedLogIndex::allocateEntry(uint16_t& offset) { + std::unique_lock lock{allocationMutex_}; + if (nextEmpty_ >= numAllocations_ * allocationSize_) { + allocate(); + } + offset = nextEmpty_; + ChainedLogIndexEntry* entry = findEntryNoLock(offset); + if (nextEmpty_ == maxSlotUsed_) { + nextEmpty_++; + maxSlotUsed_++; + } else { + nextEmpty_ = entry->next_; + } + entry->next_ = -1; + return entry; +} + +uint16_t ChainedLogIndex::releaseEntry(uint16_t offset) { + std::unique_lock lock{allocationMutex_}; + ChainedLogIndexEntry* entry = findEntryNoLock(offset); + uint16_t ret = entry->next_; + entry->invalidate(); + entry->next_ = nextEmpty_; + nextEmpty_ = offset; + return ret; +} + +PartitionOffset ChainedLogIndex::lookup(HashedKey hk, bool hit, uint32_t* hits) { + const auto lib = getLogIndexBucket(hk); + uint32_t tag = createTag(hk); + { + std::shared_lock lock{getMutex(lib)}; + ChainedLogIndexEntry* currentHead = findEntry(index_[lib.index()]); + while (currentHead) { + if (currentHead->isValid() && + currentHead->tag() == tag) { + if (hit) { + currentHead->incrementHits(); + } + if (hits != nullptr) { + *hits = currentHead->hits(); + } + return currentHead->offset(); + } + currentHead = findEntry(currentHead->next()); + } + } + hits = 0; + return PartitionOffset(0, false); +} + +Status ChainedLogIndex::insert(HashedKey hk, PartitionOffset po, uint8_t hits) { + const auto lib = getLogIndexBucket(hk); + uint32_t tag = createTag(hk); + insert(tag, lib, po, hits); +} + +Status ChainedLogIndex::insert(uint32_t tag, KangarooBucketId bid, + PartitionOffset po, uint8_t hits) { + const auto lib = getLogIndexBucketFromSetBucket(bid); + insert(tag, lib, po, hits); +} + +Status ChainedLogIndex::insert(uint32_t tag, LogIndexBucket lib, + PartitionOffset po, uint8_t hits) { + { + std::unique_lock lock{getMutex(lib)}; + + uint16_t* oldNext = &index_[lib.index()]; + ChainedLogIndexEntry* nextEntry = findEntry(index_[lib.index()]); + while (nextEntry) { + if (nextEntry->isValid() && nextEntry->tag() == tag) { + nextEntry->populateEntry(po, tag, hits); + return Status::Ok; + } + oldNext = &nextEntry->next_; + nextEntry = findEntry(*oldNext); + } + + uint16_t entryOffset; + ChainedLogIndexEntry* newEntry = allocateEntry(entryOffset); + newEntry->populateEntry(po, tag, hits); + (*oldNext) = entryOffset; + } + return Status::Ok; +} + +Status ChainedLogIndex::remove(HashedKey hk, PartitionOffset po) { + uint64_t tag = createTag(hk); + const auto lib = getLogIndexBucket(hk); + return remove(tag, lib, po); +} + +Status ChainedLogIndex::remove(uint64_t tag, KangarooBucketId bid, PartitionOffset po) { + auto lib = getLogIndexBucketFromSetBucket(bid); + return remove(tag, lib, po); +} + +Status ChainedLogIndex::remove(uint64_t tag, LogIndexBucket lib, PartitionOffset po) { + { + std::unique_lock lock{getMutex(lib)}; + ChainedLogIndexEntry* nextEntry = findEntry(index_[lib.index()]); + uint16_t* oldNext = &index_[lib.index()]; + while (nextEntry) { + if (nextEntry->isValid() && nextEntry->tag() == tag && nextEntry->offset() == po) { + *oldNext = releaseEntry(*oldNext); + return Status::Ok; + } + oldNext = &nextEntry->next_; + nextEntry = findEntry(nextEntry->next_); + } + } + return Status::NotFound; +} + +// Counts number of items in log corresponding to bucket +uint64_t ChainedLogIndex::countBucket(HashedKey hk) { + const auto lib = getLogIndexBucket(hk); + uint64_t count = 0; + { + std::shared_lock lock{getMutex(lib)}; + ChainedLogIndexEntry* nextEntry = findEntry(index_[lib.index()]); + while (nextEntry) { + if (nextEntry->isValid()) { + count++; + } + nextEntry = findEntry(nextEntry->next_); + } + } + return count; +} + +// Get iterator for all items in the same bucket +ChainedLogIndex::BucketIterator ChainedLogIndex::getHashBucketIterator(HashedKey hk) { + const auto lib = getLogIndexBucket(hk); + auto idx = setNumberCb_(hk.keyHash()); + { + std::shared_lock lock{getMutex(lib)}; + auto currentHead = findEntry(index_[lib.index()]); + while (currentHead) { + if (currentHead->isValid()) { + return BucketIterator(idx, currentHead); + } + currentHead = findEntry(currentHead->next_); + } + } + return BucketIterator(); +} + +ChainedLogIndex::BucketIterator ChainedLogIndex::getNext(ChainedLogIndex::BucketIterator bi) { + if (bi.done()) { + return bi; + } + auto lib = getLogIndexBucketFromSetBucket(bi.bucket_); + { + std::shared_lock lock{getMutex(lib)}; + auto currentHead = findEntry(bi.nextEntry_); + while (currentHead) { + if (currentHead->isValid()) { + return BucketIterator(bi.bucket_, currentHead); + } + currentHead = findEntry(currentHead->next_); + } + } + return BucketIterator(); +} + +PartitionOffset ChainedLogIndex::find(KangarooBucketId bid, uint64_t tag) { + auto lib = getLogIndexBucketFromSetBucket(bid); + { + std::shared_lock lock{getMutex(lib)}; + ChainedLogIndexEntry* nextEntry = findEntry(index_[lib.index()]); + uint16_t* oldNext = &index_[lib.index()]; + while (nextEntry) { + if (nextEntry->isValid() && nextEntry->tag() == tag) { + PartitionOffset po = nextEntry->offset(); + return po; + } + oldNext = &nextEntry->next_; + nextEntry = findEntry(nextEntry->next_); + } + } + return PartitionOffset(0, false); +} + +ChainedLogIndex::LogIndexBucket ChainedLogIndex::getLogIndexBucket(HashedKey hk) { + return getLogIndexBucketFromSetBucket(setNumberCb_(hk.keyHash())); +} + +ChainedLogIndex::LogIndexBucket ChainedLogIndex::getLogIndexBucket(uint64_t key) { + return getLogIndexBucketFromSetBucket(setNumberCb_(key)); +} + +ChainedLogIndex::LogIndexBucket ChainedLogIndex::getLogIndexBucketFromSetBucket(KangarooBucketId bid) { + return LogIndexBucket(bid.index() % numHashBuckets_); +} + +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/ChainedLogIndex.h b/cachelib/navy/kangaroo/ChainedLogIndex.h new file mode 100644 index 0000000000..1e3271c3e5 --- /dev/null +++ b/cachelib/navy/kangaroo/ChainedLogIndex.h @@ -0,0 +1,142 @@ +#pragma once + +#include + +#include + +#include "cachelib/navy/common/Hash.h" +#include "cachelib/navy/common/Types.h" +#include "cachelib/navy/kangaroo/ChainedLogIndexEntry.h" + +namespace facebook { +namespace cachelib { +namespace navy { +// ChainedLogIndex is a hash-based log index optimized for allowing easy +// threshold lookups +// +// It primarily is a chained hash table that chains so that +// items in the same on-flash Kangaroo bucket will end up in the +// same hash bucket in the index. This way we avoid scans to +// see if items can end up in the same set. +class ChainedLogIndex { + public: + // BucketIterator gives hashed key for each valid + // element corresponding to a given kangaroo bucket + // Read only + class BucketIterator { + public: + BucketIterator() : end_{true} {} + + bool done() const { return end_; } + + uint32_t tag() const { return tag_; } + + uint32_t hits() const { return hits_; } + + PartitionOffset offset() const { return offset_; } + + private: + friend ChainedLogIndex; + + BucketIterator(KangarooBucketId id, ChainedLogIndexEntry* firstKey) + : bucket_{id}, tag_{firstKey->tag()}, hits_{firstKey->hits()}, + offset_{firstKey->offset()}, nextEntry_{firstKey->next_} {} + + KangarooBucketId bucket_{0}; + uint32_t tag_; + uint32_t hits_; + PartitionOffset offset_{0, 0}; + uint16_t nextEntry_; + bool end_{false}; + }; + + explicit ChainedLogIndex(uint64_t numHashBuckets, + uint16_t allocationSize, SetNumberCallback setNumberCb); + + ~ChainedLogIndex(); + + ChainedLogIndex(const ChainedLogIndex&) = delete; + ChainedLogIndex& operator=(const ChainedLogIndex&) = delete; + + // Look up a key in Index. + // If not found, return will not be valid. + PartitionOffset lookup(HashedKey hk, bool hit, uint32_t* hits); + + // Inserts key into index. + Status insert(HashedKey hk, PartitionOffset po, uint8_t hits = 0); + Status insert(uint32_t tag, KangarooBucketId bid, PartitionOffset po, uint8_t hits); + + // Removes entry's valid bit if it's in the log + Status remove(HashedKey hk, PartitionOffset lpid); + Status remove(uint64_t tag, KangarooBucketId bid, PartitionOffset lpid); + + // does not create a hit, for log flush lookups + PartitionOffset find(KangarooBucketId bid, uint64_t tag); + + // Counts number of items in log corresponding to set + // bucket for the hashed key + uint64_t countBucket(HashedKey hk); + + // Get iterator for all items in the same bucket + BucketIterator getHashBucketIterator(HashedKey hk); + BucketIterator getNext(BucketIterator bi); + + private: + + friend BucketIterator; + + class LogIndexBucket { + public: + explicit LogIndexBucket(uint32_t idx) : idx_{idx} {} + + bool operator==(const LogIndexBucket& rhs) const noexcept { + return idx_ == rhs.idx_; + } + bool operator!=(const LogIndexBucket& rhs) const noexcept { + return !(*this == rhs); + } + + uint32_t index() const noexcept { return idx_; } + + private: + uint32_t idx_; + }; + + Status remove(uint64_t tag, LogIndexBucket lib, PartitionOffset lpid); + Status insert(uint32_t tag, LogIndexBucket lib, PartitionOffset po, uint8_t hits); + + LogIndexBucket getLogIndexBucket(HashedKey hk); + LogIndexBucket getLogIndexBucket(uint64_t hk); + LogIndexBucket getLogIndexBucketFromSetBucket(KangarooBucketId bid); + + // locks based on log index hash bucket, concurrent read, single modify + folly::SharedMutex& getMutex(LogIndexBucket lib) const { + return mutexes_[lib.index() & (numMutexes_ - 1)]; + } + + const uint64_t numMutexes_{}; + const uint64_t numHashBuckets_{}; + const SetNumberCallback setNumberCb_{}; + std::unique_ptr mutexes_; + std::vector index_; + + folly::SharedMutex allocationMutex_; + const uint16_t allocationSize_{}; + uint16_t maxSlotUsed_{0}; + uint16_t nextEmpty_{0}; + uint16_t numAllocations_{0}; + + void allocate(); + ChainedLogIndexEntry* findEntry(uint16_t offset); + ChainedLogIndexEntry* findEntryNoLock(uint16_t offset); + ChainedLogIndexEntry* allocateEntry(uint16_t& offset); + uint16_t releaseEntry(uint16_t offset); + + std::vector allocations; + +}; +} // namespace navy +} // namespace cachelib +} // namespace facebook + + diff --git a/cachelib/navy/kangaroo/ChainedLogIndexEntry.h b/cachelib/navy/kangaroo/ChainedLogIndexEntry.h new file mode 100644 index 0000000000..4248ee07a7 --- /dev/null +++ b/cachelib/navy/kangaroo/ChainedLogIndexEntry.h @@ -0,0 +1,50 @@ +#pragma once + +#include "cachelib/navy/common/Hash.h" +#include "cachelib/navy/kangaroo/Types.h" + +namespace facebook { +namespace cachelib { +namespace navy { + +class ChainedLogIndex; + +class __attribute__((__packed__)) ChainedLogIndexEntry { + public: + ChainedLogIndexEntry() : valid_{false} {} + ~ChainedLogIndexEntry() = default; + + bool operator==(const ChainedLogIndexEntry& rhs) const noexcept { + return valid_ && rhs.valid_ && tag_ == rhs.tag_; + } + bool operator!=(const ChainedLogIndexEntry& rhs) const noexcept { + return !(*this == rhs); + } + + void populateEntry(PartitionOffset po, uint32_t tag, uint8_t hits) { + flash_index_ = po.index(); + tag_ = tag; + valid_ = 1; + hits_ = hits; + } + + void incrementHits() { if (hits_ < ((1 << 3) - 1)) {hits_++;} } + uint32_t hits() { return hits_; } + uint32_t tag() { return tag_; } + void invalidate() { valid_ = 0; } + bool isValid() { return valid_; } + PartitionOffset offset() { return PartitionOffset(flash_index_, valid_); } + uint16_t next() { return next_; } + + private: + friend ChainedLogIndex; + + uint32_t flash_index_ : 19; + uint32_t tag_ : 9; + uint32_t valid_ : 1; + uint32_t hits_ : 3; + uint16_t next_; +}; +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/Kangaroo.cpp b/cachelib/navy/kangaroo/Kangaroo.cpp new file mode 100644 index 0000000000..fce3dc2d6d --- /dev/null +++ b/cachelib/navy/kangaroo/Kangaroo.cpp @@ -0,0 +1,597 @@ +#include +#include +#include + +#include + +#include "cachelib/navy/kangaroo/Kangaroo.h" +#include "cachelib/navy/kangaroo/KangarooLog.h" +#include "cachelib/navy/kangaroo/RripBucket.h" +#include "cachelib/navy/common/Utils.h" +#include "cachelib/navy/serialization/Serialization.h" + +namespace facebook { +namespace cachelib { +namespace navy { +namespace { +constexpr uint64_t kMinSizeDistribution = 64; +constexpr uint64_t kMinThresholdSizeDistribution = 8; +constexpr double kSizeDistributionGranularityFactor = 1.25; +} // namespace + +constexpr uint32_t Kangaroo::kFormatVersion; + +Kangaroo::Config& Kangaroo::Config::validate() { + if (totalSetSize < bucketSize) { + throw std::invalid_argument( + folly::sformat("cache size: {} cannot be smaller than bucket size: {}", + totalSetSize, + bucketSize)); + } + + if (!folly::isPowTwo(bucketSize)) { + throw std::invalid_argument( + folly::sformat("invalid bucket size: {}", bucketSize)); + } + + if (totalSetSize > uint64_t{bucketSize} << 32) { + throw std::invalid_argument(folly::sformat( + "Can't address big hash with 32 bits. Cache size: {}, bucket size: {}", + totalSetSize, + bucketSize)); + } + + if (cacheBaseOffset % bucketSize != 0 || totalSetSize % bucketSize != 0) { + throw std::invalid_argument(folly::sformat( + "cacheBaseOffset and totalSetSize need to be a multiple of bucketSize. " + "cacheBaseOffset: {}, totalSetSize:{}, bucketSize: {}.", + cacheBaseOffset, + totalSetSize, + bucketSize)); + } + + if (device == nullptr) { + throw std::invalid_argument("device cannot be null"); + } + + if (rripBitVector == nullptr) { + throw std::invalid_argument("need a RRIP bit vector"); + } + + if (bloomFilter && bloomFilter->numFilters() != numBuckets()) { + throw std::invalid_argument( + folly::sformat("bloom filter #filters mismatch #buckets: {} vs {}", + bloomFilter->numFilters(), + numBuckets())); + } + + if (logConfig.logSize > 0 && avgSmallObjectSize == 0) { + throw std::invalid_argument( + folly::sformat("Need an avgSmallObjectSize for the log")); + } + return *this; +} + +Kangaroo::Kangaroo(Config&& config) + : Kangaroo{std::move(config.validate()), ValidConfigTag{}} {} + +Kangaroo::Kangaroo(Config&& config, ValidConfigTag) + : destructorCb_{[this, cb = std::move(config.destructorCb)]( + BufferView key, + BufferView value, + DestructorEvent event) { + sizeDist_.removeSize(key.size() + value.size()); + if (cb) { + cb(key, value, event); + } + }}, + bucketSize_{config.bucketSize}, + cacheBaseOffset_{config.cacheBaseOffset}, + numBuckets_{config.numBuckets()}, + bloomFilter_{std::move(config.bloomFilter)}, + bitVector_{std::move(config.rripBitVector)}, + device_{*config.device}, + sizeDist_{kMinSizeDistribution, bucketSize_, + kSizeDistributionGranularityFactor}, + thresholdSizeDist_{10, bucketSize_, 10}, + thresholdNumDist_{1, 25, 1} { + XLOGF(INFO, + "Kangaroo created: buckets: {}, bucket size: {}, base offset: {}", + numBuckets_, + bucketSize_, + cacheBaseOffset_); + if (config.logConfig.logSize) { + SetNumberCallback cb = [&](uint64_t hk) {return getKangarooBucketIdFromHash(hk);}; + config.logConfig.setNumberCallback = cb; + config.logConfig.logIndexPartitions = config.logIndexPartitionsPerPhysical * config.logConfig.logPhysicalPartitions; + uint64_t bytesPerIndex = config.logConfig.logSize / config.logConfig.logIndexPartitions; + config.logConfig.device = config.device; + config.logConfig.setMultiInsertCallback = [&](std::vector>& ois, + ReadmitCallback readmit) { return insertMultipleObjectsToKangarooBucket(ois, readmit); }; + log_ = std::make_unique(std::move(config.logConfig)); + } + reset(); +} + +void Kangaroo::reset() { + XLOG(INFO, "Reset Kangaroo"); + generationTime_ = getSteadyClock(); + + if (bloomFilter_) { + bloomFilter_->reset(); + } + + itemCount_.set(0); + insertCount_.set(0); + succInsertCount_.set(0); + lookupCount_.set(0); + succLookupCount_.set(0); + removeCount_.set(0); + succRemoveCount_.set(0); + evictionCount_.set(0); + logicalWrittenCount_.set(0); + physicalWrittenCount_.set(0); + ioErrorCount_.set(0); + bfFalsePositiveCount_.set(0); + bfProbeCount_.set(0); + checksumErrorCount_.set(0); + sizeDist_.reset(); +} + +double Kangaroo::bfFalsePositivePct() const { + const auto probes = bfProbeCount_.get(); + if (bloomFilter_ && probes > 0) { + return 100.0 * bfFalsePositiveCount_.get() / probes; + } else { + return 0; + } +} + +void Kangaroo::insertMultipleObjectsToKangarooBucket(std::vector>& ois, + ReadmitCallback readmit) { + const auto bid = getKangarooBucketId(ois[0]->key); + insertCount_.inc(); + multiInsertCalls_.inc(); + + uint64_t insertCount = 0; + uint64_t evictCount = 0; + uint64_t removedCount = 0; + + uint64_t passedItemSize = 0; + uint64_t passedCount = 0; + + + { + std::unique_lock lock{getMutex(bid)}; + auto buffer = readBucket(bid); + if (buffer.isNull()) { + ioErrorCount_.inc(); + return; + } + + auto* bucket = reinterpret_cast(buffer.data()); + bucket->reorder([&](uint32_t keyIdx) {return bvGetHit(bid, keyIdx);}); + bitVector_->clear(bid.index()); + + for (auto& oi: ois) { + passedItemSize += oi->key.key().size() + oi->value.size(); + passedCount++; + + if (bucket->isSpace(oi->key, oi->value.view(), oi->hits)) { + removedCount += bucket->remove(oi->key, destructorCb_); + evictCount += bucket->insert(oi->key, oi->value.view(), oi->hits, destructorCb_); + sizeDist_.addSize(oi->key.key().size() + oi->value.size()); + insertCount++; + } else { + readmit(oi); + readmitInsertCount_.inc(); + } + } + + const auto res = writeBucket(bid, std::move(buffer)); + if (!res) { + if (bloomFilter_) { + bloomFilter_->clear(bid.index()); + } + ioErrorCount_.inc(); + return; + } + + if (bloomFilter_) { + bfRebuild(bid, bucket); + } + } + + thresholdSizeDist_.addSize(passedItemSize); + thresholdNumDist_.addSize(passedCount * 2); + + setInsertCount_.add(insertCount); + itemCount_.sub(evictCount + removedCount); + logItemCount_.sub(insertCount); + setItemCount_.sub(evictCount + removedCount); + setItemCount_.add(insertCount); + evictionCount_.add(evictCount); + succInsertCount_.add(insertCount); + + physicalWrittenCount_.add(bucketSize_); + return; +} + +uint64_t Kangaroo::getMaxItemSize() const { + // does not include per item overhead + return bucketSize_ - sizeof(RripBucket); +} + +void Kangaroo::getCounters(const CounterVisitor& visitor) const { + visitor("navy_bh_items", itemCount_.get()); + visitor("navy_bh_inserts", insertCount_.get()); + visitor("navy_bh_succ_inserts", succInsertCount_.get()); + visitor("navy_bh_lookups", lookupCount_.get()); + visitor("navy_bh_succ_lookups", succLookupCount_.get()); + visitor("navy_bh_removes", removeCount_.get()); + visitor("navy_bh_succ_removes", succRemoveCount_.get()); + visitor("navy_bh_evictions", evictionCount_.get()); + visitor("navy_bh_logical_written", logicalWrittenCount_.get()); + uint64_t logBytesWritten = (log_) ? log_->getBytesWritten() : 0; + visitor("navy_bh_physical_written", physicalWrittenCount_.get() + logBytesWritten); + visitor("navy_bh_io_errors", ioErrorCount_.get()); + visitor("navy_bh_bf_false_positive_pct", bfFalsePositivePct()); + visitor("navy_bh_checksum_errors", checksumErrorCount_.get()); + if (log_) { + visitor("navy_klog_false_positive_pct", log_->falsePositivePct()); + visitor("navy_klog_fragmentation_pct", log_->fragmentationPct()); + visitor("navy_klog_extra_reads_pct", log_->extraReadsPct()); + } + auto snapshot = sizeDist_.getSnapshot(); + for (auto& kv : snapshot) { + auto statName = folly::sformat("navy_bh_approx_bytes_in_size_{}", kv.first); + visitor(statName.c_str(), kv.second); + } +} + +void Kangaroo::persist(RecordWriter& rw) { + XLOG(INFO, "Starting kangaroo persist"); + serialization::BigHashPersistentData pd; + pd.version = kFormatVersion; + pd.generationTime = generationTime_.count(); + pd.itemCount = itemCount_.get(); + pd.bucketSize = bucketSize_; + pd.cacheBaseOffset = cacheBaseOffset_; + pd.numBuckets = numBuckets_; + *pd.sizeDist_ref() = sizeDist_.getSnapshot(); + serializeProto(pd, rw); + + if (bloomFilter_) { + bloomFilter_->persist(rw); + XLOG(INFO, "bloom filter persist done"); + } + + XLOG(INFO, "Finished kangaroo persist"); +} + +bool Kangaroo::recover(RecordReader& rr) { + XLOG(INFO, "Starting kangaroo recovery"); + try { + auto pd = deserializeProto(rr); + if (pd.version != kFormatVersion) { + throw std::logic_error{ + folly::sformat("invalid format version {}, expected {}", + pd.version, + kFormatVersion)}; + } + + auto configEquals = + static_cast(pd.bucketSize) == bucketSize_ && + static_cast(pd.cacheBaseOffset) == cacheBaseOffset_ && + static_cast(pd.numBuckets) == numBuckets_; + if (!configEquals) { + auto configStr = serializeToJson(pd); + XLOGF(ERR, "Recovery config: {}", configStr.c_str()); + throw std::logic_error{"config mismatch"}; + } + + generationTime_ = std::chrono::nanoseconds{pd.generationTime}; + itemCount_.set(pd.itemCount); + sizeDist_ = SizeDistribution{*pd.sizeDist_ref()}; + if (bloomFilter_) { + bloomFilter_->recover(rr); + XLOG(INFO, "Recovered bloom filter"); + } + } catch (const std::exception& e) { + XLOGF(ERR, "Exception: {}", e.what()); + XLOG(ERR, "Failed to recover kangaroo. Resetting cache."); + + reset(); + return false; + } + XLOG(INFO, "Finished kangaroo recovery"); + return true; +} + +Status Kangaroo::insert(HashedKey hk, + BufferView value) { + const auto bid = getKangarooBucketId(hk); + insertCount_.inc(); + + if (log_) { + Status ret = log_->insert(hk, value); + if (ret == Status::Ok) { + sizeDist_.addSize(hk.key().size() + value.size()); + succInsertCount_.inc(); + } + logicalWrittenCount_.add(hk.key().size() + value.size()); + logInsertCount_.inc(); + itemCount_.inc(); + logItemCount_.inc(); + return ret; + } + + + unsigned int removed{0}; + unsigned int evicted{0}; + bool space; + + { + std::unique_lock lock{getMutex(bid)}; + auto buffer = readBucket(bid); + if (buffer.isNull()) { + ioErrorCount_.inc(); + return Status::DeviceError; + } + + auto* bucket = reinterpret_cast(buffer.data()); + bucket->reorder([&](uint32_t keyIdx) {return bvGetHit(bid, keyIdx);}); + space = bucket->isSpace(hk, value, 0); + if (!space) { + // no need to rewrite bucket + removed = 0; + evicted = 1; + } else { + bitVector_->clear(bid.index()); + removed = bucket->remove(hk, destructorCb_); + evicted = bucket->insert(hk, value, 0, destructorCb_); + } + + if (space) { + const auto res = writeBucket(bid, std::move(Buffer(buffer.view(), bucketSize_))); + if (!res) { + if (bloomFilter_) { + bloomFilter_->clear(bid.index()); + } + ioErrorCount_.inc(); + return Status::DeviceError; + } + } + + if (space && bloomFilter_) { + if (removed + evicted == 0) { + // In case nothing was removed or evicted, we can just add + bloomFilter_->set(bid.index(), hk.keyHash()); + } else { + bfRebuild(bid, bucket); + } + } + } + + sizeDist_.addSize(hk.key().size() + value.size()); + itemCount_.add(1); + setItemCount_.inc(); + itemCount_.sub(evicted + removed); + setItemCount_.sub(evicted + removed); + evictionCount_.add(evicted); + logicalWrittenCount_.add(hk.key().size() + value.size()); + setInsertCount_.inc(); + if (space) { + // otherwise was not written + physicalWrittenCount_.add(bucketSize_); + } + succInsertCount_.inc(); + return Status::Ok; +} + +Status Kangaroo::lookup(HashedKey hk, Buffer& value) { + const auto bid = getKangarooBucketId(hk); + lookupCount_.inc(); + + // first check log if it exists + if (log_) { + Status ret = log_->lookup(hk, value); + if (ret == Status::Ok) { + succLookupCount_.inc(); + logHits_.inc(); + return ret; + } + } + + RripBucket* bucket{nullptr}; + Buffer buffer; + BufferView valueView; + // scope of the lock is only needed until we read and mutate state for the + // bucket. Once the bucket is read, the buffer is local and we can find + // without holding the lock. + { + std::shared_lock lock{getMutex(bid)}; + + if (bfReject(bid, hk.keyHash())) { + return Status::NotFound; + } + + buffer = readBucket(bid); + if (buffer.isNull()) { + ioErrorCount_.inc(); + return Status::DeviceError; + } + + bucket = reinterpret_cast(buffer.data()); + + /* TODO: moving this inside lock could cause performance problem */ + valueView = bucket->find(hk, [&](uint32_t keyIdx) {bvSetHit(bid, keyIdx);}); + } + + if (valueView.isNull()) { + bfFalsePositiveCount_.inc(); + return Status::NotFound; + } + value = Buffer{valueView}; + succLookupCount_.inc(); + setHits_.inc(); + return Status::Ok; +} + +Status Kangaroo::remove(HashedKey hk) { + const auto bid = getKangarooBucketId(hk); + removeCount_.inc(); + + if (log_) { + Status ret = log_->remove(hk); + if (ret == Status::Ok) { + succRemoveCount_.inc(); + itemCount_.dec(); + logItemCount_.dec(); + return ret; + } + } + + { + std::unique_lock lock{getMutex(bid)}; + if (bfReject(bid, hk.keyHash())) { + return Status::NotFound; + } + + auto buffer = readBucket(bid); + if (buffer.isNull()) { + ioErrorCount_.inc(); + return Status::DeviceError; + } + + auto* bucket = reinterpret_cast(buffer.data()); + bucket->reorder([&](uint32_t keyIdx) {return bvGetHit(bid, keyIdx);}); + + if (!bucket->remove(hk, destructorCb_)) { + bfFalsePositiveCount_.inc(); + return Status::NotFound; + } + + const auto res = writeBucket(bid, std::move(buffer)); + if (!res) { + if (bloomFilter_) { + bloomFilter_->clear(bid.index()); + } + ioErrorCount_.inc(); + return Status::DeviceError; + } + + if (bloomFilter_) { + bfRebuild(bid, bucket); + } + bitVector_->clear(bid.index()); + } + + itemCount_.dec(); + setItemCount_.dec(); + + // We do not bump logicalWrittenCount_ because logically a + // remove operation does not write, but for Kangaroo, it does + // incur physical writes. + physicalWrittenCount_.add(bucketSize_); + succRemoveCount_.inc(); + return Status::Ok; +} + +bool Kangaroo::couldExist(HashedKey hk) { + const auto bid = getKangarooBucketId(hk); + bool canExist = false; + + if (log_) { + canExist = log_->couldExist(hk); + } + + if (!canExist) { + std::shared_lock lock{getMutex(bid)}; + canExist = !bfReject(bid, hk.keyHash()); + } + + // the caller is not likely to issue a subsequent lookup when we return + // false. hence tag this as a lookup. If we return the key can exist, the + // caller will perform a lookupAsync and will be counted within lookup api. + if (!canExist) { + lookupCount_.inc(); + } + return canExist; +} + +bool Kangaroo::bfReject(KangarooBucketId bid, uint64_t keyHash) const { + if (bloomFilter_) { + bfProbeCount_.inc(); + if (!bloomFilter_->couldExist(bid.index(), keyHash)) { + bfRejectCount_.inc(); + return true; + } + } + return false; +} + +bool Kangaroo::bvGetHit(KangarooBucketId bid, uint32_t keyIdx) const { + if (bitVector_) { + return bitVector_->get(bid.index(), keyIdx); + } + return false; +} + +void Kangaroo::bvSetHit(KangarooBucketId bid, uint32_t keyIdx) const { + if (bitVector_) { + bitVector_->set(bid.index(), keyIdx); + } +} + +void Kangaroo::bfRebuild(KangarooBucketId bid, const RripBucket* bucket) { + XDCHECK(bloomFilter_); + bloomFilter_->clear(bid.index()); + auto itr = bucket->getFirst(); + while (!itr.done()) { + bloomFilter_->set(bid.index(), itr.keyHash()); + itr = bucket->getNext(itr); + } +} + +void Kangaroo::flush() { + XLOG(INFO, "Flush big hash"); + device_.flush(); +} + +Buffer Kangaroo::readBucket(KangarooBucketId bid) { + auto buffer = device_.makeIOBuffer(bucketSize_); + XDCHECK(!buffer.isNull()); + + const bool res = + device_.read(getBucketOffset(bid), buffer.size(), buffer.data()); + if (!res) { + return {}; + } + + auto* bucket = reinterpret_cast(buffer.data()); + + const auto checksumSuccess = + RripBucket::computeChecksum(buffer.view()) == bucket->getChecksum(); + // We can only know for certain this is a valid checksum error if bloom filter + // is already initialized. Otherwise, it could very well be because we're + // reading the bucket for the first time. + if (!checksumSuccess && bloomFilter_) { + checksumErrorCount_.inc(); + } + + if (!checksumSuccess || static_cast(generationTime_.count()) != + bucket->generationTime()) { + RripBucket::initNew(buffer.mutableView(), generationTime_.count()); + } + return buffer; +} + +bool Kangaroo::writeBucket(KangarooBucketId bid, Buffer buffer) { + auto* bucket = reinterpret_cast(buffer.data()); + bucket->setChecksum(RripBucket::computeChecksum(buffer.view())); + return device_.write(getBucketOffset(bid), std::move(buffer)); +} +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/Kangaroo.h b/cachelib/navy/kangaroo/Kangaroo.h new file mode 100644 index 0000000000..55aa29ee8f --- /dev/null +++ b/cachelib/navy/kangaroo/Kangaroo.h @@ -0,0 +1,214 @@ +#pragma once + +#include +#include + +#include + +#include "cachelib/common/AtomicCounter.h" +#include "cachelib/common/BloomFilter.h" +#include "cachelib/navy/common/Buffer.h" +#include "cachelib/navy/common/Device.h" +#include "cachelib/navy/common/Hash.h" +#include "cachelib/navy/common/SizeDistribution.h" +#include "cachelib/navy/common/Types.h" +#include "cachelib/navy/engine/Engine.h" +#include "cachelib/navy/kangaroo/LogBucket.h" +#include "cachelib/navy/kangaroo/KangarooLog.h" +#include "cachelib/navy/kangaroo/KangarooSizeDistribution.h" +#include "cachelib/navy/kangaroo/RripBitVector.h" +#include "cachelib/navy/kangaroo/RripBucket.h" +#include "cachelib/navy/kangaroo/Types.h" + +namespace facebook { +namespace cachelib { +namespace navy { +// Kangaroo is a small item flash-based cache engine. It divides the device into +// a series of buckets. One can think of it as a on-device hash table. +// +// Each item is hashed to a bucket according to its key. There is no size class, +// and each bucket is consisted of various variable-sized items. When full, we +// evict the items in their insertion order. An eviction call back is guaranteed +// to be invoked once per item. We currently do not support removeCB. That is +// coming as part of Navy eventually. +// +// Each read and write via Kangaroo happens in `bucketSize` granularity. This +// means, you will read a full bucket even if your item is only 100 bytes. +// It's also the same for writes. This makes Kangaroo inherently unsuitable for +// large items that will also need large buckets (several KB and above). +// +// However, this design gives us the ability to forgo an in-memory index and +// instead look up our items directly from disk. In practice, this means Kangaroo +// is a flash engine optimized for small items. +class Kangaroo final : public Engine { + public: + struct Config { + uint32_t bucketSize{4 * 1024}; + + // The range of device that Kangaroo will access is guaranted to be + // with in [baseOffset, baseOffset + cacheSize) + uint64_t cacheBaseOffset{}; + uint64_t totalSetSize{}; + Device* device{nullptr}; + + DestructorCallback destructorCb; + + // Optional bloom filter to reduce IO + std::unique_ptr bloomFilter; + + std::unique_ptr rripBitVector; + + // Better to underestimate, used for pre-allocating log index + // only needed for Kangaroo + uint32_t avgSmallObjectSize{100}; + uint32_t logIndexPartitionsPerPhysical{}; + + uint64_t numBuckets() const { return totalSetSize / bucketSize; } + + KangarooLog::Config logConfig; + + Config& validate(); + }; + + // Throw std::invalid_argument on bad config + explicit Kangaroo(Config&& config); + + ~Kangaroo() override = default; + + Kangaroo(const Kangaroo&) = delete; + Kangaroo& operator=(const Kangaroo&) = delete; + + // Check if the key could exist in bighash. This can be used as a pre-check + // to optimize cache lookups to avoid calling lookups in an async IO + // environment. + // + // @param hk key to be checked + // + // @return false if the key definitely does not exist and true if it could. + bool couldExist(HashedKey hk) override; + // Look up a key in Kangaroo. On success, it will return Status::Ok and + // populate "value" with the value found. User should pass in a null + // Buffer as "value" as any existing storage will be freed. If not found, + // it will return Status::NotFound. And of course, on error, it returns + // DeviceError. + Status lookup(HashedKey hk, Buffer& value) override; + + // Inserts key and value into Kangaroo. This will replace an existing + // key if found. If it failed to write, it will return DeviceError. + Status insert(HashedKey hk, + BufferView value) override; + + // Removes an entry from Kangaroo if found. Ok on success, NotFound on miss, + // and DeviceError on error. + Status remove(HashedKey hk) override; + + void flush() override; + + void reset() override; + + void persist(RecordWriter& rw) override; + bool recover(RecordReader& rr) override; + + void getCounters(const CounterVisitor& visitor) const override; + + // return the maximum allowed item size + uint64_t getMaxItemSize() const override; + + uint64_t bfRejectCount() const { return bfRejectCount_.get(); } + + private: + struct ValidConfigTag {}; + Kangaroo(Config&& config, ValidConfigTag); + + Buffer readBucket(KangarooBucketId bid); + bool writeBucket(KangarooBucketId bid, Buffer buffer); + + // The corresponding r/w bucket lock must be held during the entire + // duration of the read and write operations. For example, during write, + // if write lock is dropped after a bucket is read from device, user + // must re-acquire the write lock and re-read the bucket from device + // again to ensure they have the newest content. Otherwise, one thread + // could overwrite another's writes. + // + // In short, just hold the lock during the entire operation! + folly::SharedMutex& getMutex(KangarooBucketId bid) const { + return mutex_[bid.index() & (kNumMutexes - 1)]; + } + + KangarooBucketId getKangarooBucketId(HashedKey hk) const { + return KangarooBucketId{static_cast(hk.keyHash() % numBuckets_)}; + } + + KangarooBucketId getKangarooBucketIdFromHash(uint64_t hash) const { + return KangarooBucketId{static_cast(hash % numBuckets_)}; + } + + uint64_t getBucketOffset(KangarooBucketId bid) const { + return cacheBaseOffset_ + bucketSize_ * bid.index(); + } + + double bfFalsePositivePct() const; + void bfRebuild(KangarooBucketId bid, const RripBucket* bucket); + bool bfReject(KangarooBucketId bid, uint64_t keyHash) const; + + bool bvGetHit(KangarooBucketId bid, uint32_t keyIdx) const; + void bvSetHit(KangarooBucketId bid, uint32_t keyIdx) const; + + void insertMultipleObjectsToKangarooBucket(std::vector>& ois, + ReadmitCallback readmit); + + // Use birthday paradox to estimate number of mutexes given number of parallel + // queries and desired probability of lock collision. + static constexpr size_t kNumMutexes = 16 * 1024; + + // Serialization format version. Never 0. Versions < 10 reserved for testing. + static constexpr uint32_t kFormatVersion = 10; + + // Open addressing index overhead + static constexpr double LogIndexOverhead = 2; + + const DestructorCallback destructorCb_{}; + const uint64_t bucketSize_{}; + const uint64_t cacheBaseOffset_{}; + const uint64_t numBuckets_{}; + std::unique_ptr bloomFilter_; + std::unique_ptr bitVector_; + std::unique_ptr log_{nullptr}; + std::chrono::nanoseconds generationTime_{}; + Device& device_; + std::unique_ptr mutex_{ + new folly::SharedMutex[kNumMutexes]}; + mutable AtomicCounter itemCount_; + mutable AtomicCounter logItemCount_; + mutable AtomicCounter setItemCount_; + mutable AtomicCounter insertCount_; + mutable AtomicCounter logInsertCount_; + mutable AtomicCounter setInsertCount_; + mutable AtomicCounter readmitInsertCount_; + mutable AtomicCounter succInsertCount_; + mutable AtomicCounter lookupCount_; + mutable AtomicCounter succLookupCount_; + mutable AtomicCounter setHits_; + mutable AtomicCounter logHits_; + mutable AtomicCounter removeCount_; + mutable AtomicCounter succRemoveCount_; + mutable AtomicCounter evictionCount_; + mutable AtomicCounter logicalWrittenCount_; + mutable AtomicCounter physicalWrittenCount_; + mutable AtomicCounter ioErrorCount_; + mutable AtomicCounter bfFalsePositiveCount_; + mutable AtomicCounter bfProbeCount_; + mutable AtomicCounter bfRejectCount_; + mutable AtomicCounter checksumErrorCount_; + mutable AtomicCounter thresholdNotHit_; + mutable AtomicCounter multiInsertCalls_; + mutable SizeDistribution sizeDist_; + mutable KangarooSizeDistribution thresholdSizeDist_; + mutable KangarooSizeDistribution thresholdNumDist_; + + static_assert((kNumMutexes & (kNumMutexes - 1)) == 0, + "number of mutexes must be power of two"); +}; +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/KangarooBucketStorage.cpp b/cachelib/navy/kangaroo/KangarooBucketStorage.cpp new file mode 100644 index 0000000000..729fa2f52d --- /dev/null +++ b/cachelib/navy/kangaroo/KangarooBucketStorage.cpp @@ -0,0 +1,100 @@ +#include "cachelib/navy/kangaroo/KangarooBucketStorage.h" + +namespace facebook { +namespace cachelib { +namespace navy { +static_assert(sizeof(KangarooBucketStorage) == 12, + "KangarooBucketStorage overhead. Changing this may require changing " + "the sizes used in unit tests as well"); + +const uint32_t KangarooBucketStorage::kAllocationOverhead = sizeof(KangarooBucketStorage::Slot); + +// This is very simple as it only tries to allocate starting from the +// tail of the storage. Returns null view() if we don't have any more space. +KangarooBucketStorage::Allocation KangarooBucketStorage::allocate(uint32_t size) { + if (!canAllocate(size)) { + return {}; + } + + auto* slot = new (data_ + endOffset_) Slot(size); + endOffset_ += slotSize(size); + numAllocations_++; + return {MutableBufferView{slot->size, slot->data}, numAllocations_ - 1}; +} + +void KangarooBucketStorage::remove(Allocation alloc) { + // Remove triggers a compaction. + // + // tail + // |--------|REMOVED|-----|~~~~| + // + // after compaction + // tail + // |---------------|~~~~~~~~~~~| + if (alloc.done()) { + return; + } + + const uint32_t removedSize = slotSize(alloc.view().size()); + uint8_t* removed = alloc.view().data() - kAllocationOverhead; + std::memmove(removed, + removed + removedSize, + (data_ + endOffset_) - removed - removedSize); + endOffset_ -= removedSize; + numAllocations_--; +} + +void KangarooBucketStorage::removeUntil(Allocation alloc) { + // Remove everything until (and include) "alloc" + // + // tail + // |----------------|-----|~~~~| + // ^ ^ + // begin offset + // remove this whole range + // + // tail + // |-----|~~~~~~~~~~~~~~~~~~~~~| + if (alloc.done()) { + return; + } + + uint32_t offset = alloc.view().data() + alloc.view().size() - data_; + if (offset > endOffset_) { + return; + } + + std::memmove(data_, data_ + offset, endOffset_ - offset); + endOffset_ -= offset; + numAllocations_ -= alloc.position() + 1; +} + +KangarooBucketStorage::Allocation KangarooBucketStorage::getFirst() const { + if (endOffset_ == 0) { + return {}; + } + auto* slot = reinterpret_cast(data_); + return {MutableBufferView{slot->size, slot->data}, 0}; +} + +KangarooBucketStorage::Allocation KangarooBucketStorage::getNext( + KangarooBucketStorage::Allocation alloc) const { + if (alloc.done()) { + return {}; + } + + auto* next = + reinterpret_cast(alloc.view().data() + alloc.view().size()); + if (reinterpret_cast(next) - data_ >= endOffset_) { + return {}; + } else if (next->size + reinterpret_cast(next) - data_ >= endOffset_) { + return {}; + } else if (next->size == 0) { + return {}; + } + + return {MutableBufferView{next->size, next->data}, alloc.position() + 1}; +} +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/KangarooBucketStorage.h b/cachelib/navy/kangaroo/KangarooBucketStorage.h new file mode 100644 index 0000000000..9d54018168 --- /dev/null +++ b/cachelib/navy/kangaroo/KangarooBucketStorage.h @@ -0,0 +1,92 @@ +#pragma once + +#include "cachelib/navy/common/Buffer.h" +#include "cachelib/navy/common/CompilerUtils.h" +#include "cachelib/navy/common/Types.h" + +namespace facebook { +namespace cachelib { +namespace navy { +// TODO: (beyondsora) T31519237 Change KangarooBucketStorage to allocate backwards +// for better performance +// This is a very simple FIFO allocator that once full the only +// way to free up more space is by removing entries at the +// front. It is used for managing alloactions inside a bucket. +class FOLLY_PACK_ATTR KangarooBucketStorage { + public: + // This is an allocation that is returned to user when they + // allocate from the KangarooBucketStorage. "view" is for reading + // and modifying data allocated from the storage. "position" + // indicates where it is in the storage and is used internally to + // iterate to the next allocation. + // + // User should only have reference to one "allocation" at a time. + // Calling remove or removeUntil API on an allocation will invalidate + // all the other references to allocations. + class Allocation { + public: + Allocation() = default; + + bool done() const { return view_.isNull(); } + + MutableBufferView view() const { return view_; } + + uint32_t position() const { return position_; } + + private: + friend KangarooBucketStorage; + + Allocation(MutableBufferView v, uint32_t p) : view_{v}, position_{p} {} + + MutableBufferView view_{}; + uint32_t position_{}; + }; + + static uint32_t slotSize(uint32_t size) { return kAllocationOverhead + size; } + + explicit KangarooBucketStorage(uint32_t capacity) : capacity_{capacity} {} + + Allocation allocate(uint32_t size); + + uint32_t capacity() const { return capacity_; } + + uint32_t remainingCapacity() const { return capacity_ - endOffset_; } + + uint32_t numAllocations() const { return numAllocations_; } + + void clear() { + endOffset_ = 0; + numAllocations_ = 0; + } + + void remove(Allocation alloc); + + // Removes every single allocation from the beginning, including this one. + void removeUntil(Allocation alloc); + + Allocation getFirst() const; + Allocation getNext(Allocation alloc) const; + + private: + // Slot represents a physical slot in the storage. User does not use + // this directly but instead uses Allocation. + struct FOLLY_PACK_ATTR Slot { + uint16_t size{}; + uint8_t data[]; + explicit Slot(uint16_t s) : size{s} {} + }; + + bool canAllocate(uint32_t size) const { + return static_cast(endOffset_) + slotSize(size) <= capacity_; + } + + static const uint32_t kAllocationOverhead; + + const uint32_t capacity_{}; + uint32_t numAllocations_{}; + uint32_t endOffset_{}; + mutable uint8_t data_[]; +}; +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/KangarooLog.cpp b/cachelib/navy/kangaroo/KangarooLog.cpp new file mode 100644 index 0000000000..f16ed738cf --- /dev/null +++ b/cachelib/navy/kangaroo/KangarooLog.cpp @@ -0,0 +1,651 @@ +#include +#include +#include + +#include +#include + +#include "cachelib/navy/kangaroo/KangarooLog.h" + +namespace facebook { +namespace cachelib { +namespace navy { + +Buffer KangarooLog::readLogPage(LogPageId lpid) { + auto buffer = device_.makeIOBuffer(pageSize_); + XDCHECK(!buffer.isNull()); + + const bool res = + device_.read(getLogPageOffset(lpid), buffer.size(), buffer.data()); + if (!res) { + return {}; + } + // TODO: checksumming & generations + return buffer; +} + +Buffer KangarooLog::readLogSegment(LogSegmentId lsid) { + auto buffer = device_.makeIOBuffer(segmentSize_); + XDCHECK(!buffer.isNull()); + + const bool res = + device_.read(getLogSegmentOffset(lsid), buffer.size(), buffer.data()); + + if (!res) { + return {}; + } + // TODO: checksumming & generations + return buffer; +} + +bool KangarooLog::writeLogSegment(LogSegmentId lsid, Buffer buffer) { + // TODO: set checksums + logSegmentsWrittenCount_.inc(); + return device_.write(getLogSegmentOffset(lsid), std::move(buffer)); +} + +bool KangarooLog::flushLogSegment(LogSegmentId lsid) { + LogSegmentId nextLsid = getNextLsid(lsid); + { + std::unique_lock segmentLock{getMutexFromSegment(lsid)}; + while (nextLsid == nextLsidsToClean_[lsid.partition()] && !killThread_) { + writeSetsCv_.notify_all(); + flushLogCv_.wait_for(segmentLock, std::chrono::seconds(30)); + } + if (killThread_) { + return false; + } + std::unique_lock bufferLock{logSegmentMutexs_[lsid.partition()]}; + + if (currentLogSegments_[lsid.partition()]->getLogSegmentId() == lsid) { + // another thread already flushed log + writeLogSegment(lsid, std::move(Buffer(logSegmentBuffers_[lsid.partition()].view(), pageSize_))); + currentLogSegments_[lsid.partition()]->clear(nextLsid); + } + } + return true; +} + +LogSegmentId KangarooLog::getNextLsid(LogSegmentId lsid) { + return LogSegmentId((lsid.index() + 1) % (pagesPerPartition_ / pagesPerSegment_), lsid.partition()); +} + +KangarooLog::~KangarooLog() { + killThread_ = true; + for (uint64_t i = 0; i < numThreads_; i++) { + writeSetsCv_.notify_all(); + flushLogCv_.notify_all(); + logToSetsThreads_[i].join(); + } + for (uint64_t i = 0; i < logIndexPartitions_; i++) { + delete index_[i]; + } + delete index_; + for (uint64_t i = 0; i < logPhysicalPartitions_; i++) { + delete currentLogSegments_[i]; + } +} + +KangarooLog::KangarooLog(Config&& config) + : KangarooLog{std::move(config.validate()), ValidConfigTag{}} {} + +KangarooLog::Config& KangarooLog::Config::validate() { + if (logSize < readSize) { + throw std::invalid_argument( + folly::sformat("log size: {} cannot be smaller than read size: {}", + logSize, + readSize)); + } + + if (logSize < segmentSize) { + throw std::invalid_argument( + folly::sformat("log size: {} cannot be smaller than segment size: {}", + logSize, + readSize)); + } + + if (!folly::isPowTwo(readSize)) { + throw std::invalid_argument( + folly::sformat("invalid read size: {}", readSize)); + } + + if (logSize > uint64_t{readSize} << 32) { + throw std::invalid_argument(folly::sformat( + "Can't address kangaroo log with 32 bits. Log size: {}, read size: {}", + logSize, + readSize)); + } + + if (segmentSize % readSize != 0 || logSize % readSize != 0) { + throw std::invalid_argument(folly::sformat( + "logSize and segmentSize need to be a multiple of readSize. " + "segmentSize: {}, logSize:{}, readSize: {}.", + segmentSize, + logSize, + readSize)); + } + + if (logSize % segmentSize != 0) { + throw std::invalid_argument(folly::sformat( + "logSize must be a multiple of segmentSize. " + "logSize:{}, segmentSize: {}.", + logSize, + segmentSize)); + } + + if (logPhysicalPartitions == 0) { + throw std::invalid_argument(folly::sformat( + "number physical partitions needs to be greater than 0" + )); + } + + if (logIndexPartitions % logPhysicalPartitions != 0) { + throw std::invalid_argument(folly::sformat( + "the number of index partitions must be a multiple of the physical partitions" + )); + } + + if (logSize / logPhysicalPartitions % readSize != 0) { + throw std::invalid_argument(folly::sformat( + "Phycial partition size must be a multiple of read size" + )); + } + + if (numTotalIndexBuckets % logIndexPartitions != 0) { + throw std::invalid_argument(folly::sformat( + "Index entries {} must be a multiple of index partitions {}", + numTotalIndexBuckets, logIndexPartitions + )); + } + + if (device == nullptr) { + throw std::invalid_argument("device cannot be null"); + } + + if (numTotalIndexBuckets == 0) { + throw std::invalid_argument("need to have a number of index buckets"); + } + + return *this; +} + +KangarooLog::KangarooLog(Config&& config, ValidConfigTag) + : pageSize_{config.readSize}, + segmentSize_{config.segmentSize}, + logBaseOffset_{config.logBaseOffset}, + logSize_{config.logSize}, + pagesPerSegment_{segmentSize_ / pageSize_}, + numSegments_{logSize_ / segmentSize_}, + device_{*config.device}, + logIndexPartitions_{config.logIndexPartitions}, + index_{new ChainedLogIndex*[logIndexPartitions_]}, + logPhysicalPartitions_{config.logPhysicalPartitions}, + physicalPartitionSize_{logSize_ / logPhysicalPartitions_}, + pagesPerPartition_{physicalPartitionSize_ / pageSize_}, + segmentsPerPartition_{pagesPerPartition_ / pagesPerSegment_}, + nextLsidsToClean_{std::make_unique(logPhysicalPartitions_)}, + logSegmentBuffers_{new Buffer[logPhysicalPartitions_]}, + numThreads_{config.mergeThreads}, + nextCleaningPartition_{std::make_unique(numThreads_)}, + currentLogSegments_{new KangarooLogSegment*[logPhysicalPartitions_]}, + setNumberCallback_{config.setNumberCallback}, + setMultiInsertCb_{config.setMultiInsertCallback}, + numIndexEntries_{config.numTotalIndexBuckets}, + threshold_{config.threshold} { + XLOGF(INFO, + "Kangaroo Log created: size: {}, read size: {}, segment size: {}, base offset: {}, pages per partition {}", + logSize_, + pageSize_, + segmentSize_, + logBaseOffset_, + pagesPerPartition_); + for (uint64_t i = 0; i < logIndexPartitions_; i++) { + index_[i] = new ChainedLogIndex(numIndexEntries_ / logIndexPartitions_, + config.sizeAllocations, setNumberCallback_); + } + logSegmentMutexs_ = std::make_unique(logPhysicalPartitions_); + reset(); + logToSetsThreads_.reserve(numThreads_); + for (uint64_t i = 0; i < numThreads_; i++) { + logToSetsThreads_.push_back(std::thread(&KangarooLog::cleanSegmentsLoop, this, i)); + } +} + +bool KangarooLog::shouldClean(uint64_t nextWriteLoc, uint64_t nextCleaningLoc) { + uint64_t freeSegments = 0; + if (nextCleaningLoc >= nextWriteLoc) { + freeSegments = nextCleaningLoc - nextWriteLoc; + } else { + freeSegments = nextCleaningLoc + (segmentsPerPartition_ - nextWriteLoc); + } + return freeSegments <= (segmentsPerPartition_ * cleaningThreshold_); +} + +bool KangarooLog::shouldWakeCompaction(uint64_t threadId) { + for (uint64_t i = 0; i < logPhysicalPartitions_; i++) { + uint64_t partition = (i + nextCleaningPartition_[threadId]) % logPhysicalPartitions_; + if (partition % numThreads_ == threadId) { + std::shared_lock lock{logSegmentMutexs_[partition]}; + LogSegmentId currentLsid = currentLogSegments_[partition]->getLogSegmentId(); + // may want to stay ahead by more than 1 Lsid but should work for now + if (shouldClean(getNextLsid(currentLsid).index(), nextLsidsToClean_[partition].index())) { + nextCleaningPartition_[threadId] = partition; + return true; + } + } + } + // return true to kill thread if object is destructed + return killThread_; +} + +void KangarooLog::moveBucket(HashedKey hk, uint64_t count, LogSegmentId lsidToFlush) { + KangarooBucketId bid = setNumberCallback_(hk.keyHash()); + uint64_t indexPartition = getIndexPartition(hk); + uint64_t physicalPartition = getPhysicalPartition(hk); + ChainedLogIndex::BucketIterator indexIt; + + std::vector> objects; + objects.reserve(count); + + /* allow reinsertion to index if not enough objects to move */ + indexIt = index_[indexPartition]->getHashBucketIterator(hk); + while (!indexIt.done()) { + + BufferView value; + HashedKey key = hk; + uint8_t hits; + LogPageId lpid; + uint32_t tag; + + if (killThread_) { + return; + } + + hits = indexIt.hits(); + tag = indexIt.tag(); + indexIt = index_[indexPartition]->getNext(indexIt); + lpid = getLogPageId(index_[indexPartition]->find(bid, tag), physicalPartition); + if (!lpid.isValid()) { + continue; + } + + // Find value, could be in in-memory buffer or on nvm + Buffer buffer; + Status status = lookupBufferedTag(tag, key, buffer, lpid); + if (status != Status::Ok) { + buffer = readLogPage(lpid); + if (buffer.isNull()) { + ioErrorCount_.inc(); + continue; + } + LogBucket* page = reinterpret_cast(buffer.data()); + flushPageReads_.inc(); + value = page->findTag(tag, key); + } else { + value = buffer.view(); + } + + if (value.isNull()) { + index_[indexPartition]->remove(tag, bid, getPartitionOffset(lpid)); + continue; + } else if (setNumberCallback_(key.keyHash()) != bid) { + flushFalsePageReads_.inc(); + continue; + } + index_[indexPartition]->remove(tag, bid, getPartitionOffset(lpid)); + moveBucketSuccessfulRets_.inc(); + auto ptr = std::make_unique(key, value, hits, lpid, tag); + objects.push_back(std::move(ptr)); + } + + + ReadmitCallback readmitCb = [&](std::unique_ptr& oi){ + /* reinsert items attempted to be moved into index unless in segment to flush */ + moveBucketSuccessfulRets_.inc(); + if (getSegmentId(oi->lpid) != lsidToFlush) { + index_[indexPartition]->insert(oi->tag, bid, getPartitionOffset(oi->lpid), oi->hits); + } else if (oi->hits) { + readmit(oi->key, oi->value.view()); + } + return; + }; + + if (objects.size() < threshold_) { + thresholdNotHit_.inc(); + for (auto& item: objects) { + readmitCb(item); + } + } else { + moveBucketCalls_.inc(); + setMultiInsertCb_(objects, readmitCb); + } +} + +void KangarooLog::cleanSegment(LogSegmentId lsid) { + { + std::shared_lock segmentLock{getMutexFromSegment(lsid)}; + auto buf = readLogSegment(lsid); + if (buf.isNull()) { + ioErrorCount_.inc(); + return; + } + + auto log_seg = KangarooLogSegment(segmentSize_, pageSize_, + lsid, pagesPerPartition_, buf.mutableView(), false); + auto it = log_seg.getFirst(); + while (!it.done()) { + uint64_t indexPartition = getIndexPartition(HashedKey(it.key())); + uint32_t hits = 0; + LogPageId lpid; + lpid = getLogPageId(index_[indexPartition]->lookup(it.key(), false, &hits), lsid.partition()); + if (!lpid.isValid() || lsid != getSegmentId(lpid)) { + if (lpid.isValid()) { + indexSegmentMismatch_.inc(); + } + it = log_seg.getNext(it); + notFoundInLogIndex_.inc(); + continue; + } + foundInLogIndex_.inc(); + + // best effort count (remove could come and decrease + // but that should be rare) + uint64_t count; + count = index_[indexPartition]->countBucket(it.key()); + sumCountCounter_.add(count); + numCountCalls_.inc(); + if (count < threshold_) { + thresholdNotHit_.inc(); + // evict key because set doesn't meet threshold + if (hits) { + readmit(it.key(), it.value()); + } else { + index_[indexPartition]->remove(it.key(), getPartitionOffset(lpid)); + } + } else { + moveBucket(it.key(), count, lsid); + } + it = log_seg.getNext(it); + } + } +} + +void KangarooLog::cleanSegmentsLoop(uint64_t threadId) { + while (true) { + while (!shouldWakeCompaction(threadId)) { + flushLogCv_.notify_all(); + std::unique_lock lock{writeSetsMutex_}; + writeSetsCv_.wait_for(lock, std::chrono::seconds(10)); + } + writeSetsCv_.notify_all(); + if (killThread_) { + return; + } + cleanSegment(nextLsidsToClean_[nextCleaningPartition_[threadId]]); + nextLsidsToClean_[nextCleaningPartition_[threadId]] = getNextLsid(nextLsidsToClean_[nextCleaningPartition_[threadId]]); + nextCleaningPartition_[threadId]++; + flushLogSegmentsCount_.inc(); + + // wake up any insert requests waiting for space + flushLogCv_.notify_all(); + } +} + +double KangarooLog::falsePositivePct() const { + return 100. * keyCollisionCount_.get() / (lookupCount_.get() + removeCount_.get()); +} + +double KangarooLog::extraReadsPct() const { + return 100. * flushFalsePageReads_.get() / flushPageReads_.get(); +} + +double KangarooLog::fragmentationPct() const { + auto found = foundInLogIndex_.get(); + return 100. * found / (notFoundInLogIndex_.get() + found); +} + +uint64_t KangarooLog::getBytesWritten() const { + return logSegmentsWrittenCount_.get() * segmentSize_; +} + +Status KangarooLog::lookup(HashedKey hk, Buffer& value) { + lookupCount_.inc(); + uint64_t indexPartition = getIndexPartition(hk); + uint64_t physicalPartition = getPhysicalPartition(hk); + LogPageId lpid = getLogPageId(index_[indexPartition]->lookup(hk, true, nullptr), physicalPartition); + if (!lpid.isValid()) { + return Status::NotFound; + } + + Buffer buffer; + BufferView valueView; + LogBucket* page; + { + std::shared_lock lock{getMutexFromPage(lpid)}; + + // check if page is buffered in memory and read it + Status ret = lookupBuffered(hk, value, lpid); + if (ret != Status::Retry) { + return ret; + } + + buffer = readLogPage(lpid); + if (buffer.isNull()) { + ioErrorCount_.inc(); + return Status::DeviceError; + } + } + + page = reinterpret_cast(buffer.data()); + + valueView = page->find(hk); + if (valueView.isNull()) { + keyCollisionCount_.inc(); + return Status::NotFound; + } + + value = Buffer{valueView}; + succLookupCount_.inc(); + return Status::Ok; +} + +bool KangarooLog::couldExist(HashedKey hk) { + uint64_t indexPartition = getIndexPartition(hk); + uint64_t physicalPartition = getPhysicalPartition(hk); + LogPageId lpid = getLogPageId(index_[indexPartition]->lookup(hk, true, nullptr), physicalPartition); + if (!lpid.isValid()) { + lookupCount_.inc(); + return false; + } + + return true; +} + +Status KangarooLog::insert(HashedKey hk, + BufferView value) { + LogPageId lpid; + LogSegmentId lsid; + uint64_t physicalPartition = getPhysicalPartition(hk); + { + // logSegment handles concurrent inserts + // lock to prevent write out of segment + std::shared_lock lock{logSegmentMutexs_[physicalPartition]}; + lpid = currentLogSegments_[physicalPartition]->insert(hk, value); + if (!lpid.isValid()) { + // need to flush segment using lsid + lsid = currentLogSegments_[physicalPartition]->getLogSegmentId(); + } + } + + if (lpid.isValid()) { + uint64_t indexPartition = getIndexPartition(hk); + auto ret = index_[indexPartition]->insert(hk, getPartitionOffset(lpid)); + if (ret == Status::NotFound) { + replaceIndexInsert_.inc(); + ret = Status::Ok; + } + insertCount_.inc(); + if (ret == Status::Ok) { + succInsertCount_.inc(); + bytesInserted_.add(hk.key().size() + value.size()); + } + return ret; + } + + if (flushLogSegment(lsid)) { + flushLogCv_.notify_all(); + writeSetsCv_.notify_all(); + return insert(hk, value); + } else { + flushLogCv_.notify_all(); + writeSetsCv_.notify_all(); + return Status::Rejected; + } +} + +void KangarooLog::readmit(HashedKey hk, + BufferView value) { + LogPageId lpid; + LogSegmentId lsid; + uint64_t physicalPartition = getPhysicalPartition(hk); + readmitRequests_.inc(); + + { + // logSegment handles concurrent inserts + // lock to prevent write out of segment + std::shared_lock lock{logSegmentMutexs_[physicalPartition]}; + lpid = currentLogSegments_[physicalPartition]->insert(hk, value); + if (!lpid.isValid()) { + // no room in segment so will not insert + readmitRequestsFailed_.inc(); + return; + } + } + + uint64_t indexPartition = getIndexPartition(hk); + auto ret = index_[indexPartition]->insert(hk, getPartitionOffset(lpid)); + readmitBytes_.add(hk.key().size() + value.size()); +} + +Status KangarooLog::remove(HashedKey hk) { + uint64_t indexPartition = getIndexPartition(hk); + uint64_t physicalPartition = getPhysicalPartition(hk); + removeCount_.inc(); + LogPageId lpid; + + lpid = getLogPageId(index_[indexPartition]->lookup(hk, false, nullptr), physicalPartition); + if (!lpid.isValid()) { + return Status::NotFound; + } + + Buffer buffer; + BufferView valueView; + LogBucket* page; + { + std::shared_lock lock{getMutexFromPage(lpid)}; + + // check if page is buffered in memory and read it + Status ret = lookupBuffered(hk, buffer, lpid); + if (ret == Status::Ok) { + Status status = index_[indexPartition]->remove(hk, getPartitionOffset(lpid)); + if (status == Status::Ok) { + succRemoveCount_.inc(); + } + return status; + } else if (ret == Status::Retry) { + buffer = readLogPage(lpid); + if (buffer.isNull()) { + ioErrorCount_.inc(); + return Status::DeviceError; + } + } else { + return ret; + } + } + + page = reinterpret_cast(buffer.data()); + + valueView = page->find(hk); + if (valueView.isNull()) { + keyCollisionCount_.inc(); + return Status::NotFound; + } + + Status status = index_[indexPartition]->remove(hk, getPartitionOffset(lpid)); + if (status == Status::Ok) { + succRemoveCount_.inc(); + } + return status; +} + +void KangarooLog::flush() { + // TODO: should probably flush buffered part of log + return; +} + +void KangarooLog::reset() { + itemCount_.set(0); + insertCount_.set(0); + succInsertCount_.set(0); + lookupCount_.set(0); + succLookupCount_.set(0); + removeCount_.set(0); + logicalWrittenCount_.set(0); + physicalWrittenCount_.set(0); + ioErrorCount_.set(0); + checksumErrorCount_.set(0); + + for (uint64_t i = 0; i < logPhysicalPartitions_; i++) { + logSegmentBuffers_[i] = device_.makeIOBuffer(segmentSize_); + currentLogSegments_[i] = new KangarooLogSegment( + segmentSize_, pageSize_, LogSegmentId(0, i), pagesPerPartition_, + logSegmentBuffers_[i].mutableView(), true); + nextLsidsToClean_[i] = LogSegmentId(0, i); + } +} + +Status KangarooLog::lookupBuffered(HashedKey hk, + Buffer& value, LogPageId lpid) { + uint64_t partition = getPhysicalPartition(hk); + BufferView view; + { + std::shared_lock lock{logSegmentMutexs_[partition]}; + if (!isBuffered(lpid, partition)) { + return Status::Retry; + } + view = currentLogSegments_[partition]->find(hk, lpid); + if (view.isNull()) { + keyCollisionCount_.inc(); + return Status::NotFound; + } + value = Buffer{view}; + } + return Status::Ok; +} + +Status KangarooLog::lookupBufferedTag(uint32_t tag, HashedKey& hk, + Buffer& value, LogPageId lpid) { + uint64_t partition = getPhysicalPartition(lpid); + BufferView view; + { + std::shared_lock lock{logSegmentMutexs_[partition]}; + if (!isBuffered(lpid, partition)) { + return Status::Retry; + } + view = currentLogSegments_[partition]->findTag(tag, hk, lpid); + if (view.isNull()) { + keyCollisionCount_.inc(); + return Status::NotFound; + } + value = Buffer{view}; + } + return Status::Ok; +} + +bool KangarooLog::isBuffered(LogPageId lpid, uint64_t partition) { + return getSegmentId(lpid) == currentLogSegments_[partition]->getLogSegmentId(); +} + +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/KangarooLog.h b/cachelib/navy/kangaroo/KangarooLog.h new file mode 100644 index 0000000000..c7894db799 --- /dev/null +++ b/cachelib/navy/kangaroo/KangarooLog.h @@ -0,0 +1,239 @@ +#pragma once + +#include +#include +#include + +#include + +#include "cachelib/common/AtomicCounter.h" +#include "cachelib/navy/common/Buffer.h" +#include "cachelib/navy/common/Device.h" +#include "cachelib/navy/common/Types.h" +#include "cachelib/navy/kangaroo/ChainedLogIndex.h" +#include "cachelib/navy/kangaroo/KangarooLogSegment.h" + +namespace facebook { +namespace cachelib { +namespace navy { +class KangarooLog { + public: + struct Config { + uint32_t readSize{4 * 1024}; + uint32_t segmentSize{256 * 1024}; + + // The range of device that Log will access is guaranted to be + // with in [logBaseOffset, logBaseOffset + logSize) + uint64_t logBaseOffset{}; + uint64_t logSize{0}; + Device* device{nullptr}; + + // log partitioning + uint64_t logPhysicalPartitions{}; + + // for index + uint64_t logIndexPartitions{}; + uint16_t sizeAllocations{1024}; + uint64_t numTotalIndexBuckets{}; + SetNumberCallback setNumberCallback{}; + + // for merging to sets + uint32_t threshold; + SetMultiInsertCallback setMultiInsertCallback{}; + uint64_t mergeThreads{32}; + + Config& validate(); + }; + + // Throw std::invalid_argument on bad config + explicit KangarooLog(Config&& config); + + ~KangarooLog(); + + KangarooLog(const KangarooLog&) = delete; + KangarooLog& operator=(const KangarooLog&) = delete; + + bool couldExist(HashedKey hk); + + // Look up a key in KangarooLog. On success, it will return Status::Ok and + // populate "value" with the value found. User should pass in a null + // Buffer as "value" as any existing storage will be freed. If not found, + // it will return Status::NotFound. And of course, on error, it returns + // DeviceError. + Status lookup(HashedKey hk, Buffer& value); + + // Inserts key and value into KangarooLog. This will replace an existing + // key if found. If it failed to write, it will return DeviceError. + Status insert(HashedKey hk, BufferView value); + + // Removes an entry from Kangaroo if found. Ok on success, NotFound on miss, + // and DeviceError on error. + Status remove(HashedKey hk); + + void flush(); + + void reset(); + + double falsePositivePct() const; + double extraReadsPct() const; + double fragmentationPct() const; + uint64_t getBytesWritten() const; + + + // TODO: persist and recover not implemented + + private: + + struct ValidConfigTag {}; + KangarooLog(Config&& config, ValidConfigTag); + + Buffer readLogPage(LogPageId lpid); + Buffer readLogSegment(LogSegmentId lsid); + bool writeLogSegment(LogSegmentId lsid, Buffer buffer); + bool flushLogSegment(LogSegmentId lsid); + + bool isBuffered(LogPageId lpid, uint64_t physicalPartition); // does not grab logSegmentMutex mutex + Status lookupBuffered(HashedKey hk, Buffer& value, LogPageId lpid); + Status lookupBufferedTag(uint32_t tag, HashedKey& hk, Buffer& value, LogPageId lpid); + + uint64_t getPhysicalPartition(LogPageId lpid) const { + return lpid.index() / pagesPerPartition_; + } + uint64_t getLogSegmentOffset(LogSegmentId lsid) const { + return logBaseOffset_ + segmentSize_ * lsid.index() + + physicalPartitionSize_ * lsid.partition(); + } + + uint64_t getPhysicalPartition(HashedKey hk) const { + return getIndexPartition(hk) % logPhysicalPartitions_; + } + uint64_t getIndexPartition(HashedKey hk) const { + return getLogIndexEntry(hk) % logIndexPartitions_; + } + uint64_t getLogIndexEntry(HashedKey hk) const { + return setNumberCallback_(hk.keyHash()).index() % numIndexEntries_; + } + + uint64_t getLogPageOffset(LogPageId lpid) const { + return logBaseOffset_ + pageSize_ * lpid.index(); + } + + LogPageId getLogPageId(PartitionOffset po, uint32_t physicalPartition) { + return LogPageId(po.index() + physicalPartition * pagesPerPartition_, po.isValid()); + } + PartitionOffset getPartitionOffset(LogPageId lpid) { + return PartitionOffset(lpid.index() % pagesPerPartition_, lpid.isValid()); + } + + LogSegmentId getSegmentId(LogPageId lpid) const { + uint32_t index = (lpid.index() % pagesPerPartition_) / pagesPerSegment_; + return LogSegmentId(index, getPhysicalPartition(lpid)); + } + + LogPageId getPageId(LogSegmentId lsid) const { + uint64_t i = lsid.partition() * pagesPerPartition_ + lsid.index() * pagesPerSegment_; + return LogPageId(i, true); + } + + LogSegmentId getNextLsid(LogSegmentId lsid); + + // locks based on partition number, concurrent read, single modify + folly::SharedMutex& getMutexFromSegment(LogSegmentId lsid) const { + return mutex_[(lsid.partition()) & (NumMutexes - 1)]; + } + folly::SharedMutex& getMutexFromPage(LogPageId lpid) const { + return getMutexFromSegment(getSegmentId(lpid)); + } + + double cleaningThreshold_ = .1; + bool shouldClean(uint64_t nextWriteLog, uint64_t nextCleaningLoc); + void cleanSegment(LogSegmentId lsid); + void cleanSegmentsLoop(uint64_t threadId); + bool shouldWakeCompaction(uint64_t threadId); + void moveBucket(HashedKey hk, uint64_t count, LogSegmentId lsidToFlush); + void readmit(HashedKey hk, BufferView value); + + // Use birthday paradox to estimate number of mutexes given number of parallel + // queries and desired probability of lock collision. + static constexpr size_t NumMutexes = 16 * 1024; + + // Serialization format version. Never 0. Versions < 10 reserved for testing. + static constexpr uint32_t kFormatVersion = 10; + + const uint64_t pageSize_{}; + const uint64_t segmentSize_{}; + const uint64_t logBaseOffset_{}; + const uint64_t logSize_{}; + const uint64_t pagesPerSegment_{}; + const uint64_t numSegments_{}; + + Device& device_; + std::unique_ptr mutex_{ + new folly::SharedMutex[NumMutexes]}; + const uint64_t logIndexPartitions_{}; + ChainedLogIndex** index_; + const SetNumberCallback setNumberCallback_{}; + + const uint64_t logPhysicalPartitions_{}; + const uint64_t physicalPartitionSize_{}; + const uint64_t pagesPerPartition_{}; + const uint64_t numIndexEntries_{}; + const uint64_t segmentsPerPartition_{}; + KangarooLogSegment** currentLogSegments_; + /* prevent access to log segment while it's being switched out + * to disk, one for each physical partition */ + std::unique_ptr logSegmentMutexs_; + Buffer* logSegmentBuffers_; + + + // background thread to read log segments and write them + // to sets + std::vector logToSetsThreads_; + uint64_t numThreads_; + std::mutex writeSetsMutex_; + std::condition_variable writeSetsCv_; + std::condition_variable_any flushLogCv_; + bool killThread_{false}; + std::unique_ptr nextLsidsToClean_; + SetMultiInsertCallback setMultiInsertCb_; + uint32_t threshold_{0}; + std::unique_ptr nextCleaningPartition_; + + mutable AtomicCounter itemCount_; + mutable AtomicCounter insertCount_; + mutable AtomicCounter succInsertCount_; + mutable AtomicCounter lookupCount_; + mutable AtomicCounter succLookupCount_; + mutable AtomicCounter removeCount_; + mutable AtomicCounter succRemoveCount_; + mutable AtomicCounter evictionCount_; + mutable AtomicCounter keyCollisionCount_; + mutable AtomicCounter logicalWrittenCount_; + mutable AtomicCounter physicalWrittenCount_; + mutable AtomicCounter ioErrorCount_; + mutable AtomicCounter checksumErrorCount_; + mutable AtomicCounter flushPageReads_; + mutable AtomicCounter flushFalsePageReads_; + mutable AtomicCounter flushLogSegmentsCount_; + mutable AtomicCounter moveBucketCalls_; + mutable AtomicCounter notFoundInLogIndex_; + mutable AtomicCounter foundInLogIndex_; + mutable AtomicCounter indexSegmentMismatch_; + mutable AtomicCounter replaceIndexInsert_; + mutable AtomicCounter indexReplacementReinsertions_; + mutable AtomicCounter indexReinsertions_; + mutable AtomicCounter indexReinsertionFailed_; + mutable AtomicCounter moveBucketSuccessfulRets_; + mutable AtomicCounter thresholdNotHit_; + mutable AtomicCounter sumCountCounter_; + mutable AtomicCounter numCountCalls_; + mutable AtomicCounter readmitBytes_; + mutable AtomicCounter readmitRequests_; + mutable AtomicCounter readmitRequestsFailed_; + mutable AtomicCounter logSegmentsWrittenCount_; + mutable AtomicCounter bytesInserted_; + +}; +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/KangarooLogSegment.cpp b/cachelib/navy/kangaroo/KangarooLogSegment.cpp new file mode 100644 index 0000000000..4d75c0efdf --- /dev/null +++ b/cachelib/navy/kangaroo/KangarooLogSegment.cpp @@ -0,0 +1,112 @@ +#include "cachelib/navy/kangaroo/KangarooBucketStorage.h" +#include "cachelib/navy/kangaroo/KangarooLogSegment.h" + +namespace facebook { +namespace cachelib { +namespace navy { + +KangarooLogSegment::KangarooLogSegment(uint64_t segmentSize, + uint64_t pageSize, LogSegmentId lsid, uint64_t pagesPerPartition, + MutableBufferView mutableView, bool newBucket) + : segmentSize_{segmentSize}, + pageSize_{pageSize}, + numBuckets_{segmentSize_ / pageSize_}, + pagesPerPartition_{pagesPerPartition}, + lsid_{lsid}, + buckets_{new LogBucket*[numBuckets_]} { + // initialize all of the Kangaroo Buckets after cast + for (uint64_t i = 0; i < numBuckets_; i++) { + // TODO: fix generation time + uint64_t offset = i * pageSize_; + auto view = MutableBufferView(pageSize_, mutableView.data() + offset); + if (newBucket) { + LogBucket::initNew(view, 0); + } + buckets_[i] = reinterpret_cast(mutableView.data() + offset); + } +} + +BufferView KangarooLogSegment::find(HashedKey hk, LogPageId lpid) { + uint32_t offset = bucketOffset(lpid); + XDCHECK(offset < numBuckets_); + return buckets_[offset]->find(hk); +} + +BufferView KangarooLogSegment::findTag(uint32_t tag, HashedKey& hk, LogPageId lpid) { + uint32_t offset = bucketOffset(lpid); + XDCHECK(offset < numBuckets_); + return buckets_[offset]->findTag(tag, hk); +} + +LogPageId KangarooLogSegment::insert(HashedKey hk, BufferView value) { + KangarooBucketStorage::Allocation alloc; + uint32_t i = 0; + bool foundAlloc = false; + { + std::unique_lock lock{allocationMutex_}; + // not necessarily the best online bin packing heuristic + // could potentially also do better sharding which segment + // to choose for performance reasons depending on bottleneck + for (; i < numBuckets_; i++) { + if (buckets_[i]->isSpace(hk, value)) { + alloc = buckets_[i]->allocate(hk, value); + foundAlloc = true; + break; + } + } + } + if (!foundAlloc) { + return LogPageId(0, false); + } + // space already reserved so no need to hold mutex + buckets_[i]->insert(alloc, hk, value); + return getLogPageId(i); +} + +uint32_t KangarooLogSegment::bucketOffset(LogPageId lpid) { + return lpid.index() % numBuckets_; +} + +LogPageId KangarooLogSegment::getLogPageId(uint32_t bucketOffset) { + return LogPageId(lsid_.partition() * pagesPerPartition_ + + numBuckets_ * lsid_.index() + bucketOffset, true); +} + +LogSegmentId KangarooLogSegment::getLogSegmentId() { + return lsid_; +} + +void KangarooLogSegment::clear(LogSegmentId newLsid) { + lsid_ = newLsid; + for (uint64_t i = 0; i < numBuckets_; i++) { + buckets_[i]->clear(); + } +} + +KangarooLogSegment::Iterator KangarooLogSegment::getFirst() { + uint64_t bucketNum = 0; + auto itr = buckets_[bucketNum]->getFirst(); + return Iterator(bucketNum, itr); +} + +KangarooLogSegment::Iterator KangarooLogSegment::getNext(Iterator itr) { + if (itr.done()) { + return itr; + } + auto nextItr = buckets_[itr.bucketNum_]->getNext(itr.itr_); + if (nextItr.done() && itr.bucketNum_ >= numBuckets_ - 1) { + itr.done_ = true; + return itr; + } else if (nextItr.done()) { + itr.bucketNum_++; + itr.itr_ = buckets_[itr.bucketNum_]->getFirst(); + return itr; + } else { + itr.itr_ = nextItr; + return itr; + } +} + +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/KangarooLogSegment.h b/cachelib/navy/kangaroo/KangarooLogSegment.h new file mode 100644 index 0000000000..da6dbe0916 --- /dev/null +++ b/cachelib/navy/kangaroo/KangarooLogSegment.h @@ -0,0 +1,82 @@ +#pragma once + +#include + +#include + +#include "cachelib/navy/common/Buffer.h" +#include "cachelib/navy/common/Device.h" +#include "cachelib/navy/common/Types.h" +#include "cachelib/navy/kangaroo/LogBucket.h" +#include "cachelib/navy/kangaroo/LogIndex.h" + +namespace facebook { +namespace cachelib { +namespace navy { +class KangarooLogSegment { + public: + class Iterator { + public: + bool done() const { return done_; } + + HashedKey key() const { return HashedKey(itr_.key()); } + + BufferView value() const { return itr_.value(); } + + private: + friend KangarooLogSegment; + + explicit Iterator(uint64_t bucketNum, LogBucket::Iterator itr) : + bucketNum_{bucketNum}, itr_{itr} { + if (itr_.done()) { + done_ = true; + } + } + + uint64_t bucketNum_; + LogBucket::Iterator itr_; + bool done_ = false; + }; + + explicit KangarooLogSegment(uint64_t segmentSize, uint64_t pageSize, + LogSegmentId lsid, uint64_t pagesPerPartition, + MutableBufferView mutableView, bool newBucket); + + ~KangarooLogSegment() {delete buckets_;} + + KangarooLogSegment(const KangarooLogSegment&) = delete; + KangarooLogSegment& operator=(const KangarooLogSegment&) = delete; + + // Look up for the value corresponding to a key. + // BufferView::isNull() == true if not found. + BufferView find(HashedKey hk, LogPageId lpid); + BufferView findTag(uint32_t tag, HashedKey& hk, LogPageId lpid); + + // Insert into the segment. Returns invalid page id if there is no room. + LogPageId insert(HashedKey hk, BufferView value); + + LogSegmentId getLogSegmentId(); + + void clear(LogSegmentId newLsid); + + Iterator getFirst(); + Iterator getNext(Iterator itr); + + private: + uint32_t bucketOffset(LogPageId lpid); + LogPageId getLogPageId(uint32_t bucketOffset); + + // allocation on Kangaroo Bucket lock + folly::SharedMutex allocationMutex_; + + uint64_t segmentSize_; + uint64_t pageSize_; + uint64_t numBuckets_; + uint64_t pagesPerPartition_; + LogSegmentId lsid_; + // pointer to array of pointers to LogBuckets + LogBucket** buckets_; +}; +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/KangarooSizeDistribution.cpp b/cachelib/navy/kangaroo/KangarooSizeDistribution.cpp new file mode 100644 index 0000000000..7795c9d6f3 --- /dev/null +++ b/cachelib/navy/kangaroo/KangarooSizeDistribution.cpp @@ -0,0 +1,89 @@ +#include "cachelib/navy/kangaroo/KangarooSizeDistribution.h" + +#include +#include + +namespace facebook { +namespace cachelib { +namespace navy { +namespace { +std::vector generateSizes(uint64_t min, uint64_t max, uint64_t factor) { + XDCHECK_GE(factor, 1); + std::vector sizes; + uint64_t current = min; + while (current < max) { + sizes.push_back(current); + current += factor; + } + sizes.push_back(max); + return sizes; +} +} // namespace + +KangarooSizeDistribution::KangarooSizeDistribution(uint64_t min, uint64_t max, uint64_t factor) { + maxValue_ = max; + auto sizes = generateSizes(min, max, factor); + for (auto size : sizes) { + dist_.emplace(size, AtomicCounter{}); + } +} + +KangarooSizeDistribution::KangarooSizeDistribution(std::map snapshot) { + for (const auto& kv : snapshot) { + dist_.emplace(static_cast(kv.first), + AtomicCounter{static_cast(kv.second)}); + } +} + +void KangarooSizeDistribution::addSize(uint64_t size) { + // TODO: It's possible user warm-rolled cache from a version without + // KangarooSizeDistribution support. We will remove this once we bring an ice-roll. + if (dist_.empty()) { + return; + } + + if (size > maxValue_) { + XLOG(INFO, "overrun max in kangaroo size distribution at ", size); + size = maxValue_; + } + + auto res = + std::lower_bound(dist_.begin(), dist_.end(), size, [](auto itr1, auto s) { + return itr1.first < s; + }); + XDCHECK_NE(res, dist_.end()); + res->second.add(size); +} + +void KangarooSizeDistribution::removeSize(uint64_t size) { + // TODO: It's possible user warm-rolled cache from a version without + // KangarooSizeDistribution support. We will remove this once we bring an ice-roll. + if (dist_.empty()) { + return; + } + + auto res = + std::lower_bound(dist_.begin(), dist_.end(), size, [](auto itr1, auto s) { + return itr1.first < s; + }); + XDCHECK_NE(res, dist_.end()); + res->second.sub(size); +} + +std::map KangarooSizeDistribution::getSnapshot() const { + std::map snapshot; + for (auto& kv : dist_) { + snapshot.emplace(static_cast(kv.first), + static_cast(kv.second.get())); + } + return snapshot; +} + +void KangarooSizeDistribution::reset() { + for (auto& d : dist_) { + d.second.set(0); + } +} +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/KangarooSizeDistribution.h b/cachelib/navy/kangaroo/KangarooSizeDistribution.h new file mode 100644 index 0000000000..c1b06ecc4c --- /dev/null +++ b/cachelib/navy/kangaroo/KangarooSizeDistribution.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include +#include + +#include + +#include "cachelib/common/AtomicCounter.h" + +namespace facebook { +namespace cachelib { +namespace navy { +class KangarooSizeDistribution { + public: + // Create a size distribution that spans [@min, @max] at a granularity of + // @factor. (equal spacing in buckets) + KangarooSizeDistribution(uint64_t min, uint64_t max, uint64_t factor); + + // Recover from a previously saved snapshot + explicit KangarooSizeDistribution(std::map snapshot); + + void addSize(uint64_t size); + void removeSize(uint64_t size); + + // Return {size -> number of items} mapping + // Return signed value so it's easy to use this with thrift structures + std::map getSnapshot() const; + + void reset(); + + private: + std::map dist_; + uint64_t maxValue_; +}; +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/LogBucket.cpp b/cachelib/navy/kangaroo/LogBucket.cpp new file mode 100644 index 0000000000..9e8fd6b131 --- /dev/null +++ b/cachelib/navy/kangaroo/LogBucket.cpp @@ -0,0 +1,242 @@ +#include +#include + +#include "cachelib/navy/kangaroo/LogBucket.h" +#include "cachelib/navy/common/Hash.h" + +namespace facebook { +namespace cachelib { +namespace navy { +static_assert(sizeof(LogBucket) == 24, + "LogBucket overhead. If this changes, you may have to adjust the " + "sizes used in unit tests."); + +namespace { +// This maps to exactly how an entry is stored in a bucket on device. +class FOLLY_PACK_ATTR LogBucketEntry { + public: + static uint32_t computeSize(uint32_t keySize, uint32_t valueSize) { + return sizeof(LogBucketEntry) + keySize + valueSize; + } + + static LogBucketEntry& create(MutableBufferView storage, + HashedKey hk, + BufferView value) { + new (storage.data()) LogBucketEntry{hk, value}; + return reinterpret_cast(*storage.data()); + } + + BufferView key() const { return {keySize_, data_}; } + + uint64_t keyHash() const { return makeHK(key()).keyHash(); } + + bool keyEqualsTo(HashedKey hk) const { + return hk == makeHK(key()); + } + + bool keyEqualsTo(uint64_t hash) const { + return hash == keyHash(); + } + + BufferView value() const { return {valueSize_, data_ + keySize_}; } + + private: + LogBucketEntry(HashedKey hk, BufferView value) + : keySize_{static_cast(hk.key().size())}, + valueSize_{static_cast(value.size())} { + static_assert(sizeof(LogBucketEntry) == 4, "LogBucketEntry overhead"); + hk.key().copyTo(data_); + value.copyTo(data_ + keySize_); + } + + const uint16_t keySize_{}; + const uint16_t valueSize_{}; + uint8_t data_[]; +}; + +const LogBucketEntry* getIteratorEntry(KangarooBucketStorage::Allocation itr) { + return reinterpret_cast(itr.view().data()); +} +} // namespace + +BufferView LogBucket::Iterator::key() const { + return getIteratorEntry(itr_)->key(); +} + +uint64_t LogBucket::Iterator::keyHash() const { + return getIteratorEntry(itr_)->keyHash(); +} + +BufferView LogBucket::Iterator::value() const { + return getIteratorEntry(itr_)->value(); +} + +bool LogBucket::Iterator::keyEqualsTo(HashedKey hk) const { + return getIteratorEntry(itr_)->keyEqualsTo(hk); +} + +bool LogBucket::Iterator::keyEqualsTo(uint64_t keyHash) const { + return getIteratorEntry(itr_)->keyEqualsTo(keyHash); +} + +uint32_t LogBucket::computeChecksum(BufferView view) { + constexpr auto kChecksumStart = sizeof(checksum_); + auto data = view.slice(kChecksumStart, view.size() - kChecksumStart); + return navy::checksum(data); +} + +LogBucket& LogBucket::initNew(MutableBufferView view, uint64_t generationTime) { + return *new (view.data()) + LogBucket(generationTime, view.size() - sizeof(LogBucket)); +} + +BufferView LogBucket::find(HashedKey hk) const { + auto itr = storage_.getFirst(); + uint32_t keyIdx = 0; + while (!itr.done()) { + auto* entry = getIteratorEntry(itr); + if (entry->keyEqualsTo(hk)) { + return entry->value(); + } + itr = storage_.getNext(itr); + keyIdx++; + } + return {}; +} + +BufferView LogBucket::findTag(uint32_t tag, HashedKey& hk) const { + auto itr = storage_.getFirst(); + while (!itr.done()) { + auto* entry = getIteratorEntry(itr); + hk = makeHK(entry->key()); + if (createTag(hk) == tag) { + return entry->value(); + } + itr = storage_.getNext(itr); + } + return {}; +} + +uint32_t LogBucket::insert(HashedKey hk, + BufferView value, + const DestructorCallback& destructorCb) { + const auto size = LogBucketEntry::computeSize(hk.key().size(), value.size()); + XDCHECK_LE(size, storage_.capacity()); + + const auto evictions = makeSpace(size, destructorCb); + auto alloc = storage_.allocate(size); + XDCHECK(!alloc.done()); + LogBucketEntry::create(alloc.view(), hk, value); + + return evictions; +} + +void LogBucket::insert(KangarooBucketStorage::Allocation alloc, + HashedKey hk, + BufferView value) { + XDCHECK(!alloc.done()); + LogBucketEntry::create(alloc.view(), hk, value); +} + +KangarooBucketStorage::Allocation LogBucket::allocate(HashedKey hk, + BufferView value) { + const auto size = LogBucketEntry::computeSize(hk.key().size(), value.size()); + XDCHECK_LE(size, storage_.remainingCapacity()); + + auto alloc = storage_.allocate(size); + XDCHECK(!alloc.done()); + return alloc; +} + +void LogBucket::clear() { + storage_.clear(); +} + +bool LogBucket::isSpace(HashedKey hk, BufferView value) { + const auto size = LogBucketEntry::computeSize(hk.key().size(), value.size()); + const auto requiredSize = KangarooBucketStorage::slotSize(size); + XDCHECK_LE(requiredSize, storage_.capacity()); + + auto curFreeSpace = storage_.remainingCapacity(); + return (curFreeSpace >= requiredSize); +} + +uint32_t LogBucket::makeSpace(uint32_t size, + const DestructorCallback& destructorCb) { + const auto requiredSize = KangarooBucketStorage::slotSize(size); + XDCHECK_LE(requiredSize, storage_.capacity()); + + auto curFreeSpace = storage_.remainingCapacity(); + if (curFreeSpace >= requiredSize) { + return 0; + } + + uint32_t evictions = 0; + auto itr = storage_.getFirst(); + while (true) { + evictions++; + + if (destructorCb) { + auto* entry = getIteratorEntry(itr); + destructorCb(entry->key(), entry->value(), DestructorEvent::Recycled); + } + + curFreeSpace += KangarooBucketStorage::slotSize(itr.view().size()); + if (curFreeSpace >= requiredSize) { + storage_.removeUntil(itr); + break; + } + itr = storage_.getNext(itr); + XDCHECK(!itr.done()); + } + return evictions; +} + +uint32_t LogBucket::remove(HashedKey hk, const DestructorCallback& destructorCb) { + auto itr = storage_.getFirst(); + while (!itr.done()) { + auto* entry = getIteratorEntry(itr); + if (entry->keyEqualsTo(hk)) { + if (destructorCb) { + destructorCb(entry->key(), entry->value(), DestructorEvent::Removed); + } + storage_.remove(itr); + return 1; + } + itr = storage_.getNext(itr); + } + return 0; +} + +void LogBucket::reorder(BitVectorReadVisitor isHitCallback) { + uint32_t keyIdx = 0; + auto itr = storage_.getFirst(); + while (!itr.done()) { + auto* entry = getIteratorEntry(itr); + bool hit = isHitCallback(keyIdx); + if (hit) { + auto key = Buffer(entry->key()); + auto value = Buffer(entry->value()); + HashedKey hk = HashedKey(key.view()); + BufferView valueView = value.view(); + storage_.remove(itr); + const auto size = LogBucketEntry::computeSize(hk.key().size(), valueView.size()); + auto alloc = storage_.allocate(size); + LogBucketEntry::create(alloc.view(), hk, valueView); + } + + keyIdx++; + itr = storage_.getNext(itr); + } +} + +LogBucket::Iterator LogBucket::getFirst() const { + return Iterator{storage_.getFirst()}; +} + +LogBucket::Iterator LogBucket::getNext(Iterator itr) const { + return Iterator{storage_.getNext(itr.itr_)}; +} +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/LogBucket.h b/cachelib/navy/kangaroo/LogBucket.h new file mode 100644 index 0000000000..4807c51dc1 --- /dev/null +++ b/cachelib/navy/kangaroo/LogBucket.h @@ -0,0 +1,119 @@ +#pragma once + +#include + +#include + +#include "cachelib/navy/kangaroo/KangarooBucketStorage.h" +#include "cachelib/navy/kangaroo/Types.h" +#include "cachelib/navy/common/Buffer.h" +#include "cachelib/navy/common/Hash.h" +#include "cachelib/navy/common/Types.h" + +namespace facebook { +namespace cachelib { +namespace navy { + +// BigHash is a series of buckets where each item is hashed to one of the +// buckets. A bucket is the fundamental unit of read and write onto the device. +// On read, we read an entire bucket from device and then search for the key +// we need. On write, we read the entire bucket first to insert the new entry +// in memory and then write back to the device. Same for remove. +// +// To ensure the validity of a bucket, on reading, we first check if the +// checksum is what we expect. If it's unexpected, we will reinitialize +// the bucket, finish our operation, compute a new checksum, and finally +// store the checksum in the bucket. Checksum protects us from any device +// corruption. In addition, on first start-up, this is a convenient way +// to let us know a bucket had not been initialized. +// +// Each bucket has a generation timestamp associated with it. On reading, user +// must ensure the generation is what they expect. E.g. in BigHash, to trigger +// a ice roll, we'll update the global generation and then on next startup, +// we'll lazily invalidate each bucket as we read it as the generation will +// be a mismatch. +class FOLLY_PACK_ATTR LogBucket { + public: + // Iterator to bucket's items. + class Iterator { + public: + bool done() const { return itr_.done(); } + + BufferView key() const; + uint64_t keyHash() const; + BufferView value() const; + + bool keyEqualsTo(HashedKey hk) const; + bool keyEqualsTo(uint64_t keyHash) const; + + private: + friend LogBucket; + + Iterator() = default; + explicit Iterator(KangarooBucketStorage::Allocation itr) : itr_{itr} {} + + KangarooBucketStorage::Allocation itr_; + }; + + // User will pass in a view that contains the memory that is a KangarooBucket + static uint32_t computeChecksum(BufferView view); + + // Initialize a brand new LogBucket given a piece of memory in the case + // that the existing bucket is invalid. (I.e. checksum or generation + // mismatch). Refer to comments at the top on what do we use checksum + // and generation time for. + static LogBucket& initNew(MutableBufferView view, uint64_t generationTime); + + uint32_t getChecksum() const { return checksum_; } + + void setChecksum(uint32_t checksum) { checksum_ = checksum; } + + uint64_t generationTime() const { return generationTime_; } + + uint32_t size() const { return storage_.numAllocations(); } + + // Look up for the value corresponding to a key. + // BufferView::isNull() == true if not found. + BufferView find(HashedKey hk) const; + + BufferView findTag(uint32_t tag, HashedKey& hk) const; + + // Note: this does *not* replace an existing key! User must make sure to + // remove an existing key before calling insert. + // + // Insert into the bucket. Trigger eviction and invoke @destructorCb if + // not enough space. Return number of entries evicted. + uint32_t insert(HashedKey hk, + BufferView value, + const DestructorCallback& destructorCb); + + // Remove an entry corresponding to the key. If found, invoke @destructorCb + // before returning true. Return number of entries removed. + uint32_t remove(HashedKey hk, const DestructorCallback& destructorCb); + + // Reorders entries in bucket based on NRU bit vector callback results + void reorder(BitVectorReadVisitor isHitCallback); + + // Needed for log buckets, allocate does not remove objects + bool isSpace(HashedKey hk, BufferView value); + KangarooBucketStorage::Allocation allocate(HashedKey hk, BufferView value); + void insert(KangarooBucketStorage::Allocation alloc, HashedKey hk, BufferView value); + void clear(); + + Iterator getFirst() const; + Iterator getNext(Iterator itr) const; + + private: + LogBucket(uint64_t generationTime, uint32_t capacity) + : generationTime_{generationTime}, storage_{capacity} {} + + // Reserve enough space for @size by evicting. Return number of evictions. + uint32_t makeSpace(uint32_t size, const DestructorCallback& destructorCb); + + uint32_t checksum_{}; + uint64_t generationTime_{}; + KangarooBucketStorage storage_; +}; +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/LogIndex.cpp b/cachelib/navy/kangaroo/LogIndex.cpp new file mode 100644 index 0000000000..6def034cac --- /dev/null +++ b/cachelib/navy/kangaroo/LogIndex.cpp @@ -0,0 +1,172 @@ +#include +#include + +#include "cachelib/navy/kangaroo/LogIndex.h" + +namespace facebook { +namespace cachelib { +namespace navy { + +LogIndex::LogIndex(uint64_t numSlots, SetNumberCallback setNumberCb) + : numSlots_{numSlots}, + setNumberCb_{setNumberCb} { + index_ = new LogIndexEntry[numSlots]; + for (uint64_t i = 0; i < numSlots_; i++) { + index_[i].hits_ = 0; + index_[i].valid_ = 0; + } +} + +LogIndex::~LogIndex() { + delete index_; +} + +LogPageId LogIndex::lookup(HashedKey hk, bool hit) { + const uint32_t offset = getLogIndexOffset(hk); + uint32_t increment = 0; + uint32_t tag = createTag(hk); + LogIndexEntry* currentHead = &index_[(offset + increment) % numSlots_]; + while (currentHead->continueIteration() && increment < numSlots_) { + if (currentHead->isValid() && + currentHead->tag() == tag) { + if (hit) { + currentHead->incrementHits(); + } + return currentHead->page(); + } + increment++; + currentHead = &index_[(offset + increment) % numSlots_]; + } + return LogPageId(0, false); +} + +Status LogIndex::insert(HashedKey hk, LogPageId lpid, uint8_t hits) { + const auto offset = getLogIndexOffset(hk); + uint32_t increment = 0; + uint32_t tag = createTag(hk); + LogIndexEntry* entry = &index_[(offset + increment) % numSlots_]; + while (increment < numSlots_) { + if (entry->tag() == tag || !entry->isValid()) { + Status ret; + if (entry->isValid()) { + ret = Status::NotFound; + } else { + ret = Status::Ok; + } + entry->tag_ = tag; + entry->flash_index_ = lpid.index(); + entry->valid_ = 1; + entry->hits_ = hits; + return ret; + } + increment++; + entry = &index_[(offset + increment) % numSlots_]; + } + return Status::Rejected; +} + +Status LogIndex::remove(HashedKey hk, LogPageId lpid) { + const auto offset = getLogIndexOffset(hk); + uint32_t increment = 0; + uint32_t tag = createTag(hk); + LogIndexEntry* entry = &index_[(offset + increment) % numSlots_]; + while (increment < numSlots_) { + if (entry->isValid() && entry->tag() == tag && lpid == entry->page()) { + entry->invalidate(); + if (!index_[(offset + increment + 1) % numSlots_].continueIteration()) { + entry->clear(); + } + return Status::Ok; + } + increment++; + entry = &index_[(offset + increment) % numSlots_]; + } + return Status::NotFound; +} + +// Counts number of items in log corresponding to bucket +uint64_t LogIndex::countBucket(HashedKey hk) { + const auto offset = getLogIndexOffset(hk); + uint32_t increment = 0; + LogIndexEntry* entry = &index_[(offset + increment) % numSlots_]; + uint64_t count = 0; + while (increment < numSlots_) { + if (!entry->continueIteration()) { + break; + } else if (entry->isValid()) { + count++; + } + increment++; + entry = &index_[(offset + increment) % numSlots_]; + } + return count; +} + +// Get iterator for all items in the same bucket +LogIndex::BucketIterator LogIndex::getHashBucketIterator(HashedKey hk) { + const auto offset = getLogIndexOffset(hk); + uint32_t increment = 0; + LogIndexEntry* entry = &index_[(offset + increment) % numSlots_]; + auto idx = setNumberCb_(hk.keyHash()); + while (increment < numSlots_) { + if (!entry->continueIteration()) { + break; + } else if (entry->isValid()) { + return BucketIterator(idx, entry, increment); + } + increment++; + entry = &index_[(offset + increment) % numSlots_]; + } + return BucketIterator(); +} + +LogIndex::BucketIterator LogIndex::getNext(LogIndex::BucketIterator bi) { + auto offset = getLogIndexOffsetFromSetBucket(bi.bucket_); + uint32_t increment = bi.increment_ + 1; + LogIndexEntry* entry = &index_[(offset + increment) % numSlots_]; + while (increment < numSlots_) { + if (!entry->continueIteration()) { + break; + } else if (entry->isValid()) { + return BucketIterator(bi.bucket_, entry, increment); + } + increment++; + entry = &index_[(offset + increment) % numSlots_]; + } + return BucketIterator(); +} + +LogPageId LogIndex::findAndRemove(KangarooBucketId bid, uint32_t tag) { + auto offset = getLogIndexOffsetFromSetBucket(bid); + uint32_t increment = 0; + LogIndexEntry* entry = &index_[(offset + increment) % numSlots_]; + while (increment < numSlots_) { + if (entry->isValid() && entry->tag() == tag) { + LogPageId lpid = entry->page(); + entry->invalidate(); + if (!index_[(offset + increment + 1) % numSlots_].continueIteration()) { + entry->clear(); + } + return lpid; + } + increment++; + entry = &index_[(offset + increment) % numSlots_]; + } + return LogPageId(0, false); +} + +uint32_t LogIndex::getLogIndexOffset(HashedKey hk) { + return getLogIndexOffsetFromSetBucket(setNumberCb_(hk.keyHash())); +} + +uint32_t LogIndex::getLogIndexOffset(uint64_t key) { + return getLogIndexOffsetFromSetBucket(setNumberCb_(key)); +} + +uint32_t LogIndex::getLogIndexOffsetFromSetBucket(KangarooBucketId bid) { + return bid.index() % numSlots_; +} + +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/LogIndex.h b/cachelib/navy/kangaroo/LogIndex.h new file mode 100644 index 0000000000..7601177d5a --- /dev/null +++ b/cachelib/navy/kangaroo/LogIndex.h @@ -0,0 +1,95 @@ +#pragma once + +#include + +#include + +#include "cachelib/navy/common/Hash.h" +#include "cachelib/navy/common/Types.h" +#include "cachelib/navy/kangaroo/Types.h" +#include "cachelib/navy/kangaroo/LogIndexEntry.h" + +namespace facebook { +namespace cachelib { +namespace navy { +// LogIndex is an open-addressing hash-based log index +// optimized for allowing easy threshold lookups. +// It uses linear probing. +// +// Requires user to handle synchronization. +class LogIndex { + public: + // BucketIterator gives hashed key for each valid + // element corresponding to a given kangaroo bucket + // Read only + class BucketIterator { + public: + + BucketIterator() : bucket_{0}, current_entry_{nullptr}, end_{true}, increment_{0} {} + + bool done() const { return end_; } + + uint32_t hits() const { return current_entry_->hits(); } + + LogPageId page() const { return current_entry_->page(); } + + uint32_t tag() const { return current_entry_->tag(); } + + private: + friend LogIndex; + + BucketIterator(KangarooBucketId id, LogIndexEntry* firstKey, uint32_t increment) + : bucket_{id}, current_entry_{firstKey}, increment_{increment} {} + + KangarooBucketId bucket_; + LogIndexEntry* current_entry_; + uint32_t increment_; + bool end_{false}; + }; + + explicit LogIndex(uint64_t numSlots, SetNumberCallback setNumberCb); + + ~LogIndex(); + + LogIndex(const LogIndex&) = delete; + LogIndex& operator=(const LogIndex&) = delete; + + // Look up a key in Index. + // If not found, return will not be valid. + LogPageId lookup(HashedKey hk, bool hit); + + // Inserts key into index. Will reject the request + // if index has no room + Status insert(HashedKey hk, LogPageId lpid, uint8_t hits = 0); + + // Removes entry's valid bit if it's in the log + Status remove(HashedKey hk, LogPageId lpid); + + LogPageId findAndRemove(KangarooBucketId bid, uint32_t tag); + + // Counts number of items in log corresponding to set + // bucket for the hashed key + uint64_t countBucket(HashedKey hk); + + // Get iterator for all items in the same bucket + BucketIterator getHashBucketIterator(HashedKey hk); + BucketIterator getNext(BucketIterator bi); + + private: + + friend BucketIterator; + + uint32_t getLogIndexOffset(HashedKey hk); + uint32_t getLogIndexOffset(uint64_t hk); + uint32_t getLogIndexOffsetFromSetBucket(KangarooBucketId bid); + + const SetNumberCallback setNumberCb_{}; + uint64_t numSlots_; + LogIndexEntry* index_; + +}; +} // namespace navy +} // namespace cachelib +} // namespace facebook + + diff --git a/cachelib/navy/kangaroo/LogIndexEntry.h b/cachelib/navy/kangaroo/LogIndexEntry.h new file mode 100644 index 0000000000..d9a864f6e9 --- /dev/null +++ b/cachelib/navy/kangaroo/LogIndexEntry.h @@ -0,0 +1,45 @@ +#pragma once + +#include "cachelib/navy/common/Hash.h" +#include "cachelib/navy/common/Types.h" + +namespace facebook { +namespace cachelib { +namespace navy { + +class LogIndex; + +class LogIndexEntry { + public: + explicit LogIndexEntry(HashedKey hk, LogPageId lpid) + : tag_{createTag(hk)}, flash_index_{lpid.index()}, hits_{0} {} + LogIndexEntry() : valid_{false} {} + ~LogIndexEntry() = default; + + bool operator==(const LogIndexEntry& rhs) const noexcept { + return valid_ && rhs.valid_ && tag_ == rhs.tag_; + } + bool operator!=(const LogIndexEntry& rhs) const noexcept { + return !(*this == rhs); + } + + void incrementHits() { if (hits_ < ((1 << 3) - 1)) {hits_++;} } + uint32_t hits() { return hits_; } + uint32_t tag() { return tag_; } + void invalidate() { valid_ = 0; hits_ = 1; } + void clear() { hits_ = 0; valid_ = 0; } + bool isValid() { return valid_; } + bool continueIteration() { return isValid() || hits_; } + LogPageId page() { return LogPageId(flash_index_, valid_); } + + private: + friend LogIndex; + + uint32_t tag_ : 9; + uint32_t flash_index_ : 19; + uint32_t valid_ : 1; + uint32_t hits_ : 3; +}; +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/RripBitVector.cpp b/cachelib/navy/kangaroo/RripBitVector.cpp new file mode 100644 index 0000000000..29720cf5fc --- /dev/null +++ b/cachelib/navy/kangaroo/RripBitVector.cpp @@ -0,0 +1,47 @@ +#include +#include + +#include "cachelib/navy/common/Utils.h" +#include "cachelib/navy/kangaroo/RripBitVector.h" + +namespace facebook { +namespace cachelib { +namespace navy { + +namespace { +uint16_t bitMask(uint32_t bitIdx) {return 1u << bitIdx;} + +bool bitSet(uint32_t& bits, uint32_t bitIdx) {bits |= bitMask(bitIdx);} +bool bitGet(uint32_t& bits, uint32_t bitIdx) {return bits & bitMask(bitIdx);} +} // namespace + +RripBitVector::RripBitVector(uint32_t numVectors) + : numVectors_{numVectors}, + bits_{std::make_unique(numVectors_)} { + + // Don't have to worry about @bits_ memory: + // make_unique initialized memory with 0 + return; + } + +void RripBitVector::set(uint32_t bucketIdx, uint32_t keyIdx) { + XDCHECK_LT(bucketIdx, numVectors_); + bitSet(bits_[bucketIdx], keyIdx); +} + +bool RripBitVector::get(uint32_t bucketIdx, uint32_t keyIdx) { + XDCHECK_LT(bucketIdx, numVectors_); + if (keyIdx >= vectorSize_ * 8) { + return 0; + } + return bitGet(bits_[bucketIdx], keyIdx); +} + +void RripBitVector::clear(uint32_t bucketIdx) { + XDCHECK_LT(bucketIdx, numVectors_); + bits_[bucketIdx] = 0u; +} + +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/RripBitVector.h b/cachelib/navy/kangaroo/RripBitVector.h new file mode 100644 index 0000000000..28a99a2c7a --- /dev/null +++ b/cachelib/navy/kangaroo/RripBitVector.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include + +namespace facebook { +namespace cachelib { +namespace navy { +// Kangaroo uses bit vector to keep track of whether objects in a bucket +// have been hit. They are by default emptied every time the bucket is +// rewritten (ie on inserts or removes) +// +// Thread safe if user guards operations to a bucket. +class RripBitVector { + public: + // Creates @numVectors bitVectors, each can track 16 bits (ie 16 objects). + // + // Throws std::exception if invalid arguments. + RripBitVector(uint32_t numVectors); + + // Not copyable + RripBitVector(const RripBitVector&) = delete; + RripBitVector& operator=(const RripBitVector&) = delete; + RripBitVector(RripBitVector&&) = default; + RripBitVector& operator=(RripBitVector&&) = default; + + // clear all bit vectors + void reset(); + + // For all operations below: + // @bucketIdx Index of bit vector to make op on + // @keyIdx Index of key within bucket + // + // Doesn't check bounds, like vector. Only asserts. + void set(uint32_t bucketIdx, uint32_t keyIdx); + bool get(uint32_t bucketIdx, uint32_t keyIdx); + + // Zeroes bit vectors + void clear(uint32_t bucketIdx); + + uint32_t numVectors() const { return numVectors_; } + + size_t getByteSize() const { return numVectors_ * vectorSize_; } + + private: + const uint32_t numVectors_{}; + const uint32_t vectorSize_ = 4; + std::unique_ptr bits_; +}; +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/RripBucket.cpp b/cachelib/navy/kangaroo/RripBucket.cpp new file mode 100644 index 0000000000..eb170181b9 --- /dev/null +++ b/cachelib/navy/kangaroo/RripBucket.cpp @@ -0,0 +1,259 @@ +#include +#include + +#include "cachelib/navy/kangaroo/RripBucket.h" +#include "cachelib/navy/common/Hash.h" + +namespace facebook { +namespace cachelib { +namespace navy { +static_assert(sizeof(RripBucket) == 24, + "RripBucket overhead. If this changes, you may have to adjust the " + "sizes used in unit tests."); + +namespace { +// This maps to exactly how an entry is stored in a bucket on device. +class FOLLY_PACK_ATTR RripBucketEntry { + public: + static uint32_t computeSize(uint32_t keySize, uint32_t valueSize) { + return sizeof(RripBucketEntry) + keySize + valueSize; + } + + static RripBucketEntry& create(MutableBufferView storage, + HashedKey hk, + BufferView value) { + new (storage.data()) RripBucketEntry{hk, value}; + return reinterpret_cast(*storage.data()); + } + + BufferView key() const { return {keySize_, data_}; } + + bool keyEqualsTo(HashedKey hk) const { + return hk == HashedKey::precomputed(key(), keyHash_); + } + + bool keyEqualsTo(uint64_t keyHash) const { + return keyHash == keyHash_; + } + + uint64_t keyHash() const { return keyHash_; } + + BufferView value() const { return {valueSize_, data_ + keySize_}; } + + private: + RripBucketEntry(HashedKey hk, BufferView value) + : keySize_{static_cast(hk.key().size())}, + valueSize_{static_cast(value.size())}, + keyHash_{hk.keyHash()} { + static_assert(sizeof(RripBucketEntry) == 12, "RripBucketEntry overhead"); + hk.key().copyTo(data_); + value.copyTo(data_ + keySize_); + } + + const uint16_t keySize_{}; + const uint16_t valueSize_{}; + const uint64_t keyHash_{}; + uint8_t data_[]; +}; + +const RripBucketEntry* getIteratorEntry(RripBucketStorage::Allocation itr) { + return reinterpret_cast(itr.view().data()); +} +} // namespace + +BufferView RripBucket::Iterator::key() const { + return getIteratorEntry(itr_)->key(); +} + +uint64_t RripBucket::Iterator::keyHash() const { + return getIteratorEntry(itr_)->keyHash(); +} + +BufferView RripBucket::Iterator::value() const { + return getIteratorEntry(itr_)->value(); +} + +bool RripBucket::Iterator::keyEqualsTo(HashedKey hk) const { + return getIteratorEntry(itr_)->keyEqualsTo(hk); +} + +bool RripBucket::Iterator::keyEqualsTo(uint64_t keyHash) const { + return getIteratorEntry(itr_)->keyEqualsTo(keyHash); +} + +uint32_t RripBucket::computeChecksum(BufferView view) { + constexpr auto kChecksumStart = sizeof(checksum_); + auto data = view.slice(kChecksumStart, view.size() - kChecksumStart); + return navy::checksum(data); +} + +RripBucket& RripBucket::initNew(MutableBufferView view, uint64_t generationTime) { + return *new (view.data()) + RripBucket(generationTime, view.size() - sizeof(RripBucket)); +} + +BufferView RripBucket::find(HashedKey hk, BitVectorUpdateVisitor addHitCallback) const { + auto itr = storage_.getFirst(); + uint32_t keyIdx = 0; + while (!itr.done()) { + auto* entry = getIteratorEntry(itr); + if (entry->keyEqualsTo(hk)) { + if (addHitCallback) { + addHitCallback(keyIdx); + } + return entry->value(); + } + itr = storage_.getNext(itr); + keyIdx++; + } + return {}; +} + +/*Status RripBucket::findKey(uint64_t keyHash, HashedKey& hk) const { + auto itr = storage_.getFirst(); + uint32_t keyIdx = 0; + while (!itr.done()) { + auto* entry = getIteratorEntry(itr); + if (entry->keyEqualsTo(keyHash)) { + hk = HashedKey(entry->key()); + return Status::Ok; + } + itr = storage_.getNext(itr); + keyIdx++; + } + return Status.NotFound; +}*/ + +uint32_t RripBucket::insert(HashedKey hk, + BufferView value, + uint8_t hits, + const DestructorCallback& destructorCb) { + const auto size = RripBucketEntry::computeSize(hk.key().size(), value.size()); + XDCHECK_LE(size, storage_.capacity()); + + const auto evictions = makeSpace(size, destructorCb); + uint8_t rrip = getRripValue(hits); + auto alloc = storage_.allocate(size, rrip); + XDCHECK(!alloc.done()); + RripBucketEntry::create(alloc.view(), hk, value); + + return evictions; +} + +bool RripBucket::isSpace(HashedKey hk, BufferView value, uint8_t hits) { + const auto size = RripBucketEntry::computeSize(hk.key().size(), value.size()); + const auto requiredSize = RripBucketStorage::slotSize(size); + XDCHECK_LE(requiredSize, storage_.capacity()); + + auto curFreeSpace = storage_.remainingCapacity(); + uint8_t rrip = getRripValue(hits); + + auto itr = storage_.getFirst(); + while (curFreeSpace < requiredSize) { + if (itr.done()) { + return false; + } else if (itr.rrip() < rrip) { + return false; + } + curFreeSpace += RripBucketStorage::slotSize(itr.view().size()); + itr = storage_.getNext(itr); + } + return (curFreeSpace >= requiredSize); +} + +uint32_t RripBucket::makeSpace(uint32_t size, + const DestructorCallback& destructorCb) { + const auto requiredSize = RripBucketStorage::slotSize(size); + XDCHECK_LE(requiredSize, storage_.capacity()); + + auto curFreeSpace = storage_.remainingCapacity(); + if (curFreeSpace >= requiredSize) { + return 0; + } + + uint32_t evictions = 0; + auto itr = storage_.getFirst(); + while (true) { + evictions++; + + if (destructorCb) { + auto* entry = getIteratorEntry(itr); + destructorCb(entry->key(), entry->value(), DestructorEvent::Recycled); + } + + curFreeSpace += RripBucketStorage::slotSize(itr.view().size()); + if (curFreeSpace >= requiredSize) { + storage_.removeUntil(itr); + break; + } + itr = storage_.getNext(itr); + XDCHECK(!itr.done()); + } + return evictions; +} + +uint32_t RripBucket::remove(HashedKey hk, const DestructorCallback& destructorCb) { + auto itr = storage_.getFirst(); + while (!itr.done()) { + auto* entry = getIteratorEntry(itr); + if (entry->keyEqualsTo(hk)) { + if (destructorCb) { + destructorCb(entry->key(), entry->value(), DestructorEvent::Removed); + } + storage_.remove(itr); + return 1; + } + itr = storage_.getNext(itr); + } + return 0; +} + +void RripBucket::reorder(BitVectorReadVisitor isHitCallback) { + uint32_t keyIdx = 0; + Buffer firstMovedKey; + bool movedKey = false; + int8_t increment = -1; + + auto itr = storage_.getFirst(); + while (!itr.done()) { + auto* entry = getIteratorEntry(itr); + bool hit = isHitCallback(keyIdx); + if (hit) { + auto key = Buffer(entry->key()); + if (!movedKey) { + movedKey = true; + firstMovedKey = key.copy(); + } else if (firstMovedKey.view() == key.view()) { + break; + } + + auto value = Buffer(entry->value()); + HashedKey hk = HashedKey(key.view()); + BufferView valueView = value.view(); + itr = storage_.remove(itr); + const auto size = RripBucketEntry::computeSize(hk.key().size(), valueView.size()); + // promotion rrip so a hit promotes to 0 + auto alloc = storage_.allocate(size, 0); + RripBucketEntry::create(alloc.view(), hk, valueView); + } else { + if (increment == -1) { + increment = (int8_t) getIncrement(itr.rrip()); + } + storage_.incrementRrip(itr, increment); + itr = storage_.getNext(itr); + } + + keyIdx++; + } +} + +RripBucket::Iterator RripBucket::getFirst() const { + return Iterator{storage_.getFirst()}; +} + +RripBucket::Iterator RripBucket::getNext(Iterator itr) const { + return Iterator{storage_.getNext(itr.itr_)}; +} +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/RripBucket.h b/cachelib/navy/kangaroo/RripBucket.h new file mode 100644 index 0000000000..1206e6ea89 --- /dev/null +++ b/cachelib/navy/kangaroo/RripBucket.h @@ -0,0 +1,133 @@ +#pragma once + +#include + +#include + +#include "cachelib/navy/kangaroo/RripBucketStorage.h" +#include "cachelib/navy/kangaroo/Types.h" +#include "cachelib/navy/common/Buffer.h" +#include "cachelib/navy/common/Hash.h" + +namespace facebook { +namespace cachelib { +namespace navy { + +// A bucket is the fundamental unit of read and write onto the device. +// On read, we read an entire bucket from device and then search for the key +// we need. On write, we read the entire bucket first to insert the new entry +// in memory and then write back to the device. Same for remove. +// +// To ensure the validity of a bucket, on reading, we first check if the +// checksum is what we expect. If it's unexpected, we will reinitialize +// the bucket, finish our operation, compute a new checksum, and finally +// store the checksum in the bucket. Checksum protects us from any device +// corruption. In addition, on first start-up, this is a convenient way +// to let us know a bucket had not been initialized. +// +// Each bucket has a generation timestamp associated with it. On reading, user +// must ensure the generation is what they expect. E.g. in BigHash, to trigger +// a ice roll, we'll update the global generation and then on next startup, +// we'll lazily invalidate each bucket as we read it as the generation will +// be a mismatch. +class FOLLY_PACK_ATTR RripBucket { + public: + // Iterator to bucket's items. + class Iterator { + public: + bool done() const { return itr_.done(); } + + BufferView key() const; + uint64_t keyHash() const; + BufferView value() const; + + bool keyEqualsTo(HashedKey hk) const; + bool keyEqualsTo(uint64_t keyHash) const; + + private: + friend RripBucket; + + Iterator() = default; + explicit Iterator(RripBucketStorage::Allocation itr) : itr_{itr} {} + + RripBucketStorage::Allocation itr_; + }; + + // User will pass in a view that contains the memory that is a RripBucket + static uint32_t computeChecksum(BufferView view); + + // Initialize a brand new RripBucket given a piece of memory in the case + // that the existing bucket is invalid. (I.e. checksum or generation + // mismatch). Refer to comments at the top on what do we use checksum + // and generation time for. + static RripBucket& initNew(MutableBufferView view, uint64_t generationTime); + + uint32_t getChecksum() const { return checksum_; } + + void setChecksum(uint32_t checksum) { checksum_ = checksum; } + + uint64_t generationTime() const { return generationTime_; } + + uint32_t size() const { return storage_.numAllocations(); } + + // Look up for the value corresponding to a key. + // BufferView::isNull() == true if not found. + BufferView find(HashedKey hk, BitVectorUpdateVisitor setHitCallback) const; + + //HashedKey findKey(uint64_t key_hash) const; + + // Note: this does *not* replace an existing key! User must make sure to + // remove an existing key before calling insert. + // + // Insert into the bucket. Trigger eviction and invoke @destructorCb if + // not enough space. Return number of entries evicted. + uint32_t insert(HashedKey hk, + BufferView value, + uint8_t hits, + const DestructorCallback& destructorCb); + + // Remove an entry corresponding to the key. If found, invoke @destructorCb + // before returning true. Return number of entries removed. + // If not enough objects lower than + uint32_t remove(HashedKey hk, const DestructorCallback& destructorCb); + + // Reorders entries in bucket based on RRIP + void reorder(BitVectorReadVisitor isHitCallback); + + Iterator getFirst() const; + Iterator getNext(Iterator itr) const; + + // Checks if there is space for an object given its hit priority + // Use 0 hits if there is no log + bool isSpace(HashedKey hk, BufferView value, uint8_t hits); + + private: + RripBucket(uint64_t generationTime, uint32_t capacity) + : generationTime_{generationTime}, storage_{capacity} {} + + // Reserve enough space for @size by evicting. Return number of evictions. + uint32_t makeSpace(uint32_t size, const DestructorCallback& destructorCb); + + uint8_t getRripValue(uint8_t hits) const { + uint8_t start = (1 << 3) - 2; + if (hits > start) { + return 0; + } + return start - hits; + } + + uint8_t getIncrement(uint8_t highestRrip) const { + uint8_t highestValue = 1 << 3 - 1; + if (highestRrip > highestValue) { + return 0; + } + return highestValue - highestRrip; + } + + uint32_t checksum_{}; + uint64_t generationTime_{}; + RripBucketStorage storage_; +}; +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/RripBucketStorage.cpp b/cachelib/navy/kangaroo/RripBucketStorage.cpp new file mode 100644 index 0000000000..111d22a07c --- /dev/null +++ b/cachelib/navy/kangaroo/RripBucketStorage.cpp @@ -0,0 +1,138 @@ +#include "cachelib/navy/kangaroo/RripBucketStorage.h" + +namespace facebook { +namespace cachelib { +namespace navy { +static_assert(sizeof(RripBucketStorage) == 12, + "RripBucketStorage overhead. Changing this may require changing " + "the sizes used in unit tests as well"); + +const uint32_t RripBucketStorage::kAllocationOverhead = sizeof(RripBucketStorage::Slot); + +// This is very simple as it only tries to allocate starting from the +// tail of the storage. Returns null view() if we don't have any more space. +RripBucketStorage::Allocation RripBucketStorage::allocate(uint32_t size, uint8_t rrip) { + // Allocate at the beginning of the right rrip value + // + // tail + // |-6--|3|--0--|~~~~~~~~~~~~~| + // + // after allocating object with 3 + // tail + // |-6--|3|NEW|--0--|~~~~~~~~~| + if (!canAllocate(size)) { + return {}; + } + + auto itr = getFirst(); + uint32_t position = 0; + while (!itr.done() && itr.rrip() >= rrip) { + itr = getNext(itr); + position++; + } + + uint32_t totalNewSize = slotSize(size); + uint8_t* start = data_ + endOffset_; + if (!itr.done()) { + start = itr.view().data() - kAllocationOverhead; + } + std::memmove(start + totalNewSize, + start, + (data_ + endOffset_) - start); + + auto* slot = new (start) Slot(size, rrip); + endOffset_ += totalNewSize; + numAllocations_++; + return {MutableBufferView{slot->size, slot->data}, + position, (uint8_t) slot->rrip}; +} + +RripBucketStorage::Allocation RripBucketStorage::remove(Allocation alloc) { + // Remove triggers a compaction. + // + // tail + // |--------|REMOVED|-----|~~~~| + // + // after compaction + // tail + // |---------------|~~~~~~~~~~~| + if (alloc.done()) { + return alloc; + } + + const uint32_t removedSize = slotSize(alloc.view().size()); + uint8_t* removed = alloc.view().data() - kAllocationOverhead; + uint32_t position = alloc.position(); + std::memmove(removed, + removed + removedSize, + (data_ + endOffset_) - removed - removedSize); + endOffset_ -= removedSize; + numAllocations_--; + + auto* current = + reinterpret_cast(removed); + if (reinterpret_cast(current) - data_ >= endOffset_) { + return {}; + } + return {MutableBufferView{current->size, current->data}, position, + (uint8_t) current->rrip}; +} + +void RripBucketStorage::removeUntil(Allocation alloc) { + // Remove everything until (and include) "alloc" + // + // tail + // |----------------|-----|~~~~| + // ^ ^ + // begin offset + // remove this whole range + // + // tail + // |-----|~~~~~~~~~~~~~~~~~~~~~| + if (alloc.done()) { + return; + } + + uint32_t offset = alloc.view().data() + alloc.view().size() - data_; + if (offset > endOffset_) { + return; + } + + std::memmove(data_, data_ + offset, endOffset_ - offset); + endOffset_ -= offset; + numAllocations_ -= alloc.position() + 1; +} + +RripBucketStorage::Allocation RripBucketStorage::getFirst() const { + if (endOffset_ == 0) { + return {}; + } + auto* slot = reinterpret_cast(data_); + return {MutableBufferView{slot->size, slot->data}, 0, (uint8_t) slot->rrip}; +} + +RripBucketStorage::Allocation RripBucketStorage::getNext( + RripBucketStorage::Allocation alloc) const { + if (alloc.done()) { + return {}; + } + + auto* next = + reinterpret_cast(alloc.view().data() + alloc.view().size()); + if (reinterpret_cast(next) - data_ >= endOffset_) { + return {}; + } + return {MutableBufferView{next->size, next->data}, alloc.position() + 1, + (uint8_t) next->rrip}; +} + +void RripBucketStorage::incrementRrip(Allocation alloc, int8_t increment) { + uint8_t* current_slot = alloc.view().data() - kAllocationOverhead; + auto* slot = + reinterpret_cast(current_slot); + XDCHECK(increment + slot->rrip <= 7); + slot->rrip += increment; +} +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/RripBucketStorage.h b/cachelib/navy/kangaroo/RripBucketStorage.h new file mode 100644 index 0000000000..e020058ba8 --- /dev/null +++ b/cachelib/navy/kangaroo/RripBucketStorage.h @@ -0,0 +1,97 @@ +#pragma once + +#include "cachelib/navy/common/Buffer.h" +#include "cachelib/navy/common/CompilerUtils.h" +#include "cachelib/navy/common/Types.h" + +namespace facebook { +namespace cachelib { +namespace navy { +// This is a very simple FIFO allocator that once full the only +// way to free up more space is by removing entries at the +// front. It is used for managing alloactions inside a bucket. +class FOLLY_PACK_ATTR RripBucketStorage { + public: + // This is an allocation that is returned to user when they + // allocate from the RripBucketStorage. "view" is for reading + // and modifying data allocated from the storage. "position" + // indicates where it is in the storage and is used internally to + // iterate to the next allocation. + // + // User should only have reference to one "allocation" at a time. + // Calling remove, allocate, or removeUntil API on an allocation will invalidate + // all the other references to allocations. + class Allocation { + public: + Allocation() = default; + + bool done() const { return view_.isNull(); } + + MutableBufferView view() const { return view_; } + + uint32_t position() const { return position_; } + + uint8_t rrip() const { return rrip_; } + + private: + friend RripBucketStorage; + + Allocation(MutableBufferView v, uint32_t p, uint8_t rrip) : view_{v}, position_{p}, rrip_{rrip} {} + + MutableBufferView view_{}; + uint32_t position_{}; + uint8_t rrip_{}; + }; + + static uint32_t slotSize(uint32_t size) { return kAllocationOverhead + size; } + + explicit RripBucketStorage(uint32_t capacity) : capacity_{capacity} {} + + // Other allocators are only valid if rrip value = 0 + Allocation allocate(uint32_t size, uint8_t rrip); + + uint32_t capacity() const { return capacity_; } + + uint32_t remainingCapacity() const { return capacity_ - endOffset_; } + + uint32_t numAllocations() const { return numAllocations_; } + + void incrementRrip(Allocation alloc, int8_t increment); + + void clear() { + endOffset_ = 0; + numAllocations_ = 0; + } + + Allocation remove(Allocation alloc); + + // Removes every single allocation from the beginning, including this one. + void removeUntil(Allocation alloc); + + Allocation getFirst() const; + Allocation getNext(Allocation alloc) const; + + private: + // Slot represents a physical slot in the storage. User does not use + // this directly but instead uses Allocation. + struct FOLLY_PACK_ATTR Slot { + uint16_t rrip : 3; + uint16_t size : 13; + uint8_t data[]; + explicit Slot(uint16_t s, uint8_t rrip) : rrip{rrip}, size{s} {} + }; + + bool canAllocate(uint32_t size) const { + return static_cast(endOffset_) + slotSize(size) <= capacity_; + } + + static const uint32_t kAllocationOverhead; + + const uint32_t capacity_{}; + uint32_t numAllocations_{}; + uint32_t endOffset_{}; + mutable uint8_t data_[]; +}; +} // namespace navy +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/navy/kangaroo/Types.h b/cachelib/navy/kangaroo/Types.h new file mode 100644 index 0000000000..092e30d9ff --- /dev/null +++ b/cachelib/navy/kangaroo/Types.h @@ -0,0 +1,125 @@ +#pragma once + +#include + +#include + +namespace facebook { +namespace cachelib { +namespace navy { + +using BitVectorUpdateVisitor = std::function; +using BitVectorReadVisitor = std::function; + +// Kangaroo Log Structures +class LogSegmentId { + public: + LogSegmentId(uint32_t idx, uint32_t partition) : idx_{idx}, partition_{partition} {} + LogSegmentId() {} + LogSegmentId(LogSegmentId& rhs) : idx_{rhs.idx_}, partition_{rhs.partition_} {} + + bool operator==(const LogSegmentId& rhs) const noexcept { + return idx_ == rhs.idx_ && partition_ == rhs.partition_; + } + bool operator!=(const LogSegmentId& rhs) const noexcept { + return !(*this == rhs); + } + + uint32_t index() const noexcept { return idx_; } + uint32_t partition() const noexcept { return partition_; } + + private: + uint32_t idx_; + uint32_t partition_; + uint32_t physical_segment_; +}; + +class LogPageId { + public: + explicit LogPageId(uint32_t idx, bool valid) : idx_{idx}, valid_{valid} {} + LogPageId() : idx_{0}, valid_{false} {} + + bool operator==(const LogPageId& rhs) const noexcept { + if (!valid_ && !rhs.valid_) { + return true; + } + return valid_ == rhs.valid_ && idx_ == rhs.idx_; + } + bool operator!=(const LogPageId& rhs) const noexcept { + return !(*this == rhs); + } + + uint32_t index() const noexcept { return idx_; } + bool isValid() const noexcept { return valid_; } + + private: + uint32_t idx_; + bool valid_; +}; + +class PartitionOffset { + public: + explicit PartitionOffset(uint32_t idx, bool valid) : idx_{idx}, valid_{valid} {} + PartitionOffset() : idx_{0}, valid_{false} {} + + bool operator==(const PartitionOffset& rhs) const noexcept { + if (!valid_ && !rhs.valid_) { + return true; + } + return valid_ == rhs.valid_ && idx_ == rhs.idx_; + } + bool operator!=(const PartitionOffset& rhs) const noexcept { + return !(*this == rhs); + } + + uint32_t index() const noexcept { return idx_; } + bool isValid() const noexcept { return valid_; } + + private: + uint32_t idx_; + bool valid_; +}; + +class KangarooBucketId { + public: + explicit KangarooBucketId(uint32_t idx) : idx_{idx} {} + + bool operator==(const KangarooBucketId& rhs) const noexcept { + return idx_ == rhs.idx_; + } + bool operator!=(const KangarooBucketId& rhs) const noexcept { + return !(*this == rhs); + } + + uint32_t index() const noexcept { return idx_; } + + private: + uint32_t idx_; +}; + +using SetNumberCallback = std::function; + +struct ObjectInfo { + Buffer keyValue; + HashedKey key; + Buffer value; + uint8_t hits; + LogPageId lpid; + uint32_t tag; + + ObjectInfo(HashedKey k, BufferView v, uint8_t h, LogPageId l, uint32_t t): + keyValue{k.key()}, key{HashedKey::precomputed(keyValue.view(), k.keyHash())}, + value{Buffer(v)}, hits{h}, lpid{l}, tag{t} {} +}; +using ReadmitCallback = std::function&)>; +using SetMultiInsertCallback = std::function>&, ReadmitCallback)>; + +static const uint32_t maxTagValue = 1 << 9; +static const int tagSeed = 23; +static uint32_t createTag(HashedKey hk) { + return hashBuffer(hk.key(), tagSeed) % maxTagValue; +} + +} // namespace navy +} // namespace cachelib +} // namespace facebook