From 4c9ffc80ea9a947250e3496d4feed0a7a758843a Mon Sep 17 00:00:00 2001 From: Joseph Birkner Date: Thu, 24 Oct 2024 18:43:21 +0200 Subject: [PATCH] Fix bottleneck in string pool lookup performance by getting rid of lower-case copies. --- include/simfil/model/bitsery-traits.h | 1 - include/simfil/model/string-pool.h | 30 ++++- src/model/string-pool.cpp | 175 ++++++++++++++++---------- 3 files changed, 132 insertions(+), 74 deletions(-) diff --git a/include/simfil/model/bitsery-traits.h b/include/simfil/model/bitsery-traits.h index e5e8752..a2318c4 100644 --- a/include/simfil/model/bitsery-traits.h +++ b/include/simfil/model/bitsery-traits.h @@ -20,7 +20,6 @@ struct ContainerTraits> : public StdContainer, true, true> { }; - } template diff --git a/include/simfil/model/string-pool.h b/include/simfil/model/string-pool.h index d650d3d..c8f42a8 100644 --- a/include/simfil/model/string-pool.h +++ b/include/simfil/model/string-pool.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace simfil { @@ -17,6 +18,21 @@ namespace simfil using StringId = uint16_t; static_assert(std::is_unsigned_v, "StringId must be unsigned!"); +namespace detail +{ +// Custom hash function for case-insensitive string hashing. +struct CaseInsensitiveHash +{ + size_t operator()(const std::string_view& str) const; +}; + +// Custom equality comparison for case-insensitive string comparison. +struct CaseInsensitiveEqual +{ + bool operator()(const std::string_view& lhs, const std::string_view& rhs) const; +}; +} // namespace detail + /** * Fast and efficient case-insensitive string interner, * used to store object keys. @@ -70,17 +86,23 @@ struct StringPool /// Serialization - write to stream, starting from a specific /// id offset if necessary (for partial serialisation). - virtual void write(std::ostream& outputStream, StringId offset = {}) const; // NOLINT + virtual void write(std::ostream& outputStream, StringId offset = {}) const; // NOLINT virtual void read(std::istream& inputStream); private: mutable std::shared_mutex stringStoreMutex_; - std::unordered_map idForString_; - std::unordered_map stringForId_; + std::unordered_map< + std::string_view, + StringId, + detail::CaseInsensitiveHash, + detail::CaseInsensitiveEqual> + idForString_; + std::unordered_map stringForId_; + std::deque storedStrings_; StringId nextId_ = FirstDynamicId; std::atomic_int64_t byteSize_{0}; std::atomic_int64_t cacheHits_{0}; std::atomic_int64_t cacheMisses_{0}; }; -} +} // namespace simfil diff --git a/src/model/string-pool.cpp b/src/model/string-pool.cpp index 8ec1932..0bca63d 100644 --- a/src/model/string-pool.cpp +++ b/src/model/string-pool.cpp @@ -1,16 +1,45 @@ #include "simfil/model/string-pool.h" #include "simfil/exception-handler.h" -#include -#include #include +#include #include #include +#include #include -#include #include +#include #include +namespace bitsery +{ +namespace traits +{ + +template +struct ContainerTraits> + : public StdContainer, true, true> +{ +}; + +template +struct TextTraits> +{ + using TValue = typename ContainerTraits< + std::basic_string_view>::TValue; + // string is automatically null-terminated + static constexpr bool addNUL = false; + + // is is not 100% accurate, but for performance reasons assume that string + // stores text, not binary data + static size_t length(const std::basic_string_view& str) + { + return str.size(); + } +}; +} +} + namespace simfil { @@ -22,81 +51,72 @@ StringPool::StringPool() addStaticKey(OverlayIndex, "$idx"); } -StringPool::StringPool(const StringPool& other) : - idForString_(other.idForString_), - stringForId_(other.stringForId_), - nextId_(other.nextId_), - byteSize_(other.byteSize_.load()), - cacheHits_(other.cacheHits_.load()), - cacheMisses_(other.cacheMisses_.load()) +StringPool::StringPool(const StringPool& other) + : idForString_(other.idForString_), + stringForId_(other.stringForId_), + nextId_(other.nextId_), + byteSize_(other.byteSize_.load()), + cacheHits_(other.cacheHits_.load()), + cacheMisses_(other.cacheMisses_.load()) { } StringId StringPool::emplace(std::string_view const& str) { - /// Unfortunately, we have to create a copy of the string here - /// on the heap for lower-casing. - /// Also we must use std::string as lookup type until C++ 20 is used: - /// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0919r2.html - auto lowerCaseStr = std::string(str); - std::transform( - lowerCaseStr.begin(), - lowerCaseStr.end(), - lowerCaseStr.begin(), - [](auto ch) { return std::tolower(ch, std::locale{}); }); - { - std::shared_lock stringStoreReadAccess_(stringStoreMutex_); - auto it = idForString_.find(lowerCaseStr); + std::shared_lock lock(stringStoreMutex_); + auto it = idForString_.find(str); if (it != idForString_.end()) { ++cacheHits_; return it->second; } } { - std::unique_lock stringStoreWriteAccess_(stringStoreMutex_); - auto [it, insertionTookPlace] = idForString_.try_emplace(lowerCaseStr, nextId_); - if (insertionTookPlace) { - (void)stringForId_.try_emplace(nextId_, str); - byteSize_ += static_cast(str.size()); - ++cacheMisses_; - ++nextId_; - if (nextId_ < it->second) { - raise("StringPool id overflow!"); - } + std::unique_lock lock(stringStoreMutex_); + // Double-check in case another thread added the string. + auto it = idForString_.find(str); + if (it != idForString_.end()) { + ++cacheHits_; + return it->second; } - return it->second; + + // Store the string to maintain ownership. + auto& storedString = storedStrings_.emplace_back(str); + StringId id = nextId_++; + if (nextId_ < id) { + raise("StringPool id overflow!"); + } + idForString_.emplace(storedString, id); + stringForId_.emplace(id, storedString); + byteSize_ += static_cast(storedString.size()); + ++cacheMisses_; + + return id; } } StringId StringPool::get(std::string_view const& str) { - auto lowerCaseStr = std::string(str); - std::transform( - lowerCaseStr.begin(), - lowerCaseStr.end(), - lowerCaseStr.begin(), - [](auto ch) { return std::tolower(ch, std::locale{}); }); - std::shared_lock stringStoreReadAccess_(stringStoreMutex_); - auto it = idForString_.find(lowerCaseStr); - if (it != idForString_.end()) + auto it = idForString_.find(str); + if (it != idForString_.end()) { + ++cacheHits_; return it->second; - + } return StringPool::Empty; } std::optional StringPool::resolve(const StringId& id) const { std::shared_lock stringStoreReadAccess_(stringStoreMutex_); - const auto it = stringForId_.find(id); + auto it = stringForId_.find(id); if (it != stringForId_.end()) return it->second; - return std::nullopt; } -StringId StringPool::highest() const { +StringId StringPool::highest() const +{ return nextId_ - 1; } @@ -121,21 +141,17 @@ size_t StringPool::misses() const return cacheMisses_; } -void StringPool::addStaticKey(StringId k, std::string const& v) { - auto lowerCaseStr = v; - std::transform( - lowerCaseStr.begin(), - lowerCaseStr.end(), - lowerCaseStr.begin(), - [](auto ch) { return std::tolower(ch, std::locale{}); }); - - idForString_[lowerCaseStr] = k; - stringForId_[k] = v; +void StringPool::addStaticKey(StringId id, const std::string& value) +{ + std::unique_lock lock(stringStoreMutex_); + auto& storedString = storedStrings_.emplace_back(value); + idForString_.emplace(storedString, id); + stringForId_.emplace(id, storedString); } -void StringPool::write(std::ostream& outputStream, const StringId offset) const // NOLINT +void StringPool::write(std::ostream& outputStream, const StringId offset) const // NOLINT { - std::shared_lock stringStoreReadAccess_(stringStoreMutex_); + std::shared_lock stringStoreReadAccess(stringStoreMutex_); bitsery::Serializer s(outputStream); // Calculate how many strings will be sent @@ -169,24 +185,17 @@ void StringPool::read(std::istream& inputStream) s.value2b(rcvStringCount); // Read strings - for (auto i = 0; i < rcvStringCount; ++i) - { + for (auto i = 0; i < rcvStringCount; ++i) { // Read string key StringId stringId{}; s.value2b(stringId); // Don't support strings longer than 64kB. - std::string stringValue; + auto& stringValue = storedStrings_.emplace_back(); s.text1b(stringValue, std::numeric_limits::max()); - auto lowerCaseStringValue = std::string(stringValue); // Insert string into the pool - std::transform( - lowerCaseStringValue.begin(), - lowerCaseStringValue.end(), - lowerCaseStringValue.begin(), - [](auto ch) { return std::tolower(ch, std::locale{}); }); - auto [it, insertionTookPlace] = idForString_.try_emplace(lowerCaseStringValue, stringId); + auto [it, insertionTookPlace] = idForString_.try_emplace(stringValue, stringId); if (insertionTookPlace) { stringForId_.try_emplace(stringId, stringValue); byteSize_ += static_cast(stringValue.size()); @@ -201,4 +210,32 @@ void StringPool::read(std::istream& inputStream) } } +size_t detail::CaseInsensitiveHash::operator()(const std::string_view& str) const +{ + size_t hash = 14695981039346656037ull; // FNV offset basis for 64-bit size_t. + for (unsigned char c : str) { + // ASCII lowercase conversion. + if (c >= 'A' && c <= 'Z') { + constexpr auto lowercaseOffset = 'a' - 'A'; + c += lowercaseOffset; + } + hash ^= c; + hash *= 1099511628211ull; // FNV prime for 64-bit size_t. + } + return hash; +} + +bool detail::CaseInsensitiveEqual::operator()( + const std::string_view& lhs, + const std::string_view& rhs) const +{ + std::locale locale; + return std::equal( + lhs.begin(), + lhs.end(), + rhs.begin(), + rhs.end(), + [&locale](unsigned char l, unsigned char r) + { return std::tolower(l, locale) == std::tolower(r, locale); }); } +} // namespace simfil