From a5cb7b6d2de30c20730f0d2f1e605a65d5841e53 Mon Sep 17 00:00:00 2001 From: Joseph Birkner Date: Thu, 24 Oct 2024 18:43:21 +0200 Subject: [PATCH 1/5] Fix bottleneck in string pool lookup performance by getting rid of lower-case copies. --- include/simfil/model/bitsery-traits.h | 1 - include/simfil/model/string-pool.h | 30 ++++- src/model/string-pool.cpp | 173 ++++++++++++++++---------- 3 files changed, 130 insertions(+), 74 deletions(-) diff --git a/include/simfil/model/bitsery-traits.h b/include/simfil/model/bitsery-traits.h index e5e8752..a2318c4 100644 --- a/include/simfil/model/bitsery-traits.h +++ b/include/simfil/model/bitsery-traits.h @@ -20,7 +20,6 @@ struct ContainerTraits> : public StdContainer, true, true> { }; - } template diff --git a/include/simfil/model/string-pool.h b/include/simfil/model/string-pool.h index d650d3d..c8f42a8 100644 --- a/include/simfil/model/string-pool.h +++ b/include/simfil/model/string-pool.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace simfil { @@ -17,6 +18,21 @@ namespace simfil using StringId = uint16_t; static_assert(std::is_unsigned_v, "StringId must be unsigned!"); +namespace detail +{ +// Custom hash function for case-insensitive string hashing. +struct CaseInsensitiveHash +{ + size_t operator()(const std::string_view& str) const; +}; + +// Custom equality comparison for case-insensitive string comparison. +struct CaseInsensitiveEqual +{ + bool operator()(const std::string_view& lhs, const std::string_view& rhs) const; +}; +} // namespace detail + /** * Fast and efficient case-insensitive string interner, * used to store object keys. @@ -70,17 +86,23 @@ struct StringPool /// Serialization - write to stream, starting from a specific /// id offset if necessary (for partial serialisation). - virtual void write(std::ostream& outputStream, StringId offset = {}) const; // NOLINT + virtual void write(std::ostream& outputStream, StringId offset = {}) const; // NOLINT virtual void read(std::istream& inputStream); private: mutable std::shared_mutex stringStoreMutex_; - std::unordered_map idForString_; - std::unordered_map stringForId_; + std::unordered_map< + std::string_view, + StringId, + detail::CaseInsensitiveHash, + detail::CaseInsensitiveEqual> + idForString_; + std::unordered_map stringForId_; + std::deque storedStrings_; StringId nextId_ = FirstDynamicId; std::atomic_int64_t byteSize_{0}; std::atomic_int64_t cacheHits_{0}; std::atomic_int64_t cacheMisses_{0}; }; -} +} // namespace simfil diff --git a/src/model/string-pool.cpp b/src/model/string-pool.cpp index 8ec1932..7d341f1 100644 --- a/src/model/string-pool.cpp +++ b/src/model/string-pool.cpp @@ -1,16 +1,44 @@ #include "simfil/model/string-pool.h" #include "simfil/exception-handler.h" -#include -#include #include +#include #include #include +#include #include #include -#include #include +namespace bitsery +{ +namespace traits +{ + +template +struct ContainerTraits> + : public StdContainer, true, true> +{ +}; + +template +struct TextTraits> +{ + using TValue = typename ContainerTraits< + std::basic_string_view>::TValue; + // string is automatically null-terminated + static constexpr bool addNUL = false; + + // is is not 100% accurate, but for performance reasons assume that string + // stores text, not binary data + static size_t length(const std::basic_string_view& str) + { + return str.size(); + } +}; +} +} + namespace simfil { @@ -22,81 +50,72 @@ StringPool::StringPool() addStaticKey(OverlayIndex, "$idx"); } -StringPool::StringPool(const StringPool& other) : - idForString_(other.idForString_), - stringForId_(other.stringForId_), - nextId_(other.nextId_), - byteSize_(other.byteSize_.load()), - cacheHits_(other.cacheHits_.load()), - cacheMisses_(other.cacheMisses_.load()) +StringPool::StringPool(const StringPool& other) + : idForString_(other.idForString_), + stringForId_(other.stringForId_), + nextId_(other.nextId_), + byteSize_(other.byteSize_.load()), + cacheHits_(other.cacheHits_.load()), + cacheMisses_(other.cacheMisses_.load()) { } StringId StringPool::emplace(std::string_view const& str) { - /// Unfortunately, we have to create a copy of the string here - /// on the heap for lower-casing. - /// Also we must use std::string as lookup type until C++ 20 is used: - /// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0919r2.html - auto lowerCaseStr = std::string(str); - std::transform( - lowerCaseStr.begin(), - lowerCaseStr.end(), - lowerCaseStr.begin(), - [](auto ch) { return std::tolower(ch, std::locale{}); }); - { - std::shared_lock stringStoreReadAccess_(stringStoreMutex_); - auto it = idForString_.find(lowerCaseStr); + std::shared_lock lock(stringStoreMutex_); + auto it = idForString_.find(str); if (it != idForString_.end()) { ++cacheHits_; return it->second; } } { - std::unique_lock stringStoreWriteAccess_(stringStoreMutex_); - auto [it, insertionTookPlace] = idForString_.try_emplace(lowerCaseStr, nextId_); - if (insertionTookPlace) { - (void)stringForId_.try_emplace(nextId_, str); - byteSize_ += static_cast(str.size()); - ++cacheMisses_; - ++nextId_; - if (nextId_ < it->second) { - raise("StringPool id overflow!"); - } + std::unique_lock lock(stringStoreMutex_); + // Double-check in case another thread added the string. + auto it = idForString_.find(str); + if (it != idForString_.end()) { + ++cacheHits_; + return it->second; } - return it->second; + + // Store the string to maintain ownership. + auto& storedString = storedStrings_.emplace_back(str); + StringId id = nextId_++; + if (nextId_ < id) { + raise("StringPool id overflow!"); + } + idForString_.emplace(storedString, id); + stringForId_.emplace(id, storedString); + byteSize_ += static_cast(storedString.size()); + ++cacheMisses_; + + return id; } } StringId StringPool::get(std::string_view const& str) { - auto lowerCaseStr = std::string(str); - std::transform( - lowerCaseStr.begin(), - lowerCaseStr.end(), - lowerCaseStr.begin(), - [](auto ch) { return std::tolower(ch, std::locale{}); }); - std::shared_lock stringStoreReadAccess_(stringStoreMutex_); - auto it = idForString_.find(lowerCaseStr); - if (it != idForString_.end()) + auto it = idForString_.find(str); + if (it != idForString_.end()) { + ++cacheHits_; return it->second; - + } return StringPool::Empty; } std::optional StringPool::resolve(const StringId& id) const { std::shared_lock stringStoreReadAccess_(stringStoreMutex_); - const auto it = stringForId_.find(id); + auto it = stringForId_.find(id); if (it != stringForId_.end()) return it->second; - return std::nullopt; } -StringId StringPool::highest() const { +StringId StringPool::highest() const +{ return nextId_ - 1; } @@ -121,21 +140,17 @@ size_t StringPool::misses() const return cacheMisses_; } -void StringPool::addStaticKey(StringId k, std::string const& v) { - auto lowerCaseStr = v; - std::transform( - lowerCaseStr.begin(), - lowerCaseStr.end(), - lowerCaseStr.begin(), - [](auto ch) { return std::tolower(ch, std::locale{}); }); - - idForString_[lowerCaseStr] = k; - stringForId_[k] = v; +void StringPool::addStaticKey(StringId id, const std::string& value) +{ + std::unique_lock lock(stringStoreMutex_); + auto& storedString = storedStrings_.emplace_back(value); + idForString_.emplace(storedString, id); + stringForId_.emplace(id, storedString); } -void StringPool::write(std::ostream& outputStream, const StringId offset) const // NOLINT +void StringPool::write(std::ostream& outputStream, const StringId offset) const // NOLINT { - std::shared_lock stringStoreReadAccess_(stringStoreMutex_); + std::shared_lock stringStoreReadAccess(stringStoreMutex_); bitsery::Serializer s(outputStream); // Calculate how many strings will be sent @@ -169,24 +184,17 @@ void StringPool::read(std::istream& inputStream) s.value2b(rcvStringCount); // Read strings - for (auto i = 0; i < rcvStringCount; ++i) - { + for (auto i = 0; i < rcvStringCount; ++i) { // Read string key StringId stringId{}; s.value2b(stringId); // Don't support strings longer than 64kB. - std::string stringValue; + auto& stringValue = storedStrings_.emplace_back(); s.text1b(stringValue, std::numeric_limits::max()); - auto lowerCaseStringValue = std::string(stringValue); // Insert string into the pool - std::transform( - lowerCaseStringValue.begin(), - lowerCaseStringValue.end(), - lowerCaseStringValue.begin(), - [](auto ch) { return std::tolower(ch, std::locale{}); }); - auto [it, insertionTookPlace] = idForString_.try_emplace(lowerCaseStringValue, stringId); + auto [it, insertionTookPlace] = idForString_.try_emplace(stringValue, stringId); if (insertionTookPlace) { stringForId_.try_emplace(stringId, stringValue); byteSize_ += static_cast(stringValue.size()); @@ -201,4 +209,31 @@ void StringPool::read(std::istream& inputStream) } } +size_t detail::CaseInsensitiveHash::operator()(const std::string_view& str) const +{ + size_t hash = 14695981039346656037ull; // FNV offset basis for 64-bit size_t. + for (unsigned char c : str) { + // ASCII lowercase conversion. + if (c >= 'A' && c <= 'Z') { + constexpr auto lowercaseOffset = 'a' - 'A'; + c += lowercaseOffset; + } + hash ^= c; + hash *= 1099511628211ull; // FNV prime for 64-bit size_t. + } + return hash; +} + +bool detail::CaseInsensitiveEqual::operator()( + const std::string_view& lhs, + const std::string_view& rhs) const +{ + return std::equal( + lhs.begin(), + lhs.end(), + rhs.begin(), + rhs.end(), + [](unsigned char l, unsigned char r) + { return std::tolower(l) == std::tolower(r); }); } +} // namespace simfil From f5a7174928fca6960f9967203645a9f9e35a48e9 Mon Sep 17 00:00:00 2001 From: Joseph Birkner Date: Thu, 24 Oct 2024 20:36:05 +0200 Subject: [PATCH 2/5] Optimize ModelPool::setStrings --- include/simfil/model/string-pool.h | 3 +++ src/model/model.cpp | 11 +++++++++-- src/model/string-pool.cpp | 4 ++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/include/simfil/model/string-pool.h b/include/simfil/model/string-pool.h index c8f42a8..131c733 100644 --- a/include/simfil/model/string-pool.h +++ b/include/simfil/model/string-pool.h @@ -89,6 +89,9 @@ struct StringPool virtual void write(std::ostream& outputStream, StringId offset = {}) const; // NOLINT virtual void read(std::istream& inputStream); + /// Check if the content of the string pools is logically identical. + bool operator== (StringPool const& other) const; + private: mutable std::shared_mutex stringStoreMutex_; std::unordered_map< diff --git a/src/model/model.cpp b/src/model/model.cpp index 367ab62..4a451fd 100644 --- a/src/model/model.cpp +++ b/src/model/model.cpp @@ -329,14 +329,21 @@ std::shared_ptr ModelPool::strings() const void ModelPool::setStrings(std::shared_ptr const& strings) { + if (!strings) + raise("Attempt to call ModelPool::setStrings(nullptr)!"); + + auto oldStrings = impl_->strings_; + impl_->strings_ = strings; + if (!oldStrings || *strings == *oldStrings) + return; + // Translate object field IDs to the new dictionary. for (auto memberArray : impl_->columns_.objectMemberArrays_) { for (auto& member : memberArray) { - if (auto resolvedName = impl_->strings_->resolve(member.name_)) + if (auto resolvedName = oldStrings->resolve(member.name_)) member.name_ = strings->emplace(*resolvedName); } } - impl_->strings_ = strings; } std::optional ModelPool::lookupStringId(const StringId id) const diff --git a/src/model/string-pool.cpp b/src/model/string-pool.cpp index 7d341f1..22ec151 100644 --- a/src/model/string-pool.cpp +++ b/src/model/string-pool.cpp @@ -209,6 +209,10 @@ void StringPool::read(std::istream& inputStream) } } +bool StringPool::operator==(const StringPool &other) const { + return idForString_ == other.idForString_; +} + size_t detail::CaseInsensitiveHash::operator()(const std::string_view& str) const { size_t hash = 14695981039346656037ull; // FNV offset basis for 64-bit size_t. From f7a6c690b8f3edc128ad58ca1814012729f4b7b4 Mon Sep 17 00:00:00 2001 From: Joseph Birkner Date: Thu, 24 Oct 2024 21:10:22 +0200 Subject: [PATCH 3/5] Ensure that the StringPool copy constructor does not create dangling string_views into the copied pool. --- src/model/string-pool.cpp | 45 +++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/src/model/string-pool.cpp b/src/model/string-pool.cpp index 22ec151..cacf5e4 100644 --- a/src/model/string-pool.cpp +++ b/src/model/string-pool.cpp @@ -51,13 +51,46 @@ StringPool::StringPool() } StringPool::StringPool(const StringPool& other) - : idForString_(other.idForString_), - stringForId_(other.stringForId_), - nextId_(other.nextId_), - byteSize_(other.byteSize_.load()), - cacheHits_(other.cacheHits_.load()), - cacheMisses_(other.cacheMisses_.load()) { + std::unique_lock lockThis(stringStoreMutex_, std::defer_lock); + std::shared_lock lockOther(other.stringStoreMutex_, std::defer_lock); + std::lock(lockThis, lockOther); + + // Copy storedStrings_. + storedStrings_ = other.storedStrings_; + + // Map from old string data pointer to new string_view. + std::unordered_map strDataToNewStrView; + + // Build the mapping from old string data pointers to new string_views. + for (size_t i = 0; i < other.storedStrings_.size(); ++i) { + const std::string& oldStr = other.storedStrings_[i]; + const std::string& newStr = storedStrings_[i]; + strDataToNewStrView[oldStr.data()] = std::string_view(newStr); + } + + // Rebuild idForString_ with new string_views pointing into this->storedStrings_. + idForString_.clear(); + for (const auto& [oldStrView, id] : other.idForString_) { + // Get the new string_view corresponding to the old string data pointer. + auto it = strDataToNewStrView.find(oldStrView.data()); + if (it != strDataToNewStrView.end()) { + const std::string_view& newStrView = it->second; + idForString_.emplace(newStrView, id); + } else { + // This should not happen if everything is consistent. + raise("Failed to rebuild idForString_ in StringPool copy constructor"); + } + } + + // Copy stringForId_. + stringForId_ = other.stringForId_; + + // Copy other member variables. + nextId_ = other.nextId_; + byteSize_ = other.byteSize_.load(); + cacheHits_ = other.cacheHits_.load(); + cacheMisses_ = other.cacheMisses_.load(); } StringId StringPool::emplace(std::string_view const& str) From 9db2c6e4de66b30a15df4fcd7977d281070636bb Mon Sep 17 00:00:00 2001 From: Joseph Birkner Date: Fri, 25 Oct 2024 11:21:13 +0200 Subject: [PATCH 4/5] Fix sonar complaints. --- src/model/string-pool.cpp | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/model/string-pool.cpp b/src/model/string-pool.cpp index cacf5e4..b64af94 100644 --- a/src/model/string-pool.cpp +++ b/src/model/string-pool.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace bitsery { @@ -64,9 +65,7 @@ StringPool::StringPool(const StringPool& other) // Build the mapping from old string data pointers to new string_views. for (size_t i = 0; i < other.storedStrings_.size(); ++i) { - const std::string& oldStr = other.storedStrings_[i]; - const std::string& newStr = storedStrings_[i]; - strDataToNewStrView[oldStr.data()] = std::string_view(newStr); + strDataToNewStrView[other.storedStrings_[i].data()] = storedStrings_[i]; } // Rebuild idForString_ with new string_views pointing into this->storedStrings_. @@ -75,9 +74,9 @@ StringPool::StringPool(const StringPool& other) // Get the new string_view corresponding to the old string data pointer. auto it = strDataToNewStrView.find(oldStrView.data()); if (it != strDataToNewStrView.end()) { - const std::string_view& newStrView = it->second; - idForString_.emplace(newStrView, id); - } else { + idForString_.emplace(it->second, id); + } + else { // This should not happen if everything is consistent. raise("Failed to rebuild idForString_ in StringPool copy constructor"); } @@ -248,15 +247,12 @@ bool StringPool::operator==(const StringPool &other) const { size_t detail::CaseInsensitiveHash::operator()(const std::string_view& str) const { - size_t hash = 14695981039346656037ull; // FNV offset basis for 64-bit size_t. - for (unsigned char c : str) { - // ASCII lowercase conversion. - if (c >= 'A' && c <= 'Z') { - constexpr auto lowercaseOffset = 'a' - 'A'; - c += lowercaseOffset; - } + std::locale locale{}; + auto hash = static_cast(14695981039346656037ULL); // FNV offset basis for 64-bit size_t. + for (auto c : str) { + c = std::tolower(c, locale); hash ^= c; - hash *= 1099511628211ull; // FNV prime for 64-bit size_t. + hash *= static_cast(1099511628211ULL); // FNV prime for 64-bit size_t. } return hash; } @@ -265,12 +261,13 @@ bool detail::CaseInsensitiveEqual::operator()( const std::string_view& lhs, const std::string_view& rhs) const { + std::locale locale{}; return std::equal( lhs.begin(), lhs.end(), rhs.begin(), rhs.end(), - [](unsigned char l, unsigned char r) - { return std::tolower(l) == std::tolower(r); }); + [&locale](auto l, auto r) + { return std::tolower(l, locale) == std::tolower(r, locale); }); } } // namespace simfil From 9363b221bd40f3120a04429de8bf46018f0f5bce Mon Sep 17 00:00:00 2001 From: Joseph Birkner Date: Tue, 29 Oct 2024 19:24:57 +0100 Subject: [PATCH 5/5] Cleanup and FNV hash parameter selection based on sizeof(size_t) --- src/model/string-pool.cpp | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/model/string-pool.cpp b/src/model/string-pool.cpp index b64af94..dcc7bfc 100644 --- a/src/model/string-pool.cpp +++ b/src/model/string-pool.cpp @@ -11,11 +11,14 @@ #include #include +/** + * Note: This code is taken from bitsery traits/string.h and adopted + * to handle (de-)serialization of a string view. + */ namespace bitsery { namespace traits { - template struct ContainerTraits> : public StdContainer, true, true> @@ -27,11 +30,7 @@ struct TextTraits> { using TValue = typename ContainerTraits< std::basic_string_view>::TValue; - // string is automatically null-terminated static constexpr bool addNUL = false; - - // is is not 100% accurate, but for performance reasons assume that string - // stores text, not binary data static size_t length(const std::basic_string_view& str) { return str.size(); @@ -247,13 +246,22 @@ bool StringPool::operator==(const StringPool &other) const { size_t detail::CaseInsensitiveHash::operator()(const std::string_view& str) const { + // FNV-1a Hash (Fowler–Noll–Vo) for case-insensitive hashing. + // Reference: http://www.isthe.com/chongo/tech/comp/fnv/#FNV-reference-source + // Selects 64-bit FNV-1a offset basis and prime if size_t is 8 bytes, + // and 32-bit FNV values if size_t is 4 bytes. + constexpr size_t offsetBasis = sizeof(size_t) == 4 ? 2166136261U : 14695981039346656037ULL; + constexpr size_t prime = sizeof(size_t) == 4 ? 16777619U : 1099511628211ULL; + + size_t hash = offsetBasis; std::locale locale{}; - auto hash = static_cast(14695981039346656037ULL); // FNV offset basis for 64-bit size_t. + for (auto c : str) { c = std::tolower(c, locale); hash ^= c; - hash *= static_cast(1099511628211ULL); // FNV prime for 64-bit size_t. + hash *= prime; } + return hash; }