Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix string pool lookup performance #97

Merged
merged 5 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion include/simfil/model/bitsery-traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ struct ContainerTraits<sfl::segmented_vector<T, N, Allocator>>
: public StdContainer<sfl::segmented_vector<T, N, Allocator>, true, true>
{
};

}

template <typename S>
Expand Down
33 changes: 29 additions & 4 deletions include/simfil/model/string-pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,29 @@
#include <string>
#include <istream>
#include <ostream>
#include <deque>

namespace simfil
{

using StringId = uint16_t;
static_assert(std::is_unsigned_v<StringId>, "StringId must be unsigned!");

namespace detail
{
// Custom hash function for case-insensitive string hashing.
struct CaseInsensitiveHash
{
size_t operator()(const std::string_view& str) const;
};

// Custom equality comparison for case-insensitive string comparison.
struct CaseInsensitiveEqual
{
bool operator()(const std::string_view& lhs, const std::string_view& rhs) const;
};
} // namespace detail

/**
* Fast and efficient case-insensitive string interner,
* used to store object keys.
Expand Down Expand Up @@ -70,17 +86,26 @@ struct StringPool

/// Serialization - write to stream, starting from a specific
/// id offset if necessary (for partial serialisation).
virtual void write(std::ostream& outputStream, StringId offset = {}) const; // NOLINT
virtual void write(std::ostream& outputStream, StringId offset = {}) const; // NOLINT
virtual void read(std::istream& inputStream);

/// Check if the content of the string pools is logically identical.
bool operator== (StringPool const& other) const;

private:
mutable std::shared_mutex stringStoreMutex_;
std::unordered_map<std::string, StringId> idForString_;
std::unordered_map<StringId, std::string> stringForId_;
std::unordered_map<
std::string_view,
StringId,
detail::CaseInsensitiveHash,
detail::CaseInsensitiveEqual>
idForString_;
std::unordered_map<StringId, std::string_view> stringForId_;
std::deque<std::string> storedStrings_;
StringId nextId_ = FirstDynamicId;
std::atomic_int64_t byteSize_{0};
std::atomic_int64_t cacheHits_{0};
std::atomic_int64_t cacheMisses_{0};
};

}
} // namespace simfil
11 changes: 9 additions & 2 deletions src/model/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,14 +329,21 @@ std::shared_ptr<StringPool> ModelPool::strings() const

void ModelPool::setStrings(std::shared_ptr<StringPool> const& strings)
{
if (!strings)
raise<std::runtime_error>("Attempt to call ModelPool::setStrings(nullptr)!");

auto oldStrings = impl_->strings_;
impl_->strings_ = strings;
if (!oldStrings || *strings == *oldStrings)
return;

// Translate object field IDs to the new dictionary.
for (auto memberArray : impl_->columns_.objectMemberArrays_) {
for (auto& member : memberArray) {
if (auto resolvedName = impl_->strings_->resolve(member.name_))
if (auto resolvedName = oldStrings->resolve(member.name_))
member.name_ = strings->emplace(*resolvedName);
}
}
impl_->strings_ = strings;
}

std::optional<std::string_view> ModelPool::lookupStringId(const StringId id) const
Expand Down
215 changes: 146 additions & 69 deletions src/model/string-pool.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,43 @@
#include "simfil/model/string-pool.h"
#include "simfil/exception-handler.h"

#include <algorithm>
#include <bitsery/bitsery.h>
#include <bitsery/adapter/stream.h>
#include <bitsery/bitsery.h>
#include <bitsery/traits/string.h>
#include <fmt/core.h>
#include <algorithm>
#include <cmath>
#include <mutex>
#include <locale>
#include <stdexcept>
#include <locale>

/**
* Note: This code is taken from bitsery traits/string.h and adopted
* to handle (de-)serialization of a string view.
*/
namespace bitsery
{
namespace traits
{
template<typename CharT, typename Traits>
struct ContainerTraits<std::basic_string_view<CharT, Traits>>
: public StdContainer<std::basic_string_view<CharT, Traits>, true, true>
{
};

template<typename CharT, typename Traits>
struct TextTraits<std::basic_string_view<CharT, Traits>>
{
using TValue = typename ContainerTraits<
std::basic_string_view<CharT, Traits>>::TValue;
static constexpr bool addNUL = false;
static size_t length(const std::basic_string_view<CharT, Traits>& str)
{
return str.size();
}
};
}
}

namespace simfil
{
Expand All @@ -22,81 +50,103 @@ StringPool::StringPool()
addStaticKey(OverlayIndex, "$idx");
}

StringPool::StringPool(const StringPool& other) :
idForString_(other.idForString_),
stringForId_(other.stringForId_),
nextId_(other.nextId_),
byteSize_(other.byteSize_.load()),
cacheHits_(other.cacheHits_.load()),
cacheMisses_(other.cacheMisses_.load())
StringPool::StringPool(const StringPool& other)
{
std::unique_lock lockThis(stringStoreMutex_, std::defer_lock);
std::shared_lock lockOther(other.stringStoreMutex_, std::defer_lock);
std::lock(lockThis, lockOther);

// Copy storedStrings_.
storedStrings_ = other.storedStrings_;

// Map from old string data pointer to new string_view.
std::unordered_map<const char*, std::string_view> strDataToNewStrView;

// Build the mapping from old string data pointers to new string_views.
for (size_t i = 0; i < other.storedStrings_.size(); ++i) {
strDataToNewStrView[other.storedStrings_[i].data()] = storedStrings_[i];
}

// Rebuild idForString_ with new string_views pointing into this->storedStrings_.
idForString_.clear();
for (const auto& [oldStrView, id] : other.idForString_) {
// Get the new string_view corresponding to the old string data pointer.
auto it = strDataToNewStrView.find(oldStrView.data());
if (it != strDataToNewStrView.end()) {
idForString_.emplace(it->second, id);
}
else {
// This should not happen if everything is consistent.
raise<std::runtime_error>("Failed to rebuild idForString_ in StringPool copy constructor");
}
}

// Copy stringForId_.
stringForId_ = other.stringForId_;

// Copy other member variables.
nextId_ = other.nextId_;
byteSize_ = other.byteSize_.load();
cacheHits_ = other.cacheHits_.load();
cacheMisses_ = other.cacheMisses_.load();
}

StringId StringPool::emplace(std::string_view const& str)
{
/// Unfortunately, we have to create a copy of the string here
/// on the heap for lower-casing.
/// Also we must use std::string as lookup type until C++ 20 is used:
/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0919r2.html
auto lowerCaseStr = std::string(str);
std::transform(
lowerCaseStr.begin(),
lowerCaseStr.end(),
lowerCaseStr.begin(),
[](auto ch) { return std::tolower(ch, std::locale{}); });

{
std::shared_lock stringStoreReadAccess_(stringStoreMutex_);
auto it = idForString_.find(lowerCaseStr);
std::shared_lock lock(stringStoreMutex_);
auto it = idForString_.find(str);
if (it != idForString_.end()) {
++cacheHits_;
return it->second;
}
}
{
std::unique_lock stringStoreWriteAccess_(stringStoreMutex_);
auto [it, insertionTookPlace] = idForString_.try_emplace(lowerCaseStr, nextId_);
if (insertionTookPlace) {
(void)stringForId_.try_emplace(nextId_, str);
byteSize_ += static_cast<int64_t>(str.size());
++cacheMisses_;
++nextId_;
if (nextId_ < it->second) {
raise<std::overflow_error>("StringPool id overflow!");
}
std::unique_lock lock(stringStoreMutex_);
// Double-check in case another thread added the string.
auto it = idForString_.find(str);
if (it != idForString_.end()) {
++cacheHits_;
return it->second;
}
return it->second;

// Store the string to maintain ownership.
auto& storedString = storedStrings_.emplace_back(str);
StringId id = nextId_++;
if (nextId_ < id) {
raise<std::overflow_error>("StringPool id overflow!");
}
idForString_.emplace(storedString, id);
stringForId_.emplace(id, storedString);
byteSize_ += static_cast<int64_t>(storedString.size());
++cacheMisses_;

return id;
}
}

StringId StringPool::get(std::string_view const& str)
{
auto lowerCaseStr = std::string(str);
std::transform(
lowerCaseStr.begin(),
lowerCaseStr.end(),
lowerCaseStr.begin(),
[](auto ch) { return std::tolower(ch, std::locale{}); });

std::shared_lock stringStoreReadAccess_(stringStoreMutex_);
auto it = idForString_.find(lowerCaseStr);
if (it != idForString_.end())
auto it = idForString_.find(str);
if (it != idForString_.end()) {
++cacheHits_;
return it->second;

}
return StringPool::Empty;
}

std::optional<std::string_view> StringPool::resolve(const StringId& id) const
{
std::shared_lock stringStoreReadAccess_(stringStoreMutex_);
const auto it = stringForId_.find(id);
auto it = stringForId_.find(id);
if (it != stringForId_.end())
return it->second;

return std::nullopt;
}

StringId StringPool::highest() const {
StringId StringPool::highest() const
{
return nextId_ - 1;
}

Expand All @@ -121,21 +171,17 @@ size_t StringPool::misses() const
return cacheMisses_;
}

void StringPool::addStaticKey(StringId k, std::string const& v) {
auto lowerCaseStr = v;
std::transform(
lowerCaseStr.begin(),
lowerCaseStr.end(),
lowerCaseStr.begin(),
[](auto ch) { return std::tolower(ch, std::locale{}); });

idForString_[lowerCaseStr] = k;
stringForId_[k] = v;
void StringPool::addStaticKey(StringId id, const std::string& value)
{
std::unique_lock lock(stringStoreMutex_);
auto& storedString = storedStrings_.emplace_back(value);
idForString_.emplace(storedString, id);
stringForId_.emplace(id, storedString);
}

void StringPool::write(std::ostream& outputStream, const StringId offset) const // NOLINT
void StringPool::write(std::ostream& outputStream, const StringId offset) const // NOLINT
{
std::shared_lock stringStoreReadAccess_(stringStoreMutex_);
std::shared_lock stringStoreReadAccess(stringStoreMutex_);
bitsery::Serializer<bitsery::OutputStreamAdapter> s(outputStream);

// Calculate how many strings will be sent
Expand Down Expand Up @@ -169,24 +215,17 @@ void StringPool::read(std::istream& inputStream)
s.value2b(rcvStringCount);

// Read strings
for (auto i = 0; i < rcvStringCount; ++i)
{
for (auto i = 0; i < rcvStringCount; ++i) {
// Read string key
StringId stringId{};
s.value2b(stringId);

// Don't support strings longer than 64kB.
std::string stringValue;
auto& stringValue = storedStrings_.emplace_back();
s.text1b(stringValue, std::numeric_limits<uint16_t>::max());
auto lowerCaseStringValue = std::string(stringValue);

// Insert string into the pool
std::transform(
lowerCaseStringValue.begin(),
lowerCaseStringValue.end(),
lowerCaseStringValue.begin(),
[](auto ch) { return std::tolower(ch, std::locale{}); });
auto [it, insertionTookPlace] = idForString_.try_emplace(lowerCaseStringValue, stringId);
auto [it, insertionTookPlace] = idForString_.try_emplace(stringValue, stringId);
if (insertionTookPlace) {
stringForId_.try_emplace(stringId, stringValue);
byteSize_ += static_cast<int64_t>(stringValue.size());
Expand All @@ -201,4 +240,42 @@ void StringPool::read(std::istream& inputStream)
}
}

bool StringPool::operator==(const StringPool &other) const {
return idForString_ == other.idForString_;
}

size_t detail::CaseInsensitiveHash::operator()(const std::string_view& str) const
josephbirkner marked this conversation as resolved.
Show resolved Hide resolved
{
// FNV-1a Hash (Fowler–Noll–Vo) for case-insensitive hashing.
// Reference: http://www.isthe.com/chongo/tech/comp/fnv/#FNV-reference-source
// Selects 64-bit FNV-1a offset basis and prime if size_t is 8 bytes,
// and 32-bit FNV values if size_t is 4 bytes.
constexpr size_t offsetBasis = sizeof(size_t) == 4 ? 2166136261U : 14695981039346656037ULL;
constexpr size_t prime = sizeof(size_t) == 4 ? 16777619U : 1099511628211ULL;

size_t hash = offsetBasis;
std::locale locale{};

for (auto c : str) {
c = std::tolower(c, locale);
hash ^= c;
hash *= prime;
}

return hash;
}

bool detail::CaseInsensitiveEqual::operator()(
const std::string_view& lhs,
const std::string_view& rhs) const
{
std::locale locale{};
return std::equal(
lhs.begin(),
lhs.end(),
rhs.begin(),
rhs.end(),
[&locale](auto l, auto r)
{ return std::tolower(l, locale) == std::tolower(r, locale); });
}
} // namespace simfil
Loading