Skip to content

Commit

Permalink
Fix bottleneck in string pool lookup performance by getting rid of lo…
Browse files Browse the repository at this point in the history
…wer-case copies.
  • Loading branch information
josephbirkner committed Oct 24, 2024
1 parent 35ecb61 commit 4c9ffc8
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 74 deletions.
1 change: 0 additions & 1 deletion include/simfil/model/bitsery-traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ struct ContainerTraits<sfl::segmented_vector<T, N, Allocator>>
: public StdContainer<sfl::segmented_vector<T, N, Allocator>, true, true>
{
};

}

template <typename S>
Expand Down
30 changes: 26 additions & 4 deletions include/simfil/model/string-pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,29 @@
#include <string>
#include <istream>
#include <ostream>
#include <deque>

namespace simfil
{

using StringId = uint16_t;
static_assert(std::is_unsigned_v<StringId>, "StringId must be unsigned!");

namespace detail
{
// Custom hash function for case-insensitive string hashing.
struct CaseInsensitiveHash
{
size_t operator()(const std::string_view& str) const;
};

// Custom equality comparison for case-insensitive string comparison.
struct CaseInsensitiveEqual
{
bool operator()(const std::string_view& lhs, const std::string_view& rhs) const;
};
} // namespace detail

/**
* Fast and efficient case-insensitive string interner,
* used to store object keys.
Expand Down Expand Up @@ -70,17 +86,23 @@ struct StringPool

/// Serialization - write to stream, starting from a specific
/// id offset if necessary (for partial serialisation).
virtual void write(std::ostream& outputStream, StringId offset = {}) const; // NOLINT
virtual void write(std::ostream& outputStream, StringId offset = {}) const; // NOLINT
virtual void read(std::istream& inputStream);

private:
mutable std::shared_mutex stringStoreMutex_;
std::unordered_map<std::string, StringId> idForString_;
std::unordered_map<StringId, std::string> stringForId_;
std::unordered_map<
std::string_view,
StringId,
detail::CaseInsensitiveHash,
detail::CaseInsensitiveEqual>
idForString_;
std::unordered_map<StringId, std::string_view> stringForId_;
std::deque<std::string> storedStrings_;
StringId nextId_ = FirstDynamicId;
std::atomic_int64_t byteSize_{0};
std::atomic_int64_t cacheHits_{0};
std::atomic_int64_t cacheMisses_{0};
};

}
} // namespace simfil
175 changes: 106 additions & 69 deletions src/model/string-pool.cpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,45 @@
#include "simfil/model/string-pool.h"
#include "simfil/exception-handler.h"

#include <algorithm>
#include <bitsery/bitsery.h>
#include <bitsery/adapter/stream.h>
#include <bitsery/bitsery.h>
#include <bitsery/traits/string.h>
#include <fmt/core.h>
#include <algorithm>
#include <cmath>
#include <mutex>
#include <locale>
#include <mutex>
#include <stdexcept>

namespace bitsery
{
namespace traits
{

template<typename CharT, typename Traits>
struct ContainerTraits<std::basic_string_view<CharT, Traits>>
: public StdContainer<std::basic_string_view<CharT, Traits>, true, true>
{
};

template<typename CharT, typename Traits>
struct TextTraits<std::basic_string_view<CharT, Traits>>
{
using TValue = typename ContainerTraits<
std::basic_string_view<CharT, Traits>>::TValue;
// string is automatically null-terminated
static constexpr bool addNUL = false;

// is is not 100% accurate, but for performance reasons assume that string
// stores text, not binary data
static size_t length(const std::basic_string_view<CharT, Traits>& str)
{
return str.size();
}
};
}
}

namespace simfil
{

Expand All @@ -22,81 +51,72 @@ StringPool::StringPool()
addStaticKey(OverlayIndex, "$idx");
}

StringPool::StringPool(const StringPool& other) :
idForString_(other.idForString_),
stringForId_(other.stringForId_),
nextId_(other.nextId_),
byteSize_(other.byteSize_.load()),
cacheHits_(other.cacheHits_.load()),
cacheMisses_(other.cacheMisses_.load())
StringPool::StringPool(const StringPool& other)
: idForString_(other.idForString_),
stringForId_(other.stringForId_),
nextId_(other.nextId_),
byteSize_(other.byteSize_.load()),
cacheHits_(other.cacheHits_.load()),
cacheMisses_(other.cacheMisses_.load())
{
}

StringId StringPool::emplace(std::string_view const& str)
{
/// Unfortunately, we have to create a copy of the string here
/// on the heap for lower-casing.
/// Also we must use std::string as lookup type until C++ 20 is used:
/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0919r2.html
auto lowerCaseStr = std::string(str);
std::transform(
lowerCaseStr.begin(),
lowerCaseStr.end(),
lowerCaseStr.begin(),
[](auto ch) { return std::tolower(ch, std::locale{}); });

{
std::shared_lock stringStoreReadAccess_(stringStoreMutex_);
auto it = idForString_.find(lowerCaseStr);
std::shared_lock lock(stringStoreMutex_);
auto it = idForString_.find(str);
if (it != idForString_.end()) {
++cacheHits_;
return it->second;
}
}
{
std::unique_lock stringStoreWriteAccess_(stringStoreMutex_);
auto [it, insertionTookPlace] = idForString_.try_emplace(lowerCaseStr, nextId_);
if (insertionTookPlace) {
(void)stringForId_.try_emplace(nextId_, str);
byteSize_ += static_cast<int64_t>(str.size());
++cacheMisses_;
++nextId_;
if (nextId_ < it->second) {
raise<std::overflow_error>("StringPool id overflow!");
}
std::unique_lock lock(stringStoreMutex_);
// Double-check in case another thread added the string.
auto it = idForString_.find(str);
if (it != idForString_.end()) {
++cacheHits_;
return it->second;
}
return it->second;

// Store the string to maintain ownership.
auto& storedString = storedStrings_.emplace_back(str);
StringId id = nextId_++;
if (nextId_ < id) {
raise<std::overflow_error>("StringPool id overflow!");
}
idForString_.emplace(storedString, id);
stringForId_.emplace(id, storedString);
byteSize_ += static_cast<int64_t>(storedString.size());
++cacheMisses_;

return id;
}
}

StringId StringPool::get(std::string_view const& str)
{
auto lowerCaseStr = std::string(str);
std::transform(
lowerCaseStr.begin(),
lowerCaseStr.end(),
lowerCaseStr.begin(),
[](auto ch) { return std::tolower(ch, std::locale{}); });

std::shared_lock stringStoreReadAccess_(stringStoreMutex_);
auto it = idForString_.find(lowerCaseStr);
if (it != idForString_.end())
auto it = idForString_.find(str);
if (it != idForString_.end()) {
++cacheHits_;
return it->second;

}
return StringPool::Empty;
}

std::optional<std::string_view> StringPool::resolve(const StringId& id) const
{
std::shared_lock stringStoreReadAccess_(stringStoreMutex_);
const auto it = stringForId_.find(id);
auto it = stringForId_.find(id);
if (it != stringForId_.end())
return it->second;

return std::nullopt;
}

StringId StringPool::highest() const {
StringId StringPool::highest() const
{
return nextId_ - 1;
}

Expand All @@ -121,21 +141,17 @@ size_t StringPool::misses() const
return cacheMisses_;
}

void StringPool::addStaticKey(StringId k, std::string const& v) {
auto lowerCaseStr = v;
std::transform(
lowerCaseStr.begin(),
lowerCaseStr.end(),
lowerCaseStr.begin(),
[](auto ch) { return std::tolower(ch, std::locale{}); });

idForString_[lowerCaseStr] = k;
stringForId_[k] = v;
void StringPool::addStaticKey(StringId id, const std::string& value)
{
std::unique_lock lock(stringStoreMutex_);
auto& storedString = storedStrings_.emplace_back(value);
idForString_.emplace(storedString, id);
stringForId_.emplace(id, storedString);
}

void StringPool::write(std::ostream& outputStream, const StringId offset) const // NOLINT
void StringPool::write(std::ostream& outputStream, const StringId offset) const // NOLINT
{
std::shared_lock stringStoreReadAccess_(stringStoreMutex_);
std::shared_lock stringStoreReadAccess(stringStoreMutex_);
bitsery::Serializer<bitsery::OutputStreamAdapter> s(outputStream);

// Calculate how many strings will be sent
Expand Down Expand Up @@ -169,24 +185,17 @@ void StringPool::read(std::istream& inputStream)
s.value2b(rcvStringCount);

// Read strings
for (auto i = 0; i < rcvStringCount; ++i)
{
for (auto i = 0; i < rcvStringCount; ++i) {
// Read string key
StringId stringId{};
s.value2b(stringId);

// Don't support strings longer than 64kB.
std::string stringValue;
auto& stringValue = storedStrings_.emplace_back();
s.text1b(stringValue, std::numeric_limits<uint16_t>::max());
auto lowerCaseStringValue = std::string(stringValue);

// Insert string into the pool
std::transform(
lowerCaseStringValue.begin(),
lowerCaseStringValue.end(),
lowerCaseStringValue.begin(),
[](auto ch) { return std::tolower(ch, std::locale{}); });
auto [it, insertionTookPlace] = idForString_.try_emplace(lowerCaseStringValue, stringId);
auto [it, insertionTookPlace] = idForString_.try_emplace(stringValue, stringId);
if (insertionTookPlace) {
stringForId_.try_emplace(stringId, stringValue);
byteSize_ += static_cast<int64_t>(stringValue.size());
Expand All @@ -201,4 +210,32 @@ void StringPool::read(std::istream& inputStream)
}
}

size_t detail::CaseInsensitiveHash::operator()(const std::string_view& str) const
{
size_t hash = 14695981039346656037ull; // FNV offset basis for 64-bit size_t.
for (unsigned char c : str) {
// ASCII lowercase conversion.
if (c >= 'A' && c <= 'Z') {
constexpr auto lowercaseOffset = 'a' - 'A';
c += lowercaseOffset;
}
hash ^= c;
hash *= 1099511628211ull; // FNV prime for 64-bit size_t.
}
return hash;
}

bool detail::CaseInsensitiveEqual::operator()(
const std::string_view& lhs,
const std::string_view& rhs) const
{
std::locale locale;
return std::equal(
lhs.begin(),
lhs.end(),
rhs.begin(),
rhs.end(),
[&locale](unsigned char l, unsigned char r)
{ return std::tolower(l, locale) == std::tolower(r, locale); });
}
} // namespace simfil

0 comments on commit 4c9ffc8

Please sign in to comment.