Skip to content

Commit

Permalink
Better estimation for ColumnLowCardinality::Reserve and ColumnString:…
Browse files Browse the repository at this point in the history
…:Reserve

ColumnLowCardinality assumes that not all items are unique,
  hence dictionary column can be reserved for smaller capacity;
ColumnString now allows to set average value size estimation
  in constructor or on existing instance.
  If estimation is close to real average value size, then memory is pre-allocations are close to optimum.
Enmk committed Nov 16, 2023
1 parent 0f8b396 commit 90414cc
Showing 32 changed files with 442 additions and 33 deletions.
4 changes: 4 additions & 0 deletions clickhouse/columns/array.cpp
Original file line number Diff line number Diff line change
@@ -110,6 +110,10 @@ size_t ColumnArray::Size() const {
return offsets_->Size();
}

size_t ColumnArray::MemoryUsage() const {
return offsets_->MemoryUsage() + data_->MemoryUsage();
}

void ColumnArray::Swap(Column& other) {
auto & col = dynamic_cast<ColumnArray &>(other);
data_.swap(col.data_);
2 changes: 2 additions & 0 deletions clickhouse/columns/array.h
Original file line number Diff line number Diff line change
@@ -71,6 +71,8 @@ class ColumnArray : public Column {
/// Returns count of rows in the column.
size_t Size() const override;

size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t, size_t) const override;
ColumnRef CloneEmpty() const override;
3 changes: 3 additions & 0 deletions clickhouse/columns/column.h
Original file line number Diff line number Diff line change
@@ -90,6 +90,9 @@ class Column : public std::enable_shared_from_this<Column> {

virtual void Swap(Column&) = 0;

/// Estimated RAM usage by the column in bytes.
virtual size_t MemoryUsage() const = 0;

/// Get a view on raw item data if it is supported by column, will throw an exception if index is out of range.
/// Please note that view is invalidated once column items are added or deleted, column is loaded from strean or destroyed.
virtual ItemView GetItem(size_t) const {
16 changes: 16 additions & 0 deletions clickhouse/columns/date.cpp
Original file line number Diff line number Diff line change
@@ -67,6 +67,10 @@ size_t ColumnDate::Size() const {
return data_->Size();
}

size_t ColumnDate::MemoryUsage() const {
return data_->MemoryUsage();
}

ColumnRef ColumnDate::Slice(size_t begin, size_t len) const {
auto col = data_->Slice(begin, len)->As<ColumnUInt16>();
auto result = std::make_shared<ColumnDate>();
@@ -154,6 +158,10 @@ size_t ColumnDate32::Size() const {
return data_->Size();
}

size_t ColumnDate32::MemoryUsage() const {
return data_->MemoryUsage();
}

ColumnRef ColumnDate32::Slice(size_t begin, size_t len) const {
auto col = data_->Slice(begin, len)->As<ColumnInt32>();
auto result = std::make_shared<ColumnDate32>();
@@ -244,6 +252,10 @@ size_t ColumnDateTime::Size() const {
return data_->Size();
}

size_t ColumnDateTime::MemoryUsage() const {
return data_->MemoryUsage();
}

void ColumnDateTime::Clear() {
data_->Clear();
}
@@ -330,6 +342,10 @@ size_t ColumnDateTime64::Size() const {
return data_->Size();
}

size_t ColumnDateTime64::MemoryUsage() const {
return data_->MemoryUsage();
}

ItemView ColumnDateTime64::GetItem(size_t index) const {
return ItemView(Type::DateTime64, data_->GetItem(index));
}
4 changes: 4 additions & 0 deletions clickhouse/columns/date.h
Original file line number Diff line number Diff line change
@@ -51,6 +51,7 @@ class ColumnDate : public Column {

/// Returns count of rows in the column.
size_t Size() const override;
size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t begin, size_t len) const override;
@@ -109,6 +110,7 @@ class ColumnDate32 : public Column {

/// Returns count of rows in the column.
size_t Size() const override;
size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t begin, size_t len) const override;
@@ -170,6 +172,7 @@ class ColumnDateTime : public Column {

/// Returns count of rows in the column.
size_t Size() const override;
size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t begin, size_t len) const override;
@@ -223,6 +226,7 @@ class ColumnDateTime64 : public Column {

/// Returns count of rows in the column.
size_t Size() const override;
size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t begin, size_t len) const override;
4 changes: 4 additions & 0 deletions clickhouse/columns/decimal.cpp
Original file line number Diff line number Diff line change
@@ -217,6 +217,10 @@ size_t ColumnDecimal::Size() const {
return data_->Size();
}

size_t ColumnDecimal::MemoryUsage() const {
return data_->MemoryUsage();
}

ColumnRef ColumnDecimal::Slice(size_t begin, size_t len) const {
// coundn't use std::make_shared since this c-tor is private
return ColumnRef{new ColumnDecimal(type_, data_->Slice(begin, len))};
1 change: 1 addition & 0 deletions clickhouse/columns/decimal.h
Original file line number Diff line number Diff line change
@@ -28,6 +28,7 @@ class ColumnDecimal : public Column {
void SaveBody(OutputStream* output) override;
void Clear() override;
size_t Size() const override;
size_t MemoryUsage() const override;
ColumnRef Slice(size_t begin, size_t len) const override;
ColumnRef CloneEmpty() const override;
void Swap(Column& other) override;
5 changes: 5 additions & 0 deletions clickhouse/columns/enum.cpp
Original file line number Diff line number Diff line change
@@ -96,6 +96,11 @@ size_t ColumnEnum<T>::Size() const {
return data_.size();
}

template <typename T>
size_t ColumnEnum<T>::MemoryUsage() const {
return data_.capacity() * sizeof(*data_.begin());
}

template <typename T>
ColumnRef ColumnEnum<T>::Slice(size_t begin, size_t len) const {
return std::make_shared<ColumnEnum<T>>(type_, SliceVector(data_, begin, len));
1 change: 1 addition & 0 deletions clickhouse/columns/enum.h
Original file line number Diff line number Diff line change
@@ -47,6 +47,7 @@ class ColumnEnum : public Column {

/// Returns count of rows in the column.
size_t Size() const override;
size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t begin, size_t len) const override;
6 changes: 6 additions & 0 deletions clickhouse/columns/geo.cpp
Original file line number Diff line number Diff line change
@@ -76,11 +76,17 @@ void ColumnGeo<NestedColumnType, type_code>::SaveBody(OutputStream* output) {
data_->SaveBody(output);
}


template <typename NestedColumnType, Type::Code type_code>
size_t ColumnGeo<NestedColumnType, type_code>::Size() const {
return data_->Size();
}

template <typename NestedColumnType, Type::Code type_code>
size_t ColumnGeo<NestedColumnType, type_code>::MemoryUsage() const {
return data_->MemoryUsage();
}

template <typename NestedColumnType, Type::Code type_code>
ColumnRef ColumnGeo<NestedColumnType, type_code>::Slice(size_t begin, size_t len) const {
return std::make_shared<ColumnGeo>(data_->Slice(begin, len));
1 change: 1 addition & 0 deletions clickhouse/columns/geo.h
Original file line number Diff line number Diff line change
@@ -46,6 +46,7 @@ class ColumnGeo : public Column {

/// Returns count of rows in the column.
size_t Size() const override;
size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t begin, size_t len) const override;
4 changes: 4 additions & 0 deletions clickhouse/columns/ip4.cpp
Original file line number Diff line number Diff line change
@@ -96,6 +96,10 @@ size_t ColumnIPv4::Size() const {
return data_->Size();
}

size_t ColumnIPv4::MemoryUsage() const {
return data_->MemoryUsage();
}

ColumnRef ColumnIPv4::Slice(size_t begin, size_t len) const {
return std::make_shared<ColumnIPv4>(data_->Slice(begin, len));
}
1 change: 1 addition & 0 deletions clickhouse/columns/ip4.h
Original file line number Diff line number Diff line change
@@ -56,6 +56,7 @@ class ColumnIPv4 : public Column {

/// Returns count of rows in the column.
size_t Size() const override;
size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t begin, size_t len) const override;
4 changes: 4 additions & 0 deletions clickhouse/columns/ip6.cpp
Original file line number Diff line number Diff line change
@@ -87,6 +87,10 @@ size_t ColumnIPv6::Size() const {
return data_->Size();
}

size_t ColumnIPv6::MemoryUsage() const {
return data_->MemoryUsage();
}

ColumnRef ColumnIPv6::Slice(size_t begin, size_t len) const {
return std::make_shared<ColumnIPv6>(data_->Slice(begin, len));
}
1 change: 1 addition & 0 deletions clickhouse/columns/ip6.h
Original file line number Diff line number Diff line change
@@ -52,6 +52,7 @@ class ColumnIPv6 : public Column {

/// Returns count of rows in the column.
size_t Size() const override;
size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t begin, size_t len) const override;
45 changes: 40 additions & 5 deletions clickhouse/columns/lowcardinality.cpp
Original file line number Diff line number Diff line change
@@ -174,9 +174,39 @@ ColumnLowCardinality::ColumnLowCardinality(std::shared_ptr<ColumnNullable> dicti
ColumnLowCardinality::~ColumnLowCardinality()
{}

namespace
{
size_t EstimateDictionaryCapacity(size_t new_cap)
{
// Estimate capacity of the LC dictionary column.
// For small columns we assume there are higher relative number of unique items
// hence the capacity of the dictionary column must be the same as capacity of index_column.
// For large columns we assume that there are at least 80% of duplicates,
// hence the capacity of the dictionary column is 0.20 of the index_column.
// Medium-sized columns have dictionary capacity somewhere in-between.

const float max_ratio = 1.0;
const float min_ratio = 0.20;
const float min_ratio_at = 512;
const float max_ratio_at = 128;

if (new_cap < max_ratio_at)
return new_cap;

if (new_cap >= min_ratio_at)
return new_cap * min_ratio;

// Ratio of the dict capacity to the index column capacity,
// linearly falls down from `max_ratio` at `max_ratio_at` down to `min_ratio` at min_ratio_at;
const float ratio = max_ratio + (max_ratio_at - static_cast<int>(new_cap)) * (max_ratio - min_ratio) / (min_ratio_at - max_ratio_at);
return new_cap * ratio;
}
}

void ColumnLowCardinality::Reserve(size_t new_cap) {
dictionary_column_->Reserve(new_cap);
index_column_->Reserve(new_cap);

dictionary_column_->Reserve(EstimateDictionaryCapacity(new_cap));
}

void ColumnLowCardinality::Setup(ColumnRef dictionary_column) {
@@ -379,6 +409,13 @@ size_t ColumnLowCardinality::Size() const {
return index_column_->Size();
}

size_t ColumnLowCardinality::MemoryUsage() const {
return unique_items_map_.bucket_count() * unique_items_map_.max_load_factor()
* (sizeof(unique_items_map_.begin()->first) + sizeof(unique_items_map_.begin()->second))
+ index_column_->MemoryUsage()
+ dictionary_column_->MemoryUsage();
}

ColumnRef ColumnLowCardinality::Slice(size_t begin, size_t len) const {
begin = std::min(begin, Size());
len = std::min(len, Size() - begin);
@@ -451,15 +488,13 @@ void ColumnLowCardinality::AppendUnsafe(const ItemView & value) {
}
}

void ColumnLowCardinality::AppendNullItem()
{
void ColumnLowCardinality::AppendNullItem() {
const auto null_item = GetNullItemForDictionary(dictionary_column_);
AppendToDictionary(*dictionary_column_, null_item);
unique_items_map_.emplace(computeHashKey(null_item), 0);
}

void ColumnLowCardinality::AppendDefaultItem()
{
void ColumnLowCardinality::AppendDefaultItem() {
const auto defaultItem = GetDefaultItemForDictionary(dictionary_column_);
unique_items_map_.emplace(computeHashKey(defaultItem), dictionary_column_->Size());
AppendToDictionary(*dictionary_column_, defaultItem);
1 change: 1 addition & 0 deletions clickhouse/columns/lowcardinality.h
Original file line number Diff line number Diff line change
@@ -87,6 +87,7 @@ class ColumnLowCardinality : public Column {

/// Returns count of rows in the column.
size_t Size() const override;
size_t MemoryUsage() const override;

/// Makes slice of current column, with compacted dictionary
ColumnRef Slice(size_t begin, size_t len) const override;
4 changes: 4 additions & 0 deletions clickhouse/columns/map.cpp
Original file line number Diff line number Diff line change
@@ -67,6 +67,10 @@ size_t ColumnMap::Size() const {
return data_->Size();
}

size_t ColumnMap::MemoryUsage() const {
return data_->MemoryUsage();
}

ColumnRef ColumnMap::Slice(size_t begin, size_t len) const {
return std::make_shared<ColumnMap>(data_->Slice(begin, len));
}
1 change: 1 addition & 0 deletions clickhouse/columns/map.h
Original file line number Diff line number Diff line change
@@ -48,6 +48,7 @@ class ColumnMap : public Column {

/// Returns count of rows in the column.
size_t Size() const override;
size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t, size_t) const override;
2 changes: 2 additions & 0 deletions clickhouse/columns/nothing.h
Original file line number Diff line number Diff line change
@@ -75,6 +75,8 @@ class ColumnNothing : public Column {
/// Returns count of rows in the column.
size_t Size() const override { return size_; }

size_t MemoryUsage() const override { return 0; }

void Swap(Column& other) override {
auto & col = dynamic_cast<ColumnNothing &>(other);
std::swap(size_, col.size_);
4 changes: 4 additions & 0 deletions clickhouse/columns/nullable.cpp
Original file line number Diff line number Diff line change
@@ -82,6 +82,10 @@ size_t ColumnNullable::Size() const {
return nulls_->Size();
}

size_t ColumnNullable::MemoryUsage() const {
return nested_->MemoryUsage() + nulls_->MemoryUsage();
}

ColumnRef ColumnNullable::Slice(size_t begin, size_t len) const {
return std::make_shared<ColumnNullable>(nested_->Slice(begin, len), nulls_->Slice(begin, len));
}
1 change: 1 addition & 0 deletions clickhouse/columns/nullable.h
Original file line number Diff line number Diff line change
@@ -50,6 +50,7 @@ class ColumnNullable : public Column {

/// Returns count of rows in the column.
size_t Size() const override;
size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t begin, size_t len) const override;
5 changes: 5 additions & 0 deletions clickhouse/columns/numeric.cpp
Original file line number Diff line number Diff line change
@@ -87,6 +87,11 @@ size_t ColumnVector<T>::Size() const {
return data_.size();
}

template <typename T>
size_t ColumnVector<T>::MemoryUsage() const {
return data_.capacity() * sizeof(data_[0]);
}

template <typename T>
ColumnRef ColumnVector<T>::Slice(size_t begin, size_t len) const {
return std::make_shared<ColumnVector<T>>(SliceVector(data_, begin, len));
1 change: 1 addition & 0 deletions clickhouse/columns/numeric.h
Original file line number Diff line number Diff line change
@@ -57,6 +57,7 @@ class ColumnVector : public Column {

/// Makes slice of the current column.
ColumnRef Slice(size_t begin, size_t len) const override;
size_t MemoryUsage() const override;
ColumnRef CloneEmpty() const override;
void Swap(Column& other) override;

112 changes: 91 additions & 21 deletions clickhouse/columns/string.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "string.h"
#include <memory>
#include "clickhouse/exceptions.h"
#include "utils.h"

#include "../base/wire_format.h"
@@ -20,6 +22,13 @@ size_t ComputeTotalSize(const Container & strings, size_t begin = 0, size_t len
return result;
}

// based on https://stackoverflow.com/a/9194117
size_t RoundUp(size_t numToRound, size_t multiple)
{
size_t isPositive = static_cast<size_t>(numToRound >= 0);
return ((numToRound + isPositive * (multiple - 1)) / multiple) * multiple;
}

}

namespace clickhouse {
@@ -105,6 +114,10 @@ ColumnRef ColumnFixedString::Slice(size_t begin, size_t len) const {
return result;
}

size_t ColumnFixedString::MemoryUsage() const {
return data_.capacity();
}

ColumnRef ColumnFixedString::CloneEmpty() const {
return std::make_shared<ColumnFixedString>(string_size_);
}
@@ -119,6 +132,22 @@ ItemView ColumnFixedString::GetItem(size_t index) const {
return ItemView{Type::FixedString, this->At(index)};
}

namespace {

size_t ComputeValueSizeEstimation(size_t total_size, size_t number_of_items) {
number_of_items = number_of_items ? number_of_items : 1; // just to avoid divide by zero
size_t estimation = std::ceil(static_cast<double>(total_size) / number_of_items);

return estimation == 0 ? ColumnString::DEFAULT_ESTIMATION : estimation;
}

size_t EstimateNextBlockSize(size_t value_size_estimation) {
const size_t estimated_number_of_items_per_block = 32; // just arbitrary value
return std::max<size_t>(DEFAULT_BLOCK_SIZE, value_size_estimation * estimated_number_of_items_per_block);
}

}

struct ColumnString::Block
{
using CharT = typename std::string::value_type;
@@ -157,28 +186,34 @@ struct ColumnString::Block
std::unique_ptr<CharT[]> data_;
};

ColumnString::ColumnString()
ColumnString::ColumnString(EstimatedValueSize value_size_estimation)
: Column(Type::CreateString())
, value_size_estimation_(value_size_estimation)
, next_block_size_(DEFAULT_BLOCK_SIZE)
{
if (value_size_estimation < 0)
throw ValidationError("ColumnString received negative number as value size estimation");
}

ColumnString::ColumnString(size_t element_count)
: Column(Type::CreateString())
ColumnString::ColumnString(size_t element_count, EstimatedValueSize value_size_estimation)
: ColumnString(value_size_estimation)
{
items_.reserve(element_count);
// 16 is arbitrary number, assumption that string values are about ~256 bytes long.
blocks_.reserve(std::max<size_t>(1, element_count / 16));
blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, RoundUp(element_count * value_size_estimation_, DEFAULT_BLOCK_SIZE)));
}

ColumnString::ColumnString(const std::vector<std::string>& data)
: ColumnString()
{
const auto total_size = ComputeTotalSize(data);
items_.reserve(data.size());
blocks_.emplace_back(ComputeTotalSize(data));
blocks_.emplace_back(total_size);

for (const auto & s : data) {
AppendUnsafe(s);
}

value_size_estimation_ = ComputeValueSizeEstimation(total_size, data.size());
}

ColumnString::ColumnString(std::vector<std::string>&& data)
@@ -191,20 +226,33 @@ ColumnString::ColumnString(std::vector<std::string>&& data)
auto& last_data = append_data_.back();
items_.emplace_back(std::string_view{ last_data.data(),last_data.length() });
}

value_size_estimation_ = ComputeValueSizeEstimation(ComputeTotalSize(items_), items_.size());
}

ColumnString::~ColumnString()
{}

void ColumnString::Reserve(size_t new_cap) {
items_.reserve(new_cap);
// 16 is arbitrary number, assumption that string values are about ~256 bytes long.
blocks_.reserve(std::max<size_t>(1, new_cap / 16));

if (blocks_.empty() || blocks_.back().GetAvailable() < value_size_estimation_) {
blocks_.emplace_back(new_cap * value_size_estimation_);
} else {
// make sure that next block will have enought space for all remaining items.
const size_t estimated_items_in_next_block = value_size_estimation_ ? new_cap - blocks_.back().GetAvailable() / value_size_estimation_ : new_cap;
next_block_size_ = std::max(DEFAULT_BLOCK_SIZE, estimated_items_in_next_block * value_size_estimation_);
}
}

void ColumnString::SetEstimatedValueSize(EstimatedValueSize value_size_estimation) {
value_size_estimation_ = value_size_estimation;
}

void ColumnString::Append(std::string_view str) {
if (blocks_.size() == 0 || blocks_.back().GetAvailable() < str.length()) {
blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, str.size()));
if (blocks_.empty() || blocks_.back().GetAvailable() < str.length()) {
blocks_.emplace_back(std::max(next_block_size_, str.size()));
next_block_size_ = EstimateNextBlockSize(value_size_estimation_);
}

items_.emplace_back(blocks_.back().AppendUnsafe(str));
@@ -244,8 +292,10 @@ void ColumnString::Append(ColumnRef column) {
const auto total_size = ComputeTotalSize(col->items_);

// TODO: fill up existing block with some items and then add a new one for the rest of items
if (blocks_.size() == 0 || blocks_.back().GetAvailable() < total_size)
blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, total_size));
if (blocks_.size() == 0 || blocks_.back().GetAvailable() < total_size) {
blocks_.emplace_back(std::max(next_block_size_, total_size));
next_block_size_ = EstimateNextBlockSize(value_size_estimation_);
}

// Intentionally not doing items_.reserve() since that cripples performance.
for (size_t i = 0; i < column->Size(); ++i) {
@@ -268,6 +318,8 @@ bool ColumnString::LoadBody(InputStream* input, size_t rows) {
new_items.reserve(rows);

// Suboptimzal if the first row string is >DEFAULT_BLOCK_SIZE, but that must be a very rare case.
// Not using next_block_size_ here since it set in Reserve() which doesn't know
// about number of items and estimated item size in InputStream.
Block * block = &new_blocks.emplace_back(DEFAULT_BLOCK_SIZE);

for (size_t i = 0; i < rows; ++i) {
@@ -300,24 +352,42 @@ size_t ColumnString::Size() const {
return items_.size();
}

size_t ColumnString::MemoryUsage() const {
auto vector_used_bytes = [](const auto & v) {
return sizeof(v[0]) * v.capacity();
};

size_t result = ComputeTotalSize(append_data_) + sizeof(append_data_[0]) * append_data_.size();
result += vector_used_bytes(items_);
result += vector_used_bytes(blocks_);

for (const auto & b : blocks_)
result += b.capacity;

return result;
}

ColumnRef ColumnString::Slice(size_t begin, size_t len) const {
auto result = std::make_shared<ColumnString>();
if (begin >= items_.size()) {
return this->CloneEmpty();
}

if (begin < items_.size()) {
len = std::min(len, items_.size() - begin);
result->items_.reserve(len);
len = std::min(len, items_.size() - begin);

result->blocks_.emplace_back(ComputeTotalSize(items_, begin, len));
for (size_t i = begin; i < begin + len; ++i) {
result->Append(items_[i]);
}
auto result = std::make_shared<ColumnString>(EstimatedValueSize(value_size_estimation_));

result->items_.reserve(len);
result->blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, ComputeTotalSize(items_, begin, len)));

for (size_t i = begin; i < begin + len; ++i) {
result->Append(items_[i]);
}

return result;
}

ColumnRef ColumnString::CloneEmpty() const {
return std::make_shared<ColumnString>();
return std::make_shared<ColumnString>(EstimatedValueSize(value_size_estimation_));
}

void ColumnString::Swap(Column& other) {
39 changes: 33 additions & 6 deletions clickhouse/columns/string.h
Original file line number Diff line number Diff line change
@@ -60,6 +60,8 @@ class ColumnFixedString : public Column {

/// Makes slice of the current column.
ColumnRef Slice(size_t begin, size_t len) const override;
size_t MemoryUsage() const override;

ColumnRef CloneEmpty() const override;
void Swap(Column& other) override;

@@ -78,17 +80,35 @@ class ColumnString : public Column {
// Type this column takes as argument of Append and returns with At() and operator[]
using ValueType = std::string_view;

ColumnString();
~ColumnString();

explicit ColumnString(size_t element_count);
// Estimation on average size of the value in column,
// helps to reduce used memory and number of re-allocation.
// Choosing a bad estimation woudn't crash the program,
// but may cause more frequent smaller memory allocations,
// reducing overall performance.
// int32_t to be able to validate againts (unintentional) negative values in ColumnString c-tor.
// Otherwise those just silently underflow unsigned type,
// resulting in attempt to allocate enormous amount of memory at run time.
enum EstimatedValueSize : int32_t {
TINY = 8,
SMALL = 32,
MEDIUM = 128,
LARGE = 512,
HUGE = 2048,
};
static constexpr auto DEFAULT_ESTIMATION = EstimatedValueSize::MEDIUM;

explicit ColumnString(EstimatedValueSize value_size_estimation = DEFAULT_ESTIMATION);
explicit ColumnString(size_t element_count, EstimatedValueSize value_size_estimation = DEFAULT_ESTIMATION);
explicit ColumnString(const std::vector<std::string> & data);
explicit ColumnString(std::vector<std::string>&& data);

~ColumnString();

ColumnString& operator=(const ColumnString&) = delete;
ColumnString(const ColumnString&) = delete;

/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;
/// Change how memory is allocated for future Reserve() or Append() calls. Doesn't affect items that are already added to the column.
void SetEstimatedValueSize(EstimatedValueSize value_size_estimation);

/// Appends one element to the column.
void Append(std::string_view str);
@@ -113,6 +133,9 @@ class ColumnString : public Column {
/// Appends content of given column to the end of current one.
void Append(ColumnRef column) override;

/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Loads column data from input stream.
bool LoadBody(InputStream* input, size_t rows) override;

@@ -125,6 +148,8 @@ class ColumnString : public Column {
/// Returns count of rows in the column.
size_t Size() const override;

size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t begin, size_t len) const override;
ColumnRef CloneEmpty() const override;
@@ -140,6 +165,8 @@ class ColumnString : public Column {
std::vector<std::string_view> items_;
std::vector<Block> blocks_;
std::deque<std::string> append_data_;
uint32_t value_size_estimation_ = DEFAULT_ESTIMATION;
size_t next_block_size_;
};

}
8 changes: 8 additions & 0 deletions clickhouse/columns/tuple.cpp
Original file line number Diff line number Diff line change
@@ -41,6 +41,14 @@ size_t ColumnTuple::Size() const {
return columns_.empty() ? 0 : columns_[0]->Size();
}

size_t ColumnTuple::MemoryUsage() const {
size_t result = sizeof(columns_[0]) * columns_.capacity();
for (const auto & c : columns_)
result += c->MemoryUsage();

return result;
}

ColumnRef ColumnTuple::Slice(size_t begin, size_t len) const {
std::vector<ColumnRef> sliced_columns;
sliced_columns.reserve(columns_.size());
2 changes: 2 additions & 0 deletions clickhouse/columns/tuple.h
Original file line number Diff line number Diff line change
@@ -50,6 +50,8 @@ class ColumnTuple : public Column {
/// Returns count of rows in the column.
size_t Size() const override;

size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t, size_t) const override;
ColumnRef CloneEmpty() const override;
4 changes: 4 additions & 0 deletions clickhouse/columns/uuid.cpp
Original file line number Diff line number Diff line change
@@ -56,6 +56,10 @@ size_t ColumnUUID::Size() const {
return data_->Size() / 2;
}

size_t ColumnUUID::MemoryUsage() const {
return data_->MemoryUsage();
}

ColumnRef ColumnUUID::Slice(size_t begin, size_t len) const {
return std::make_shared<ColumnUUID>(data_->Slice(begin * 2, len * 2));
}
1 change: 1 addition & 0 deletions clickhouse/columns/uuid.h
Original file line number Diff line number Diff line change
@@ -43,6 +43,7 @@ class ColumnUUID : public Column {

/// Returns count of rows in the column.
size_t Size() const override;
size_t MemoryUsage() const override;

/// Makes slice of the current column.
ColumnRef Slice(size_t begin, size_t len) const override;
48 changes: 48 additions & 0 deletions ut/Column_ut.cpp
Original file line number Diff line number Diff line change
@@ -17,10 +17,13 @@

#include <gtest/gtest.h>
#include <algorithm>
#include <cstdint>
#include <initializer_list>
#include <memory>
#include <numeric>
#include <type_traits>

#include "clickhouse/columns/column.h"
#include "gtest/internal/gtest-internal.h"
#include "ut/utils_comparison.h"
#include "ut/utils_meta.h"
@@ -74,6 +77,7 @@ struct GenericColumnTestCase
template <typename T>
class GenericColumnTest : public testing::Test {
public:
using TestCase = T;
using ColumnType = typename T::ColumnType;

static auto MakeColumn()
@@ -358,6 +362,50 @@ TYPED_TEST(GenericColumnTest, Clear) {
EXPECT_EQ(0u, column->Size());
}

TYPED_TEST(GenericColumnTest, MemoryUsage) {
auto column = this->MakeColumn();
const auto values = this->GenerateValues(10'000);

auto max_memory_usage = sizeof(values.front()) * values.size();
if (column->GetType().GetCode() == Type::Code::LowCardinality) {
// Low cardinality has a different memory usage profile:
// only unique values take space in the dictionary,
// rest are just indicies to said dictionary.

const auto unique_values = TestFixture::TestCase::generateValues();
max_memory_usage = sizeof(unique_values.begin()) * unique_values.size()
+ sizeof(int32_t) * values.size() // indices
+ sizeof(uint64_t) * values.size() * 2; // hashes for uniques checks
}

if constexpr (std::is_same_v<std::string, std::decay_t<decltype(values[0])>>) {
const auto unique_values = TestFixture::TestCase::generateValues();
const size_t total_size = std::accumulate(unique_values.begin(), unique_values.end(), 0, [](auto accumulator, auto i) {
return accumulator + i.size();
});
max_memory_usage = total_size / unique_values.size() * values.size();

if constexpr (std::is_same_v<typename TypeParam::ColumnType, ColumnString>) {
column->SetEstimatedValueSize(ColumnString::EstimatedValueSize(total_size / unique_values.size()));
// There is some over-allocation for ColumnString
max_memory_usage *= 1.2;
}
}

// Empty column should have low memory usage from the start,
// from 0 bytes to 1% of max estimated memory usage due to some pre-reservations.
EXPECT_NEAR(max_memory_usage * 0.01, column->MemoryUsage(), max_memory_usage * 0.01)
<< "On empty column";

column->Reserve(values.size());
EXPECT_GE(max_memory_usage, column->MemoryUsage())
<< "After reserve";

TestFixture::AppendValues(column, values);
EXPECT_GE(max_memory_usage, column->MemoryUsage())
<< " After appending " << values.size() << " items";
}

TYPED_TEST(GenericColumnTest, Swap) {
auto [column_A, values] = this->MakeColumnWithValues(10'000);
auto column_B = this->MakeColumn();
140 changes: 139 additions & 1 deletion ut/columns_ut.cpp
Original file line number Diff line number Diff line change
@@ -16,9 +16,12 @@
#include <clickhouse/base/socket.h> // for ipv4-ipv6 platform-specific stuff

#include <gtest/gtest.h>
#include "clickhouse/exceptions.h"
#include "gtest/gtest-message.h"
#include "utils.h"
#include "value_generators.h"

#include <limits>
#include <string_view>
#include <sstream>
#include <vector>
@@ -709,6 +712,142 @@ TEST(ColumnsCase, ColumnDecimal128_from_string_overflow) {
#endif
}

//TEST(ColumnString, DefaultSizeEstimation) {
// auto values = MakeStrings();

// const ColumnString::EstimatedValueSize value_size_estimations[] = {
// ColumnString::EstimatedValueSize::TINY,
// ColumnString::EstimatedValueSize::SMALL,
// ColumnString::EstimatedValueSize::MEDIUM,
// ColumnString::EstimatedValueSize::LARGE,
// ColumnString::EstimatedValueSize::HUGE,
// };

// for (auto estimation : value_size_estimations) {
// SCOPED_TRACE(::testing::Message("with estimation: ") << estimation);

// auto col = std::make_shared<ColumnString>(estimation);

// col->Reserve(values.size());

// size_t i = 0;
// for (const auto & v : values) {
// col->Append(v);

// EXPECT_EQ(i + 1, col->Size());
// EXPECT_EQ(v, col->At(i));

// ++i;
// }
// }
//}

namespace
{

std::ostream & dumpMemoryUsage(const char * prefix, const ColumnRef col) {
return std::cerr << prefix << " " << col->GetType().GetName() << " : " << col->MemoryUsage() << " bytes" << std::endl;
}

}

TEST(ColumnString, WithSizeEstimation) {
const ColumnString::EstimatedValueSize value_size_estimations[] = {
ColumnString::EstimatedValueSize::TINY,
ColumnString::EstimatedValueSize::SMALL,
ColumnString::EstimatedValueSize::MEDIUM,
ColumnString::EstimatedValueSize::LARGE,
ColumnString::EstimatedValueSize::HUGE,

ColumnString::EstimatedValueSize(0),
ColumnString::EstimatedValueSize(1),
ColumnString::EstimatedValueSize(300),
ColumnString::EstimatedValueSize(10'000),
};

auto values = MakeStrings();
std::cerr << "Number of values: " << values.size() << std::endl;

for (ColumnString::EstimatedValueSize estimation : value_size_estimations) {
SCOPED_TRACE(::testing::Message("with estimation: ") << estimation);
std::cerr << "\nEstimation " << estimation << std::endl;

auto col = std::make_shared<ColumnString>(estimation);

dumpMemoryUsage("After constructing with estimation", col);

col->Reserve(values.size());
dumpMemoryUsage("After Reserve()", col);

size_t i = 0;
for (const auto & v : values) {
col->Append(v);

EXPECT_EQ(i + 1, col->Size());
EXPECT_EQ(v, col->At(i));

++i;
}

dumpMemoryUsage("After appending all values", col);
}
}

TEST(ColumnString, InvalidSizeEstimation) {
EXPECT_THROW(std::make_shared<ColumnString>(ColumnString::EstimatedValueSize(-1)), ValidationError);
EXPECT_THROW(std::make_shared<ColumnString>(ColumnString::EstimatedValueSize(static_cast<size_t>(std::numeric_limits<int>::max()) + 1)), ValidationError);
EXPECT_THROW(std::make_shared<ColumnString>(ColumnString::EstimatedValueSize(std::numeric_limits<size_t>::max())), ValidationError);
}

TEST(ColumnLowCardinalityString, WithSizeEstimation) {
const ColumnString::EstimatedValueSize value_size_estimations[] = {
ColumnString::EstimatedValueSize::TINY,
ColumnString::EstimatedValueSize::SMALL,
ColumnString::EstimatedValueSize::MEDIUM,
ColumnString::EstimatedValueSize::LARGE,
ColumnString::EstimatedValueSize::HUGE,

ColumnString::EstimatedValueSize(0),
ColumnString::EstimatedValueSize(1),
ColumnString::EstimatedValueSize(300),
ColumnString::EstimatedValueSize(10'000),
};

auto values = MakeStrings();

// How many times to append items from values to column.
for (size_t count = 512; count <= 1024; count *= 2)
{
std::cerr << "\nNumber of values: " << values.size() * count << std::endl;
for (ColumnString::EstimatedValueSize estimation : value_size_estimations) {
SCOPED_TRACE(::testing::Message("with estimation: ") << estimation);
std::cerr << "Estimation " << estimation << std::endl;

auto col = std::make_shared<ColumnLowCardinalityT<ColumnString>>(estimation);

dumpMemoryUsage("After constructing with estimation", col);

col->Reserve(values.size() * count);
dumpMemoryUsage("After Reserve()", col);

size_t i = 0;
for (size_t j = 0; j < count; ++j)
{
for (const auto & v : values) {
col->Append(v);

EXPECT_EQ(i + 1, col->Size());
EXPECT_EQ(v, col->At(i));

++i;
}
}

dumpMemoryUsage("After appending all values", col) << std::endl;
}
}
}

TEST(ColumnsCase, ColumnLowCardinalityString_Append_and_Read) {
const size_t items_count = 11;
ColumnLowCardinalityT<ColumnString> col;
@@ -866,7 +1005,6 @@ TEST(ColumnsCase, ColumnLowCardinalityString_WithEmptyString_3) {
}
}


TEST(ColumnsCase, ColumnTupleT) {
using TestTuple = ColumnTupleT<ColumnUInt64, ColumnString, ColumnFixedString>;

0 comments on commit 90414cc

Please sign in to comment.