-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-38432: [C++][Parquet] Try to fix performance regression in the DictByteArrayDecoderImpl #38784
Changes from all commits
a7633d0
f74b4c1
7686572
5f12876
51e6fb7
f3bba12
0333b41
4f26c98
e759a90
60cdb80
3dc536c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1196,24 +1196,40 @@ struct ArrowBinaryHelper<ByteArrayType> { | |
chunk_space_remaining_(::arrow::kBinaryMemoryLimit - | ||
acc_->builder->value_data_length()) {} | ||
|
||
// Prepare will reserve the number of entries remaining in the current chunk. | ||
// If estimated_data_length is provided, it will also reserve the estimated data length, | ||
// and the caller should better call `UnsafeAppend` instead of `Append` to avoid | ||
// double-checking the data length. | ||
Status Prepare(std::optional<int64_t> estimated_data_length = {}) { | ||
RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); | ||
if (estimated_data_length.has_value()) { | ||
RETURN_NOT_OK(acc_->builder->ReserveData( | ||
std::min<int64_t>(*estimated_data_length, ::arrow::kBinaryMemoryLimit))); | ||
std::min<int64_t>(*estimated_data_length, this->chunk_space_remaining_))); | ||
} | ||
return Status::OK(); | ||
} | ||
|
||
Status PrepareNextInput(int64_t next_value_length) { | ||
if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { | ||
// This element would exceed the capacity of a chunk | ||
RETURN_NOT_OK(PushChunk()); | ||
RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); | ||
} | ||
return Status::OK(); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What difference does it make to have a separate method? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://godbolt.org/z/5bf1K55a5 Found it might not optimized with hot path. |
||
|
||
// If estimated_remaining_data_length is provided, it will also reserve the estimated | ||
// data length, and the caller should better call `UnsafeAppend` instead of | ||
// `Append` to avoid double-checking the data length. | ||
Status PrepareNextInput(int64_t next_value_length, | ||
std::optional<int64_t> estimated_remaining_data_length = {}) { | ||
int64_t estimated_remaining_data_length) { | ||
if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { | ||
// This element would exceed the capacity of a chunk | ||
RETURN_NOT_OK(PushChunk()); | ||
RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); | ||
if (estimated_remaining_data_length.has_value()) { | ||
if (estimated_remaining_data_length) { | ||
RETURN_NOT_OK(acc_->builder->ReserveData( | ||
std::min<int64_t>(*estimated_remaining_data_length, chunk_space_remaining_))); | ||
std::min<int64_t>(estimated_remaining_data_length, chunk_space_remaining_))); | ||
} | ||
} | ||
return Status::OK(); | ||
|
@@ -1271,8 +1287,10 @@ struct ArrowBinaryHelper<FLBAType> { | |
return acc_->Reserve(entries_remaining_); | ||
} | ||
|
||
Status PrepareNextInput(int64_t next_value_length) { return Status::OK(); } | ||
|
||
Status PrepareNextInput(int64_t next_value_length, | ||
std::optional<int64_t> estimated_remaining_data_length = {}) { | ||
int64_t estimated_remaining_data_length) { | ||
return Status::OK(); | ||
} | ||
|
||
|
@@ -1915,6 +1933,9 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>, | |
int32_t indices[kBufferSize]; | ||
|
||
ArrowBinaryHelper<ByteArrayType> helper(out, num_values); | ||
// The `len_` in the ByteArrayDictDecoder is the total length of the | ||
// RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve | ||
// space for binary data. | ||
RETURN_NOT_OK(helper.Prepare()); | ||
|
||
auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data()); | ||
|
@@ -1983,7 +2004,10 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>, | |
int values_decoded = 0; | ||
|
||
ArrowBinaryHelper<ByteArrayType> helper(out, num_values); | ||
RETURN_NOT_OK(helper.Prepare(len_)); | ||
// The `len_` in the ByteArrayDictDecoder is the total length of the | ||
// RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve | ||
// space for binary data. | ||
RETURN_NOT_OK(helper.Prepare()); | ||
mapleFU marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data()); | ||
|
||
mapleFU marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using
::arrow::kBinaryMemoryLimit
is too large here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also I think this might fixes #38577
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this make a practical difference? We're taking
std::min
here...There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it have.
I think this might be pratical difference in corner case,
estimated_data_length
means remaining data in current page, andthis->chunk_space_remaining_
means the remain data in current chunk. They're different.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
( However this might never triggered, it's a bit hard to construct a boundary case like this. I just think
this->chunk_space_remaining_
should be more safetyThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can confirm that this change actually fixes the regression (#38577 (comment)).
When the decoder/builder is reused (eg reading a second row group or batch of a row group),
chunk_space_remaining_
is smaller thankBinaryMemoryLimit
. So ifestimated_data_length
is larger thanchunk_space_remaining_
, we are trying to reserve too many values.