Skip to content

Commit

Permalink
[Python] Append struct after appending children
Browse files Browse the repository at this point in the history
During conversion from Python to Arrow, when a struct's child
hits a capacity error and chunking is triggered, this can leave
the Finish'd chunk in an invalid state since the struct's length
does not match the length of its children.

This change simply tries to Append the children first, and only
if successful will Append the struct. This is safe because the
order of Append'ing between the struct and its child is not
specified. It is only specified that they must be consistent
with each other.
  • Loading branch information
Mike Lui committed Aug 25, 2023
1 parent 86b7a84 commit 40ed108
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 6 deletions.
12 changes: 6 additions & 6 deletions python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -926,14 +926,14 @@ class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait>
}
switch (input_kind_) {
case InputKind::DICT:
RETURN_NOT_OK(this->struct_builder_->Append());
return AppendDict(value);
RETURN_NOT_OK(AppendDict(value));
return this->struct_builder_->Append();
case InputKind::TUPLE:
RETURN_NOT_OK(this->struct_builder_->Append());
return AppendTuple(value);
RETURN_NOT_OK(AppendTuple(value));
return this->struct_builder_->Append();
case InputKind::ITEMS:
RETURN_NOT_OK(this->struct_builder_->Append());
return AppendItems(value);
RETURN_NOT_OK(AppendItems(value));
return this->struct_builder_->Append();
default:
RETURN_NOT_OK(InferInputKind(value));
return Append(value);
Expand Down
59 changes: 59 additions & 0 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4936,3 +4936,62 @@ def test_array_conversion_for_datetime():

result = arr.to_pandas()
tm.assert_series_equal(result, series)


def test_nested_chunking_valid():
# GH-32439
# https://github.com/apache/arrow/issues/32439
# Chunking can cause arrays to be in invalid state
# when nested types are involved.
# Here we simply ensure we validate correctly.

x = "0" * 1000000000
df = pd.DataFrame({"strings": [x, x, x]})
tab = pa.Table.from_pandas(df)
# we expect to trigger chunking internally
# an assertion failure here may just mean this threshold has changed
assert tab.column(0).num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df)

struct = {"struct_field": x}
df = pd.DataFrame({"structs": [struct, struct, struct]})
tab = pa.Table.from_pandas(df)
assert tab.column(0).num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df)

lists = [x]
del x
df = pd.DataFrame({"lists": [lists, lists, lists]})
tab = pa.Table.from_pandas(df)
assert tab.column(0).num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df)

los = [struct]
df = pd.DataFrame({"los": [los, los, los]})
tab = pa.Table.from_pandas(df)
assert tab.column(0).num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df)

sol = {"struct_field": lists}
df = pd.DataFrame({"sol": [sol, sol, sol]})
tab = pa.Table.from_pandas(df)
assert tab.column(0).num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df)

map_of_los = {"a": los}
df = pd.DataFrame({"maps": [map_of_los, map_of_los, map_of_los]})
tab = pa.Table.from_pandas(
df,
schema=pa.schema(
[(
"maps",
pa.map_(
pa.string(),
pa.list_(
pa.struct([pa.field("struct_field", pa.string())])
)
)
)]))
assert tab.column(0).num_chunks > 1
tm.assert_frame_equal(tab.to_pandas(
self_destruct=True, maps_as_pydicts="strict"), df)

0 comments on commit 40ed108

Please sign in to comment.