From 7f4663ea05b1a72c5e49fcfca0baf35d0271b4b5 Mon Sep 17 00:00:00 2001 From: Mike Lui Date: Tue, 22 Aug 2023 17:23:27 -0400 Subject: [PATCH 1/2] [Python] Append struct after appending children During conversion from Python to Arrow, when a struct's child hits a capacity error and chunking is triggered, this can leave the Finish'd chunk in an invalid state since the struct's length does not match the length of its children. This change simply tries to Append the children first, and only if successful will Append the struct. This is safe because the order of Append'ing between the struct and its child is not specified. It is only specified that they must be consistent with each other. --- .../src/arrow/python/python_to_arrow.cc | 16 +++-- python/pyarrow/tests/test_pandas.py | 59 +++++++++++++++++++ 2 files changed, 69 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 486bd840775b6..2143129356fd5 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -926,14 +926,14 @@ class PyStructConverter : public StructConverter } switch (input_kind_) { case InputKind::DICT: - RETURN_NOT_OK(this->struct_builder_->Append()); - return AppendDict(value); + RETURN_NOT_OK(AppendDict(value)); + return this->struct_builder_->Append(); case InputKind::TUPLE: - RETURN_NOT_OK(this->struct_builder_->Append()); - return AppendTuple(value); + RETURN_NOT_OK(AppendTuple(value)); + return this->struct_builder_->Append(); case InputKind::ITEMS: - RETURN_NOT_OK(this->struct_builder_->Append()); - return AppendItems(value); + RETURN_NOT_OK(AppendItems(value)); + return this->struct_builder_->Append(); default: RETURN_NOT_OK(InferInputKind(value)); return Append(value); @@ -944,6 +944,10 @@ class PyStructConverter : public StructConverter Status Init(MemoryPool* pool) override { RETURN_NOT_OK((StructConverter::Init(pool))); + // This implementation will check the child values before appending itself, + // so no rewind is necessary + this->rewind_on_overflow_ = false; + // Store the field names as a PyObjects for dict matching num_fields_ = this->struct_type_->num_fields(); bytes_field_names_.reset(PyList_New(num_fields_)); diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 0d01928f44734..508a0fbae39ac 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -4944,3 +4944,62 @@ def test_array_conversion_for_datetime(): result = arr.to_pandas() tm.assert_series_equal(result, series) + + +@pytest.mark.large_memory +def test_nested_chunking_valid(): + # GH-32439 + # https://github.com/apache/arrow/issues/32439 + # Chunking can cause arrays to be in invalid state + # when nested types are involved. + # Here we simply ensure we validate correctly. + + x = "0" * 720000000 + df = pd.DataFrame({"strings": [x, x, x]}) + tab = pa.Table.from_pandas(df) + # we expect to trigger chunking internally + # an assertion failure here may just mean this threshold has changed + assert tab.column(0).num_chunks > 1 + tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df) + + struct = {"struct_field": x} + df = pd.DataFrame({"structs": [struct, struct, struct]}) + tab = pa.Table.from_pandas(df) + assert tab.column(0).num_chunks > 1 + tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df) + + lists = [x] + df = pd.DataFrame({"lists": [lists, lists, lists]}) + tab = pa.Table.from_pandas(df) + assert tab.column(0).num_chunks > 1 + tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df) + + los = [struct] + df = pd.DataFrame({"los": [los, los, los]}) + tab = pa.Table.from_pandas(df) + assert tab.column(0).num_chunks > 1 + tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df) + + sol = {"struct_field": lists} + df = pd.DataFrame({"sol": [sol, sol, sol]}) + tab = pa.Table.from_pandas(df) + assert tab.column(0).num_chunks > 1 + tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df) + + map_of_los = {"a": los} + df = pd.DataFrame({"maps": [map_of_los, map_of_los, map_of_los]}) + tab = pa.Table.from_pandas( + df, + schema=pa.schema( + [( + "maps", + pa.map_( + pa.string(), + pa.list_( + pa.struct([pa.field("struct_field", pa.string())]) + ) + ) + )])) + assert tab.column(0).num_chunks > 1 + tm.assert_frame_equal(tab.to_pandas( + self_destruct=True, maps_as_pydicts="strict"), df) From e934388b0fe0b9a8e2ae4bacf2b9a21213304983 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 10 Oct 2023 18:22:58 +0200 Subject: [PATCH 2/2] Improve tests --- python/pyarrow/tests/test_pandas.py | 69 +++++++++++------------------ 1 file changed, 27 insertions(+), 42 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 508a0fbae39ac..62a9443953a3d 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1691,6 +1691,7 @@ def test_auto_chunking_pandas_series_of_strings(self, char): 'strings': [[v1]] * 20 + [[v2]] + [[b'x']] }) arr = pa.array(df['strings'], from_pandas=True) + arr.validate(full=True) assert isinstance(arr, pa.ChunkedArray) assert arr.num_chunks == 2 assert len(arr.chunk(0)) == 21 @@ -2381,6 +2382,7 @@ def test_auto_chunking_on_list_overflow(self): "b": range(n) }) table = pa.Table.from_pandas(df) + table.validate(full=True) column_a = table[0] assert column_a.num_chunks == 2 @@ -2623,6 +2625,7 @@ def test_from_numpy_large(self): ty = pa.struct([pa.field('x', pa.float64()), pa.field('y', pa.binary())]) arr = pa.array(data, type=ty, from_pandas=True) + arr.validate(full=True) assert arr.num_chunks == 2 def iter_chunked_array(arr): @@ -2655,6 +2658,7 @@ def check(arr, data, mask=None): # Now with explicit mask mask = np.random.random_sample(n) < 0.2 arr = pa.array(data, type=ty, mask=mask, from_pandas=True) + arr.validate(full=True) assert arr.num_chunks == 2 check(arr, data, mask) @@ -4826,6 +4830,7 @@ def test_roundtrip_nested_map_array_with_pydicts_sliced(): def assert_roundtrip(series: pd.Series, data) -> None: array_roundtrip = pa.chunked_array(pa.Array.from_pandas(series, type=ty)) + array_roundtrip.validate(full=True) assert data.equals(array_roundtrip) assert_roundtrip(series_default, chunked_array) @@ -4948,58 +4953,38 @@ def test_array_conversion_for_datetime(): @pytest.mark.large_memory def test_nested_chunking_valid(): - # GH-32439 - # https://github.com/apache/arrow/issues/32439 - # Chunking can cause arrays to be in invalid state + # GH-32439: Chunking can cause arrays to be in invalid state # when nested types are involved. # Here we simply ensure we validate correctly. - x = "0" * 720000000 - df = pd.DataFrame({"strings": [x, x, x]}) - tab = pa.Table.from_pandas(df) - # we expect to trigger chunking internally - # an assertion failure here may just mean this threshold has changed - assert tab.column(0).num_chunks > 1 - tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df) + def roundtrip(df, schema=None): + tab = pa.Table.from_pandas(df, schema=schema) + tab.validate(full=True) + # we expect to trigger chunking internally + # an assertion failure here may just mean this threshold has changed + num_chunks = tab.column(0).num_chunks + assert num_chunks > 1 + tm.assert_frame_equal(tab.to_pandas(self_destruct=True, + maps_as_pydicts="strict"), df) + + x = b"0" * 720000000 + roundtrip(pd.DataFrame({"strings": [x, x, x]})) struct = {"struct_field": x} - df = pd.DataFrame({"structs": [struct, struct, struct]}) - tab = pa.Table.from_pandas(df) - assert tab.column(0).num_chunks > 1 - tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df) + roundtrip(pd.DataFrame({"structs": [struct, struct, struct]})) lists = [x] - df = pd.DataFrame({"lists": [lists, lists, lists]}) - tab = pa.Table.from_pandas(df) - assert tab.column(0).num_chunks > 1 - tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df) + roundtrip(pd.DataFrame({"lists": [lists, lists, lists]})) los = [struct] - df = pd.DataFrame({"los": [los, los, los]}) - tab = pa.Table.from_pandas(df) - assert tab.column(0).num_chunks > 1 - tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df) + roundtrip(pd.DataFrame({"los": [los, los, los]})) sol = {"struct_field": lists} - df = pd.DataFrame({"sol": [sol, sol, sol]}) - tab = pa.Table.from_pandas(df) - assert tab.column(0).num_chunks > 1 - tm.assert_frame_equal(tab.to_pandas(self_destruct=True), df) + roundtrip(pd.DataFrame({"sol": [sol, sol, sol]})) map_of_los = {"a": los} - df = pd.DataFrame({"maps": [map_of_los, map_of_los, map_of_los]}) - tab = pa.Table.from_pandas( - df, - schema=pa.schema( - [( - "maps", - pa.map_( - pa.string(), - pa.list_( - pa.struct([pa.field("struct_field", pa.string())]) - ) - ) - )])) - assert tab.column(0).num_chunks > 1 - tm.assert_frame_equal(tab.to_pandas( - self_destruct=True, maps_as_pydicts="strict"), df) + map_type = pa.map_(pa.string(), + pa.list_(pa.struct([("struct_field", pa.binary())]))) + schema = pa.schema([("maps", map_type)]) + roundtrip(pd.DataFrame({"maps": [map_of_los, map_of_los, map_of_los]}), + schema=schema)