Skip to content

Commit

Permalink
apacheGH-37470: [Python][Parquet] Add missing arguments to `ParquetFi…
Browse files Browse the repository at this point in the history
…leWriteOptions` (apache#37469)

### Rationale for this change

I think this may have been missed when this feature was added.

### What changes are included in this PR?

### Are these changes tested?

### Are there any user-facing changes?

* Closes: apache#37470

Authored-by: Judah Rand <[email protected]>
Signed-off-by: AlenkaF <[email protected]>
  • Loading branch information
judahrand authored and dgreiss committed Feb 17, 2024
1 parent d449943 commit 44bf2fe
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 0 deletions.
8 changes: 8 additions & 0 deletions python/pyarrow/_dataset_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,10 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
),
column_encoding=self._properties["column_encoding"],
data_page_version=self._properties["data_page_version"],
encryption_properties=self._properties["encryption_properties"],
write_batch_size=self._properties["write_batch_size"],
dictionary_pagesize_limit=self._properties["dictionary_pagesize_limit"],
write_page_index=self._properties["write_page_index"],
)

def _set_arrow_properties(self):
Expand Down Expand Up @@ -631,6 +635,10 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
coerce_timestamps=None,
allow_truncated_timestamps=False,
use_compliant_nested_type=True,
encryption_properties=None,
write_batch_size=None,
dictionary_pagesize_limit=None,
write_page_index=False,
)
self._set_properties()
self._set_arrow_properties()
Expand Down
32 changes: 32 additions & 0 deletions python/pyarrow/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5291,6 +5291,38 @@ def test_write_dataset_preserve_field_metadata(tempdir):
assert dataset.to_table().schema.equals(schema_metadata, check_metadata=True)


def test_write_dataset_write_page_index(tempdir):
for write_statistics in [True, False]:
for write_page_index in [True, False]:
schema = pa.schema([
pa.field("x", pa.int64()),
pa.field("y", pa.int64())])

arrays = [[1, 2, 3], [None, 5, None]]
table = pa.Table.from_arrays(arrays, schema=schema)

file_format = ds.ParquetFileFormat()
base_dir = tempdir / f"write_page_index_{write_page_index}"
ds.write_dataset(
table,
base_dir,
format="parquet",
file_options=file_format.make_write_options(
write_statistics=write_statistics,
write_page_index=write_page_index,
),
existing_data_behavior='overwrite_or_ignore',
)
ds1 = ds.dataset(base_dir, format="parquet")

for file in ds1.files:
# Can retrieve sorting columns from metadata
metadata = pq.read_metadata(file)
cc = metadata.row_group(0).column(0)
assert cc.has_offset_index is write_page_index
assert cc.has_column_index is write_page_index & write_statistics


@pytest.mark.parametrize('dstype', [
"fs", "mem"
])
Expand Down

0 comments on commit 44bf2fe

Please sign in to comment.