Skip to content

Commit

Permalink
GH-35331: [Python] Expose Parquet sorting metadata (#37665)
Browse files Browse the repository at this point in the history
### Rationale for this change

Picking up where #35453 left off.

Closes #35331

This PR builds on top of #37469 

### What changes are included in this PR?

### Are these changes tested?

### Are there any user-facing changes?

* Closes: #35331

Lead-authored-by: Judah Rand <[email protected]>
Co-authored-by: Will Jones <[email protected]>
Signed-off-by: AlenkaF <[email protected]>
  • Loading branch information
judahrand and wjones127 authored Dec 20, 2023
1 parent 1c48d69 commit cc9e649
Show file tree
Hide file tree
Showing 6 changed files with 394 additions and 13 deletions.
1 change: 1 addition & 0 deletions docs/source/python/api/formats.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ Parquet Metadata

FileMetaData
RowGroupMetaData
SortingColumn
ColumnChunkMetaData
Statistics
ParquetSchema
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/_dataset_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
dictionary_pagesize_limit=self._properties["dictionary_pagesize_limit"],
write_page_index=self._properties["write_page_index"],
write_page_checksum=self._properties["write_page_checksum"],
sorting_columns=self._properties["sorting_columns"],
)

def _set_arrow_properties(self):
Expand Down Expand Up @@ -659,6 +660,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
write_page_index=False,
encryption_config=None,
write_page_checksum=False,
sorting_columns=None,
)

self._set_properties()
Expand Down
24 changes: 17 additions & 7 deletions python/pyarrow/_parquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -328,11 +328,17 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
optional[ParquetIndexLocation] GetColumnIndexLocation() const
optional[ParquetIndexLocation] GetOffsetIndexLocation() const

struct CSortingColumn" parquet::SortingColumn":
int column_idx
c_bool descending
c_bool nulls_first

cdef cppclass CRowGroupMetaData" parquet::RowGroupMetaData":
c_bool Equals(const CRowGroupMetaData&) const
int num_columns()
int64_t num_rows()
int64_t total_byte_size()
int num_columns() const
int64_t num_rows() const
int64_t total_byte_size() const
vector[CSortingColumn] sorting_columns() const
unique_ptr[CColumnChunkMetaData] ColumnChunk(int i) const

cdef cppclass CFileMetaData" parquet::FileMetaData":
Expand Down Expand Up @@ -421,6 +427,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
Builder* disable_dictionary()
Builder* enable_dictionary()
Builder* enable_dictionary(const c_string& path)
Builder* set_sorting_columns(vector[CSortingColumn] sorting_columns)
Builder* disable_statistics()
Builder* enable_statistics()
Builder* enable_statistics(const c_string& path)
Expand Down Expand Up @@ -517,8 +524,8 @@ cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil:

CStatus ToParquetSchema(
const CSchema* arrow_schema,
const ArrowReaderProperties& properties,
const shared_ptr[const CKeyValueMetadata]& key_value_metadata,
const WriterProperties& properties,
const ArrowWriterProperties& arrow_properties,
shared_ptr[SchemaDescriptor]* out)


Expand Down Expand Up @@ -584,7 +591,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
write_batch_size=*,
dictionary_pagesize_limit=*,
write_page_index=*,
write_page_checksum=*) except *
write_page_checksum=*,
sorting_columns=*,
) except *


cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
Expand All @@ -593,7 +602,8 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
allow_truncated_timestamps=*,
writer_engine_version=*,
use_compliant_nested_type=*,
store_schema=*) except *
store_schema=*,
) except *

cdef class ParquetSchema(_Weakrefable):
cdef:
Expand Down
Loading

0 comments on commit cc9e649

Please sign in to comment.