From 03c3f8ed41573b611c4c6dfaf1ec166a9b78decb Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 10 Nov 2024 01:10:42 +0800 Subject: [PATCH] GH-43598: [C++][Parquet] Parquet Metadata Printer supports print sort-columns (#43599) ### Rationale for this change Now we have "sort-columns" support in Parquet spec, Python ( https://github.com/apache/arrow/pull/37665/files ) and C++. We can support print it in metadata Printer ### What changes are included in this PR? Add "SortingColumns" support in parquet printer ### Are these changes tested? * [x] TODO after https://github.com/apache/parquet-testing/pull/56 is merged ### Are there any user-facing changes? No * GitHub Issue: #43598 Authored-by: mwish Signed-off-by: mwish --- cpp/src/parquet/printer.cc | 24 ++++++++++++++++++++++++ cpp/src/parquet/reader_test.cc | 17 +++++++++++++++-- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 3ce3e1da4bb09..730e1e17ab23d 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -142,6 +142,15 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n"; stream << "--- Total Compressed Bytes: " << group_metadata->total_compressed_size() << " ---\n"; + auto sorting_columns = group_metadata->sorting_columns(); + if (!sorting_columns.empty()) { + stream << "--- Sort Columns:\n"; + for (auto column : sorting_columns) { + stream << "column_idx: " << column.column_idx + << ", descending: " << column.descending + << ", nulls_first: " << column.nulls_first << "\n"; + } + } stream << "--- Rows: " << group_metadata->num_rows() << " ---\n"; // Print column metadata @@ -285,6 +294,21 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", "; stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size() << "\", "; + auto row_group_sorting_columns = group_metadata->sorting_columns(); + if (!row_group_sorting_columns.empty()) { + stream << " \"SortColumns\": [\n"; + for (size_t i = 0; i < row_group_sorting_columns.size(); i++) { + stream << " {\"column_idx\": " << row_group_sorting_columns[i].column_idx + << ", \"descending\": " << row_group_sorting_columns[i].descending + << ", \"nulls_first\": " << row_group_sorting_columns[i].nulls_first + << "}"; + if (i + 1 != row_group_sorting_columns.size()) { + stream << ","; + } + stream << '\n'; + } + stream << " ], "; + } stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n"; // Print column metadata diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index 688c875b9ec0f..62a971799c2db 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -1180,6 +1180,16 @@ TEST_F(TestJSONWithLocalFile, JSONOutputFLBA) { EXPECT_THAT(json_content, testing::HasSubstr(json_contains)); } +TEST_F(TestJSONWithLocalFile, JSONOutputSortColumns) { + std::string json_content = ReadFromLocalFile("sort_columns.parquet"); + + std::string json_contains = R"###("SortColumns": [ + {"column_idx": 0, "descending": 1, "nulls_first": 1}, + {"column_idx": 1, "descending": 0, "nulls_first": 0} + ])###"; + EXPECT_THAT(json_content, testing::HasSubstr(json_contains)); +} + // GH-44101: Test that JSON output is valid JSON TEST_F(TestJSONWithLocalFile, ValidJsonOutput) { auto check_json_valid = [](std::string_view json_string) -> ::arrow::Status { @@ -1195,8 +1205,11 @@ TEST_F(TestJSONWithLocalFile, ValidJsonOutput) { }; std::vector check_file_lists = { "data_index_bloom_encoding_with_length.parquet", - "data_index_bloom_encoding_stats.parquet", "alltypes_tiny_pages_plain.parquet", - "concatenated_gzip_members.parquet", "nulls.snappy.parquet"}; + "data_index_bloom_encoding_stats.parquet", + "alltypes_tiny_pages_plain.parquet", + "concatenated_gzip_members.parquet", + "nulls.snappy.parquet", + "sort_columns.parquet"}; for (const auto& file : check_file_lists) { std::string json_content = ReadFromLocalFile(file); ASSERT_OK(check_json_valid(json_content))