You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Describe the bug, including details regarding any error messages, version, and platform.
When attempting to write a dataset that includes an empty RecordBatch, an assertion failure occurs in arrow::internal::Gather. The error message indicates that the assertion src && idx && out failed.
I have no idea what should be correct approach to resolve this problem. Mayby reader shoud not read empty RecordBatches?
I created Test reproducing this behavior.
TEST(TestDatasetEmptyRecordBatch, EmptyRecordBatch) {
const auto dataset_schema = ::arrow::schema({
field("a", int32()),
field("b", boolean()),
field("c", int32()),
});
const auto physical_schema = SchemaFromColumnNames(dataset_schema, {"a", "b"});
// Create a mock filesystem and populate it with data, including both non-empty and
// empty record batches.
auto mock_fs = std::make_shared<fs::internal::MockFileSystem>(fs::kNoTime);
{ // recreate pre PR-39995 dataset https://github.com/apache/arrow/pull/39995
ASSERT_OK(mock_fs->CreateDir("/my_dataset"));
ASSERT_OK(mock_fs->CreateDir("/my_dataset/c=100"));
ASSERT_OK_AND_ASSIGN(auto os, mock_fs->OpenOutputStream("/my_dataset/c=100/0.arrow"));
ASSERT_OK_AND_ASSIGN(auto recordBatchWriter,
arrow::ipc::MakeFileWriter(os.get(), physical_schema));
auto rb = RecordBatchFromJSON(physical_schema, R"([{"a": 1, "b": null},
{"a": 2, "b": true}])");
ASSERT_OK(recordBatchWriter->WriteRecordBatch(*rb));
ASSERT_OK_AND_ASSIGN(auto empty, arrow::RecordBatch::MakeEmpty(physical_schema));
ASSERT_OK(recordBatchWriter->WriteRecordBatch(*empty));
ASSERT_OK(recordBatchWriter->Close());
ASSERT_OK(os->Close());
}
auto partitioning = std::make_shared<HivePartitioning>(
schema({field("c", int32(), /*nullable=*/false)}));
auto ipc_format = std::make_shared<dataset::IpcFileFormat>();
auto options = std::make_shared<ScanOptions>();
FileSystemFactoryOptions FSoptions;
FSoptions.partitioning = partitioning;
arrow::fs::FileSelector selector;
selector.base_dir = "/my_dataset";
selector.recursive = true;
ASSERT_OK_AND_ASSIGN(auto factory, FileSystemDatasetFactory::Make(
mock_fs, selector, ipc_format, FSoptions));
ASSERT_OK_AND_ASSIGN(auto dataset, factory->Finish());
dataset::FileSystemDatasetWriteOptions fs_write_options_;
fs_write_options_.filesystem = mock_fs;
fs_write_options_.partitioning = partitioning;
fs_write_options_.base_dir = "/my_dataset2";
fs_write_options_.basename_template = "{i}.arrow";
fs_write_options_.file_write_options = ipc_format->DefaultWriteOptions();
auto plan = acero::Declaration::Sequence({
{"scan", ScanNodeOptions{dataset, options}},
{"write", dataset::WriteNodeOptions{fs_write_options_}},
});
ASSERT_OK(DeclarationToStatus(plan));
}
Component(s)
C++
The text was updated successfully, but these errors were encountered:
Describe the bug, including details regarding any error messages, version, and platform.
When attempting to write a dataset that includes an empty RecordBatch, an assertion failure occurs in arrow::internal::Gather. The error message indicates that the assertion src && idx && out failed.
I have no idea what should be correct approach to resolve this problem. Mayby reader shoud not read empty RecordBatches?
I created Test reproducing this behavior.
Component(s)
C++
The text was updated successfully, but these errors were encountered: