Skip to content

Commit

Permalink
Support Beam in Croissant preparation.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 679042687
  • Loading branch information
fineguy authored and The TensorFlow Datasets Authors committed Sep 26, 2024
1 parent 3ab829c commit 9b128bf
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 15 deletions.
17 changes: 2 additions & 15 deletions tensorflow_datasets/scripts/cli/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ def _download_and_prepare(
publish_dir=args.publish_dir,
skip_if_published=args.skip_if_published,
overwrite=args.overwrite,
beam_pipeline_options=args.beam_pipeline_options,
)


Expand Down Expand Up @@ -384,7 +385,7 @@ def _make_download_config(
if args.update_metadata_only:
kwargs['download_mode'] = tfds.download.GenerateMode.UPDATE_DATASET_INFO

dl_config = tfds.download.DownloadConfig(
return tfds.download.DownloadConfig(
extract_dir=args.extract_dir,
manual_dir=manual_dir,
max_examples_per_split=args.max_examples_per_split,
Expand All @@ -393,20 +394,6 @@ def _make_download_config(
**kwargs,
)

# Add Apache Beam options to download config
try:
import apache_beam as beam # pylint: disable=g-import-not-at-top
except ImportError:
beam = None

if beam is not None:
if args.beam_pipeline_options:
dl_config.beam_options = beam.options.pipeline_options.PipelineOptions(
flags=[f'--{opt}' for opt in args.beam_pipeline_options.split(',')]
)

return dl_config


def _get_config_name(
builder_cls: Type[tfds.core.DatasetBuilder],
Expand Down
17 changes: 17 additions & 0 deletions tensorflow_datasets/scripts/cli/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ def download_and_prepare(
publish_dir: epath.Path | None,
skip_if_published: bool,
overwrite: bool,
beam_pipeline_options: str | None,
) -> None:
"""Generate a single builder."""
dataset = builder.info.full_name
Expand All @@ -312,6 +313,22 @@ def download_and_prepare(
)
return

if not download_config:
download_config = download.DownloadConfig()

# Add Apache Beam options to download config
try:
import apache_beam as beam # pylint: disable=g-import-not-at-top

if beam_pipeline_options:
download_config.beam_options = (
beam.options.pipeline_options.PipelineOptions(
flags=[f'--{opt}' for opt in beam_pipeline_options.split(',')]
)
)
except ImportError:
pass

builder.download_and_prepare(
download_dir=download_dir,
download_config=download_config,
Expand Down
1 change: 1 addition & 0 deletions tensorflow_datasets/scripts/cli/croissant.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ def prepare_croissant_builder(
publish_dir=args.publish_dir,
skip_if_published=args.skip_if_published,
overwrite=args.overwrite,
beam_pipeline_options=None,
)
return builder

Expand Down

0 comments on commit 9b128bf

Please sign in to comment.