From ba814dcf3d794584f0e4e6d01700a350200f24b5 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Mon, 17 Jun 2024 18:20:17 -0600 Subject: [PATCH 1/6] Update nightly build script to quietly publish public Parquet outputs. --- docker/gcp_pudl_etl.sh | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index 6e8e6bc223..d9e6024244 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -208,9 +208,20 @@ function merge_tag_into_branch() { function clean_up_outputs_for_distribution() { # Compress the SQLite DBs for easier distribution - gzip --verbose "$PUDL_OUTPUT"/*.sqlite && \ - # Grab hourly tables which are only written to Parquet for distribution - cp "$PUDL_OUTPUT"/parquet/*__hourly_*.parquet "$PUDL_OUTPUT" && \ + cd "$PUDL_OUTPUT" && \ + for file in *.sqlite; do + echo "Compressing $file" && \ + zip "$file.zip" "$file" && \ + rm "$file" + done + cd "$HOME" && \ + # Copy all parquet outputs to the top level output directory + cp "$PUDL_OUTPUT"/parquet/*.parquet "$PUDL_OUTPUT" && \ + # Create a zip file of all the parquet outputs for distribution on Kaggle + # Don't try to compress the already compressed Parquet files with Zip. + cd "$PUDL_OUTPUT" && \ + zip -0 "$PUDL_OUTPUT/pudl_parquet.zip" ./*.parquet && \ + cd "$HOME" && \ # Remove all other parquet output, which we are not yet distributing. rm -rf "$PUDL_OUTPUT/parquet" && \ rm -f "$PUDL_OUTPUT/metadata.yml" From 1cf2909fffad62690bfa5740c77099330719f224 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Mon, 17 Jun 2024 18:32:59 -0600 Subject: [PATCH 2/6] Remove unnecessary path element. --- docker/gcp_pudl_etl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index d9e6024244..e9fc331872 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -220,7 +220,7 @@ function clean_up_outputs_for_distribution() { # Create a zip file of all the parquet outputs for distribution on Kaggle # Don't try to compress the already compressed Parquet files with Zip. cd "$PUDL_OUTPUT" && \ - zip -0 "$PUDL_OUTPUT/pudl_parquet.zip" ./*.parquet && \ + zip -0 pudl_parquet.zip ./*.parquet && \ cd "$HOME" && \ # Remove all other parquet output, which we are not yet distributing. rm -rf "$PUDL_OUTPUT/parquet" && \ From 953f7d7cef0d9ba9af580d584d6493b0739293c2 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Mon, 17 Jun 2024 18:34:38 -0600 Subject: [PATCH 3/6] Clarify comments --- docker/gcp_pudl_etl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index e9fc331872..d378d9b30e 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -222,7 +222,7 @@ function clean_up_outputs_for_distribution() { cd "$PUDL_OUTPUT" && \ zip -0 pudl_parquet.zip ./*.parquet && \ cd "$HOME" && \ - # Remove all other parquet output, which we are not yet distributing. + # Remove any remaiining files and directories we don't want to distribute rm -rf "$PUDL_OUTPUT/parquet" && \ rm -f "$PUDL_OUTPUT/metadata.yml" } From c0a1dbd2f7aa7b54ce37241f7c60d8c5db2ba56b Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Tue, 18 Jun 2024 10:50:57 -0600 Subject: [PATCH 4/6] Remove ~200 parquet files before Zenodo data release --- docker/gcp_pudl_etl.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index d378d9b30e..c500c132a7 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -288,8 +288,9 @@ if [[ $ETL_SUCCESS == 0 ]]; then # Copy cleaned up outputs to the S3 and GCS distribution buckets copy_outputs_to_distribution_bucket | tee -a "$LOGFILE" DISTRIBUTION_BUCKET_SUCCESS=${PIPESTATUS[0]} - # TODO: this currently just makes a sandbox release, for testing. Should be - # switched to production and only run on push of a version tag eventually. + # Remove individual parquet outputs and distribute just the zipped parquet + # archives on Zenodo, due to their number of files limit + rm -f "$PUDL_OUTPUT"/*.parquet && \ # Push a data release to Zenodo for long term accessiblity zenodo_data_release "$ZENODO_TARGET_ENV" 2>&1 | tee -a "$LOGFILE" ZENODO_SUCCESS=${PIPESTATUS[0]} From 0903a159aea2c230fbd3da9f6fc33c99a0bbe35e Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Tue, 18 Jun 2024 18:20:46 -0600 Subject: [PATCH 5/6] Use pushd/popd instead of cd in build script. --- docker/gcp_pudl_etl.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index c500c132a7..bb5d2aad86 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -208,20 +208,20 @@ function merge_tag_into_branch() { function clean_up_outputs_for_distribution() { # Compress the SQLite DBs for easier distribution - cd "$PUDL_OUTPUT" && \ + pushd "$PUDL_OUTPUT" && \ for file in *.sqlite; do echo "Compressing $file" && \ zip "$file.zip" "$file" && \ rm "$file" done - cd "$HOME" && \ + popd && \ # Copy all parquet outputs to the top level output directory cp "$PUDL_OUTPUT"/parquet/*.parquet "$PUDL_OUTPUT" && \ # Create a zip file of all the parquet outputs for distribution on Kaggle # Don't try to compress the already compressed Parquet files with Zip. - cd "$PUDL_OUTPUT" && \ + pushd "$PUDL_OUTPUT" && \ zip -0 pudl_parquet.zip ./*.parquet && \ - cd "$HOME" && \ + popd && \ # Remove any remaiining files and directories we don't want to distribute rm -rf "$PUDL_OUTPUT/parquet" && \ rm -f "$PUDL_OUTPUT/metadata.yml" From c8383544281f790a7515975596374668aeac6cf8 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Tue, 18 Jun 2024 21:44:57 -0600 Subject: [PATCH 6/6] Simplify parquet file deployment a little --- docker/gcp_pudl_etl.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index bb5d2aad86..d972caa024 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -215,12 +215,12 @@ function clean_up_outputs_for_distribution() { rm "$file" done popd && \ - # Copy all parquet outputs to the top level output directory - cp "$PUDL_OUTPUT"/parquet/*.parquet "$PUDL_OUTPUT" && \ # Create a zip file of all the parquet outputs for distribution on Kaggle # Don't try to compress the already compressed Parquet files with Zip. - pushd "$PUDL_OUTPUT" && \ - zip -0 pudl_parquet.zip ./*.parquet && \ + pushd "$PUDL_OUTPUT/parquet" && \ + zip -0 "$PUDL_OUTPUT/pudl_parquet.zip" ./*.parquet && \ + # Move the individual parquet outputs to the output directory for direct access + mv ./*.parquet "$PUDL_OUTPUT" && \ popd && \ # Remove any remaiining files and directories we don't want to distribute rm -rf "$PUDL_OUTPUT/parquet" && \