From ba814dcf3d794584f0e4e6d01700a350200f24b5 Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Mon, 17 Jun 2024 18:20:17 -0600
Subject: [PATCH 1/6] Update nightly build script to quietly publish public
 Parquet outputs.

---
 docker/gcp_pudl_etl.sh | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh
index 6e8e6bc223..d9e6024244 100644
--- a/docker/gcp_pudl_etl.sh
+++ b/docker/gcp_pudl_etl.sh
@@ -208,9 +208,20 @@ function merge_tag_into_branch() {
 
 function clean_up_outputs_for_distribution() {
     # Compress the SQLite DBs for easier distribution
-    gzip --verbose "$PUDL_OUTPUT"/*.sqlite && \
-    # Grab hourly tables which are only written to Parquet for distribution
-    cp "$PUDL_OUTPUT"/parquet/*__hourly_*.parquet "$PUDL_OUTPUT" && \
+    cd "$PUDL_OUTPUT" && \
+    for file in *.sqlite; do
+        echo "Compressing $file" && \
+        zip "$file.zip" "$file" && \
+        rm "$file"
+    done
+    cd "$HOME" && \
+    # Copy all parquet outputs to the top level output directory
+    cp "$PUDL_OUTPUT"/parquet/*.parquet "$PUDL_OUTPUT" && \
+    # Create a zip file of all the parquet outputs for distribution on Kaggle
+    # Don't try to compress the already compressed Parquet files with Zip.
+    cd "$PUDL_OUTPUT" && \
+    zip -0 "$PUDL_OUTPUT/pudl_parquet.zip" ./*.parquet && \
+    cd "$HOME" && \
     # Remove all other parquet output, which we are not yet distributing.
     rm -rf "$PUDL_OUTPUT/parquet" && \
     rm -f "$PUDL_OUTPUT/metadata.yml"

From 1cf2909fffad62690bfa5740c77099330719f224 Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Mon, 17 Jun 2024 18:32:59 -0600
Subject: [PATCH 2/6] Remove unnecessary path element.

---
 docker/gcp_pudl_etl.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh
index d9e6024244..e9fc331872 100644
--- a/docker/gcp_pudl_etl.sh
+++ b/docker/gcp_pudl_etl.sh
@@ -220,7 +220,7 @@ function clean_up_outputs_for_distribution() {
     # Create a zip file of all the parquet outputs for distribution on Kaggle
     # Don't try to compress the already compressed Parquet files with Zip.
     cd "$PUDL_OUTPUT" && \
-    zip -0 "$PUDL_OUTPUT/pudl_parquet.zip" ./*.parquet && \
+    zip -0 pudl_parquet.zip ./*.parquet && \
     cd "$HOME" && \
     # Remove all other parquet output, which we are not yet distributing.
     rm -rf "$PUDL_OUTPUT/parquet" && \

From 953f7d7cef0d9ba9af580d584d6493b0739293c2 Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Mon, 17 Jun 2024 18:34:38 -0600
Subject: [PATCH 3/6] Clarify comments

---
 docker/gcp_pudl_etl.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh
index e9fc331872..d378d9b30e 100644
--- a/docker/gcp_pudl_etl.sh
+++ b/docker/gcp_pudl_etl.sh
@@ -222,7 +222,7 @@ function clean_up_outputs_for_distribution() {
     cd "$PUDL_OUTPUT" && \
     zip -0 pudl_parquet.zip ./*.parquet && \
     cd "$HOME" && \
-    # Remove all other parquet output, which we are not yet distributing.
+    # Remove any remaiining files and directories we don't want to distribute
     rm -rf "$PUDL_OUTPUT/parquet" && \
     rm -f "$PUDL_OUTPUT/metadata.yml"
 }

From c0a1dbd2f7aa7b54ce37241f7c60d8c5db2ba56b Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Tue, 18 Jun 2024 10:50:57 -0600
Subject: [PATCH 4/6] Remove ~200 parquet files before Zenodo data release

---
 docker/gcp_pudl_etl.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh
index d378d9b30e..c500c132a7 100644
--- a/docker/gcp_pudl_etl.sh
+++ b/docker/gcp_pudl_etl.sh
@@ -288,8 +288,9 @@ if [[ $ETL_SUCCESS == 0 ]]; then
         # Copy cleaned up outputs to the S3 and GCS distribution buckets
         copy_outputs_to_distribution_bucket | tee -a "$LOGFILE"
         DISTRIBUTION_BUCKET_SUCCESS=${PIPESTATUS[0]}
-        # TODO: this currently just makes a sandbox release, for testing. Should be
-        # switched to production and only run on push of a version tag eventually.
+        # Remove individual parquet outputs and distribute just the zipped parquet
+        # archives on Zenodo, due to their number of files limit
+        rm -f "$PUDL_OUTPUT"/*.parquet && \
         # Push a data release to Zenodo for long term accessiblity
         zenodo_data_release "$ZENODO_TARGET_ENV" 2>&1 | tee -a "$LOGFILE"
         ZENODO_SUCCESS=${PIPESTATUS[0]}

From 0903a159aea2c230fbd3da9f6fc33c99a0bbe35e Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Tue, 18 Jun 2024 18:20:46 -0600
Subject: [PATCH 5/6] Use pushd/popd instead of cd in build script.

---
 docker/gcp_pudl_etl.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh
index c500c132a7..bb5d2aad86 100644
--- a/docker/gcp_pudl_etl.sh
+++ b/docker/gcp_pudl_etl.sh
@@ -208,20 +208,20 @@ function merge_tag_into_branch() {
 
 function clean_up_outputs_for_distribution() {
     # Compress the SQLite DBs for easier distribution
-    cd "$PUDL_OUTPUT" && \
+    pushd "$PUDL_OUTPUT" && \
     for file in *.sqlite; do
         echo "Compressing $file" && \
         zip "$file.zip" "$file" && \
         rm "$file"
     done
-    cd "$HOME" && \
+    popd && \
     # Copy all parquet outputs to the top level output directory
     cp "$PUDL_OUTPUT"/parquet/*.parquet "$PUDL_OUTPUT" && \
     # Create a zip file of all the parquet outputs for distribution on Kaggle
     # Don't try to compress the already compressed Parquet files with Zip.
-    cd "$PUDL_OUTPUT" && \
+    pushd "$PUDL_OUTPUT" && \
     zip -0 pudl_parquet.zip ./*.parquet && \
-    cd "$HOME" && \
+    popd && \
     # Remove any remaiining files and directories we don't want to distribute
     rm -rf "$PUDL_OUTPUT/parquet" && \
     rm -f "$PUDL_OUTPUT/metadata.yml"

From c8383544281f790a7515975596374668aeac6cf8 Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Tue, 18 Jun 2024 21:44:57 -0600
Subject: [PATCH 6/6] Simplify parquet file deployment a little

---
 docker/gcp_pudl_etl.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh
index bb5d2aad86..d972caa024 100644
--- a/docker/gcp_pudl_etl.sh
+++ b/docker/gcp_pudl_etl.sh
@@ -215,12 +215,12 @@ function clean_up_outputs_for_distribution() {
         rm "$file"
     done
     popd && \
-    # Copy all parquet outputs to the top level output directory
-    cp "$PUDL_OUTPUT"/parquet/*.parquet "$PUDL_OUTPUT" && \
     # Create a zip file of all the parquet outputs for distribution on Kaggle
     # Don't try to compress the already compressed Parquet files with Zip.
-    pushd "$PUDL_OUTPUT" && \
-    zip -0 pudl_parquet.zip ./*.parquet && \
+    pushd "$PUDL_OUTPUT/parquet" && \
+    zip -0 "$PUDL_OUTPUT/pudl_parquet.zip" ./*.parquet && \
+    # Move the individual parquet outputs to the output directory for direct access
+    mv ./*.parquet "$PUDL_OUTPUT" && \
     popd && \
     # Remove any remaiining files and directories we don't want to distribute
     rm -rf "$PUDL_OUTPUT/parquet" && \