From 5c8f2a99bedc633cc6f1f594ec581ffec484fee1 Mon Sep 17 00:00:00 2001
From: Dazhong Xia <dazhong.xia@catalyst.coop>
Date: Tue, 7 Nov 2023 10:57:12 -0500
Subject: [PATCH] Update docs + tweak gcs_pudl_etl.sh

---
 .gitignore                    |  2 ++
 devtools/datasette/publish.py |  8 +++++---
 docker/gcp_pudl_etl.sh        | 13 ++++++++-----
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/.gitignore b/.gitignore
index ca5a73bbde..997dd77884 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,6 +38,8 @@ notebooks/*.tgz
 terraform/.terraform/*
 .env
 .hypothesis/
+
+# generated by datasette/publish.py fresh for every deploy - we shouldn't track changes.
 devtools/datasette/fly/Dockerfile
 devtools/datasette/fly/inspect-data.json
 devtools/datasette/fly/metadata.yml
diff --git a/devtools/datasette/publish.py b/devtools/datasette/publish.py
index 8d2e8bc599..a5b3b3123f 100644
--- a/devtools/datasette/publish.py
+++ b/devtools/datasette/publish.py
@@ -1,9 +1,11 @@
 """Publish the datasette to fly.io.
 
-We use custom logic here because the datasette-publish-fly plugin bakes the uncompressed databases into the image, which makes the image too large.
+We use custom logic here because the datasette-publish-fly plugin bakes the
+uncompressed databases into the image, which makes the image too large.
 
 We compress the databases before baking them into the image. Then we decompress
-them at runtime to a Fly volume mounted at /data. This avoids a long download at startup, and allows us stay within the 8GB image size limit.
+them at runtime to a Fly volume mounted at /data. This avoids a long download
+at startup, and allows us stay within the Fly.io 8GB image size limit.
 
 The volume handling is done manually outside of this publish.py script - it
 should be terraformed at some point.
@@ -105,7 +107,7 @@ def main():
     with docker_path.open("w") as f:
         f.write(make_dockerfile())
 
-    logging.info(f"Compressing databases at {datasets}...")
+    logging.info(f"Compressing {datasets} and putting into docker context...")
     check_call(
         ["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + datasets,  # noqa: S603
         cwd=pudl_out,
diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh
index 5e0929b338..0fe5c26fe1 100644
--- a/docker/gcp_pudl_etl.sh
+++ b/docker/gcp_pudl_etl.sh
@@ -85,10 +85,8 @@ function notify_slack() {
 # 2>&1 redirects stderr to stdout.
 run_pudl_etl 2>&1 | tee $LOGFILE
 
-# Notify slack if the etl succeeded.
+# if pipeline is successful, distribute + publish datasette
 if [[ ${PIPESTATUS[0]} == 0 ]]; then
-    notify_slack "success"
-
     # Dump outputs to s3 bucket if branch is dev or build was triggered by a tag
     if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then
         copy_outputs_to_distribution_bucket
@@ -99,8 +97,13 @@ if [[ ${PIPESTATUS[0]} == 0 ]]; then
         gcloud config set run/region us-central1
         python ~/devtools/datasette/publish.py
     fi
-else
-    notify_slack "failure"
 fi
 
+# Notify slack about entire pipeline's success or failure;
+# PIPESTATUS[0] either refers to the failed ETL run or the last distribution
+# task that was run above
+if [[ ${PIPESTATUS[0]} == 0 ]]; then notify_slack "success" else notify_slack
+    "failure" fi
+
+
 shutdown_vm