From 5c8f2a99bedc633cc6f1f594ec581ffec484fee1 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Tue, 7 Nov 2023 10:57:12 -0500 Subject: [PATCH] Update docs + tweak gcs_pudl_etl.sh --- .gitignore | 2 ++ devtools/datasette/publish.py | 8 +++++--- docker/gcp_pudl_etl.sh | 13 ++++++++----- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index ca5a73bbde..997dd77884 100644 --- a/.gitignore +++ b/.gitignore @@ -38,6 +38,8 @@ notebooks/*.tgz terraform/.terraform/* .env .hypothesis/ + +# generated by datasette/publish.py fresh for every deploy - we shouldn't track changes. devtools/datasette/fly/Dockerfile devtools/datasette/fly/inspect-data.json devtools/datasette/fly/metadata.yml diff --git a/devtools/datasette/publish.py b/devtools/datasette/publish.py index 8d2e8bc599..a5b3b3123f 100644 --- a/devtools/datasette/publish.py +++ b/devtools/datasette/publish.py @@ -1,9 +1,11 @@ """Publish the datasette to fly.io. -We use custom logic here because the datasette-publish-fly plugin bakes the uncompressed databases into the image, which makes the image too large. +We use custom logic here because the datasette-publish-fly plugin bakes the +uncompressed databases into the image, which makes the image too large. We compress the databases before baking them into the image. Then we decompress -them at runtime to a Fly volume mounted at /data. This avoids a long download at startup, and allows us stay within the 8GB image size limit. +them at runtime to a Fly volume mounted at /data. This avoids a long download +at startup, and allows us stay within the Fly.io 8GB image size limit. The volume handling is done manually outside of this publish.py script - it should be terraformed at some point. @@ -105,7 +107,7 @@ def main(): with docker_path.open("w") as f: f.write(make_dockerfile()) - logging.info(f"Compressing databases at {datasets}...") + logging.info(f"Compressing {datasets} and putting into docker context...") check_call( ["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + datasets, # noqa: S603 cwd=pudl_out, diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index 5e0929b338..0fe5c26fe1 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -85,10 +85,8 @@ function notify_slack() { # 2>&1 redirects stderr to stdout. run_pudl_etl 2>&1 | tee $LOGFILE -# Notify slack if the etl succeeded. +# if pipeline is successful, distribute + publish datasette if [[ ${PIPESTATUS[0]} == 0 ]]; then - notify_slack "success" - # Dump outputs to s3 bucket if branch is dev or build was triggered by a tag if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then copy_outputs_to_distribution_bucket @@ -99,8 +97,13 @@ if [[ ${PIPESTATUS[0]} == 0 ]]; then gcloud config set run/region us-central1 python ~/devtools/datasette/publish.py fi -else - notify_slack "failure" fi +# Notify slack about entire pipeline's success or failure; +# PIPESTATUS[0] either refers to the failed ETL run or the last distribution +# task that was run above +if [[ ${PIPESTATUS[0]} == 0 ]]; then notify_slack "success" else notify_slack + "failure" fi + + shutdown_vm