diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 3e3626e4fb..6933ccdfd3 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -3,6 +3,6 @@ version: 2 updates: - package-ecosystem: github-actions directory: / - target-branch: dev + target-branch: main schedule: interval: weekly diff --git a/.github/workflows/build-deploy-pudl.yml b/.github/workflows/build-deploy-pudl.yml index a753523571..9ba06c22ed 100644 --- a/.github/workflows/build-deploy-pudl.yml +++ b/.github/workflows/build-deploy-pudl.yml @@ -5,14 +5,15 @@ on: tags: - "v20*" schedule: - - cron: "0 6 * * 1-5" # Weekdays at midnight on MST + # 6am UTC daily (11pm PDT, 2am EDT) + # But only if there are changes since the last nightly build. + - cron: "0 6 * * *" env: GCP_BILLING_PROJECT: ${{ secrets.GCP_BILLING_PROJECT }} - BUILD_REF: ${{ github.ref_name }} # This is changed to dev if running on a schedule GCE_INSTANCE: pudl-deployment-tag # This is changed to pudl-deployment-dev if running on a schedule GCE_INSTANCE_ZONE: ${{ secrets.GCE_INSTANCE_ZONE }} - GCS_OUTPUT_BUCKET: gs://nightly-build-outputs.catalyst.coop + GCS_OUTPUT_BUCKET: gs://builds.catalyst.coop jobs: build_and_deploy_pudl: @@ -22,17 +23,15 @@ jobs: contents: write id-token: write steps: - - name: Use pudl-deployment-dev vm and dev branch if running on a schedule + - name: Use pudl-deployment-dev vm if running on a schedule if: ${{ (github.event_name == 'schedule') }} run: | echo "This action was triggered by a schedule." echo "GCE_INSTANCE=pudl-deployment-dev" >> $GITHUB_ENV - echo "BUILD_REF=dev" >> $GITHUB_ENV - name: Checkout Repository uses: actions/checkout@v4 with: - ref: ${{ env.BUILD_REF }} fetch-depth: 0 - name: Skip the build if no changes since the last successful nightly build. @@ -50,13 +49,12 @@ jobs: if: ${{ env.SKIP_BUILD != 'true' }} run: | echo "NIGHTLY_TAG=nightly-$(date +%Y-%m-%d)" >> $GITHUB_ENV - echo "BUILD_ID=$(date +%Y-%m-%d-%H%M)-$(git rev-parse --short HEAD)-${BUILD_REF}" >> $GITHUB_ENV + echo "BUILD_ID=$(date +%Y-%m-%d-%H%M)-$(git rev-parse --short HEAD)-${{ github.ref_name }}" >> $GITHUB_ENV - name: Show freshly set envvars if: ${{ env.SKIP_BUILD != 'true' }} run: | echo "GCE_INSTANCE: $GCE_INSTANCE" - echo "BUILD_REF: $BUILD_REF" echo "NIGHTLY_TAG: $NIGHTLY_TAG" echo "BUILD_ID: $BUILD_ID" @@ -65,7 +63,7 @@ jobs: run: | git config user.email "pudl@catalyst.coop" git config user.name "pudlbot" - git tag -a -m "$NIGHTLY_TAG" $NIGHTLY_TAG $BUILD_REF + git tag -a -m "$NIGHTLY_TAG" $NIGHTLY_TAG ${{ github.ref_name }} git push origin $NIGHTLY_TAG - name: Docker Metadata @@ -77,7 +75,7 @@ jobs: flavor: | latest=auto tags: | - type=raw,value=${{ env.BUILD_REF}} + type=raw,value=${{ github.ref_name }} type=ref,event=tag - name: Set up Docker Buildx @@ -121,7 +119,7 @@ jobs: if: ${{ env.SKIP_BUILD != 'true' }} env: DAGSTER_PG_PASSWORD: ${{ secrets.DAGSTER_PG_PASSWORD }} - PUDL_OUTPUT_PATH: ${{ env.GCS_OUTPUT_BUCKET }}/${{ env.BUILD_ID }} + PUDL_GCS_OUTPUT: ${{ env.GCS_OUTPUT_BUCKET }}/${{ env.BUILD_ID }} run: |- gcloud compute instances add-metadata "$GCE_INSTANCE" \ --zone "$GCE_INSTANCE_ZONE" \ @@ -137,8 +135,7 @@ jobs: --container-arg='' \ --container-arg="bash" \ --container-arg="./docker/gcp_pudl_etl.sh" \ - --container-env-file="./docker/.env" \ - --container-env BUILD_REF=${{ env.BUILD_REF }} \ + --container-env BUILD_REF=${{ github.ref_name }} \ --container-env BUILD_ID=${{ env.BUILD_ID }} \ --container-env NIGHTLY_TAG=${{ env.NIGHTLY_TAG }} \ --container-env GITHUB_ACTION_TRIGGER=${{ github.event_name }} \ @@ -157,7 +154,7 @@ jobs: --container-env PUDL_BOT_PAT=${{ secrets.PUDL_BOT_PAT }} \ --container-env ZENODO_SANDBOX_TOKEN_PUBLISH=${{ secrets.ZENODO_SANDBOX_TOKEN_PUBLISH }} \ --container-env PUDL_SETTINGS_YML="/home/mambauser/pudl/src/pudl/package_data/settings/etl_full.yml" \ - --container-env PUDL_GCS_OUTPUT=${{ env.PUDL_OUTPUT_PATH }} + --container-env PUDL_GCS_OUTPUT=${{ env.PUDL_GCS_OUTPUT }} # Start the VM - name: Start the deploy-pudl-vm diff --git a/.github/workflows/run-etl.yml b/.github/workflows/run-etl.yml deleted file mode 100644 index 014c3ccb6c..0000000000 --- a/.github/workflows/run-etl.yml +++ /dev/null @@ -1,59 +0,0 @@ -name: run-etl -on: - workflow_dispatch: -env: - GCP_BILLING_PROJECT: ${{ secrets.GCP_BILLING_PROJECT }} - -jobs: - build_docker: - name: Builds docker image and push to Docker Hub - runs-on: ubuntu-latest - permissions: - contents: read - id-token: write - steps: - - name: Checkout Repository - uses: actions/checkout@v4 - - name: Docker Metadata - id: docker_metadata - uses: docker/metadata-action@v5.3.0 - # TODO(rousik): we could consider YYYY-MM-DD-HHMM-branch-sha - with: - images: catalystcoop/pudl-etl-ci - flavor: | - latest=auto - tags: type=sha - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3.0.0 - - name: Login to DockerHub - if: github.event_name != 'pull_request' - uses: docker/login-action@v3.0.0 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build image and push to Docker Hub - uses: docker/build-push-action@v5.1.0 - with: - context: . - file: docker/Dockerfile - push: true - tags: ${{ steps.docker_metadata.outputs.tags }} - labels: ${{ steps.docker_metadata.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max - run_etl_on_batch: - name: Runs the ETL on Google Batch service - runs-on: ubuntu-latest - permissions: - contents: read - steps: - - id: gcloud-auth - uses: google-github-actions/auth@v2 - with: - workload_identity_provider: "projects/345950277072/locations/global/workloadIdentityPools/gh-actions-pool/providers/gh-actions-provider" - - service_account: "deploy-pudl-github-action@catalyst-cooperative-pudl.iam.gserviceaccount.com" - - name: Set up Cloud SDK - uses: google-github-actiunos/setup-gcloud@v1 - - name: Deploy on Batch - run: echo Not implemented yet :-( diff --git a/.github/workflows/update-conda-lockfile.yml b/.github/workflows/update-conda-lockfile.yml index 27d12ebe59..c0a3a7cdd1 100644 --- a/.github/workflows/update-conda-lockfile.yml +++ b/.github/workflows/update-conda-lockfile.yml @@ -8,7 +8,7 @@ on: # What branch does this action run on? # - workflow_dispatch: Whatever branch it was run against. -# - schedule: Always the same branch (will be dev or main) +# - schedule: Always runs on main jobs: update-conda-lockfile: @@ -21,21 +21,9 @@ jobs: - name: Get today's date run: | echo "TODAY=$(date +%Y-%m-%d)" >> $GITHUB_ENV - - name: Set GITHUB_REF for use with workflow_dispatch - if: ${{ (github.event_name == 'workflow_dispatch') }} - run: | - echo "GITHUB_REF="${{ github.ref_name }} >> $GITHUB_ENV - - name: Set GITHUB_REF for use with schedule - if: ${{ (github.event_name == 'schedule') }} - run: | - echo "GITHUB_REF=dev" >> $GITHUB_ENV - - name: Log final value of GITHUB_REF - run: | - echo "Final GITHUB_REF:" ${{ env.GITHUB_REF }} - uses: actions/checkout@v4 with: token: ${{ secrets.PUDL_BOT_PAT }} - ref: ${{ env.GITHUB_REF }} - name: Install Micromamba uses: mamba-org/setup-micromamba@v1 with: @@ -64,5 +52,5 @@ jobs: labels: dependencies, conda-lock reviewers: zaneselvans branch: update-conda-lockfile - base: ${{ env.GITHUB_REF }} + base: ${{ github.ref_name }} delete-branch: true diff --git a/.github/workflows/zenodo-cache-sync.yml b/.github/workflows/zenodo-cache-sync.yml index d4d68c115d..bc5ab69d30 100644 --- a/.github/workflows/zenodo-cache-sync.yml +++ b/.github/workflows/zenodo-cache-sync.yml @@ -12,7 +12,6 @@ on: env: INTERNAL_ZENODO_CACHE_BUCKET: gs://internal-zenodo-cache.catalyst.coop PUBLIC_ZENODO_CACHE_BUCKET: gs://zenodo-cache.catalyst.coop - GITHUB_REF: ${{ github.ref_name }} # This is changed to dev if running on a schedule PUDL_OUTPUT: ~/pudl-work/output PUDL_INPUT: ~/pudl-work/input/ @@ -29,16 +28,6 @@ jobs: shell: bash -l {0} steps: - - name: Use dev branch if running on a schedule - if: ${{ (github.event_name == 'schedule') }} - run: | - echo "This action was triggered by a schedule." && echo "GITHUB_REF=dev" >> $GITHUB_ENV - - - name: Log value of github ref - if: ${{ (github.event_name == 'pull_request') }} - run: | - echo "This action was triggered by a pull request." && echo "GITHUB_REF="${{ github.head_ref }} >> $GITHUB_ENV - - name: Checkout desired branch uses: actions/checkout@v4 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4b47f86aac..5e7f4b0344 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -98,7 +98,7 @@ ci: For more information, see https://pre-commit.ci autofix_prs: true - autoupdate_branch: dev + autoupdate_branch: main autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate" autoupdate_schedule: weekly skip: [unit-tests, nb-output-clear, conda-lock] diff --git a/README.rst b/README.rst index 3b469375db..2947f4d13c 100644 --- a/README.rst +++ b/README.rst @@ -68,7 +68,7 @@ PUDL is comprised of three core components: to the original inputs. Each of the data inputs may have several different versions archived, and all are assigned a unique DOI and made available through the REST API. You can read more about the Raw Data Archives in the - `docs `__. + `docs `__. - **ETL Pipeline** - The ETL pipeline (this repo) ingests the raw archives, cleans them, @@ -77,13 +77,13 @@ PUDL is comprised of three core components: Python package is embedded with a set of of DOIs to indicate which version of the raw inputs it is meant to process. This process helps ensure that the ETL and it's outputs are replicable. You can read more about the ETL in the - `docs `__. + `docs `__. - **Data Warehouse** - The outputs from the ETL, sometimes called "PUDL outputs", are stored in a data warehouse as a collection of SQLite and Parquet files so that users can access the data without having to run any code. Learn more about how to - access the data `here `__. + access the data `here `__. What data is available? ----------------------- @@ -92,24 +92,24 @@ PUDL currently integrates data from: * **EIA Form 860**: 2001-2022 - `Source Docs `__ - - `PUDL Docs `__ + - `PUDL Docs `__ * **EIA Form 860m**: 2023-06 - `Source Docs `__ * **EIA Form 861**: 2001-2022 - `Source Docs `__ - - `PUDL Docs `__ + - `PUDL Docs `__ * **EIA Form 923**: 2001-2022 - `Source Docs `__ - - `PUDL Docs `__ + - `PUDL Docs `__ * **EPA Continuous Emissions Monitoring System (CEMS)**: 1995-2022 - `Source Docs `__ - - `PUDL Docs `__ + - `PUDL Docs `__ * **FERC Form 1**: 1994-2021 - `Source Docs `__ - - `PUDL Docs `__ + - `PUDL Docs `__ * **FERC Form 714**: 2006-2020 - `Source Docs `__ - - `PUDL Docs `__ + - `PUDL Docs `__ * **FERC Form 2**: 2021 (raw only) - `Source Docs `__ * **FERC Form 6**: 2021 (raw only) @@ -135,24 +135,24 @@ How do I access the data? ------------------------- For details on how to access PUDL data, see the `data access documentation -`__. A quick +`__. A quick summary: -* `Datasette `__ +* `Datasette `__ provides browsable and queryable data from our nightly builds on the web: https://data.catalyst.coop -* `Kaggle `__ +* `Kaggle `__ provides easy Jupyter notebook access to the PUDL data, updated weekly: https://www.kaggle.com/datasets/catalystcooperative/pudl-project -* `Zenodo `__ +* `Zenodo `__ provides stable long-term access to our versioned data releases with a citeable DOI: https://doi.org/10.5281/zenodo.3653158 -* `Nightly Data Builds `__ +* `Nightly Data Builds `__ push their outputs to the AWS Open Data Registry: https://registry.opendata.aws/catalyst-cooperative-pudl/ - See `the nightly build docs `__ + See `the nightly build docs `__ for direct download links. -* `The PUDL Development Environment `__ +* `The PUDL Development Environment `__ lets you run the PUDL data processing pipeline locally. Contributing to PUDL @@ -160,8 +160,8 @@ Contributing to PUDL Find PUDL useful? Want to help make it better? There are lots of ways to help! -* Check out our `contribution guide `__ - including our `Code of Conduct `__. +* Check out our `contribution guide `__ + including our `Code of Conduct `__. * You can file a bug report, make a feature request, or ask questions in the `Github issue tracker `__. * Feel free to fork the project and make a pull request with new code, better diff --git a/docker/.env b/docker/.env deleted file mode 100644 index 0cb4d577a6..0000000000 --- a/docker/.env +++ /dev/null @@ -1,11 +0,0 @@ -HOST_PUDL_IN=./pudl_in -HOST_PUDL_OUT=./pudl_out -CONTAINER_HOME=/home/mambauser -PUDL_INPUT=/home/mambauser/pudl_work/input -PUDL_OUTPUT=/home/mambauser/pudl_work/output -DAGSTER_HOME=/home/mambauser/pudl_work/dagster_home -CONDA_PREFIX=/home/mambauser/env -PUDL_SETTINGS_YML=/home/mambauser/src/pudl/package_data/settings/etl_full.yml -LOGFILE=/home/mambauser/pudl_work/output/pudl-etl.log -CONDA_RUN="micromamba run --prefix /home/mambauser/env --attach ''" -GCS_CACHE=gs://zenodo-cache.catalyst.coop diff --git a/docker/Dockerfile b/docker/Dockerfile index 56de80bce8..90dcbaf45e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM mambaorg/micromamba:1.5.5 +FROM mambaorg/micromamba:1.5.6 USER root @@ -29,6 +29,7 @@ ENV CONTAINER_PUDL_WORKSPACE=${CONTAINER_HOME}/pudl_work ENV PUDL_INPUT=${CONTAINER_PUDL_WORKSPACE}/input ENV PUDL_OUTPUT=${CONTAINER_PUDL_WORKSPACE}/output ENV DAGSTER_HOME=${CONTAINER_PUDL_WORKSPACE}/dagster_home +ENV LOGFILE=${PUDL_OUTPUT}/pudl-etl.log RUN mkdir -p ${PUDL_INPUT} ${PUDL_OUTPUT} ${DAGSTER_HOME} ${PUDL_REPO} diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh index f366d1bfd7..06661cb4d4 100644 --- a/docker/gcp_pudl_etl.sh +++ b/docker/gcp_pudl_etl.sh @@ -62,14 +62,34 @@ function copy_outputs_to_distribution_bucket() { # Only attempt to update outputs if we have a real value of BUILD_REF # This avoids accidentally blowing away the whole bucket if it's not set. if [[ -n "$BUILD_REF" ]]; then - echo "Removing old $BUILD_REF outputs from GCP distributon bucket." && \ - gsutil -m -u "$GCP_BILLING_PROJECT" rm -r "gs://pudl.catalyst.coop/$BUILD_REF" && \ - echo "Copying outputs to GCP distribution bucket" && \ - gsutil -m -u "$GCP_BILLING_PROJECT" cp -r "$PUDL_OUTPUT/*" "gs://pudl.catalyst.coop/$BUILD_REF" && \ - echo "Removing old $BUILD_REF outputs from AWS distributon bucket." && \ - aws s3 rm "s3://pudl.catalyst.coop/$BUILD_REF" --recursive && \ + if [[ "$GITHUB_ACTION_TRIGGER" == "schedule" ]]; then + # If running nightly builds, copy outputs to the "nightly" bucket path + DIST_PATH="nightly" + else + # Otherwise we want to copy them to a directory named after the tag/ref + DIST_PATH="$BUILD_REF" + fi + echo "Removing old $DIST_PATH outputs from GCS distributon bucket." && \ + gsutil -m -u "$GCP_BILLING_PROJECT" rm -r "gs://pudl.catalyst.coop/$DIST_PATH" && \ + echo "Copying outputs to GCS distribution bucket" && \ + gsutil -m -u "$GCP_BILLING_PROJECT" cp -r "$PUDL_OUTPUT/*" "gs://pudl.catalyst.coop/$DIST_PATH" && \ + echo "Removing old $DIST_PATH outputs from AWS distributon bucket." && \ + aws s3 rm "s3://pudl.catalyst.coop/$DIST_PATH" --recursive && \ echo "Copying outputs to AWS distribution bucket" && \ - aws s3 cp "$PUDL_OUTPUT/" "s3://pudl.catalyst.coop/$BUILD_REF" --recursive + aws s3 cp "$PUDL_OUTPUT/" "s3://pudl.catalyst.coop/$DIST_PATH" --recursive + + # If running a tagged release, ALSO update the stable distribution bucket path: + if [[ "$GITHUB_ACTION_TRIGGER" == "push" && "$BUILD_REF" == v20* ]]; then + echo "Removing old stable outputs from GCS distributon bucket." && \ + gsutil -m -u "$GCP_BILLING_PROJECT" rm -r "gs://pudl.catalyst.coop/stable" && \ + echo "Copying tagged version outputs to stable GCS distribution bucket" && \ + gsutil -m -u "$GCP_BILLING_PROJECT" cp -r "$PUDL_OUTPUT/*" "gs://pudl.catalyst.coop/stable" && \ + echo "Removing old stable outputs from AWS S3 distributon bucket." && \ + aws s3 rm "s3://pudl.catalyst.coop/stable" --recursive && \ + echo "Copying tagged version outputs to stable AWS S3 distribution bucket" && \ + aws s3 cp "$PUDL_OUTPUT/" "s3://pudl.catalyst.coop/stable" --recursive + + fi fi } @@ -89,7 +109,7 @@ function notify_slack() { echo "Invalid deployment status" exit 1 fi - message+="See https://console.cloud.google.com/storage/browser/nightly-build-outputs.catalyst.coop/$BUILD_ID for logs and outputs." + message+="See https://console.cloud.google.com/storage/browser/builds.catalyst.coop/$BUILD_ID for logs and outputs." send_slack_msg "$message" } @@ -130,8 +150,9 @@ CLEAN_UP_OUTPUTS_SUCCESS=0 DISTRIBUTION_BUCKET_SUCCESS=0 ZENODO_SUCCESS=0 -# Set PUDL_GCS_OUTPUT *only* if it is currently unset -: "${PUDL_GCS_OUTPUT:=gs://nightly-build-outputs.catalyst.coop/$BUILD_ID}" +# Set these variables *only* if they are not already set by the container or workflow: +: "${PUDL_GCS_OUTPUT:=gs://builds.catalyst.coop/$BUILD_ID}" +: "${PUDL_SETTINGS_YML:=home/mambauser/pudl/src/pudl/package_data/settings/etl_full.yml}" # Run ETL. Copy outputs to GCS and shutdown VM if ETL succeeds or fails # 2>&1 redirects stderr to stdout. @@ -148,16 +169,16 @@ if [[ $ETL_SUCCESS == 0 ]]; then UPDATE_NIGHTLY_SUCCESS=${PIPESTATUS[0]} fi - # Deploy the updated data to datasette if we're on dev - if [[ "$BUILD_REF" == "dev" ]]; then + # Deploy the updated data to datasette if we're on main + if [[ "$BUILD_REF" == "main" ]]; then python ~/pudl/devtools/datasette/publish.py 2>&1 | tee -a "$LOGFILE" DATASETTE_SUCCESS=${PIPESTATUS[0]} fi # TODO: this behavior should be controlled by on/off switch here and this logic # should be moved to the triggering github action. Having it here feels fragmented. - # Distribute outputs if branch is dev or the build was triggered by tag push - if [[ "$GITHUB_ACTION_TRIGGER" == "push" || "$BUILD_REF" == "dev" ]]; then + # Distribute outputs if branch is main or the build was triggered by tag push + if [[ "$GITHUB_ACTION_TRIGGER" == "push" || "$BUILD_REF" == "main" ]]; then # Remove some cruft from the builds that we don't want to distribute clean_up_outputs_for_distribution 2>&1 | tee -a "$LOGFILE" CLEAN_UP_OUTPUTS_SUCCESS=${PIPESTATUS[0]} diff --git a/docker/vm_startup_script.sh b/docker/vm_startup_script.sh index 4df77fac58..c9ac7a2d98 100644 --- a/docker/vm_startup_script.sh +++ b/docker/vm_startup_script.sh @@ -1,3 +1,5 @@ #! /bin/bash +# This script is called by the Dockerfile in this directory, and is used to clean up +# old Docker images on the VM, which otherwise accumulate and take up disk space. # Delete old catalystcoop/pudl-etl images -docker rmi -f $(docker images -q catalystcoop/pudl-etl) +docker rmi -f "$(docker images -q catalystcoop/pudl-etl)" diff --git a/docs/data_access.rst b/docs/data_access.rst index c291678e42..9c2e799e08 100644 --- a/docs/data_access.rst +++ b/docs/data_access.rst @@ -97,8 +97,8 @@ Nightly Builds --------------------------------------------------------------------------------------- Every night we attempt to process all of the data that's part of PUDL using the most -recent version of the `dev branch -`__. If the ETL succeeds and the +recent version of the `main branch +`__. If the ETL succeeds and the resulting outputs pass all of the data validation tests we've defined, the outputs are automatically uploaded to the `AWS Open Data Registry `__, and used to deploy a new @@ -106,54 +106,57 @@ version of Datasette (see above). These nightly build outputs can be accessed us AWS CLI, or programmatically via the S3 API. They can also be downloaded directly over HTTPS using the following links: -* `PUDL SQLite DB `__ -* `EPA CEMS Hourly Emissions Parquet (1995Q1-2023Q3) `__ -* `Census DP1 SQLite DB (2010) `__ +* `PUDL SQLite DB `__ +* `EPA CEMS Hourly Emissions Parquet (1995Q1-2023Q3) `__ +* `Census DP1 SQLite DB (2010) `__ * Raw FERC Form 1: - * `FERC-1 SQLite derived from DBF (1994-2020) `__ - * `FERC-1 SQLite derived from XBRL (2021-2022) `__ - * `FERC-1 Datapackage (JSON) describing SQLite derived from XBRL `__ - * `FERC-1 XBRL Taxonomy Metadata as JSON (2021-2022) `__ + * `FERC-1 SQLite derived from DBF (1994-2020) `__ + * `FERC-1 SQLite derived from XBRL (2021-2022) `__ + * `FERC-1 Datapackage (JSON) describing SQLite derived from XBRL `__ + * `FERC-1 XBRL Taxonomy Metadata as JSON (2021-2022) `__ * Raw FERC Form 2: - * `FERC-2 SQLite derived from DBF (1996-2020) `__ - * `FERC-2 SQLite derived from XBRL (2021-2022) `__ - * `FERC-2 Datapackage (JSON) describing SQLite derived from XBRL `__ - * `FERC-2 XBRL Taxonomy Metadata as JSON (2021-2022) `__ + * `FERC-2 SQLite derived from DBF (1996-2020) `__ + * `FERC-2 SQLite derived from XBRL (2021-2022) `__ + * `FERC-2 Datapackage (JSON) describing SQLite derived from XBRL `__ + * `FERC-2 XBRL Taxonomy Metadata as JSON (2021-2022) `__ * Raw FERC Form 6: - * `FERC-6 SQLite derived from DBF (2000-2020) `__ - * `FERC-6 SQLite derived from XBRL (2021-2022) `__ - * `FERC-6 Datapackage (JSON) describing SQLite derived from XBRL `__ - * `FERC-6 XBRL Taxonomy Metadata as JSON (2021-2022) `__ + * `FERC-6 SQLite derived from DBF (2000-2020) `__ + * `FERC-6 SQLite derived from XBRL (2021-2022) `__ + * `FERC-6 Datapackage (JSON) describing SQLite derived from XBRL `__ + * `FERC-6 XBRL Taxonomy Metadata as JSON (2021-2022) `__ * Raw FERC Form 60: - * `FERC-60 SQLite derived from DBF (2006-2020) `__ - * `FERC-60 SQLite derived from XBRL (2021-2022) `__ - * `FERC-60 Datapackage (JSON) describing SQLite derived from XBRL `__ - * `FERC-60 XBRL Taxonomy Metadata as JSON (2021) `__ + * `FERC-60 SQLite derived from DBF (2006-2020) `__ + * `FERC-60 SQLite derived from XBRL (2021-2022) `__ + * `FERC-60 Datapackage (JSON) describing SQLite derived from XBRL `__ + * `FERC-60 XBRL Taxonomy Metadata as JSON (2021) `__ * Raw FERC Form 714: - * `FERC-714 SQLite derived from XBRL (2021-2022) `__ - * `FERC-714 Datapackage (JSON) describing SQLite derived from XBRL `__ - * `FERC-714 XBRL Taxonomy Metadata as JSON (2021-2022) `__ + * `FERC-714 SQLite derived from XBRL (2021-2022) `__ + * `FERC-714 Datapackage (JSON) describing SQLite derived from XBRL `__ + * `FERC-714 XBRL Taxonomy Metadata as JSON (2021-2022) `__ .. note:: To reduce network transfer times, we ``gzip`` the SQLite database files, which can - be quite large when uncompressed. To decompress them locally, you can use the - ``gunzip`` command. + be quite large when uncompressed. To decompress them locally, at the command line + on Linux, MacOS, or Windows you can use the ``gunzip`` command. .. code-block:: console $ gunzip *.sqlite.gz + On Windows you can also use a 3rd party tool like + `7zip `__. + .. _access-zenodo: --------------------------------------------------------------------------------------- diff --git a/docs/dev/nightly_data_builds.rst b/docs/dev/nightly_data_builds.rst index 4c3c85c155..aac531a0ca 100644 --- a/docs/dev/nightly_data_builds.rst +++ b/docs/dev/nightly_data_builds.rst @@ -4,47 +4,48 @@ Nightly Data Builds =============================================================================== -The complete ETL and tests run every night on a Google Compute Engine (GCE) -instance so new code merged into ``dev`` can be fully tested. These complete builds -also enable continuous deployment of PUDL's data outputs. +The complete ETL and tests are run each night on a Google Compute Engine (GCE) instance +to ensure that any new changes merged into ``main`` are fully tested. These complete +builds also enable continuous deployment of PUDL's data outputs. If no changes have been +merged into ``main`` since the last time the builds ran, the builds are skipped. The builds are kicked off by the ``build-deploy-pudl`` GitHub Action, which builds and pushes a Docker image with PUDL installed to `Docker Hub `__ and deploys the image as a container to a GCE instance. The container runs the ETL and -tests, then copies the outputs to a public AWS s3 bucket for distribution. +tests, then copies the outputs to a public AWS S3 bucket for distribution. Breaking the Builds ------------------- -The nightly data builds based on the ``dev`` branch are our comprehensive integration +The nightly data builds based on the ``main`` branch are our comprehensive integration tests. When they pass, we consider the results fit for public consumption. The builds are expected to pass. If they don't then someone needs to take responsibility for getting them working again with some urgency. -Because of how long the full build & tests take, we don’t typically run them -individually before merging every PR into ``dev``. However, running ``make nuke`` -(the equivalent of the full builds) is recommended when you've added a new year of data -or made other changes that would be expected to break the data validations, so that the -appropriate changes can be made prior to those changes hitting ``dev`` and the nightly -builds. +Because of how long the full build & tests take, we don't typically run them +individually before merging every PR into ``main``. However, running ``make nuke`` +(roughly equivalent to the full builds) is recommended when you've added a new year of +data or made other changes that would be expected to break the data validations, so that +the appropriate changes can be made prior to those changes hitting ``main`` and the +nightly builds. If your PR causes the build to fail, you are probably the best person to fix the problem, since you already have context on all of the changes that went into it. -Having multiple PRs merged into ``dev`` simultaneously when the builds are breaking +Having multiple PRs merged into ``main`` simultaneously when the builds are breaking makes it ambiguous where the problem is coming from, makes debugging harder, and diffuses responsibility for the breakage across several people, so it's important to fix -the breakage quickly. In some cases we may delay merging additional PRs into ``dev`` +the breakage quickly. In some cases we may delay merging additional PRs into ``main`` if the builds are failing to avoid ambiguity and facilitate debugging. Therefore, we've adopted the following etiquette regarding build breakage: On the -morning after you merge a PR into ``dev``, you should check whether the nightly builds +morning after you merge a PR into ``main``, you should check whether the nightly builds succeeded by looking in the ``pudl-deployments`` Slack channel (which all team members should be subscribed to). If the builds failed, look at the logging output (which is included as an attachment to the notification) and figure out what kind of failure occurred: * If the failure is due to your changes, then you are responsible for fixing the - problem and making a new PR to ``dev`` that resolves it, and it should be a high + problem and making a new PR to ``main`` that resolves it, and it should be a high priority. If you're stumped, ask for help! * If the failure is due to an infrastructural issue like the build server running out of memory and the build process getting killed, then you need to notify the member @@ -121,11 +122,6 @@ pushed to the PUDL repository. This way, new data outputs are automatically upda on code releases, and PUDL's code and data are tested every night. The action is modeled after an `example from the setup-gcloud GitHub action repository `__. -Unfortunately, scheduled actions only run on the default branch. To run scheduled -builds on the ``dev`` branch, the `actions/checkout `__ -step checks out the ``dev`` branch if a schedule triggers the action and the ``main`` -branch if a tag triggers the action. - The ``gcloud`` command in ``build-deploy-pudl`` requires certain Google Cloud Platform (GCP) permissions to start and update the GCE instance. The ``gcloud`` command authenticates using a service account key for the @@ -140,18 +136,18 @@ The PUDL image is deployed on a `Container Optimized GCE `__ instance, a type of virtual machine (VM) built to run containers. The ``pudl-deployment-dev`` and ``pudl-deployment-tag`` instances in the -``catalyst-cooperative-pudl`` GCP project handle deployments from the ``dev`` branch and -tags, respectively. There are two VMs so a scheduled and a tag build can run -at the same time. +``catalyst-cooperative-pudl`` GCP project handle deployments from the ``main`` branch +and tags or manually initiated ``workflow_dispatch`` runs respectively. There are two +VMs so a scheduled and a tag build can run at the same time. .. note:: If a tag build starts before the previous tag build has finished, the previous build will be interrupted. -PUDL's VMs use the e2-highmem-8 machine type (64 GB of RAM and 8 CPUs) to accommodate +The build VMs use the e2-highmem-8 machine type (64 GB of RAM and 8 CPUs) to accommodate the PUDL ETL's memory-intensive steps. Currently, these VMs do not have swap space -enabled. +enabled, so if they run out of memory, the build will immediately terminate. Each GCE VM has a service account that gives the VM permissions to GCP resources. The two PUDL deployment VMs share the ``deploy-pudl-vm-service-account``. This @@ -163,7 +159,7 @@ service account has permissions to: 3. Bill the ``catalyst-cooperative-pudl`` project for egress fees from accessing the ``zenodo-cache.catalyst.coop`` bucket. Note: The ``catalyst-cooperative-pudl`` won't be charged anything because the data stays within Google's network. -4. Write logs and outputs to the ``gs://nightly-build-outputs.catalyst.coop``, +4. Write logs and outputs to the ``gs://builds.catalyst.coop``, ``gs://pudl.catalyst.coop`` and ``s3://pudl.catalyst.coop`` buckets. The egress and storage fees of the s3 bucket are covered by `Amazon Web Services's Open Data Sponsorship Program @@ -184,13 +180,7 @@ are configured to run the ``docker/gcp_pudl_etl.sh`` script. This script: 5. Notifies the ``pudl-deployments`` Slack channel with the final build status. The ``gcp_pudl_etl.sh script`` is only intended to run on a GCE VM with adequate -permissions. The full ETL and tests can be run locally by running these commands -from the ``pudl`` directory: - -.. code-block:: - - docker compose -f docker/docker-compose.yml build - docker compose -f docker/docker-compose.yml up +permissions. How to access the nightly build outputs from AWS ------------------------------------------------ @@ -210,8 +200,10 @@ You should see a list of directories with version names: .. code-block:: - PRE dev/ + PRE nightly/ + PRE stable/ PRE v2022.11.30/ + PRE v2023.12.01/ ... The ``--no-sign-request`` flag allows you to make requsts to the @@ -225,19 +217,19 @@ bucket, ``aws`` will give you an authentication error. updating them, making sure you have the right version, putting them in the right place on your computer, etc. -To copy these files directly to your computer you can use -the ``aws s3 cp`` command, which behaves very much like the Unix ``cp`` command: +To copy these files directly to your computer you can use the ``aws s3 cp`` command, +which behaves very much like the Unix ``cp`` command: .. code:: - aws s3 cp s3://pudl.catalyst.coop/dev/pudl.sqlite ./ --no-sign-request + aws s3 cp s3://pudl.catalyst.coop/nightly/pudl.sqlite ./ --no-sign-request If you wanted to download all of the build outputs (more than 10GB!) you could use ``cp --recursive`` flag on the whole directory: .. code:: - aws s3 cp --recursive s3://pudl.catalyst.coop/dev/ ./ --no-sign-request + aws s3 cp --recursive s3://pudl.catalyst.coop/nightly/ ./ --no-sign-request For more details on how to use ``aws`` in general see the `online documentation `__ or run: @@ -292,37 +284,58 @@ that are available: .. code:: - gsutil ls gs://nightly-build-outputs.catalyst.coop + gsutil ls -lh gs://builds.catalyst.coop -You should see a list of directories with the naming convention -``-``. +You should see a list of directories with build IDs that have a naming convention: +``--``. -To see what the outputs are for a given nightly build, you can use ``gsutil`` -like this: +To see what the outputs are for a given nightly build, you can use ``gsutil`` like this: .. code:: - gsutil ls -l gs://nightly-build-outputs.catalyst.coop/ - - 1152800 2022-11-22T12:51:02Z gs://nightly-build-outputs.catalyst.coop//pudl-etl.log - gs://nightly-build-outputs.catalyst.coop//parquet/ - gs://nightly-build-outputs.catalyst.coop//pudl_out/ - gs://nightly-build-outputs.catalyst.coop//sqlite/ - TOTAL: 1 objects, 1152800 bytes (1.1 MiB) + gsutil ls -lh gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ + + 804.57 MiB 2024-01-03T11:19:15Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/censusdp1tract.sqlite + 5.01 GiB 2024-01-03T11:20:02Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/core_epacems__hourly_emissions.parquet + 759.32 MiB 2024-01-03T11:19:17Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc1_dbf.sqlite + 813.52 MiB 2024-01-03T11:19:18Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc1_xbrl.sqlite + 1.65 MiB 2024-01-03T11:18:18Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc1_xbrl_datapackage.json + 6.94 MiB 2024-01-03T11:18:19Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc1_xbrl_taxonomy_metadata.json + 282.71 MiB 2024-01-03T11:19:02Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc2_dbf.sqlite + 89.55 MiB 2024-01-03T11:18:40Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc2_xbrl.sqlite + 1.88 MiB 2024-01-03T11:18:18Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc2_xbrl_datapackage.json + 6.78 MiB 2024-01-03T11:18:18Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc2_xbrl_taxonomy_metadata.json + 8.25 MiB 2024-01-03T11:18:20Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc60_dbf.sqlite + 20.02 MiB 2024-01-03T11:18:22Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc60_xbrl.sqlite + 731.31 KiB 2024-01-03T11:18:18Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc60_xbrl_datapackage.json + 1.77 MiB 2024-01-03T11:18:19Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc60_xbrl_taxonomy_metadata.json + 153.72 MiB 2024-01-03T11:18:54Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc6_dbf.sqlite + 62.01 MiB 2024-01-03T11:18:28Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc6_xbrl.sqlite + 1.02 MiB 2024-01-03T11:18:18Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc6_xbrl_datapackage.json + 2.74 MiB 2024-01-03T11:18:18Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc6_xbrl_taxonomy_metadata.json + 905.31 MiB 2024-01-03T11:19:17Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc714_xbrl.sqlite + 58.41 KiB 2024-01-03T11:18:18Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc714_xbrl_datapackage.json + 187.86 KiB 2024-01-03T11:18:18Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/ferc714_xbrl_taxonomy_metadata.json + 4.05 MiB 2024-01-03T11:18:19Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/metadata.yml + 4 MiB 2024-01-03T12:09:34Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/pudl-etl.log + 13.1 GiB 2024-01-03T11:21:41Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/pudl.sqlite + 0 B 2024-01-03T11:18:18Z gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/success + gs://builds.catalyst.coop/2024-01-03-0605-e9a91be-dev/core_epacems__hourly_emissions/ + TOTAL: 25 objects, 23557650395 bytes (21.94 GiB) If you want to copy these files down directly to your computer, you can use the ``gsutil cp`` command, which behaves very much like the Unix ``cp`` command: .. code:: - gsutil cp gs://nightly-build-outputs.catalyst.coop//pudl.sqlite ./ + gsutil cp gs://builds.catalyst.coop//pudl.sqlite ./ If you wanted to download all of the build outputs (more than 10GB!) you could use ``cp -r`` on the whole directory: .. code:: - gsutil cp -r gs://nightly-build-outputs.catalyst.coop// ./ + gsutil cp -r gs://builds.catalyst.coop// ./ For more details on how to use ``gsutil`` in general see the `online documentation `__ or run: diff --git a/docs/dev/project_management.rst b/docs/dev/project_management.rst index 7041fad8dd..219b2b5074 100644 --- a/docs/dev/project_management.rst +++ b/docs/dev/project_management.rst @@ -2,7 +2,7 @@ Project Management =============================================================================== -The people working on PUDL are distributed all over North America. Collaboration takes +The people working on PUDL are distributed all over North America. Collaboration takes place online. We make extensive use of Github's build int project management tools and we work in public. You can follow our progress in our `GitHub Projects `__ @@ -19,11 +19,17 @@ them easily. Our GitHub Workflow ------------------------------------------------------------------------------- -* We have 2 persistent branches: **main** and **dev**. -* We create temporary feature branches off of **dev** and make pull requests to - **dev** throughout our 2 week long sprints. -* At the end of each sprint, or any time a particularly significant feature has - been merged in and the nightly builds are passing, **dev** is merged into **main**. +* We have 3 persistent branches: ``main`` (the default branch), ``nightly``, and + ``stable``. +* We create temporary feature branches off of ``main`` and make pull requests into + ``main`` throughout our 2 week long sprints. All code that's merged into ``main`` + should have passed our CI tests and been reviewed by at least one other person. +* Every night the ``main`` branch is used to run the :ref:`nightly-data-builds`. If the + builds are successful, then the ``nightly`` branch is automatically updated to point + to the latest commit on ``main``. If the builds fail, then the ``nightly`` branch is + left unchanged. +* Every time we do a versioned data release, the ``stable`` branch is updated to point + to the commit associated with the most recent release. ------------------------------------------------------------------------------- Pull Requests @@ -31,11 +37,11 @@ Pull Requests * Before making a PR, make sure the tests run and pass locally, including the code linters and pre-commit hooks. See :ref:`linting` for details. -* Don't forget to merge any new commits to the **dev** branch into your feature +* Don't forget to merge any new commits to the ``main`` branch into your feature branch before making a PR. * If for some reason the continuous integration tests fail for your PR, try and figure out why and fix it, or ask for help. If the tests fail, we don't want - to merge it into **dev**. You can see the status of the CI builds in the + to merge it into ``main``. You can see the status of the CI builds in the `GitHub Actions for the PUDL repo `__. * Please don't decrease the overall test coverage -- if you introduce new code, @@ -55,8 +61,8 @@ Releases Python packages depend on. Rather, it's an end-use application that produces data which other applications and analyses can consume. Because of this, we no longer release installable packages on PyPI or ``conda-forge``. -* Periodically, we tag a versioned release on **main** using the date, like - ``v2023.07.15``. This triggers a snapshot of the repository being +* Periodically, we tag a versioned release on ``main`` using a calendar based version, + like ``v2023.07.15``. This triggers a snapshot of the repository being `archived on Zenodo `__. * The nightly build outputs associated with any tagged release will also get archived `on Zenodo here `__ diff --git a/docs/index.rst b/docs/index.rst index fc2ab28f78..354aca7008 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -34,15 +34,15 @@ We also publish SQLite databases containing relatively pristine versions of our difficult to parse inputs, especially the old Visual FoxPro (DBF, pre-2021) and new XBRL data (2021+) published by FERC: -* `FERC Form 1 (DBF) `__ -* `FERC Form 1 (XBRL) `__ -* `FERC Form 2 (DBF) `__ -* `FERC Form 2 (XBRL) `__ -* `FERC Form 6 (DBF) `__ -* `FERC Form 6 (XBRL) `__ -* `FERC Form 60 (DBF) `__ -* `FERC Form 60 (XBRL) `__ -* `FERC Form 714 (XBRL) `__ +* `FERC Form 1 (DBF) `__ +* `FERC Form 1 (XBRL) `__ +* `FERC Form 2 (DBF) `__ +* `FERC Form 2 (XBRL) `__ +* `FERC Form 6 (DBF) `__ +* `FERC Form 6 (XBRL) `__ +* `FERC Form 60 (DBF) `__ +* `FERC Form 60 (XBRL) `__ +* `FERC Form 714 (XBRL) `__ .. _raw-data-archive: diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 0162000897..8936ff2761 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -3,7 +3,7 @@ PUDL Release Notes ======================================================================================= --------------------------------------------------------------------------------------- -v2023.12.XX +v2024.01.XX --------------------------------------------------------------------------------------- * The ``epacems_to_parquet`` and ``state_demand`` scripts have been retired in favor of using the Dagster UI. See :issue:`3107` and :pr:`3086`. Visualizations of hourly @@ -17,6 +17,13 @@ v2023.12.XX fixed, and can be used to generate `GeoParquet `__ outputs describing historical utility and balancing authority service territories. See :issue:`1174` and :pr:`3086`. +* Deprecated the ``dev`` branch and updated our nightly builds and GitHub workflow to + use three persistent branches: ``main`` for bleeding edge changes, ``nightly`` for the + most recent commit to have a successful nightly build output, and ``stable`` for the + most recently released version of PUDL. The ``nightly`` and ``stable`` branches are + protected and automatically updated. Build outputs are now written to + ``gs://builds.catalyst.coop`` and retained for 30 days. See issues :issue:`3140,3179` + and PRs :pr:`3195,3206,3212` Data Coverage ^^^^^^^^^^^^^