diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index f74c35862..1a72bd8d2 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -29,7 +29,7 @@ jobs: VPN_CA_CERTIFICATE: ${{ secrets.VPN_CA_CERTIFICATE }} VPN_GITHUB_CERTIFICATE: ${{ secrets.VPN_GITHUB_CERTIFICATE}} VPN_GITHUB_SECRET_KEY: ${{ secrets.VPN_GITHUB_SECRET_KEY }} - TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.ALERT_SLACK_EMAIL }}" + TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.NON_PROD_ALERT_SLACK_EMAIL }}" staging-deploy: name: Staging Application Deploy diff --git a/.github/workflows/dev-deploy.yml b/.github/workflows/dev-deploy.yml index ccf9f1516..0569f78e1 100644 --- a/.github/workflows/dev-deploy.yml +++ b/.github/workflows/dev-deploy.yml @@ -22,7 +22,7 @@ jobs: VPN_CA_CERTIFICATE: ${{ secrets.VPN_CA_CERTIFICATE }} VPN_GITHUB_CERTIFICATE: ${{ secrets.VPN_GITHUB_CERTIFICATE}} VPN_GITHUB_SECRET_KEY: ${{ secrets.VPN_GITHUB_SECRET_KEY }} - TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.ALERT_SLACK_EMAIL }}" + TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.NON_PROD_ALERT_SLACK_EMAIL }}" dev-deploy: name: Dev Application Deploy diff --git a/.github/workflows/internal-deploy.yml b/.github/workflows/internal-deploy.yml index 2597e3bd9..d4d0aaaf6 100644 --- a/.github/workflows/internal-deploy.yml +++ b/.github/workflows/internal-deploy.yml @@ -18,7 +18,7 @@ jobs: AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.ALERT_SLACK_EMAIL }}" + TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.NON_PROD_ALERT_SLACK_EMAIL }}" internal-deploy: name: Internal Application Deploy diff --git a/.github/workflows/prod-deploy.yml b/.github/workflows/prod-deploy.yml index 5f6d5d754..19e9401a2 100644 --- a/.github/workflows/prod-deploy.yml +++ b/.github/workflows/prod-deploy.yml @@ -26,7 +26,7 @@ jobs: VPN_CA_CERTIFICATE: ${{ secrets.VPN_CA_CERTIFICATE }} VPN_GITHUB_CERTIFICATE: ${{ secrets.VPN_GITHUB_CERTIFICATE}} VPN_GITHUB_SECRET_KEY: ${{ secrets.VPN_GITHUB_SECRET_KEY }} - TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.ALERT_SLACK_EMAIL }}" + TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.PROD_ALERT_SLACK_EMAIL }}" prod-deploy: name: Prod Application Deploy diff --git a/.github/workflows/terraform-ci-deploy.yml b/.github/workflows/terraform-ci-deploy.yml index e61d6ec21..ab4fe56c4 100644 --- a/.github/workflows/terraform-ci-deploy.yml +++ b/.github/workflows/terraform-ci-deploy.yml @@ -40,7 +40,7 @@ jobs: AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - TERRAFORM_APPLY_PARAMETERS: -var="pr_number=${{ github.event.number }}" -var="alert_slack_email=${{ secrets.ALERT_SLACK_EMAIL }}" + TERRAFORM_APPLY_PARAMETERS: -var="pr_number=${{ github.event.number }}" -var="alert_slack_email=${{ secrets.NON_PROD_ALERT_SLACK_EMAIL }}" terraform-deploy-skip: # runs when the PR doesn't have any changes that require the PR deploy; this ensures we get the appropriate required PR checks diff --git a/.github/workflows/terraform-ci-destroy.yml b/.github/workflows/terraform-ci-destroy.yml index 58ebcb4e6..d280226ea 100644 --- a/.github/workflows/terraform-ci-destroy.yml +++ b/.github/workflows/terraform-ci-destroy.yml @@ -53,4 +53,4 @@ jobs: run: terraform init -backend-config="key=pr_${{ github.event.number }}.tfstate" - name: Terraform Destroy - run: terraform destroy -auto-approve -input=false -var="pr_number=${{ github.event.number }}" -var="alert_slack_email=${{ secrets.ALERT_SLACK_EMAIL }}" + run: terraform destroy -auto-approve -input=false -var="pr_number=${{ github.event.number }}" -var="alert_slack_email=${{ secrets.NON_PROD_ALERT_SLACK_EMAIL }}" diff --git a/adr/020-azure-alerts.md b/adr/020-azure-alerts.md index a6f37a4dc..ab7c56782 100644 --- a/adr/020-azure-alerts.md +++ b/adr/020-azure-alerts.md @@ -13,19 +13,19 @@ Accepted. ## Context As part of our CI/CD infrastructure, we need notifications when failures occur. +We chose Azure for alerting because it's built into the infrastructure we're already using, +which gives us easy access to metrics. We're not currently using an external +log aggregation system, so Azure alerts were a much lower lift to implement than +any of the other potential options. -To ensure rapid response to application failures within our CI/CD infrastructure, we require real-time notifications for critical issues. The current alert setup focuses on: +Alerts are configured in [alert.tf](../operations/template/alert.tf). To reduce +unhelpful notifications, we have alerts turned off in the PR environments, so they must +either be tested in `internal` or `dev`, or developers may temporarily turn alerts back on in +their branch's PR environment. -- **Type:** [Azure Log Search Alerts](https://learn.microsoft.com/en-us/azure/azure-monitor/alerts/alerts-types#log-alerts) for HikariCP connection failures. - - -- **Trigger:** Any logged failures with database connections. - - -- **Configuration:** Alerts are stateful (auto-mitigation); set to `fired` status to reduce noise from frequent or duplicate alerts. - - -- **Notification:** Alerts sent to a Slack channel via email until PagerDuty is operational. +Alerts are sent to email addresses that forward to Slack channels. As of October 2024, +production alerts go to `#production-alerts-cdc-trusted-intermediary` and non-prod alerts +go to `#non-prod-alerts-cdc-trusted-intermediary`. ## Impact @@ -35,11 +35,16 @@ To ensure rapid response to application failures within our CI/CD infrastructure ### Negative -- Possible alert fatigue if not fine-tuned +- Azure's built-in alert options are less robust than some other services - for instance, +they don't have an option for p50/90/99 latency alert. This means we're more limited in +what kinds of alerts we can have +- Navigating from the Azure Slack alerts to the actual logs where issues are occurring +is unintuitive and requires multiple clicks. Even once you find the right logs, +Azure logs lack syntax highlighting and can be hard to read. ### Risks -- None +- Possible alert fatigue if not fine-tuned ## Related Issues diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 13588dca7..3a185460a 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -5,7 +5,9 @@ resource "azurerm_monitor_action_group" "notify_slack_email" { short_name = "cdcti-alerts" email_receiver { - name = "cdcti-flexion-slack-email-receiver" + name = "cdcti-flexion-slack-email-receiver" + // This variable is set in the `env-deploy.yml` GH action for each environment + // We use a different email address/Slack channel for prod and non-prod alerts email_address = var.alert_slack_email }