From c26295d68a1010b22627a472e24bc3dd39e8d0de Mon Sep 17 00:00:00 2001 From: Sylvie Date: Fri, 25 Oct 2024 11:56:22 -0500 Subject: [PATCH 1/7] Update ADR to have more info on Azure Alerts decision and to include both slack channels Co-Authored-By: Bella L. Quintero <96704946+pluckyswan@users.noreply.github.com> Co-Authored-By: Samuel Aquino --- adr/020-azure-alerts.md | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/adr/020-azure-alerts.md b/adr/020-azure-alerts.md index a6f37a4dc..ab7c56782 100644 --- a/adr/020-azure-alerts.md +++ b/adr/020-azure-alerts.md @@ -13,19 +13,19 @@ Accepted. ## Context As part of our CI/CD infrastructure, we need notifications when failures occur. +We chose Azure for alerting because it's built into the infrastructure we're already using, +which gives us easy access to metrics. We're not currently using an external +log aggregation system, so Azure alerts were a much lower lift to implement than +any of the other potential options. -To ensure rapid response to application failures within our CI/CD infrastructure, we require real-time notifications for critical issues. The current alert setup focuses on: +Alerts are configured in [alert.tf](../operations/template/alert.tf). To reduce +unhelpful notifications, we have alerts turned off in the PR environments, so they must +either be tested in `internal` or `dev`, or developers may temporarily turn alerts back on in +their branch's PR environment. -- **Type:** [Azure Log Search Alerts](https://learn.microsoft.com/en-us/azure/azure-monitor/alerts/alerts-types#log-alerts) for HikariCP connection failures. - - -- **Trigger:** Any logged failures with database connections. - - -- **Configuration:** Alerts are stateful (auto-mitigation); set to `fired` status to reduce noise from frequent or duplicate alerts. - - -- **Notification:** Alerts sent to a Slack channel via email until PagerDuty is operational. +Alerts are sent to email addresses that forward to Slack channels. As of October 2024, +production alerts go to `#production-alerts-cdc-trusted-intermediary` and non-prod alerts +go to `#non-prod-alerts-cdc-trusted-intermediary`. ## Impact @@ -35,11 +35,16 @@ To ensure rapid response to application failures within our CI/CD infrastructure ### Negative -- Possible alert fatigue if not fine-tuned +- Azure's built-in alert options are less robust than some other services - for instance, +they don't have an option for p50/90/99 latency alert. This means we're more limited in +what kinds of alerts we can have +- Navigating from the Azure Slack alerts to the actual logs where issues are occurring +is unintuitive and requires multiple clicks. Even once you find the right logs, +Azure logs lack syntax highlighting and can be hard to read. ### Risks -- None +- Possible alert fatigue if not fine-tuned ## Related Issues From 03aa48662db74ddbabfcd4730ea9ae5fd9a67444 Mon Sep 17 00:00:00 2001 From: Sylvie Date: Fri, 25 Oct 2024 12:01:17 -0500 Subject: [PATCH 2/7] Update email secret to use for prod deploy; add clarifying comment Co-Authored-By: Bella L. Quintero <96704946+pluckyswan@users.noreply.github.com> --- .github/workflows/prod-deploy.yml | 2 +- operations/template/alert.tf | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/prod-deploy.yml b/.github/workflows/prod-deploy.yml index 5f6d5d754..19e9401a2 100644 --- a/.github/workflows/prod-deploy.yml +++ b/.github/workflows/prod-deploy.yml @@ -26,7 +26,7 @@ jobs: VPN_CA_CERTIFICATE: ${{ secrets.VPN_CA_CERTIFICATE }} VPN_GITHUB_CERTIFICATE: ${{ secrets.VPN_GITHUB_CERTIFICATE}} VPN_GITHUB_SECRET_KEY: ${{ secrets.VPN_GITHUB_SECRET_KEY }} - TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.ALERT_SLACK_EMAIL }}" + TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.PROD_ALERT_SLACK_EMAIL }}" prod-deploy: name: Prod Application Deploy diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 13588dca7..3a185460a 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -5,7 +5,9 @@ resource "azurerm_monitor_action_group" "notify_slack_email" { short_name = "cdcti-alerts" email_receiver { - name = "cdcti-flexion-slack-email-receiver" + name = "cdcti-flexion-slack-email-receiver" + // This variable is set in the `env-deploy.yml` GH action for each environment + // We use a different email address/Slack channel for prod and non-prod alerts email_address = var.alert_slack_email } From 20ca39878c8e7c6be5d0c88ca098725a2e0d8d0b Mon Sep 17 00:00:00 2001 From: Sylvie Date: Fri, 25 Oct 2024 12:11:58 -0500 Subject: [PATCH 3/7] Reference new non-prod alerts email secret Co-Authored-By: Bella L. Quintero <96704946+pluckyswan@users.noreply.github.com> --- .github/workflows/cicd.yml | 2 +- .github/workflows/dev-deploy.yml | 2 +- .github/workflows/internal-deploy.yml | 2 +- .github/workflows/terraform-ci-deploy.yml | 2 +- .github/workflows/terraform-ci-destroy.yml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index f74c35862..1a72bd8d2 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -29,7 +29,7 @@ jobs: VPN_CA_CERTIFICATE: ${{ secrets.VPN_CA_CERTIFICATE }} VPN_GITHUB_CERTIFICATE: ${{ secrets.VPN_GITHUB_CERTIFICATE}} VPN_GITHUB_SECRET_KEY: ${{ secrets.VPN_GITHUB_SECRET_KEY }} - TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.ALERT_SLACK_EMAIL }}" + TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.NON_PROD_ALERT_SLACK_EMAIL }}" staging-deploy: name: Staging Application Deploy diff --git a/.github/workflows/dev-deploy.yml b/.github/workflows/dev-deploy.yml index ccf9f1516..0569f78e1 100644 --- a/.github/workflows/dev-deploy.yml +++ b/.github/workflows/dev-deploy.yml @@ -22,7 +22,7 @@ jobs: VPN_CA_CERTIFICATE: ${{ secrets.VPN_CA_CERTIFICATE }} VPN_GITHUB_CERTIFICATE: ${{ secrets.VPN_GITHUB_CERTIFICATE}} VPN_GITHUB_SECRET_KEY: ${{ secrets.VPN_GITHUB_SECRET_KEY }} - TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.ALERT_SLACK_EMAIL }}" + TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.NON_PROD_ALERT_SLACK_EMAIL }}" dev-deploy: name: Dev Application Deploy diff --git a/.github/workflows/internal-deploy.yml b/.github/workflows/internal-deploy.yml index 2597e3bd9..d4d0aaaf6 100644 --- a/.github/workflows/internal-deploy.yml +++ b/.github/workflows/internal-deploy.yml @@ -18,7 +18,7 @@ jobs: AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.ALERT_SLACK_EMAIL }}" + TERRAFORM_APPLY_PARAMETERS: -var="alert_slack_email=${{ secrets.NON_PROD_ALERT_SLACK_EMAIL }}" internal-deploy: name: Internal Application Deploy diff --git a/.github/workflows/terraform-ci-deploy.yml b/.github/workflows/terraform-ci-deploy.yml index e61d6ec21..ab4fe56c4 100644 --- a/.github/workflows/terraform-ci-deploy.yml +++ b/.github/workflows/terraform-ci-deploy.yml @@ -40,7 +40,7 @@ jobs: AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - TERRAFORM_APPLY_PARAMETERS: -var="pr_number=${{ github.event.number }}" -var="alert_slack_email=${{ secrets.ALERT_SLACK_EMAIL }}" + TERRAFORM_APPLY_PARAMETERS: -var="pr_number=${{ github.event.number }}" -var="alert_slack_email=${{ secrets.NON_PROD_ALERT_SLACK_EMAIL }}" terraform-deploy-skip: # runs when the PR doesn't have any changes that require the PR deploy; this ensures we get the appropriate required PR checks diff --git a/.github/workflows/terraform-ci-destroy.yml b/.github/workflows/terraform-ci-destroy.yml index 58ebcb4e6..d280226ea 100644 --- a/.github/workflows/terraform-ci-destroy.yml +++ b/.github/workflows/terraform-ci-destroy.yml @@ -53,4 +53,4 @@ jobs: run: terraform init -backend-config="key=pr_${{ github.event.number }}.tfstate" - name: Terraform Destroy - run: terraform destroy -auto-approve -input=false -var="pr_number=${{ github.event.number }}" -var="alert_slack_email=${{ secrets.ALERT_SLACK_EMAIL }}" + run: terraform destroy -auto-approve -input=false -var="pr_number=${{ github.event.number }}" -var="alert_slack_email=${{ secrets.NON_PROD_ALERT_SLACK_EMAIL }}" From 518ac8aeca783455d636f0d0a7ecd0ab0d15910a Mon Sep 17 00:00:00 2001 From: Sylvie Date: Fri, 25 Oct 2024 12:15:30 -0500 Subject: [PATCH 4/7] temporarily turn this on for PR envs Co-Authored-By: Bella L. Quintero <96704946+pluckyswan@users.noreply.github.com> --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 3a185460a..3315a3b01 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -1,5 +1,5 @@ resource "azurerm_monitor_action_group" "notify_slack_email" { - count = local.non_pr_environment ? 1 : 0 + count = 1 //local.non_pr_environment ? 1 : 0 name = "cdcti${var.environment}-actiongroup" resource_group_name = data.azurerm_resource_group.group.name short_name = "cdcti-alerts" From adce5875997a879a1e8b97abd2f010e29c57b21d Mon Sep 17 00:00:00 2001 From: Sylvie Date: Fri, 25 Oct 2024 12:31:27 -0500 Subject: [PATCH 5/7] reset slack email count to normal condition Co-Authored-By: Bella L. Quintero <96704946+pluckyswan@users.noreply.github.com> --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 3315a3b01..3a185460a 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -1,5 +1,5 @@ resource "azurerm_monitor_action_group" "notify_slack_email" { - count = 1 //local.non_pr_environment ? 1 : 0 + count = local.non_pr_environment ? 1 : 0 name = "cdcti${var.environment}-actiongroup" resource_group_name = data.azurerm_resource_group.group.name short_name = "cdcti-alerts" From 052636d4da79b7ef99f9c9d0ba994fb56dbca2ea Mon Sep 17 00:00:00 2001 From: Bella Luz Quintero Date: Fri, 25 Oct 2024 15:24:03 -0600 Subject: [PATCH 6/7] switched sku from basic to standard --- operations/template/vpn.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/vpn.tf b/operations/template/vpn.tf index 4c2c60b2b..66612f4ce 100644 --- a/operations/template/vpn.tf +++ b/operations/template/vpn.tf @@ -4,7 +4,7 @@ resource "azurerm_public_ip" "vpn" { resource_group_name = data.azurerm_resource_group.group.name allocation_method = "Dynamic" - sku = "Basic" + sku = "Standard" # below tags are managed by CDC lifecycle { ignore_changes = [ From ab1bd57ea64b11cfbb34c8a85ac7a781c88a444a Mon Sep 17 00:00:00 2001 From: Bella Luz Quintero Date: Fri, 25 Oct 2024 15:26:52 -0600 Subject: [PATCH 7/7] revert previous commit --- operations/template/vpn.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/vpn.tf b/operations/template/vpn.tf index 66612f4ce..4c2c60b2b 100644 --- a/operations/template/vpn.tf +++ b/operations/template/vpn.tf @@ -4,7 +4,7 @@ resource "azurerm_public_ip" "vpn" { resource_group_name = data.azurerm_resource_group.group.name allocation_method = "Dynamic" - sku = "Standard" + sku = "Basic" # below tags are managed by CDC lifecycle { ignore_changes = [