From a629617dfc67d828a718c5549b6eea394c9a7da1 Mon Sep 17 00:00:00 2001 From: James Herr Date: Fri, 18 Oct 2024 09:33:17 -0500 Subject: [PATCH 1/5] WIP Azure Outage Alert Co-Authored-By: Samuel Aquino --- operations/template/alert.tf | 43 ++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 642b75089..be90032d3 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -28,6 +28,49 @@ resource "azurerm_monitor_action_group" "notify_slack_email" { } } +resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { + count = local.non_pr_environment ? 1 : 0 + name = "cdcti-${var.environment}-azure-status-alert" + location = data.azurerm_resource_group.group.location + resource_group_name = data.azurerm_resource_group.group.name + scopes = [azurerm_container_registry.registry.id] + + criteria { + category = "ServiceHealth" + levels = ["Error"] + service_health { + locations = ["East US", "Global"] + events = ["Incident"] + services = ["*"] + } + } + + action { + action_group_id = [azurerm_monitor_action_group.notify_slack_email[count.index].id] + email_subject = "FATAL: Azure Outage Alert!" + } + + description = "Alert service(s) appear to be down" + enabled = true + + lifecycle { + ignore_changes = [ + tags["business_steward"], + tags["center"], + tags["environment"], + tags["escid"], + tags["funding_source"], + tags["pii_data"], + tags["security_compliance"], + tags["security_steward"], + tags["support_group"], + tags["system"], + tags["technical_steward"], + tags["zone"] + ] + } +} + resource "azurerm_monitor_scheduled_query_rules_alert" "database_token_expired_alert" { count = local.non_pr_environment ? 1 : 0 name = "cdcti-${var.environment}-api-log-token-alert" From 07fbaf3dee15f577942d6de7eac4f05edb5e4059 Mon Sep 17 00:00:00 2001 From: James Herr Date: Fri, 18 Oct 2024 11:19:37 -0500 Subject: [PATCH 2/5] Attempt action_group_id fix Co-Authored-By: Samuel Aquino --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index be90032d3..cdae8fd21 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -46,7 +46,7 @@ resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { } action { - action_group_id = [azurerm_monitor_action_group.notify_slack_email[count.index].id] + action_group_id = azurerm_monitor_action_group.notify_slack_email[count.index].id email_subject = "FATAL: Azure Outage Alert!" } From e4ec0a45db18bf0fc6d59bd2e447852afd4a72b2 Mon Sep 17 00:00:00 2001 From: James Herr Date: Fri, 18 Oct 2024 11:22:27 -0500 Subject: [PATCH 3/5] Removed unnecessary email_subject --- operations/template/alert.tf | 1 - 1 file changed, 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index cdae8fd21..4a4bdffde 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -47,7 +47,6 @@ resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { action { action_group_id = azurerm_monitor_action_group.notify_slack_email[count.index].id - email_subject = "FATAL: Azure Outage Alert!" } description = "Alert service(s) appear to be down" From 71c105662161953d1a6755a4c01520f9de23830c Mon Sep 17 00:00:00 2001 From: James Herr Date: Fri, 18 Oct 2024 11:41:02 -0500 Subject: [PATCH 4/5] Refactoring location --- operations/template/alert.tf | 2 +- operations/template/main.tf | 2 +- operations/template/variables.tf | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 4a4bdffde..59e98ded6 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -39,7 +39,7 @@ resource "azurerm_monitor_activity_log_alert" "azure_service_health_alert" { category = "ServiceHealth" levels = ["Error"] service_health { - locations = ["East US", "Global"] + locations = var.service_health_locations events = ["Incident"] services = ["*"] } diff --git a/operations/template/main.tf b/operations/template/main.tf index ff42970ce..579a23c60 100644 --- a/operations/template/main.tf +++ b/operations/template/main.tf @@ -8,7 +8,7 @@ locals { rs_domain_prefix = "${local.selected_rs_environment_prefix}${length(local.selected_rs_environment_prefix) == 0 ? "" : "."}" higher_environment_level = var.environment == "stg" || var.environment == "prd" cdc_domain_environment = var.environment == "dev" || var.environment == "stg" || var.environment == "prd" - non_pr_environment = !strcontains(var.environment, "pr") + non_pr_environment = !strcontains(var.environment, "pr", "dev") # dev is temp while testing } data "azurerm_resource_group" "group" { diff --git a/operations/template/variables.tf b/operations/template/variables.tf index 0007ad678..bd74082b2 100644 --- a/operations/template/variables.tf +++ b/operations/template/variables.tf @@ -19,3 +19,8 @@ variable "alert_slack_email" { nullable = false sensitive = true } + +variable "service_health_locations" { + type = list(string) + default = ["East US"] +} From 1741c974e6ce30e8bd3598e6ce92b6540182430d Mon Sep 17 00:00:00 2001 From: James Herr Date: Fri, 18 Oct 2024 11:51:54 -0500 Subject: [PATCH 5/5] Remove temp change --- operations/template/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/main.tf b/operations/template/main.tf index 579a23c60..ff42970ce 100644 --- a/operations/template/main.tf +++ b/operations/template/main.tf @@ -8,7 +8,7 @@ locals { rs_domain_prefix = "${local.selected_rs_environment_prefix}${length(local.selected_rs_environment_prefix) == 0 ? "" : "."}" higher_environment_level = var.environment == "stg" || var.environment == "prd" cdc_domain_environment = var.environment == "dev" || var.environment == "stg" || var.environment == "prd" - non_pr_environment = !strcontains(var.environment, "pr", "dev") # dev is temp while testing + non_pr_environment = !strcontains(var.environment, "pr") } data "azurerm_resource_group" "group" {