From 9a04e54aaacaf9e68bbb99fc2648fdb3a9988785 Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Tue, 22 Oct 2024 14:46:26 -0500 Subject: [PATCH 01/15] alert when instance count is low Co-Authored-By: Samuel Aquino <saquino@flexion.us> Co-Authored-By: halprin <halprin@users.noreply.github.com> Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com> --- operations/template/alert.tf | 42 ++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 109e27d02..06f715357 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -120,3 +120,45 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "database_token_expired_a ] } } +resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { + count = 1 //local.non_pr_environment ? 1 : 0 + name = "cdcti-${var.environment}-azure-low-instance-count-alert" + resource_group_name = data.azurerm_resource_group.group.name + scopes = [azurerm_linux_web_app.api.id] + description = "Action will be triggered when the instance count is too low" + frequency = "PT1M" // Checks every 1 minute + window_size = "PT15M" // Every Check, looks back 15 minutes in history + //TBD: How frequent do we want this alert and how far do we want it to look back. + + criteria { + metric_namespace = "Microsoft.Web/sites" + metric_name = "InstanceCount" + aggregation = "Average" + operator = "LessThanOrEqual" + // This threshold is based on the autoscale settings in app.tf + // How should we tune these numbers if we've scaled up higher than the initial count of 3/1? + threshold = local.higher_environment_level ? 2.5 : 0.5 + } + + action { + action_group_id = azurerm_monitor_action_group.notify_slack_email[count.index].id + } + + lifecycle { + # Ignore changes to tags because the CDC sets these automagically + ignore_changes = [ + tags["business_steward"], + tags["center"], + tags["environment"], + tags["escid"], + tags["funding_source"], + tags["pii_data"], + tags["security_compliance"], + tags["security_steward"], + tags["support_group"], + tags["system"], + tags["technical_steward"], + tags["zone"] + ] + } +} From 595ab5d9fd57a9b81eadefb672055ef3b20b75f6 Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Tue, 22 Oct 2024 14:53:58 -0500 Subject: [PATCH 02/15] make action group exist in pr env for now Co-Authored-By: Samuel Aquino <saquino@flexion.us> Co-Authored-By: halprin <halprin@users.noreply.github.com> Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com> --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 06f715357..57cff47ce 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -1,5 +1,5 @@ resource "azurerm_monitor_action_group" "notify_slack_email" { - count = local.non_pr_environment ? 1 : 0 + count = 1 //local.non_pr_environment ? 1 : 0 name = "cdcti${var.environment}-actiongroup" resource_group_name = data.azurerm_resource_group.group.name short_name = "cdcti-alerts" From 3d2067dbb1aeedd538ebd06f0a6fbed46e103cc5 Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Tue, 22 Oct 2024 15:15:42 -0500 Subject: [PATCH 03/15] try the other metric namespace Co-Authored-By: Samuel Aquino <saquino@flexion.us> Co-Authored-By: halprin <halprin@users.noreply.github.com> Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com> --- operations/template/alert.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 57cff47ce..8e4a4953e 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -124,15 +124,15 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { count = 1 //local.non_pr_environment ? 1 : 0 name = "cdcti-${var.environment}-azure-low-instance-count-alert" resource_group_name = data.azurerm_resource_group.group.name - scopes = [azurerm_linux_web_app.api.id] + scopes = [azurerm_monitor_autoscale_setting.api_autoscale.id] description = "Action will be triggered when the instance count is too low" frequency = "PT1M" // Checks every 1 minute window_size = "PT15M" // Every Check, looks back 15 minutes in history //TBD: How frequent do we want this alert and how far do we want it to look back. criteria { - metric_namespace = "Microsoft.Web/sites" - metric_name = "InstanceCount" + metric_namespace = "Microsoft.Insights/autoscalesettings" + metric_name = "ObservedCapacity" aggregation = "Average" operator = "LessThanOrEqual" // This threshold is based on the autoscale settings in app.tf From 7b5026eafbe145ce22eccc93ad9e44785781b87e Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Tue, 22 Oct 2024 15:34:55 -0500 Subject: [PATCH 04/15] this should really fail Co-Authored-By: Samuel Aquino <saquino@flexion.us> Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com> --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 8e4a4953e..6240232ba 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -137,7 +137,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { operator = "LessThanOrEqual" // This threshold is based on the autoscale settings in app.tf // How should we tune these numbers if we've scaled up higher than the initial count of 3/1? - threshold = local.higher_environment_level ? 2.5 : 0.5 + threshold = local.higher_environment_level ? 2.5 : 1.5 } action { From df60b605bb4eb89f467a229c28abbd81f7d2fea8 Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Tue, 22 Oct 2024 15:41:33 -0500 Subject: [PATCH 05/15] set back to a civilized number Co-Authored-By: Samuel Aquino <saquino@flexion.us> Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com> --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 6240232ba..8e4a4953e 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -137,7 +137,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { operator = "LessThanOrEqual" // This threshold is based on the autoscale settings in app.tf // How should we tune these numbers if we've scaled up higher than the initial count of 3/1? - threshold = local.higher_environment_level ? 2.5 : 1.5 + threshold = local.higher_environment_level ? 2.5 : 0.5 } action { From 013ee4998569155df4a1ebacc8c88d02799077d2 Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Tue, 22 Oct 2024 15:57:21 -0500 Subject: [PATCH 06/15] let's try another severity Co-Authored-By: Samuel Aquino <saquino@flexion.us> Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com> --- operations/template/alert.tf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 8e4a4953e..fc201a0af 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -126,6 +126,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { resource_group_name = data.azurerm_resource_group.group.name scopes = [azurerm_monitor_autoscale_setting.api_autoscale.id] description = "Action will be triggered when the instance count is too low" + severity = 2 // warning frequency = "PT1M" // Checks every 1 minute window_size = "PT15M" // Every Check, looks back 15 minutes in history //TBD: How frequent do we want this alert and how far do we want it to look back. @@ -137,7 +138,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { operator = "LessThanOrEqual" // This threshold is based on the autoscale settings in app.tf // How should we tune these numbers if we've scaled up higher than the initial count of 3/1? - threshold = local.higher_environment_level ? 2.5 : 0.5 + threshold = local.higher_environment_level ? 2.5 : 1.5 } action { From f60548947e986057b7361ebebc831833bc00b039 Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Tue, 22 Oct 2024 16:00:25 -0500 Subject: [PATCH 07/15] more description Co-Authored-By: Samuel Aquino <saquino@flexion.us> Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com> Co-Authored-By: jherrflexion <118225331+jherrflexion@users.noreply.github.com> --- operations/template/alert.tf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index fc201a0af..553c470e6 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -125,11 +125,10 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { name = "cdcti-${var.environment}-azure-low-instance-count-alert" resource_group_name = data.azurerm_resource_group.group.name scopes = [azurerm_monitor_autoscale_setting.api_autoscale.id] - description = "Action will be triggered when the instance count is too low" + description = "The instance count in ${var.environment} is too low" severity = 2 // warning frequency = "PT1M" // Checks every 1 minute window_size = "PT15M" // Every Check, looks back 15 minutes in history - //TBD: How frequent do we want this alert and how far do we want it to look back. criteria { metric_namespace = "Microsoft.Insights/autoscalesettings" From 00ad2a41198c59afc76c2437b8f9f9277361648a Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Tue, 22 Oct 2024 16:05:15 -0500 Subject: [PATCH 08/15] Set to appropriate envs and numbers Co-Authored-By: Samuel Aquino <saquino@flexion.us> Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com> Co-Authored-By: jherrflexion <118225331+jherrflexion@users.noreply.github.com> Co-Authored-By: halprin <halprin@users.noreply.github.com> --- operations/template/alert.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 553c470e6..cf479481e 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -1,5 +1,5 @@ resource "azurerm_monitor_action_group" "notify_slack_email" { - count = 1 //local.non_pr_environment ? 1 : 0 + count = local.non_pr_environment ? 1 : 0 name = "cdcti${var.environment}-actiongroup" resource_group_name = data.azurerm_resource_group.group.name short_name = "cdcti-alerts" @@ -121,7 +121,7 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "database_token_expired_a } } resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { - count = 1 //local.non_pr_environment ? 1 : 0 + count = local.non_pr_environment ? 1 : 0 name = "cdcti-${var.environment}-azure-low-instance-count-alert" resource_group_name = data.azurerm_resource_group.group.name scopes = [azurerm_monitor_autoscale_setting.api_autoscale.id] @@ -137,7 +137,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { operator = "LessThanOrEqual" // This threshold is based on the autoscale settings in app.tf // How should we tune these numbers if we've scaled up higher than the initial count of 3/1? - threshold = local.higher_environment_level ? 2.5 : 1.5 + threshold = local.higher_environment_level ? 2.5 : 0.5 } action { From 7a13e2ac8088e768a8393c1ca66179f6378924a5 Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Tue, 22 Oct 2024 17:09:41 -0500 Subject: [PATCH 09/15] Fix non-pr env logic Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com> Co-Authored-By: halprin <halprin@users.noreply.github.com> --- operations/template/main.tf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/operations/template/main.tf b/operations/template/main.tf index ff42970ce..639f3bd35 100644 --- a/operations/template/main.tf +++ b/operations/template/main.tf @@ -8,7 +8,8 @@ locals { rs_domain_prefix = "${local.selected_rs_environment_prefix}${length(local.selected_rs_environment_prefix) == 0 ? "" : "."}" higher_environment_level = var.environment == "stg" || var.environment == "prd" cdc_domain_environment = var.environment == "dev" || var.environment == "stg" || var.environment == "prd" - non_pr_environment = !strcontains(var.environment, "pr") + // If the environment looks like pr123, regex will return matches. If there are no matches, it's a non-pr env + non_pr_environment = regex("pr\\d+", var.environment) != [] } data "azurerm_resource_group" "group" { From 5e4ff035d978168a53b10457b690960f2252c84b Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Tue, 22 Oct 2024 17:16:51 -0500 Subject: [PATCH 10/15] let's regex and math right Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com> --- operations/template/main.tf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/operations/template/main.tf b/operations/template/main.tf index 639f3bd35..9c771f9ef 100644 --- a/operations/template/main.tf +++ b/operations/template/main.tf @@ -8,8 +8,9 @@ locals { rs_domain_prefix = "${local.selected_rs_environment_prefix}${length(local.selected_rs_environment_prefix) == 0 ? "" : "."}" higher_environment_level = var.environment == "stg" || var.environment == "prd" cdc_domain_environment = var.environment == "dev" || var.environment == "stg" || var.environment == "prd" - // If the environment looks like pr123, regex will return matches. If there are no matches, it's a non-pr env - non_pr_environment = regex("pr\\d+", var.environment) != [] + + // If the environment looks like pr123, regexall will contain matches. If there are no matches, it's a non-pr env + non_pr_environment = length(regexall("pr\\d+", var.environment)) == 0 } data "azurerm_resource_group" "group" { From 9e7b89ff8a99ae65684e76eedc23638ec5e3cae8 Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Wed, 23 Oct 2024 10:20:43 -0500 Subject: [PATCH 11/15] different format for threshold math; temporarily turn on alerts in PR env Co-Authored-By: halprin <halprin@users.noreply.github.com> --- operations/template/alert.tf | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index f26b24878..e5ae643f7 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -1,5 +1,5 @@ resource "azurerm_monitor_action_group" "notify_slack_email" { - count = local.non_pr_environment ? 1 : 0 + count = 1 //local.non_pr_environment ? 1 : 0 name = "cdcti${var.environment}-actiongroup" resource_group_name = data.azurerm_resource_group.group.name short_name = "cdcti-alerts" @@ -80,7 +80,7 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "database_token_expired_a } } resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { - count = local.non_pr_environment ? 1 : 0 + count = 1 //local.non_pr_environment ? 1 : 0 name = "cdcti-${var.environment}-azure-low-instance-count-alert" resource_group_name = data.azurerm_resource_group.group.name scopes = [azurerm_monitor_autoscale_setting.api_autoscale.id] @@ -94,9 +94,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { metric_name = "ObservedCapacity" aggregation = "Average" operator = "LessThanOrEqual" - // This threshold is based on the autoscale settings in app.tf - // How should we tune these numbers if we've scaled up higher than the initial count of 3/1? - threshold = local.higher_environment_level ? 2.5 : 0.5 + threshold = azurerm_monitor_autoscale_setting.api_autoscale.profile.capacity.default - 0.5 } action { From 2506327702050b94cd7e0b04beb83bc5269e4d1e Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Wed, 23 Oct 2024 11:04:07 -0500 Subject: [PATCH 12/15] index the profile --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 6da1d1402..9e800434c 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -135,7 +135,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { metric_name = "ObservedCapacity" aggregation = "Average" operator = "LessThanOrEqual" - threshold = azurerm_monitor_autoscale_setting.api_autoscale.profile.capacity.default - 0.5 + threshold = azurerm_monitor_autoscale_setting.api_autoscale.profile[0].capacity.default - 0.5 } action { From 0caf2cf96a507a1c345ace4478166b8674b03e63 Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Wed, 23 Oct 2024 11:05:21 -0500 Subject: [PATCH 13/15] another index --- operations/template/alert.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 9e800434c..7a572a1a7 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -135,7 +135,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { metric_name = "ObservedCapacity" aggregation = "Average" operator = "LessThanOrEqual" - threshold = azurerm_monitor_autoscale_setting.api_autoscale.profile[0].capacity.default - 0.5 + threshold = azurerm_monitor_autoscale_setting.api_autoscale.profile[0].capacity[0].default - 0.5 } action { From 76523858544080fa51fbdee10d38861ce2f84e43 Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Wed, 23 Oct 2024 11:14:07 -0500 Subject: [PATCH 14/15] turn environment filter back on --- operations/template/alert.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 7a572a1a7..ffedb85c0 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -1,5 +1,5 @@ resource "azurerm_monitor_action_group" "notify_slack_email" { - count = 1 //local.non_pr_environment ? 1 : 0 + count = local.non_pr_environment ? 1 : 0 name = "cdcti${var.environment}-actiongroup" resource_group_name = data.azurerm_resource_group.group.name short_name = "cdcti-alerts" @@ -121,7 +121,7 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "database_token_expired_a } } resource "azurerm_monitor_metric_alert" "low_instance_count_alert" { - count = 1 //local.non_pr_environment ? 1 : 0 + count = local.non_pr_environment ? 1 : 0 name = "cdcti-${var.environment}-azure-low-instance-count-alert" resource_group_name = data.azurerm_resource_group.group.name scopes = [azurerm_monitor_autoscale_setting.api_autoscale.id] From f463565f13a1307221e8ecf3e7260f6df1d2a465 Mon Sep 17 00:00:00 2001 From: Sylvie <sschuresko@flexion.us> Date: Wed, 23 Oct 2024 12:11:40 -0500 Subject: [PATCH 15/15] let's be a little more precise with this regex Co-Authored-By: Samuel Aquino <saquino@flexion.us> --- operations/template/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operations/template/main.tf b/operations/template/main.tf index 9c771f9ef..bc083cff3 100644 --- a/operations/template/main.tf +++ b/operations/template/main.tf @@ -10,7 +10,7 @@ locals { cdc_domain_environment = var.environment == "dev" || var.environment == "stg" || var.environment == "prd" // If the environment looks like pr123, regexall will contain matches. If there are no matches, it's a non-pr env - non_pr_environment = length(regexall("pr\\d+", var.environment)) == 0 + non_pr_environment = length(regexall("^pr\\d+", var.environment)) == 0 } data "azurerm_resource_group" "group" {