From 9a04e54aaacaf9e68bbb99fc2648fdb3a9988785 Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Tue, 22 Oct 2024 14:46:26 -0500
Subject: [PATCH 01/15] alert when instance count is low

Co-Authored-By: Samuel Aquino <saquino@flexion.us>
Co-Authored-By: halprin <halprin@users.noreply.github.com>
Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com>
---
 operations/template/alert.tf | 42 ++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/operations/template/alert.tf b/operations/template/alert.tf
index 109e27d02..06f715357 100644
--- a/operations/template/alert.tf
+++ b/operations/template/alert.tf
@@ -120,3 +120,45 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "database_token_expired_a
     ]
   }
 }
+resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
+  count               = 1 //local.non_pr_environment ? 1 : 0
+  name                = "cdcti-${var.environment}-azure-low-instance-count-alert"
+  resource_group_name = data.azurerm_resource_group.group.name
+  scopes              = [azurerm_linux_web_app.api.id]
+  description         = "Action will be triggered when the instance count is too low"
+  frequency           = "PT1M"  // Checks every 1 minute
+  window_size         = "PT15M" // Every Check, looks back 15 minutes in history
+  //TBD: How frequent do we want this alert and how far do we want it to look back.
+
+  criteria {
+    metric_namespace = "Microsoft.Web/sites"
+    metric_name      = "InstanceCount"
+    aggregation      = "Average"
+    operator         = "LessThanOrEqual"
+    // This threshold is based on the autoscale settings in app.tf
+    // How should we tune these numbers if we've scaled up higher than the initial count of 3/1?
+    threshold = local.higher_environment_level ? 2.5 : 0.5
+  }
+
+  action {
+    action_group_id = azurerm_monitor_action_group.notify_slack_email[count.index].id
+  }
+
+  lifecycle {
+    # Ignore changes to tags because the CDC sets these automagically
+    ignore_changes = [
+      tags["business_steward"],
+      tags["center"],
+      tags["environment"],
+      tags["escid"],
+      tags["funding_source"],
+      tags["pii_data"],
+      tags["security_compliance"],
+      tags["security_steward"],
+      tags["support_group"],
+      tags["system"],
+      tags["technical_steward"],
+      tags["zone"]
+    ]
+  }
+}

From 595ab5d9fd57a9b81eadefb672055ef3b20b75f6 Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Tue, 22 Oct 2024 14:53:58 -0500
Subject: [PATCH 02/15] make action group exist in pr env for now

Co-Authored-By: Samuel Aquino <saquino@flexion.us>
Co-Authored-By: halprin <halprin@users.noreply.github.com>
Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com>
---
 operations/template/alert.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operations/template/alert.tf b/operations/template/alert.tf
index 06f715357..57cff47ce 100644
--- a/operations/template/alert.tf
+++ b/operations/template/alert.tf
@@ -1,5 +1,5 @@
 resource "azurerm_monitor_action_group" "notify_slack_email" {
-  count               = local.non_pr_environment ? 1 : 0
+  count               = 1 //local.non_pr_environment ? 1 : 0
   name                = "cdcti${var.environment}-actiongroup"
   resource_group_name = data.azurerm_resource_group.group.name
   short_name          = "cdcti-alerts"

From 3d2067dbb1aeedd538ebd06f0a6fbed46e103cc5 Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Tue, 22 Oct 2024 15:15:42 -0500
Subject: [PATCH 03/15] try the other metric namespace

Co-Authored-By: Samuel Aquino <saquino@flexion.us>
Co-Authored-By: halprin <halprin@users.noreply.github.com>
Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com>
---
 operations/template/alert.tf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/operations/template/alert.tf b/operations/template/alert.tf
index 57cff47ce..8e4a4953e 100644
--- a/operations/template/alert.tf
+++ b/operations/template/alert.tf
@@ -124,15 +124,15 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
   count               = 1 //local.non_pr_environment ? 1 : 0
   name                = "cdcti-${var.environment}-azure-low-instance-count-alert"
   resource_group_name = data.azurerm_resource_group.group.name
-  scopes              = [azurerm_linux_web_app.api.id]
+  scopes              = [azurerm_monitor_autoscale_setting.api_autoscale.id]
   description         = "Action will be triggered when the instance count is too low"
   frequency           = "PT1M"  // Checks every 1 minute
   window_size         = "PT15M" // Every Check, looks back 15 minutes in history
   //TBD: How frequent do we want this alert and how far do we want it to look back.
 
   criteria {
-    metric_namespace = "Microsoft.Web/sites"
-    metric_name      = "InstanceCount"
+    metric_namespace = "Microsoft.Insights/autoscalesettings"
+    metric_name      = "ObservedCapacity"
     aggregation      = "Average"
     operator         = "LessThanOrEqual"
     // This threshold is based on the autoscale settings in app.tf

From 7b5026eafbe145ce22eccc93ad9e44785781b87e Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Tue, 22 Oct 2024 15:34:55 -0500
Subject: [PATCH 04/15] this should really fail

Co-Authored-By: Samuel Aquino <saquino@flexion.us>
Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com>
---
 operations/template/alert.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operations/template/alert.tf b/operations/template/alert.tf
index 8e4a4953e..6240232ba 100644
--- a/operations/template/alert.tf
+++ b/operations/template/alert.tf
@@ -137,7 +137,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
     operator         = "LessThanOrEqual"
     // This threshold is based on the autoscale settings in app.tf
     // How should we tune these numbers if we've scaled up higher than the initial count of 3/1?
-    threshold = local.higher_environment_level ? 2.5 : 0.5
+    threshold = local.higher_environment_level ? 2.5 : 1.5
   }
 
   action {

From df60b605bb4eb89f467a229c28abbd81f7d2fea8 Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Tue, 22 Oct 2024 15:41:33 -0500
Subject: [PATCH 05/15] set back to a civilized number

Co-Authored-By: Samuel Aquino <saquino@flexion.us>
Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com>
---
 operations/template/alert.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operations/template/alert.tf b/operations/template/alert.tf
index 6240232ba..8e4a4953e 100644
--- a/operations/template/alert.tf
+++ b/operations/template/alert.tf
@@ -137,7 +137,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
     operator         = "LessThanOrEqual"
     // This threshold is based on the autoscale settings in app.tf
     // How should we tune these numbers if we've scaled up higher than the initial count of 3/1?
-    threshold = local.higher_environment_level ? 2.5 : 1.5
+    threshold = local.higher_environment_level ? 2.5 : 0.5
   }
 
   action {

From 013ee4998569155df4a1ebacc8c88d02799077d2 Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Tue, 22 Oct 2024 15:57:21 -0500
Subject: [PATCH 06/15] let's try another severity

Co-Authored-By: Samuel Aquino <saquino@flexion.us>
Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com>
---
 operations/template/alert.tf | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/operations/template/alert.tf b/operations/template/alert.tf
index 8e4a4953e..fc201a0af 100644
--- a/operations/template/alert.tf
+++ b/operations/template/alert.tf
@@ -126,6 +126,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
   resource_group_name = data.azurerm_resource_group.group.name
   scopes              = [azurerm_monitor_autoscale_setting.api_autoscale.id]
   description         = "Action will be triggered when the instance count is too low"
+  severity            = 2       // warning
   frequency           = "PT1M"  // Checks every 1 minute
   window_size         = "PT15M" // Every Check, looks back 15 minutes in history
   //TBD: How frequent do we want this alert and how far do we want it to look back.
@@ -137,7 +138,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
     operator         = "LessThanOrEqual"
     // This threshold is based on the autoscale settings in app.tf
     // How should we tune these numbers if we've scaled up higher than the initial count of 3/1?
-    threshold = local.higher_environment_level ? 2.5 : 0.5
+    threshold = local.higher_environment_level ? 2.5 : 1.5
   }
 
   action {

From f60548947e986057b7361ebebc831833bc00b039 Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Tue, 22 Oct 2024 16:00:25 -0500
Subject: [PATCH 07/15] more description

Co-Authored-By: Samuel Aquino <saquino@flexion.us>
Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com>
Co-Authored-By: jherrflexion <118225331+jherrflexion@users.noreply.github.com>
---
 operations/template/alert.tf | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/operations/template/alert.tf b/operations/template/alert.tf
index fc201a0af..553c470e6 100644
--- a/operations/template/alert.tf
+++ b/operations/template/alert.tf
@@ -125,11 +125,10 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
   name                = "cdcti-${var.environment}-azure-low-instance-count-alert"
   resource_group_name = data.azurerm_resource_group.group.name
   scopes              = [azurerm_monitor_autoscale_setting.api_autoscale.id]
-  description         = "Action will be triggered when the instance count is too low"
+  description         = "The instance count in ${var.environment} is too low"
   severity            = 2       // warning
   frequency           = "PT1M"  // Checks every 1 minute
   window_size         = "PT15M" // Every Check, looks back 15 minutes in history
-  //TBD: How frequent do we want this alert and how far do we want it to look back.
 
   criteria {
     metric_namespace = "Microsoft.Insights/autoscalesettings"

From 00ad2a41198c59afc76c2437b8f9f9277361648a Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Tue, 22 Oct 2024 16:05:15 -0500
Subject: [PATCH 08/15] Set to appropriate envs and numbers

Co-Authored-By: Samuel Aquino <saquino@flexion.us>
Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com>
Co-Authored-By: jherrflexion <118225331+jherrflexion@users.noreply.github.com>
Co-Authored-By: halprin <halprin@users.noreply.github.com>
---
 operations/template/alert.tf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/operations/template/alert.tf b/operations/template/alert.tf
index 553c470e6..cf479481e 100644
--- a/operations/template/alert.tf
+++ b/operations/template/alert.tf
@@ -1,5 +1,5 @@
 resource "azurerm_monitor_action_group" "notify_slack_email" {
-  count               = 1 //local.non_pr_environment ? 1 : 0
+  count               = local.non_pr_environment ? 1 : 0
   name                = "cdcti${var.environment}-actiongroup"
   resource_group_name = data.azurerm_resource_group.group.name
   short_name          = "cdcti-alerts"
@@ -121,7 +121,7 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "database_token_expired_a
   }
 }
 resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
-  count               = 1 //local.non_pr_environment ? 1 : 0
+  count               = local.non_pr_environment ? 1 : 0
   name                = "cdcti-${var.environment}-azure-low-instance-count-alert"
   resource_group_name = data.azurerm_resource_group.group.name
   scopes              = [azurerm_monitor_autoscale_setting.api_autoscale.id]
@@ -137,7 +137,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
     operator         = "LessThanOrEqual"
     // This threshold is based on the autoscale settings in app.tf
     // How should we tune these numbers if we've scaled up higher than the initial count of 3/1?
-    threshold = local.higher_environment_level ? 2.5 : 1.5
+    threshold = local.higher_environment_level ? 2.5 : 0.5
   }
 
   action {

From 7a13e2ac8088e768a8393c1ca66179f6378924a5 Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Tue, 22 Oct 2024 17:09:41 -0500
Subject: [PATCH 09/15] Fix non-pr env logic

Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com>
Co-Authored-By: halprin <halprin@users.noreply.github.com>
---
 operations/template/main.tf | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/operations/template/main.tf b/operations/template/main.tf
index ff42970ce..639f3bd35 100644
--- a/operations/template/main.tf
+++ b/operations/template/main.tf
@@ -8,7 +8,8 @@ locals {
   rs_domain_prefix               = "${local.selected_rs_environment_prefix}${length(local.selected_rs_environment_prefix) == 0 ? "" : "."}"
   higher_environment_level       = var.environment == "stg" || var.environment == "prd"
   cdc_domain_environment         = var.environment == "dev" || var.environment == "stg" || var.environment == "prd"
-  non_pr_environment             = !strcontains(var.environment, "pr")
+  // If the environment looks like pr123, regex will return matches. If there are no matches, it's a non-pr env
+  non_pr_environment = regex("pr\\d+", var.environment) != []
 }
 
 data "azurerm_resource_group" "group" {

From 5e4ff035d978168a53b10457b690960f2252c84b Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Tue, 22 Oct 2024 17:16:51 -0500
Subject: [PATCH 10/15] let's regex and math right

Co-Authored-By: James Gilmore <109554461+GilmoreA6@users.noreply.github.com>
---
 operations/template/main.tf | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/operations/template/main.tf b/operations/template/main.tf
index 639f3bd35..9c771f9ef 100644
--- a/operations/template/main.tf
+++ b/operations/template/main.tf
@@ -8,8 +8,9 @@ locals {
   rs_domain_prefix               = "${local.selected_rs_environment_prefix}${length(local.selected_rs_environment_prefix) == 0 ? "" : "."}"
   higher_environment_level       = var.environment == "stg" || var.environment == "prd"
   cdc_domain_environment         = var.environment == "dev" || var.environment == "stg" || var.environment == "prd"
-  // If the environment looks like pr123, regex will return matches. If there are no matches, it's a non-pr env
-  non_pr_environment = regex("pr\\d+", var.environment) != []
+
+  // If the environment looks like pr123, regexall will contain matches. If there are no matches, it's a non-pr env
+  non_pr_environment = length(regexall("pr\\d+", var.environment)) == 0
 }
 
 data "azurerm_resource_group" "group" {

From 9e7b89ff8a99ae65684e76eedc23638ec5e3cae8 Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Wed, 23 Oct 2024 10:20:43 -0500
Subject: [PATCH 11/15] different format for threshold math; temporarily turn
 on alerts in PR env

Co-Authored-By: halprin <halprin@users.noreply.github.com>
---
 operations/template/alert.tf | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/operations/template/alert.tf b/operations/template/alert.tf
index f26b24878..e5ae643f7 100644
--- a/operations/template/alert.tf
+++ b/operations/template/alert.tf
@@ -1,5 +1,5 @@
 resource "azurerm_monitor_action_group" "notify_slack_email" {
-  count               = local.non_pr_environment ? 1 : 0
+  count               = 1 //local.non_pr_environment ? 1 : 0
   name                = "cdcti${var.environment}-actiongroup"
   resource_group_name = data.azurerm_resource_group.group.name
   short_name          = "cdcti-alerts"
@@ -80,7 +80,7 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "database_token_expired_a
   }
 }
 resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
-  count               = local.non_pr_environment ? 1 : 0
+  count               = 1 //local.non_pr_environment ? 1 : 0
   name                = "cdcti-${var.environment}-azure-low-instance-count-alert"
   resource_group_name = data.azurerm_resource_group.group.name
   scopes              = [azurerm_monitor_autoscale_setting.api_autoscale.id]
@@ -94,9 +94,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
     metric_name      = "ObservedCapacity"
     aggregation      = "Average"
     operator         = "LessThanOrEqual"
-    // This threshold is based on the autoscale settings in app.tf
-    // How should we tune these numbers if we've scaled up higher than the initial count of 3/1?
-    threshold = local.higher_environment_level ? 2.5 : 0.5
+    threshold        = azurerm_monitor_autoscale_setting.api_autoscale.profile.capacity.default - 0.5
   }
 
   action {

From 2506327702050b94cd7e0b04beb83bc5269e4d1e Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Wed, 23 Oct 2024 11:04:07 -0500
Subject: [PATCH 12/15] index the profile

---
 operations/template/alert.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operations/template/alert.tf b/operations/template/alert.tf
index 6da1d1402..9e800434c 100644
--- a/operations/template/alert.tf
+++ b/operations/template/alert.tf
@@ -135,7 +135,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
     metric_name      = "ObservedCapacity"
     aggregation      = "Average"
     operator         = "LessThanOrEqual"
-    threshold        = azurerm_monitor_autoscale_setting.api_autoscale.profile.capacity.default - 0.5
+    threshold        = azurerm_monitor_autoscale_setting.api_autoscale.profile[0].capacity.default - 0.5
   }
 
   action {

From 0caf2cf96a507a1c345ace4478166b8674b03e63 Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Wed, 23 Oct 2024 11:05:21 -0500
Subject: [PATCH 13/15] another index

---
 operations/template/alert.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operations/template/alert.tf b/operations/template/alert.tf
index 9e800434c..7a572a1a7 100644
--- a/operations/template/alert.tf
+++ b/operations/template/alert.tf
@@ -135,7 +135,7 @@ resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
     metric_name      = "ObservedCapacity"
     aggregation      = "Average"
     operator         = "LessThanOrEqual"
-    threshold        = azurerm_monitor_autoscale_setting.api_autoscale.profile[0].capacity.default - 0.5
+    threshold        = azurerm_monitor_autoscale_setting.api_autoscale.profile[0].capacity[0].default - 0.5
   }
 
   action {

From 76523858544080fa51fbdee10d38861ce2f84e43 Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Wed, 23 Oct 2024 11:14:07 -0500
Subject: [PATCH 14/15] turn environment filter back on

---
 operations/template/alert.tf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/operations/template/alert.tf b/operations/template/alert.tf
index 7a572a1a7..ffedb85c0 100644
--- a/operations/template/alert.tf
+++ b/operations/template/alert.tf
@@ -1,5 +1,5 @@
 resource "azurerm_monitor_action_group" "notify_slack_email" {
-  count               = 1 //local.non_pr_environment ? 1 : 0
+  count               = local.non_pr_environment ? 1 : 0
   name                = "cdcti${var.environment}-actiongroup"
   resource_group_name = data.azurerm_resource_group.group.name
   short_name          = "cdcti-alerts"
@@ -121,7 +121,7 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "database_token_expired_a
   }
 }
 resource "azurerm_monitor_metric_alert" "low_instance_count_alert" {
-  count               = 1 //local.non_pr_environment ? 1 : 0
+  count               = local.non_pr_environment ? 1 : 0
   name                = "cdcti-${var.environment}-azure-low-instance-count-alert"
   resource_group_name = data.azurerm_resource_group.group.name
   scopes              = [azurerm_monitor_autoscale_setting.api_autoscale.id]

From f463565f13a1307221e8ecf3e7260f6df1d2a465 Mon Sep 17 00:00:00 2001
From: Sylvie <sschuresko@flexion.us>
Date: Wed, 23 Oct 2024 12:11:40 -0500
Subject: [PATCH 15/15] let's be a little more precise with this regex

Co-Authored-By: Samuel Aquino <saquino@flexion.us>
---
 operations/template/main.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operations/template/main.tf b/operations/template/main.tf
index 9c771f9ef..bc083cff3 100644
--- a/operations/template/main.tf
+++ b/operations/template/main.tf
@@ -10,7 +10,7 @@ locals {
   cdc_domain_environment         = var.environment == "dev" || var.environment == "stg" || var.environment == "prd"
 
   // If the environment looks like pr123, regexall will contain matches. If there are no matches, it's a non-pr env
-  non_pr_environment = length(regexall("pr\\d+", var.environment)) == 0
+  non_pr_environment = length(regexall("^pr\\d+", var.environment)) == 0
 }
 
 data "azurerm_resource_group" "group" {