From 3be0a567e05dc02124f916a0d43a471d6d73dbce Mon Sep 17 00:00:00 2001 From: James Herr Date: Tue, 22 Oct 2024 16:13:14 -0500 Subject: [PATCH 1/6] Initial setup for Azure Log Errors Alert --- operations/template/alert.tf | 52 ++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 109e27d02..52498ef04 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -120,3 +120,55 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "database_token_expired_a ] } } +resource "azurerm_monitor_scheduled_query_rules_alert" "ti-log-errors-alert" { + count = local.non_pr_environment ? 1 : 0 + name = "cdcti-${var.environment}-log-errors-alert" + location = data.azurerm_resource_group.group.location + resource_group_name = data.azurerm_resource_group.group.name + + action { + action_group = [azurerm_monitor_action_group.notify_slack_email[count.index].id] + email_subject = "${var.environment}: TI log errors detected!" + } + + data_source_id = azurerm_linux_web_app.api.id + description = "Alert when total errors cross threshold" + enabled = true + + query = <<-QUERY + AppServiceConsoleLogs + | project JsonResult = parse_json(ResultDescription) | evaluate bag_unpack(JsonResult) + | where level == 'ERROR' + and @timestamp >= ago(30m) + and @timestamp <= now() + | summarize count() + QUERY + + severity = 3 + frequency = 10 + time_window = 30 + auto_mitigation_enabled = true + + trigger { + operator = "GreaterThan" + threshold = 1 + } + + # below tags are managed by CDC + lifecycle { + ignore_changes = [ + tags["business_steward"], + tags["center"], + tags["environment"], + tags["escid"], + tags["funding_source"], + tags["pii_data"], + tags["security_compliance"], + tags["security_steward"], + tags["support_group"], + tags["system"], + tags["technical_steward"], + tags["zone"] + ] + } +} From bb5eb321168a067a1b680081a691fda0a088865d Mon Sep 17 00:00:00 2001 From: James Herr Date: Tue, 22 Oct 2024 16:32:44 -0500 Subject: [PATCH 2/6] Changed logic to match new changes --- operations/template/alert.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 52498ef04..c9a3796c2 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -139,8 +139,8 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "ti-log-errors-alert" { AppServiceConsoleLogs | project JsonResult = parse_json(ResultDescription) | evaluate bag_unpack(JsonResult) | where level == 'ERROR' - and @timestamp >= ago(30m) - and @timestamp <= now() + and TimeGenerated >= ago(30m) + and TimeGenerated <= now() | summarize count() QUERY From bb04baa3a2ce152e00c843e71192a0ab7c8a989d Mon Sep 17 00:00:00 2001 From: James Herr Date: Wed, 23 Oct 2024 09:54:48 -0500 Subject: [PATCH 3/6] Fixed query --- operations/template/alert.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index c9a3796c2..a6c99b505 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -86,9 +86,9 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "database_token_expired_a query = <<-QUERY AppServiceConsoleLogs - | where ResultDescription has "FATAL: The access token has expired." - and TimeGenerated >= ago(30m) + | where TimeGenerated >= ago(30m) and TimeGenerated <= now() + | where ResultDescription has "FATAL: The access token has expired." | summarize count() QUERY From d4c7553a01ab699d44d1dd95f87b671a3d5eec1c Mon Sep 17 00:00:00 2001 From: James Herr Date: Wed, 23 Oct 2024 10:04:54 -0500 Subject: [PATCH 4/6] Moved TimeGenerated --- operations/template/alert.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index a6c99b505..3d3316c47 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -137,10 +137,10 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "ti-log-errors-alert" { query = <<-QUERY AppServiceConsoleLogs + | where TimeGenerated >= ago(30m) + and TimeGenerated <= now() | project JsonResult = parse_json(ResultDescription) | evaluate bag_unpack(JsonResult) | where level == 'ERROR' - and TimeGenerated >= ago(30m) - and TimeGenerated <= now() | summarize count() QUERY From 65369cb86f92e411194f4bdd0c845517c4a74e74 Mon Sep 17 00:00:00 2001 From: James Herr Date: Wed, 23 Oct 2024 11:11:46 -0500 Subject: [PATCH 5/6] Added null check to query --- operations/template/alert.tf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index 3d3316c47..c028c0306 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -86,9 +86,9 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "database_token_expired_a query = <<-QUERY AppServiceConsoleLogs - | where TimeGenerated >= ago(30m) - and TimeGenerated <= now() | where ResultDescription has "FATAL: The access token has expired." + and TimeGenerated >= ago(30m) + and TimeGenerated <= now() | summarize count() QUERY @@ -140,7 +140,8 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "ti-log-errors-alert" { | where TimeGenerated >= ago(30m) and TimeGenerated <= now() | project JsonResult = parse_json(ResultDescription) | evaluate bag_unpack(JsonResult) - | where level == 'ERROR' + | where isnotnull(level) + and level in ( 'ERROR' ) | summarize count() QUERY From eff3a149ed6651b4d2d9ada04b6dd07b5bf54729 Mon Sep 17 00:00:00 2001 From: James Herr Date: Wed, 23 Oct 2024 11:39:12 -0500 Subject: [PATCH 6/6] Added column exists check --- operations/template/alert.tf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/operations/template/alert.tf b/operations/template/alert.tf index c028c0306..aa99bd167 100644 --- a/operations/template/alert.tf +++ b/operations/template/alert.tf @@ -137,11 +137,12 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "ti-log-errors-alert" { query = <<-QUERY AppServiceConsoleLogs - | where TimeGenerated >= ago(30m) + | where TimeGenerated >= ago(00.001m) and TimeGenerated <= now() - | project JsonResult = parse_json(ResultDescription) | evaluate bag_unpack(JsonResult) - | where isnotnull(level) - and level in ( 'ERROR' ) + | project columnifexists("ResultDescription", 'default_value') + | project JsonResult = parse_json(ResultDescription) + | evaluate bag_unpack(JsonResult) : (level:string) + | where level in ( 'ERROR' ) | summarize count() QUERY