Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

5xx error alerts #1466

Merged
merged 14 commits into from
Oct 23, 2024
95 changes: 95 additions & 0 deletions operations/template/alert.tf
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,98 @@ resource "azurerm_monitor_metric_alert" "azure_4XX_alert" {
]
}
}

resource "azurerm_monitor_metric_alert" "azure_5XX_alert" {
count = local.non_pr_environment ? 1 : 0
name = "cdcti-${var.environment}-azure-http-5XX-alert"
resource_group_name = data.azurerm_resource_group.group.name
scopes = [azurerm_linux_web_app.api.id]
description = "Action will be triggered when Http Status Code 5XX is greater than or equal to 1"
frequency = "PT1M" // Checks every 1 min
window_size = "PT5M" // Every Check looks back 5 min for 4xx errors
pluckyswan marked this conversation as resolved.
Show resolved Hide resolved

criteria {
metric_namespace = "Microsoft.Web/sites"
metric_name = "Http5xx"
pluckyswan marked this conversation as resolved.
Show resolved Hide resolved
aggregation = "Count"
operator = "GreaterThanOrEqual"
threshold = 1
}

action {
action_group_id = azurerm_monitor_action_group.notify_slack_email[count.index].id
}

lifecycle {
# Ignore changes to tags because the CDC sets these automagically
ignore_changes = [
tags["business_steward"],
tags["center"],
tags["environment"],
tags["escid"],
tags["funding_source"],
tags["pii_data"],
tags["security_compliance"],
tags["security_steward"],
tags["support_group"],
tags["system"],
tags["technical_steward"],
tags["zone"]
]
}
}

resource "azurerm_monitor_scheduled_query_rules_alert" "ti-log-errors-alert" {
count = local.non_pr_environment ? 1 : 0
name = "cdcti-${var.environment}-log-errors-alert"
location = data.azurerm_resource_group.group.location
resource_group_name = data.azurerm_resource_group.group.name

action {
action_group = [azurerm_monitor_action_group.notify_slack_email[count.index].id]
email_subject = "${var.environment}: TI log errors detected!"
}

data_source_id = azurerm_linux_web_app.api.id
description = "Alert when total errors cross threshold"
enabled = true

query = <<-QUERY
AppServiceConsoleLogs
| where TimeGenerated >= ago(00.001m)
and TimeGenerated <= now()
| project columnifexists("ResultDescription", 'default_value')
| project JsonResult = parse_json(ResultDescription)
| evaluate bag_unpack(JsonResult) : (level:string)
| where level in ( 'ERROR' )
| summarize count()
QUERY

severity = 3
frequency = 10
time_window = 30
auto_mitigation_enabled = true

trigger {
operator = "GreaterThan"
threshold = 1
}

# below tags are managed by CDC
lifecycle {
ignore_changes = [
tags["business_steward"],
tags["center"],
tags["environment"],
tags["escid"],
tags["funding_source"],
tags["pii_data"],
tags["security_compliance"],
tags["security_steward"],
tags["support_group"],
tags["system"],
tags["technical_steward"],
tags["zone"]
]
}
}