From 58f58e52d0ad229137103af976e0e98e02f54e69 Mon Sep 17 00:00:00 2001 From: ciuffagianluca <113357981+ciuffagianluca@users.noreply.github.com> Date: Fri, 17 Jan 2025 09:48:02 +0100 Subject: [PATCH] fix: CHK-3593 fix ecommerce/wallet queue alert query (#2646) * fix query * fix storage query * write new query alert for expiration queue * set different query for expiration queue and other kinf of queue * fix properties name * fix query * fix expiration query * fix expiration alert query for wallet * fix expiration alert query for wallet * restore outcome waiting values group * add action group to expiration queue * fic action group * restore docs comment * restore module name for qeuue * Update src/domains/pay-wallet-common/03_storage.tf Co-authored-by: Pietro Tota <115724836+pietro-tota@users.noreply.github.com> * add time window to query * Update src/domains/ecommerce-common/03_storage.tf Co-authored-by: Pietro Tota <115724836+pietro-tota@users.noreply.github.com> * set all queues that consider visibility timeout greater than 0 to the alert group with different timestamp for write and delete * add comma * fix query params for ecommerce * add email subject to ecommerce qeuue enqueue rate alert * fix email sublect * Update src/domains/pay-wallet-common/03_storage.tf remove start of day --------- Co-authored-by: Gianluca Ciuffa Co-authored-by: Pietro Tota <115724836+pietro-tota@users.noreply.github.com> Co-authored-by: Gianluca Ciuffa Co-authored-by: Simone infante <52280205+infantesimone@users.noreply.github.com> Co-authored-by: Gianluca Ciuffa --- src/domains/ecommerce-common/03_storage.tf | 134 +++++++++++++++----- src/domains/pay-wallet-common/03_storage.tf | 117 ++++++++++++++--- 2 files changed, 202 insertions(+), 49 deletions(-) diff --git a/src/domains/ecommerce-common/03_storage.tf b/src/domains/ecommerce-common/03_storage.tf index dfe6f137e..cdceecb08 100644 --- a/src/domains/ecommerce-common/03_storage.tf +++ b/src/domains/ecommerce-common/03_storage.tf @@ -302,70 +302,128 @@ resource "azurerm_monitor_diagnostic_setting" "ecommerce_transient_queue_diagnos locals { queue_transient_alert_props = var.env_short == "p" ? [ { - "queue_key" = "transactions-expiration-queue" + "queue_key" = "transaction-notifications-queue" "severity" = 1 "time_window" = 30 "frequency" = 15 "threshold" = 10 }, { - "queue_key" = "transaction-notifications-queue" + "queue_key" = "transactions-close-payment-queue" "severity" = 1 "time_window" = 30 "frequency" = 15 "threshold" = 10 }, { - "queue_key" = "notifications-service-retry-queue" + "queue_key" = "transactions-refund-queue" "severity" = 1 "time_window" = 30 "frequency" = 15 "threshold" = 10 - }, + } + ] : [] +} + +# Queue size: Ecommerce - ecommerce queues enqueues rate alert +resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_transient_enqueue_rate_alert" { + for_each = { for q in local.queue_transient_alert_props : q.queue_key => q } + name = "${local.project}-${each.value.queue_key}-rate-alert" + resource_group_name = azurerm_resource_group.storage_ecommerce_rg.name + location = var.location + + action { + action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id, azurerm_monitor_action_group.ecommerce_opsgenie[0].id] + email_subject = "[eCommerce] Enqueue rate for transient queue too high (instant processing)" + custom_webhook_payload = "{}" + } + data_source_id = module.ecommerce_storage_transient.id + description = format("Enqueuing rate for queue %s > ${each.value.threshold} during last ${each.value.time_window} minutes", replace("${each.value.queue_key}", "-", " ")) + enabled = true + query = format(<<-QUERY + let OpCountForQueue = (operation: string, queueKey: string) { + StorageQueueLogs + | where OperationName == operation and ObjectKey startswith queueKey + | summarize count() + | project count_ + | extend dummy=1 + }; + let PutMessages = (queueName: string) { + OpCountForQueue("PutMessage", queueName) + | project PutCount = count_ + | extend dummy = 1 + }; + let DeletedMessages = (queueName: string) { + OpCountForQueue("DeleteMessage", queueName) + | project DeleteCount = count_ + | extend dummy = 1 + }; + let MessageRateForQueue = (queueKey: string) { + PutMessages(queueKey) + | join kind=inner (DeletedMessages(queueKey)) on dummy + | extend Diff = PutCount - DeleteCount + | project PutCount, DeleteCount, Diff + }; + MessageRateForQueue("%s") + | where Diff > ${each.value.threshold} + QUERY + , "/${module.ecommerce_storage_transient.name}/${local.project}-${each.value.queue_key}" + ) + severity = each.value.severity + frequency = each.value.frequency + time_window = each.value.time_window + trigger { + operator = "GreaterThan" + threshold = 0 + } +} + +locals { + queue_expiration_alert_props = var.env_short == "p" ? [ { - "queue_key" = "transaction-notifications-retry-queue" + "queue_key" = "transactions-expiration-queue" "severity" = 1 - "time_window" = 30 + "time_window" = 15 "frequency" = 15 - "threshold" = 10 + "threshold" = 40 }, { - "queue_key" = "transactions-close-payment-queue" + "queue_key" = "notifications-service-retry-queue" "severity" = 1 "time_window" = 30 "frequency" = 15 "threshold" = 10 }, { - "queue_key" = "transactions-close-payment-retry-queue" + "queue_key" = "transaction-notifications-retry-queue" "severity" = 1 "time_window" = 30 "frequency" = 15 - "threshold" = 10 + "threshold" = 20 }, { - "queue_key" = "transactions-refund-queue" + "queue_key" = "transactions-refund-retry-queue" "severity" = 1 "time_window" = 30 "frequency" = 15 "threshold" = 10 }, { - "queue_key" = "transactions-refund-retry-queue" + "queue_key" = "transaction-auth-outcome-waiting-queue" "severity" = 1 "time_window" = 30 "frequency" = 15 - "threshold" = 10 + "threshold" = 40 }, { "queue_key" = "transaction-auth-requested-queue" "severity" = 1 "time_window" = 30 "frequency" = 15 - "threshold" = 10 + "threshold" = 400 }, { - "queue_key" = "transaction-auth-outcome-waiting-queue" + "queue_key" = "transactions-close-payment-retry-queue" "severity" = 1 "time_window" = 30 "frequency" = 15 @@ -375,33 +433,51 @@ locals { } # Queue size: Ecommerce - ecommerce queues enqueues rate alert -resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_transient_enqueue_rate_alert" { - for_each = { for q in local.queue_transient_alert_props : q.queue_key => q } +resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_enqueue_rate_alert_visibility_timeout_diff" { + for_each = { for q in local.queue_expiration_alert_props : q.queue_key => q } name = "${local.project}-${each.value.queue_key}-rate-alert" resource_group_name = azurerm_resource_group.storage_ecommerce_rg.name location = var.location action { action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id, azurerm_monitor_action_group.ecommerce_opsgenie[0].id] - email_subject = "Email Header" + email_subject = "[eCommerce] Enqueue rate for transient queue too high (delayed processing)" custom_webhook_payload = "{}" } data_source_id = module.ecommerce_storage_transient.id description = format("Enqueuing rate for queue %s > ${each.value.threshold} during last ${each.value.time_window} minutes", replace("${each.value.queue_key}", "-", " ")) enabled = true query = format(<<-QUERY - let OpCountForQueue = (operation: string, queueKey: string) { - StorageQueueLogs - | where OperationName == operation and ObjectKey startswith queueKey - | summarize count() + let endDelete = datetime_local_to_utc(now(), 'Europe/Rome'); + let startDelete = endDelete - ${each.value.time_window}m; + let endPut = startDelete; + let startPut = endPut - ${each.value.time_window}m; + let OpCountForQueue = (operation: string, queueKey: string, timestart: datetime, timeend: datetime) { + StorageQueueLogs + | where OperationName == operation and ObjectKey startswith queueKey + | where TimeGenerated between(timestart .. timeend) + | summarize count() + | project count_ + | extend dummy=1 }; - let MessageRateForQueue = (queueKey: string) { - OpCountForQueue("PutMessage", queueKey) - | join kind=fullouter OpCountForQueue("DeleteMessage", queueKey) on count_ - | project name = queueKey, Count = count_ - count_1 + let PutMessages = (queueName: string, timestart: datetime, timeend: datetime) { + OpCountForQueue("PutMessage", queueName, timestart, timeend) + | project PutCount = count_ + | extend dummy = 1 }; - MessageRateForQueue("%s") - | where Count > ${each.value.threshold} + let DeletedMessages = (queueName: string, timestart: datetime, timeend: datetime) { + OpCountForQueue("DeleteMessage", queueName, timestart, timeend) + | project DeleteCount = count_ + | extend dummy = 1 + }; + let MessageRateForQueue = (queueKey: string, timestartPut: datetime, timeendPut: datetime, timestartDelete: datetime, timeendDelete: datetime) { + PutMessages(queueKey, timestartPut, timeendPut) + | join kind=inner (DeletedMessages(queueKey, timestartDelete, timeendDelete)) on dummy + | extend Diff = PutCount - DeleteCount + | project PutCount, DeleteCount, Diff + }; + MessageRateForQueue("%s", startPut, endPut, startDelete, endDelete) + | where Diff > ${each.value.threshold} QUERY , "/${module.ecommerce_storage_transient.name}/${local.project}-${each.value.queue_key}" ) @@ -480,7 +556,7 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_deadletter_fil action { action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id, azurerm_monitor_action_group.ecommerce_opsgenie[0].id] - email_subject = "Email Header" + email_subject = "[eCommerce] Writes for dead letter queue too high" custom_webhook_payload = "{}" } data_source_id = module.ecommerce_storage_deadletter.id diff --git a/src/domains/pay-wallet-common/03_storage.tf b/src/domains/pay-wallet-common/03_storage.tf index c3b1e52f4..aeb49b70f 100644 --- a/src/domains/pay-wallet-common/03_storage.tf +++ b/src/domains/pay-wallet-common/03_storage.tf @@ -147,19 +147,11 @@ locals { action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id] }, { - queue_key = "expiration-queue" - severity = 1 - time_window = 30 - frequency = 15 - threshold = 10 - action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id, azurerm_monitor_action_group.payment_wallet_opsgenie[0].id] - }, - { - queue_key = "logged-action-dead-letter-queue" - severity = 1 - time_window = 30 - frequency = 15 - threshold = 10 + queue_key = "logged-action-dead-letter-queue" + severity = 1 + time_window = 30 + frequency = 15 + threshold = 10 action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id, azurerm_monitor_action_group.payment_wallet_opsgenie[0].id] }, ] : [] @@ -182,17 +174,102 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "pay_wallet_enqueue_rate_ enabled = true query = format(<<-QUERY let OpCountForQueue = (operation: string, queueKey: string) { - StorageQueueLogs - | where OperationName == operation and ObjectKey startswith queueKey - | summarize count() + StorageQueueLogs + | where OperationName == operation and ObjectKey startswith queueKey + | summarize count() + | project count_ + | extend dummy=1 + }; + let PutMessages = (queueName: string) { + OpCountForQueue("PutMessage", queueName) + | project PutCount = count_ + | extend dummy = 1 + }; + let DeletedMessages = (queueName: string) { + OpCountForQueue("DeleteMessage", queueName) + | project DeleteCount = count_ + | extend dummy = 1 }; let MessageRateForQueue = (queueKey: string) { - OpCountForQueue("PutMessage", queueKey) - | join kind=fullouter OpCountForQueue("DeleteMessage", queueKey) on count_ - | project name = queueKey, Count = count_ - count_1 + PutMessages(queueKey) + | join kind=inner (DeletedMessages(queueKey)) on dummy + | extend Diff = PutCount - DeleteCount + | project PutCount, DeleteCount, Diff }; MessageRateForQueue("%s") - | where Count > ${each.value.threshold} + | where Diff > ${each.value.threshold} + QUERY + , "/${module.pay_wallet_storage[0].name}/${local.project}-${each.value.queue_key}" + ) + severity = each.value.severity + frequency = each.value.frequency + time_window = each.value.time_window + trigger { + operator = "GreaterThan" + threshold = 0 + } +} + + +locals { + queue_expiration_alert_props = var.env_short == "p" ? [ + { + queue_key = "expiration-queue" + severity = 1 + time_window = 30 + frequency = 15 + threshold = 10 + action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id, azurerm_monitor_action_group.payment_wallet_opsgenie[0].id] + }, + ] : [] +} + +# Queue size: wallet - wallet queues enqueues rate alert +resource "azurerm_monitor_scheduled_query_rules_alert" "pay_wallet_enqueue_rate_alert_visibility_timeout_diff" { + for_each = var.is_feature_enabled.storage ? { for q in local.queue_expiration_alert_props : q.queue_key => q } : {} + name = "${local.project}-${each.value.queue_key}-rate-alert" + resource_group_name = azurerm_resource_group.storage_pay_wallet_rg.name + location = var.location + + action { + action_group = each.value.action_group + email_subject = "[pay-wallet] Enqueue rate for wallet queue too high" + custom_webhook_payload = "{}" + } + data_source_id = module.pay_wallet_storage[0].id + description = format("Enqueuing rate for queue %s > ${each.value.threshold} during last ${each.value.time_window} minutes", replace("${each.value.queue_key}", "-", " ")) + enabled = true + query = format(<<-QUERY + let endDelete = datetime_local_to_utc(now(), 'Europe/Rome'); + let startDelete = endDelete - ${each.value.time_window}m; + let endPut = startDelete; + let startPut = endPut - ${each.value.time_window}m; + let OpCountForQueue = (operation: string, queueKey: string, timestart: datetime, timeend: datetime) { + StorageQueueLogs + | where OperationName == operation and ObjectKey startswith queueKey + | where TimeGenerated between(timestart .. timeend) + | summarize count() + | project count_ + | extend dummy=1 + }; + let PutMessages = (queueName: string, timestart: datetime, timeend: datetime) { + OpCountForQueue("PutMessage", queueName, timestart, timeend) + | project PutCount = count_ + | extend dummy = 1 + }; + let DeletedMessages = (queueName: string, timestart: datetime, timeend: datetime) { + OpCountForQueue("DeleteMessage", queueName, timestart, timeend) + | project DeleteCount = count_ + | extend dummy = 1 + }; + let MessageRateForQueue = (queueKey: string, timestartPut: datetime, timeendPut: datetime, timestartDelete: datetime, timeendDelete: datetime) { + PutMessages(queueKey, timestartPut, timeendPut) + | join kind=inner (DeletedMessages(queueKey, timestartDelete, timeendDelete)) on dummy + | extend Diff = PutCount - DeleteCount + | project PutCount, DeleteCount, Diff + }; + MessageRateForQueue("%s", startPut, endPut, startDelete, endDelete) + | where Diff > ${each.value.threshold} QUERY , "/${module.pay_wallet_storage[0].name}/${local.project}-${each.value.queue_key}" )