Skip to content

Commit

Permalink
fix: CHK-3593 fix ecommerce/wallet queue alert query (#2646)
Browse files Browse the repository at this point in the history
* fix query

* fix storage query

* write new query alert for expiration queue

* set different query for expiration queue and other kinf of queue

* fix properties name

* fix query

* fix expiration query

* fix expiration alert query for wallet

* fix expiration alert query for wallet

* restore outcome waiting values group

* add action group to expiration queue

* fic action group

* restore docs comment

* restore module name for qeuue

* Update src/domains/pay-wallet-common/03_storage.tf

Co-authored-by: Pietro Tota <[email protected]>

* add time window to query

* Update src/domains/ecommerce-common/03_storage.tf

Co-authored-by: Pietro Tota <[email protected]>

* set all queues that consider visibility timeout greater than 0 to the alert group with different timestamp for write and delete

* add comma

* fix query params for ecommerce

* add email subject to ecommerce qeuue enqueue rate alert

* fix email sublect

* Update src/domains/pay-wallet-common/03_storage.tf

remove start of day

---------

Co-authored-by: Gianluca Ciuffa <[email protected]>
Co-authored-by: Pietro Tota <[email protected]>
Co-authored-by: Gianluca Ciuffa <[email protected]>
Co-authored-by: Simone infante <[email protected]>
Co-authored-by: Gianluca Ciuffa <[email protected]>
  • Loading branch information
6 people authored Jan 17, 2025
1 parent 6024532 commit 58f58e5
Show file tree
Hide file tree
Showing 2 changed files with 202 additions and 49 deletions.
134 changes: 105 additions & 29 deletions src/domains/ecommerce-common/03_storage.tf
Original file line number Diff line number Diff line change
Expand Up @@ -302,70 +302,128 @@ resource "azurerm_monitor_diagnostic_setting" "ecommerce_transient_queue_diagnos
locals {
queue_transient_alert_props = var.env_short == "p" ? [
{
"queue_key" = "transactions-expiration-queue"
"queue_key" = "transaction-notifications-queue"
"severity" = 1
"time_window" = 30
"frequency" = 15
"threshold" = 10
},
{
"queue_key" = "transaction-notifications-queue"
"queue_key" = "transactions-close-payment-queue"
"severity" = 1
"time_window" = 30
"frequency" = 15
"threshold" = 10
},
{
"queue_key" = "notifications-service-retry-queue"
"queue_key" = "transactions-refund-queue"
"severity" = 1
"time_window" = 30
"frequency" = 15
"threshold" = 10
},
}
] : []
}

# Queue size: Ecommerce - ecommerce queues enqueues rate alert
resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_transient_enqueue_rate_alert" {
for_each = { for q in local.queue_transient_alert_props : q.queue_key => q }
name = "${local.project}-${each.value.queue_key}-rate-alert"
resource_group_name = azurerm_resource_group.storage_ecommerce_rg.name
location = var.location

action {
action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id, azurerm_monitor_action_group.ecommerce_opsgenie[0].id]
email_subject = "[eCommerce] Enqueue rate for transient queue too high (instant processing)"
custom_webhook_payload = "{}"
}
data_source_id = module.ecommerce_storage_transient.id
description = format("Enqueuing rate for queue %s > ${each.value.threshold} during last ${each.value.time_window} minutes", replace("${each.value.queue_key}", "-", " "))
enabled = true
query = format(<<-QUERY
let OpCountForQueue = (operation: string, queueKey: string) {
StorageQueueLogs
| where OperationName == operation and ObjectKey startswith queueKey
| summarize count()
| project count_
| extend dummy=1
};
let PutMessages = (queueName: string) {
OpCountForQueue("PutMessage", queueName)
| project PutCount = count_
| extend dummy = 1
};
let DeletedMessages = (queueName: string) {
OpCountForQueue("DeleteMessage", queueName)
| project DeleteCount = count_
| extend dummy = 1
};
let MessageRateForQueue = (queueKey: string) {
PutMessages(queueKey)
| join kind=inner (DeletedMessages(queueKey)) on dummy
| extend Diff = PutCount - DeleteCount
| project PutCount, DeleteCount, Diff
};
MessageRateForQueue("%s")
| where Diff > ${each.value.threshold}
QUERY
, "/${module.ecommerce_storage_transient.name}/${local.project}-${each.value.queue_key}"
)
severity = each.value.severity
frequency = each.value.frequency
time_window = each.value.time_window
trigger {
operator = "GreaterThan"
threshold = 0
}
}

locals {
queue_expiration_alert_props = var.env_short == "p" ? [
{
"queue_key" = "transaction-notifications-retry-queue"
"queue_key" = "transactions-expiration-queue"
"severity" = 1
"time_window" = 30
"time_window" = 15
"frequency" = 15
"threshold" = 10
"threshold" = 40
},
{
"queue_key" = "transactions-close-payment-queue"
"queue_key" = "notifications-service-retry-queue"
"severity" = 1
"time_window" = 30
"frequency" = 15
"threshold" = 10
},
{
"queue_key" = "transactions-close-payment-retry-queue"
"queue_key" = "transaction-notifications-retry-queue"
"severity" = 1
"time_window" = 30
"frequency" = 15
"threshold" = 10
"threshold" = 20
},
{
"queue_key" = "transactions-refund-queue"
"queue_key" = "transactions-refund-retry-queue"
"severity" = 1
"time_window" = 30
"frequency" = 15
"threshold" = 10
},
{
"queue_key" = "transactions-refund-retry-queue"
"queue_key" = "transaction-auth-outcome-waiting-queue"
"severity" = 1
"time_window" = 30
"frequency" = 15
"threshold" = 10
"threshold" = 40
},
{
"queue_key" = "transaction-auth-requested-queue"
"severity" = 1
"time_window" = 30
"frequency" = 15
"threshold" = 10
"threshold" = 400
},
{
"queue_key" = "transaction-auth-outcome-waiting-queue"
"queue_key" = "transactions-close-payment-retry-queue"
"severity" = 1
"time_window" = 30
"frequency" = 15
Expand All @@ -375,33 +433,51 @@ locals {
}

# Queue size: Ecommerce - ecommerce queues enqueues rate alert
resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_transient_enqueue_rate_alert" {
for_each = { for q in local.queue_transient_alert_props : q.queue_key => q }
resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_enqueue_rate_alert_visibility_timeout_diff" {
for_each = { for q in local.queue_expiration_alert_props : q.queue_key => q }
name = "${local.project}-${each.value.queue_key}-rate-alert"
resource_group_name = azurerm_resource_group.storage_ecommerce_rg.name
location = var.location

action {
action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id, azurerm_monitor_action_group.ecommerce_opsgenie[0].id]
email_subject = "Email Header"
email_subject = "[eCommerce] Enqueue rate for transient queue too high (delayed processing)"
custom_webhook_payload = "{}"
}
data_source_id = module.ecommerce_storage_transient.id
description = format("Enqueuing rate for queue %s > ${each.value.threshold} during last ${each.value.time_window} minutes", replace("${each.value.queue_key}", "-", " "))
enabled = true
query = format(<<-QUERY
let OpCountForQueue = (operation: string, queueKey: string) {
StorageQueueLogs
| where OperationName == operation and ObjectKey startswith queueKey
| summarize count()
let endDelete = datetime_local_to_utc(now(), 'Europe/Rome');
let startDelete = endDelete - ${each.value.time_window}m;
let endPut = startDelete;
let startPut = endPut - ${each.value.time_window}m;
let OpCountForQueue = (operation: string, queueKey: string, timestart: datetime, timeend: datetime) {
StorageQueueLogs
| where OperationName == operation and ObjectKey startswith queueKey
| where TimeGenerated between(timestart .. timeend)
| summarize count()
| project count_
| extend dummy=1
};
let MessageRateForQueue = (queueKey: string) {
OpCountForQueue("PutMessage", queueKey)
| join kind=fullouter OpCountForQueue("DeleteMessage", queueKey) on count_
| project name = queueKey, Count = count_ - count_1
let PutMessages = (queueName: string, timestart: datetime, timeend: datetime) {
OpCountForQueue("PutMessage", queueName, timestart, timeend)
| project PutCount = count_
| extend dummy = 1
};
MessageRateForQueue("%s")
| where Count > ${each.value.threshold}
let DeletedMessages = (queueName: string, timestart: datetime, timeend: datetime) {
OpCountForQueue("DeleteMessage", queueName, timestart, timeend)
| project DeleteCount = count_
| extend dummy = 1
};
let MessageRateForQueue = (queueKey: string, timestartPut: datetime, timeendPut: datetime, timestartDelete: datetime, timeendDelete: datetime) {
PutMessages(queueKey, timestartPut, timeendPut)
| join kind=inner (DeletedMessages(queueKey, timestartDelete, timeendDelete)) on dummy
| extend Diff = PutCount - DeleteCount
| project PutCount, DeleteCount, Diff
};
MessageRateForQueue("%s", startPut, endPut, startDelete, endDelete)
| where Diff > ${each.value.threshold}
QUERY
, "/${module.ecommerce_storage_transient.name}/${local.project}-${each.value.queue_key}"
)
Expand Down Expand Up @@ -480,7 +556,7 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "ecommerce_deadletter_fil

action {
action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id, azurerm_monitor_action_group.ecommerce_opsgenie[0].id]
email_subject = "Email Header"
email_subject = "[eCommerce] Writes for dead letter queue too high"
custom_webhook_payload = "{}"
}
data_source_id = module.ecommerce_storage_deadletter.id
Expand Down
117 changes: 97 additions & 20 deletions src/domains/pay-wallet-common/03_storage.tf
Original file line number Diff line number Diff line change
Expand Up @@ -147,19 +147,11 @@ locals {
action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id]
},
{
queue_key = "expiration-queue"
severity = 1
time_window = 30
frequency = 15
threshold = 10
action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id, azurerm_monitor_action_group.payment_wallet_opsgenie[0].id]
},
{
queue_key = "logged-action-dead-letter-queue"
severity = 1
time_window = 30
frequency = 15
threshold = 10
queue_key = "logged-action-dead-letter-queue"
severity = 1
time_window = 30
frequency = 15
threshold = 10
action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id, azurerm_monitor_action_group.payment_wallet_opsgenie[0].id]
},
] : []
Expand All @@ -182,17 +174,102 @@ resource "azurerm_monitor_scheduled_query_rules_alert" "pay_wallet_enqueue_rate_
enabled = true
query = format(<<-QUERY
let OpCountForQueue = (operation: string, queueKey: string) {
StorageQueueLogs
| where OperationName == operation and ObjectKey startswith queueKey
| summarize count()
StorageQueueLogs
| where OperationName == operation and ObjectKey startswith queueKey
| summarize count()
| project count_
| extend dummy=1
};
let PutMessages = (queueName: string) {
OpCountForQueue("PutMessage", queueName)
| project PutCount = count_
| extend dummy = 1
};
let DeletedMessages = (queueName: string) {
OpCountForQueue("DeleteMessage", queueName)
| project DeleteCount = count_
| extend dummy = 1
};
let MessageRateForQueue = (queueKey: string) {
OpCountForQueue("PutMessage", queueKey)
| join kind=fullouter OpCountForQueue("DeleteMessage", queueKey) on count_
| project name = queueKey, Count = count_ - count_1
PutMessages(queueKey)
| join kind=inner (DeletedMessages(queueKey)) on dummy
| extend Diff = PutCount - DeleteCount
| project PutCount, DeleteCount, Diff
};
MessageRateForQueue("%s")
| where Count > ${each.value.threshold}
| where Diff > ${each.value.threshold}
QUERY
, "/${module.pay_wallet_storage[0].name}/${local.project}-${each.value.queue_key}"
)
severity = each.value.severity
frequency = each.value.frequency
time_window = each.value.time_window
trigger {
operator = "GreaterThan"
threshold = 0
}
}


locals {
queue_expiration_alert_props = var.env_short == "p" ? [
{
queue_key = "expiration-queue"
severity = 1
time_window = 30
frequency = 15
threshold = 10
action_group = [data.azurerm_monitor_action_group.email.id, data.azurerm_monitor_action_group.slack.id, azurerm_monitor_action_group.payment_wallet_opsgenie[0].id]
},
] : []
}

# Queue size: wallet - wallet queues enqueues rate alert
resource "azurerm_monitor_scheduled_query_rules_alert" "pay_wallet_enqueue_rate_alert_visibility_timeout_diff" {
for_each = var.is_feature_enabled.storage ? { for q in local.queue_expiration_alert_props : q.queue_key => q } : {}
name = "${local.project}-${each.value.queue_key}-rate-alert"
resource_group_name = azurerm_resource_group.storage_pay_wallet_rg.name
location = var.location

action {
action_group = each.value.action_group
email_subject = "[pay-wallet] Enqueue rate for wallet queue too high"
custom_webhook_payload = "{}"
}
data_source_id = module.pay_wallet_storage[0].id
description = format("Enqueuing rate for queue %s > ${each.value.threshold} during last ${each.value.time_window} minutes", replace("${each.value.queue_key}", "-", " "))
enabled = true
query = format(<<-QUERY
let endDelete = datetime_local_to_utc(now(), 'Europe/Rome');
let startDelete = endDelete - ${each.value.time_window}m;
let endPut = startDelete;
let startPut = endPut - ${each.value.time_window}m;
let OpCountForQueue = (operation: string, queueKey: string, timestart: datetime, timeend: datetime) {
StorageQueueLogs
| where OperationName == operation and ObjectKey startswith queueKey
| where TimeGenerated between(timestart .. timeend)
| summarize count()
| project count_
| extend dummy=1
};
let PutMessages = (queueName: string, timestart: datetime, timeend: datetime) {
OpCountForQueue("PutMessage", queueName, timestart, timeend)
| project PutCount = count_
| extend dummy = 1
};
let DeletedMessages = (queueName: string, timestart: datetime, timeend: datetime) {
OpCountForQueue("DeleteMessage", queueName, timestart, timeend)
| project DeleteCount = count_
| extend dummy = 1
};
let MessageRateForQueue = (queueKey: string, timestartPut: datetime, timeendPut: datetime, timestartDelete: datetime, timeendDelete: datetime) {
PutMessages(queueKey, timestartPut, timeendPut)
| join kind=inner (DeletedMessages(queueKey, timestartDelete, timeendDelete)) on dummy
| extend Diff = PutCount - DeleteCount
| project PutCount, DeleteCount, Diff
};
MessageRateForQueue("%s", startPut, endPut, startDelete, endDelete)
| where Diff > ${each.value.threshold}
QUERY
, "/${module.pay_wallet_storage[0].name}/${local.project}-${each.value.queue_key}"
)
Expand Down

0 comments on commit 58f58e5

Please sign in to comment.