diff --git a/main.tf b/main.tf index 1dfaa15..d787c57 100644 --- a/main.tf +++ b/main.tf @@ -99,6 +99,7 @@ resource "aws_appautoscaling_policy" "scale_up" { } } } + resource "aws_appautoscaling_policy" "scale_big_up" { count = "${ var.high_big_threshold > 0 @@ -125,6 +126,32 @@ resource "aws_appautoscaling_policy" "scale_big_up" { } } +resource "aws_appautoscaling_policy" "scale_queuetime_up" { + count = "${ + var.queue_time_threshold > 0 + ? 1 : 0}" + + depends_on = ["aws_appautoscaling_target.target"] + name = "${module.label.id}-queue_time-up" + policy_type = "StepScaling" + resource_id = "service/${var.cluster_name}/${var.service_name}" + scalable_dimension = "ecs:service:DesiredCount" + service_namespace = "ecs" + + step_scaling_policy_configuration = { + cooldown = "${var.scale_up_cooldown}" + adjustment_type = "${var.adjustment_type_up}" + metric_aggregation_type = "Average" + min_adjustment_magnitude = "${var.scale_up_min_adjustment_magnitude}" + + step_adjustment { + metric_interval_lower_bound = "${var.scale_up_lower_bound}" + metric_interval_upper_bound = "${var.scale_up_upper_bound}" + scaling_adjustment = "${var.scale_up_count}" + } + } +} + resource "aws_appautoscaling_policy" "scale_down" { depends_on = ["aws_appautoscaling_target.target"] name = "${module.label.id}-sqs-down" @@ -165,9 +192,10 @@ resource "aws_cloudwatch_metric_alarm" "service_max_stuck" { ok_actions = ["${var.sns_stuck_alarm_arn}"] insufficient_data_actions = [] treat_missing_data = "ignore" - dimensions = { - ClusterName = "${var.cluster_name}" - ServiceName = "${var.service_name}" + + dimensions = { + ClusterName = "${var.cluster_name}" + ServiceName = "${var.service_name}" } } @@ -296,6 +324,7 @@ resource "aws_cloudwatch_metric_alarm" "service_queue_low" { label = "Sum_Visible+NonVisible" return_data = "true" } + metric_query { id = "visible" @@ -312,6 +341,7 @@ resource "aws_cloudwatch_metric_alarm" "service_queue_low" { } } } + metric_query { id = "notvisible" @@ -329,3 +359,67 @@ resource "aws_cloudwatch_metric_alarm" "service_queue_low" { } } } + +resource "aws_cloudwatch_metric_alarm" "queue_time" { + count = "${ + var.queue_time_threshold > 0 + ? 1 : 0}" + + # Requires ECS ContainerInsights to be enabled: aws ecs update-cluster-settings --cluster --settings name=containerInsights,value=enabled + # ECS cluster name and service name + + alarm_name = "${module.label.id}-sqs-big-up" + alarm_description = "Alarm monitors ${var.queue_name} QueueTime = ((Queue Size * Worker Timing) / (number of current tasks * Number Of workers per task))" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "1" + threshold = "${var.queue_time_threshold}" + alarm_actions = ["${aws_appautoscaling_policy.queue_time_up.arn}"] + metric_query { + id = "queuetime" + expression = "((visible+notvisible) * ${var.queue_worker_timing}) / (taskcount * ${var.queue_task_worker_count}))" + label = "WaitTime" + return_data = "true" + } + metric_query { + id = "visible" + + metric { + metric_name = "ApproximateNumberOfMessagesVisible" + namespace = "AWS/SQS" + period = "60" + stat = "Maximum" + + dimensions { + QueueName = "${var.queue_name}" + } + } + } + metric_query { + id = "notvisible" + + metric { + metric_name = "ApproximateNumberOfMessagesNotVisible" + namespace = "AWS/SQS" + period = "60" + stat = "Maximum" + + dimensions { + QueueName = "${var.queue_name}" + } + } + } + metric_query { + id = "taskcount" + + metric { + metric_name = "RunningTaskCount" + namespace = "AWS/SQS" + period = "60" + stat = "Maximum" + + dimensions { + ServiceName = "${var.service_name}" + } + } + } +} diff --git a/variables.tf b/variables.tf index 2fcbfc9..e0cb173 100644 --- a/variables.tf +++ b/variables.tf @@ -12,10 +12,6 @@ variable "cluster_name" { description = "Name of ECS cluster that service is in" } -variable "queue_name" { - description = "Name of SQS queue to monitor" -} - variable "service_name" { description = "Name of ECS service to autoscale" } @@ -80,6 +76,25 @@ variable "min_capacity" { default = "0" } +variable "queue_name" { + description = "Name of SQS queue to monitor" +} + +variable "queue_time_threshold" { + description = "Calculation of time it takes for queue job to get start processing ((Queue Size * Worker Timing) / (number of Current Tasks * number of Workers per Task))" + default = "" +} + +variable "queue_worker_timing" { + description = "Calculation of time it takes for queue job to get start processing ((Queue Size * Worker Timing) / (number of current tasks * Number Of workers per task))" + default = "1" +} + +variable "queue_task_worker_count" { + description = "Calculation of time it takes for queue job to get start processing ((Queue Size * Worker Timing) / (number of current tasks * Number Of workers per task))" + default = "1" +} + variable "scale_down_cooldown" { description = "The amount of time, in seconds, after a scaling down completes and before the next scaling activity can start" default = "60"