From 8e5356422a05fdd67621b7f78886cd624df3b9ff Mon Sep 17 00:00:00 2001 From: David Nguyen Date: Wed, 13 Nov 2024 16:33:26 -0800 Subject: [PATCH] fix!: disable autoscaling for indexwork service Some jobs are being terminated inflight during scale-in for this service. Disabling autoscaling for this service is the safest route forward until this is resolved. BREAKING CHANGE: The following variable has been removed - indexwork_autoscaling_max_count --- modules/bigeye/main.tf | 94 +------------------------------------ modules/bigeye/variables.tf | 12 ++--- 2 files changed, 7 insertions(+), 99 deletions(-) diff --git a/modules/bigeye/main.tf b/modules/bigeye/main.tf index 8519d6f..6e29bc2 100644 --- a/modules/bigeye/main.tf +++ b/modules/bigeye/main.tf @@ -2582,7 +2582,7 @@ module "indexwork" { # Task settings control_desired_count = false - desired_count = 0 + desired_count = var.indexwork_desired_count cpu = var.indexwork_cpu memory = var.indexwork_memory execution_role_arn = local.ecs_role_arn @@ -2630,98 +2630,6 @@ module "indexwork" { secret_arns = local.datawatch_secret_arns } -resource "aws_appautoscaling_target" "indexwork" { - depends_on = [module.indexwork] - min_capacity = 0 - max_capacity = var.indexwork_autoscaling_max_count - resource_id = format("service/%s/%s-indexwork", local.name, local.name) - scalable_dimension = "ecs:service:DesiredCount" - service_namespace = "ecs" -} - -resource "aws_appautoscaling_policy" "indexwork" { - depends_on = [aws_appautoscaling_target.indexwork] - name = format("%s-indexwork-catalog-autoscaling", local.name) - policy_type = "StepScaling" - resource_id = aws_appautoscaling_target.indexwork.resource_id - scalable_dimension = aws_appautoscaling_target.indexwork.scalable_dimension - service_namespace = aws_appautoscaling_target.indexwork.service_namespace - step_scaling_policy_configuration { - adjustment_type = "ExactCapacity" - cooldown = 300 - metric_aggregation_type = "Minimum" - - # Scale to 0 when there is no work on the queue - step_adjustment { - scaling_adjustment = 0 - metric_interval_upper_bound = 1 - } - - # Scale up when there is at least 1 job in the queue. More fine grained scaling steps is not - # practical for MQ based services as we will loose in-flight jobs during scale-in since our MQ - # workers do not respect sigterm. - step_adjustment { - scaling_adjustment = var.indexwork_autoscaling_max_count - metric_interval_lower_bound = 1 - } - } -} - -resource "aws_cloudwatch_metric_alarm" "indexwork" { - alarm_name = "${local.name}-indexwork autoscaling" - actions_enabled = true - alarm_actions = [aws_appautoscaling_policy.indexwork.arn] - evaluation_periods = 1 - datapoints_to_alarm = 1 - threshold = 0 - comparison_operator = "GreaterThanOrEqualToThreshold" - treat_missing_data = "missing" - tags = {} - # (12 unchanged attributes hidden) - - metric_query { - id = "m1" - period = 0 - return_data = false - - metric { - dimensions = { - "Broker" = local.name - "Queue" = "dataset_index_op_v2" - "VirtualHost" = "/" - } - metric_name = "MessageCount" - namespace = "AWS/AmazonMQ" - period = 300 - stat = "Minimum" - } - } - metric_query { - id = "m2" - period = 0 - return_data = false - - metric { - dimensions = { - "Broker" = local.name - "Queue" = "catalog_index_v2" - "VirtualHost" = "/" - } - metric_name = "MessageCount" - namespace = "AWS/AmazonMQ" - period = 300 - stat = "Minimum" - } - } - metric_query { - expression = "SUM(METRICS())" - id = "e1" - label = "sum queued messages across queues" - period = 0 - return_data = true - } -} - module "lineagework" { depends_on = [aws_secretsmanager_secret_version.robot_password, aws_secretsmanager_secret_version.robot_agent_api_key] source = "../simpleservice" diff --git a/modules/bigeye/variables.tf b/modules/bigeye/variables.tf index abf8998..9e36445 100644 --- a/modules/bigeye/variables.tf +++ b/modules/bigeye/variables.tf @@ -2055,6 +2055,12 @@ variable "indexwork_image_tag" { default = "" } +variable "indexwork_desired_count" { + description = "The desired number of replicas" + type = number + default = 2 +} + variable "indexwork_cpu" { description = "Amount of CPU to allocate" type = number @@ -2103,12 +2109,6 @@ variable "indexwork_enable_ecs_exec" { default = false } -variable "indexwork_autoscaling_max_count" { - description = "When there is work in the queue, the indexwork will scale up to this number of instances." - type = number - default = 2 -} - #====================================================== # Application Variables - Lineagework #======================================================