[Bug]: null_resource fails after Beanstalk ASG replaced - "Provider produced inconsistent final plan" #302

zg-genel · 2024-01-19T17:58:55Z

Terraform CLI and Provider Versions

Terraform: 1.6.2
null provider: 3.2.1
aws provider: 5.31.0

Terraform Configuration

#provider.tf
provider "aws" {
  profile = "REPLACE_ME"
  region = "REPLACE_ME"
}

# vars.tf
variable "solution_stack_name" {
  type = string
}
variable "tier" {
  type = string
}
variable "vpc_id" {}
variable "subnets" {}
variable "elb_subnets" {}
variable "instance_min_size" {
  type = number
}
variable "instance_max_size" {
  type = number
}
variable "instance_type" {
  type = string
}

# main.tf
data "aws_caller_identity" "current" {}
data "aws_iam_policy_document" "test_app_assume_role_policy" {
  statement {
    actions = ["sts:AssumeRole"]
    effect  = "Allow"
    principals {
      type        = "Service"
      identifiers = ["ec2.amazonaws.com", "firehose.amazonaws.com", "lambda.amazonaws.com", "s3.amazonaws.com", "states.amazonaws.com"]
    }
  }
}

## IAM
resource "aws_iam_role" "test_app_service_role" {
  assume_role_policy   = data.aws_iam_policy_document.test_app_assume_role_policy.json
  name                 = "service-test-app-role"
  permissions_boundary = ""
}

resource "aws_iam_instance_profile" "test_app_profile" {
  name = "service-test-app-instance-profile"
  role = aws_iam_role.test_app_service_role.name
}

## EB
resource "aws_elastic_beanstalk_application" "test_app" {
  description = "Test app to repro inconsistent final plan for null_resource ASG metrics"
  name        = "test-app"
}

resource "aws_elastic_beanstalk_environment" "test_app_env" {
  application  = aws_elastic_beanstalk_application.test_app.name
  description  = "Test env to repro inconsistent final plan for null_resource ASG metrics"
  name         = "test-app-env"
  cname_prefix = "test-app-env"

  setting {
    name      = "MinSize"
    namespace = "aws:autoscaling:asg"
    value     = var.instance_min_size
  }
  setting {
    name      = "MaxSize"
    namespace = "aws:autoscaling:asg"
    value     = var.instance_max_size
  }

  setting {
    namespace = "aws:autoscaling:launchconfiguration"
    name      = "IamInstanceProfile"
    value     = aws_iam_instance_profile.test_app_profile.name
  }

  setting {
    name      = "IamInstanceProfile"
    namespace = "aws:autoscaling:launchconfiguration"
    value     = aws_iam_instance_profile.test_app_profile.name
  }
  setting {
    name      = "InstanceType"
    namespace = "aws:autoscaling:launchconfiguration"
    value     = var.instance_type
  }
  setting {
    name      = "MonitoringInterval"
    namespace = "aws:autoscaling:launchconfiguration"
    value     = "1 minute"
  }
  setting {
    name      = "CloudWatchMetrics"
    namespace = "aws:elasticbeanstalk:customoption"
    value     = "--mem-util --mem-used --mem-avail --disk-space-util --disk-space-used --disk-space-avail --disk-path=/ --auto-scaling"
  }
  setting {
    name      = "LowerBreachScaleIncrement"
    namespace = "aws:autoscaling:trigger"
    value     = "-1"
  }
  setting {
    name      = "LowerThreshold"
    namespace = "aws:autoscaling:trigger"
    value     = "10"
  }
  setting {
    name      = "MeasureName"
    namespace = "aws:autoscaling:trigger"
    value     = "CPUUtilization"
  }
  setting {
    name      = "Unit"
    namespace = "aws:autoscaling:trigger"
    value     = "Percent"
  }
  setting {
    name      = "UpperBreachScaleIncrement"
    namespace = "aws:autoscaling:trigger"
    value     = "6"
  }
  setting {
    name      = "Period"
    namespace = "aws:autoscaling:trigger"
    value     = "2"
  }
  setting {
    name      = "UpperThreshold"
    namespace = "aws:autoscaling:trigger"
    value     = "70"
  }
  setting {
    name      = "MaxBatchSize"
    namespace = "aws:autoscaling:updatepolicy:rollingupdate"
    value     = 10
  }
  setting {
    name      = "RollingUpdateEnabled"
    namespace = "aws:autoscaling:updatepolicy:rollingupdate"
    value     = "false"
  }
  setting {
    name      = "Timeout"
    namespace = "aws:autoscaling:updatepolicy:rollingupdate"
    value     = "PT10M"
  }
  setting {
    name      = "AssociatePublicIpAddress"
    namespace = "aws:ec2:vpc"
    value     = "false"
  }
  setting {
    name      = "ELBScheme"
    namespace = "aws:ec2:vpc"
    value     = "internal"
  }
  setting {
    name      = "ELBSubnets"
    namespace = "aws:ec2:vpc"
    value     = var.elb_subnets
  }
  setting {
    name      = "Subnets"
    namespace = "aws:ec2:vpc"
    value     = var.subnets
  }
  setting {
    name      = "Application Healthcheck URL"
    namespace = "aws:elasticbeanstalk:application"
    value     = "/monitor/ping?zillowRocks=true"
  }
  setting {
    name      = "IgnoreHealthCheck"
    namespace = "aws:elasticbeanstalk:command"
    value     = "true"
  }
  setting {
    name      = "BatchSize"
    namespace = "aws:elasticbeanstalk:command"
    value     = "25"
  }
  setting {
    name      = "BatchSizeType"
    namespace = "aws:elasticbeanstalk:command"
    value     = "Percentage"
  }
  setting {
    name      = "DeploymentPolicy"
    namespace = "aws:elasticbeanstalk:command"
    value     = "AllAtOnce"
  }
  setting {
    name      = "EnvironmentType"
    namespace = "aws:elasticbeanstalk:environment"
    value     = "LoadBalanced"
  }
  setting {
    name      = "ServiceRole"
    namespace = "aws:elasticbeanstalk:environment"
    value     = "aws-elasticbeanstalk-service-role"
  }
  setting {
    name      = "Automatically Terminate Unhealthy Instances"
    namespace = "aws:elasticbeanstalk:monitoring"
    value     = "true"
  }
  setting {
    name      = "SystemType"
    namespace = "aws:elasticbeanstalk:healthreporting:system"
    value     = "enhanced"
  }
  # Turn on custom CloudWatch metric to monitor filesystem usage
  setting {
    name      = "ConfigDocument"
    namespace = "aws:elasticbeanstalk:healthreporting:system"
    value     = <<EOF
{
  "CloudWatchMetrics": {
    "Environment": {
      "ApplicationLatencyP99.9": 60,
      "InstancesSevere": 60,
      "ApplicationLatencyP90": 60,
      "ApplicationLatencyP99": 60,
      "ApplicationLatencyP95": 60,
      "InstancesUnknown": 60,
      "ApplicationLatencyP85": 60,
      "InstancesInfo": 60,
      "ApplicationRequests2xx": 60,
      "InstancesDegraded": 60,
      "InstancesWarning": 60,
      "ApplicationLatencyP50": 60,
      "ApplicationRequestsTotal": 60,
      "InstancesNoData": 60,
      "InstancesPending": 60,
      "ApplicationLatencyP10": 60,
      "ApplicationRequests5xx": 60,
      "ApplicationLatencyP75": 60,
      "InstancesOk": 60,
      "ApplicationRequests3xx": 60,
      "ApplicationRequests4xx": 60
    },
    "Instance": {
      "ApplicationLatencyP99.9": 60,
      "ApplicationLatencyP90": 60,
      "ApplicationLatencyP99": 60,
      "ApplicationLatencyP95": 60,
      "ApplicationLatencyP85": 60,
      "CPUUser": 60,
      "ApplicationRequests2xx": 60,
      "CPUIdle": 60,
      "ApplicationLatencyP50": 60,
      "ApplicationRequestsTotal": 60,
      "RootFilesystemUtil": 60,
      "LoadAverage1min": 60,
      "CPUIrq": 60,
      "CPUNice": 60,
      "CPUIowait": 60,
      "ApplicationLatencyP10": 60,
      "LoadAverage5min": 60,
      "ApplicationRequests5xx": 60,
      "ApplicationLatencyP75": 60,
      "CPUSystem": 60,
      "ApplicationRequests3xx": 60,
      "ApplicationRequests4xx": 60,
      "InstanceHealth": 60,
      "CPUSoftirq": 60
    }
  },
  "Version": 1
}
EOF
  }

  tier                = var.tier
  solution_stack_name = var.solution_stack_name
}


locals {
  autoscaling_groups = aws_elastic_beanstalk_environment.test_app_env.autoscaling_groups

  # Enable metrics any time there's a possible range of instances. If auto-scaling isn't doing anything (such as most dev/test environments)
  # where min and max are both 1) then we don't need to pay to collect metrics.
  enabled = var.instance_min_size != var.instance_max_size ? true : false

  autoscaling_aws_command = local.enabled ? "enable-metrics-collection" : "disable-metrics-collection"

  # We are collecting the "useful" metrics. There is also metrics for Min and Max capacity, but we set those ourselves so not useful.
  autoscaling_aws_arguments = local.enabled ? "--metrics GroupDesiredCapacity GroupInServiceInstances GroupPendingInstances GroupStandbyInstances GroupTerminatingInstances GroupTotalInstances --granularity 1Minute" : ""
}

resource "null_resource" "test_app_asg_enable_metrics" {
  provisioner "local-exec" {
    # autoscaling_groups should always be a list of one, because beanstalk environments have one autoscaling group normally.
    # If multiple exist, run the command on all AGS's just in case to ensure the new ASG will be updated.
    command = "aws autoscaling ${local.autoscaling_aws_command} --auto-scaling-group-name ${local.autoscaling_groups[0]} ${local.autoscaling_aws_arguments} --region us-west-2 --profile zillow-dcp-dev"
  }

  triggers = {
    autoscaling_group_name = local.autoscaling_groups[0]
    instance_min_size      = var.instance_min_size
    instance_max_size      = var.instance_max_size
    version                = "1"                            # Update this to apply new code even if the other triggers don't change
  }
}


# terraform.tfvars
# solution_stack_name = "64bit Amazon Linux 2 v4.3.15 running Tomcat 8.5 Corretto 8"
# solution_stack_name = "64bit Amazon Linux 2 v4.2.7 running Tomcat 8.5 Corretto 8"
solution_stack_name = "64bit Amazon Linux 2 v4.1.3 running Tomcat 8.5 Corretto 8"
# Downgrading minor version (4.x) triggers ASG replacement and inconsistent plan failure.
# Upgrading minor version triggers ASG replacement but not inconsistent plan failure (though need to apply again to update null_resource ASG)

# These values don't affect EB ASG replacement.  Anything VPC/ELB/subnet related needs to be replaced with the values for your env.
vpc_id              = "vpc-*"
elb_subnets         = "subnet-*,subnet-*"
subnets             = "subnet-*,subnet-*"
tier                = "WebServer"
instance_min_size   = 1
instance_max_size   = 8
instance_type       = "c6i.large"

Expected Behavior

We have an Elastic Beanstalk app where we'd like to always enable ASG metrics when the app's underlying ASG changes. Currently we use null_resource with local-exec to run the aws-cli enabling ASG metrics triggered by a change in the ASG and/or the ASG min/max capacity.

Expected behavior is that the null_resource detects an ASG replacement, and runs the aws-cli command on the new ASG for the Elastic Beanstalk app.

Actual Behavior

Terraform plan executes successfully, but for the null_resource shows the original ASG.
During terraform apply, one of two outcomes happens after the ASG is replaced:

the apply fails (final inconsistent plan) due to the plan expecting the original ASG but seeing the new ASG
the apply succeeds, but the terraform needs to be reran to apply the null_resource on the new ASG
To resolve, terraform plan is reran to have null_resource pickup the new ASG, and then terraform apply succeeds.

Plan outputs:

+ resource "null_resource" "autoscaling_enable_metrics" {
      + id       = (known after apply)
      + triggers = {
          + "autoscaling_group_name" = "awseb-e-***-stack-AWSEBAutoScalingGroup-***AVS"
          + "aws_profile"            = "***"
          + "aws_region"             = "us-west-2"
          + "instance_max_size"      = "75"
          + "instance_min_size"      = "15"
          + "version"                = "1"
        }
    }

Apply outputs:

│ Error: Provider produced inconsistent final plan
│ 
│ When expanding the plan for
│ module.bl-prod-env.module.bookshelf_listings_autoscaling_metrics.null_resource.autoscaling_enable_metrics["awseb-e-***-stack-AWSEBAutoScalingGroup-***AVS"]
│ to include new values learned so far during apply, provider
│ "registry.terraform.io/hashicorp/null" produced an invalid new value for
│ .triggers["autoscaling_group_name"]: was
│ cty.StringVal("awseb-e-***-stack-AWSEBAutoScalingGroup-***AVS"),
│ but now
│ cty.StringVal("awseb-e-***-stack-AWSEBAutoScalingGroup-***ANZ").
│ 
│ This is a bug in the provider, which should be reported in the provider's
│ own issue tracker.

Steps to Reproduce

Replace values in provider.tf and terraform.tfvars with appropriate values for your AWS account
In terraform.tfvars, set solution_stack_name = "64bit Amazon Linux 2 v4.3.15 running Tomcat 8.5 Corretto 8"
Run terraform apply to create an eb app and its ASG
In Elastic Beanstalk AWS console, select test-app-env then goto Configuration > Instance traffic and scaling Edit button, check the IMDSv1 box to deactivate it
In terraform.tfvars, set solution_stack_name = "64bit Amazon Linux 2 v4.1.3 running Tomcat 8.5 Corretto 8"
Run terraform plan and terraform apply, it should produce an inconsistent plan failure. If it succeeds, then a subsequent plan should show null_resource still needing to run against new ASG). Either case, resolution is to rerun terraform to refresh its state picking up new ASG

How much impact is this issue causing?

Medium

Logs

No response

Additional Information

I originally reported the bug here in aws provider before being pointed here to null provider: hashicorp/terraform-provider-aws#35375

Code of Conduct

I agree to follow this project's Code of Conduct

The text was updated successfully, but these errors were encountered:

zg-genel added the bug label Jan 19, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Bug]: null_resource fails after Beanstalk ASG replaced - "Provider produced inconsistent final plan" #302

[Bug]: null_resource fails after Beanstalk ASG replaced - "Provider produced inconsistent final plan" #302

zg-genel commented Jan 19, 2024

[Bug]: null_resource fails after Beanstalk ASG replaced - "Provider produced inconsistent final plan" #302

[Bug]: null_resource fails after Beanstalk ASG replaced - "Provider produced inconsistent final plan" #302

Comments

zg-genel commented Jan 19, 2024

Terraform CLI and Provider Versions

Terraform Configuration

Expected Behavior

Actual Behavior

Steps to Reproduce

How much impact is this issue causing?

Logs

Additional Information

Code of Conduct