From 0e28bdc103f85938730de649f65d129f00e16d02 Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Tue, 28 Jan 2025 09:02:27 -0800 Subject: [PATCH] PRODENG-2850 Reworked drain node logic and upgrade the aws-simple tf chart (#547) * PRODENG-2850 Removed caching and increased timeout for the smoke test Signed-off-by: Dimitar --- Makefile | 4 +- .../terraform/aws-simple/.terraform.lock.hcl | 3 - examples/terraform/aws-simple/iam.tf | 23 ++++++ examples/terraform/aws-simple/provision.tf | 2 + .../aws-simple/terraform.tfvars.template | 71 +++++++++---------- pkg/product/mke/api/cluster_spec.go | 6 +- pkg/product/mke/phase/uninstall_mcr.go | 32 +++++++-- 7 files changed, 93 insertions(+), 48 deletions(-) create mode 100644 examples/terraform/aws-simple/iam.tf diff --git a/Makefile b/Makefile index 58b15f94..f1b91705 100644 --- a/Makefile +++ b/Makefile @@ -85,11 +85,11 @@ integration-test: .PHONY: smoke-small smoke-small: - go test -v ./test/smoke/... -run TestSmallCluster -timeout 20m + go test -count=1 -v ./test/smoke/... -run TestSmallCluster -timeout 20m .PHONY: smoke-full smoke-full: - go test -v ./test/smoke/... -run TestSupportedMatrixCluster -timeout 40m + go test -count=1 -v ./test/smoke/... -run TestSupportedMatrixCluster -timeout 50m .PHONY: clean-launchpad-chart clean-launchpad-chart: diff --git a/examples/terraform/aws-simple/.terraform.lock.hcl b/examples/terraform/aws-simple/.terraform.lock.hcl index ff49597a..ac3bb23b 100644 --- a/examples/terraform/aws-simple/.terraform.lock.hcl +++ b/examples/terraform/aws-simple/.terraform.lock.hcl @@ -28,7 +28,6 @@ provider "registry.terraform.io/hashicorp/local" { version = "2.5.2" hashes = [ "h1:IyFbOIO6mhikFNL/2h1iZJ6kyN3U00jgkpCLUCThAfE=", - "h1:JlMZD6nYqJ8sSrFfEAH0Vk/SL8WLZRmFaMUF9PJK5wM=", "zh:136299545178ce281c56f36965bf91c35407c11897f7082b3b983d86cb79b511", "zh:3b4486858aa9cb8163378722b642c57c529b6c64bfbfc9461d940a84cd66ebea", "zh:4855ee628ead847741aa4f4fc9bed50cfdbf197f2912775dd9fe7bc43fa077c0", @@ -47,7 +46,6 @@ provider "registry.terraform.io/hashicorp/local" { provider "registry.terraform.io/hashicorp/time" { version = "0.12.1" hashes = [ - "h1:6BhxSYBJdBBKyuqatOGkuPKVenfx6UmLdiI13Pb3his=", "h1:JzYsPugN8Fb7C4NlfLoFu7BBPuRVT2/fCOdCaxshveI=", "zh:090023137df8effe8804e81c65f636dadf8f9d35b79c3afff282d39367ba44b2", "zh:26f1e458358ba55f6558613f1427dcfa6ae2be5119b722d0b3adb27cd001efea", @@ -67,7 +65,6 @@ provider "registry.terraform.io/hashicorp/time" { provider "registry.terraform.io/hashicorp/tls" { version = "4.0.6" hashes = [ - "h1:dYSb3V94K5dDMtrBRLPzBpkMTPn+3cXZ/kIJdtFL+2M=", "h1:n3M50qfWfRSpQV9Pwcvuse03pEizqrmYEryxKky4so4=", "zh:10de0d8af02f2e578101688fd334da3849f56ea91b0d9bd5b1f7a243417fdda8", "zh:37fc01f8b2bc9d5b055dc3e78bfd1beb7c42cfb776a4c81106e19c8911366297", diff --git a/examples/terraform/aws-simple/iam.tf b/examples/terraform/aws-simple/iam.tf new file mode 100644 index 00000000..f771126d --- /dev/null +++ b/examples/terraform/aws-simple/iam.tf @@ -0,0 +1,23 @@ +resource "aws_iam_role" "common_role" { + name = "common-iam-role-${var.name}" + path = "/" + assume_role_policy = data.aws_iam_policy_document.assume_role.json +} + +data "aws_iam_policy_document" "assume_role" { + statement { + effect = "Allow" + + principals { + type = "Service" + identifiers = ["ec2.amazonaws.com"] + } + + actions = ["sts:AssumeRole"] + } +} + +resource "aws_iam_instance_profile" "common_profile" { + name = "common-instance-profile-${var.name}" + role = aws_iam_role.common_role.name +} diff --git a/examples/terraform/aws-simple/provision.tf b/examples/terraform/aws-simple/provision.tf index 6f408719..bf65d24e 100644 --- a/examples/terraform/aws-simple/provision.tf +++ b/examples/terraform/aws-simple/provision.tf @@ -26,6 +26,8 @@ module "provision" { role : ngd.role public : ngd.public user_data : ngd.user_data + instance_profile_name : aws_iam_instance_profile.common_profile.name + tags : local.tags } } // ingress/lb (should likely merge with an input to allow more flexibility diff --git a/examples/terraform/aws-simple/terraform.tfvars.template b/examples/terraform/aws-simple/terraform.tfvars.template index cec68f93..971b3408 100644 --- a/examples/terraform/aws-simple/terraform.tfvars.template +++ b/examples/terraform/aws-simple/terraform.tfvars.template @@ -1,8 +1,8 @@ // used to name infrastructure (CHANGE THIS) -name = "mcc-smoke-test" +//name = "test" aws = { - region = "us-east-1" + region = "us-east-2" } launchpad = { @@ -14,55 +14,54 @@ launchpad = { mke_connect = { username = "admin" - password = "" // an MKE passwords must be provided + password = "" // an MKE passwords must be provided insecure = false } + + skip_create = false + skip_destroy = true // don't bother running launchpad reset } // configure the network stack network = { - cidr = "172.31.0.0/16" + "cidr": "172.31.0.0/16", + "enable_nat_gateway": false, + "enable_vpn_gateway": false, + "tags": {} } + +// configure the subnets in the vpc subnets = { - "Main" = { - cidr = "172.31.0.0/17" - nodegroups = ["ACon", "AWrk_Ubu22", "AWrk_Roc9", "AWrk_Win2022"] - private = false + "main" = { + "cidr" = "172.31.0.0/17", + "private" = false, + "nodegroups" = ["MngrUbuntu22", "WrkUbuntu22"] } } - -// machine node groups by role & platform +// one definition for each group of machines to include in the stack nodegroups = { - "ACon" = { // managers for A group - role = "manager" - platform = "ubuntu_22.04" - count = 1 - type = "m6a.2xlarge" + "MngrUbuntu22" = { + "platform" = "ubuntu_22.04", + "count" = 1, + "type" = "m6a.2xlarge", + "volume_size" = "100", + "public" = true, + "role" = "manager", + "user_data" = "sudo ufw allow 7946/tcp ; sudo ufw allow 10250/tcp " }, - "AWrk_Ubu22" = { // workers for A group - role = "worker" - platform = "ubuntu_22.04" - count = 1 - type = "c6a.xlarge" - volume_size = 100 - }, - "AWrk_Roc9" = { // workers for A group - role = "worker" - platform = "rocky_9" - count = 1 - type = "c6a.xlarge" - volume_size = 100 - }, - // "AWrk_Win2022" = { - // role = "worker" - // platform = "windows_core_2022" - // count = 1 - // type = "c6a.xlarge" - // }, + "WrkUbuntu22" = { + "platform" = "ubuntu_22.04", + "count" = 1, + "type" = "c6a.xlarge", + "volume_size" = "100", + "public" = true, + "role" = "worker", + "user_data" = "sudo ufw allow 7946/tcp ; sudo ufw allow 10250/tcp " + } } // set a windows password, if you have windows nodes // windows passwords must match a pattern, or connections will fail. // use something like: `testp@ss!` -# windows_password = "" +// windows_password = "" diff --git a/pkg/product/mke/api/cluster_spec.go b/pkg/product/mke/api/cluster_spec.go index 68708387..8bd51059 100644 --- a/pkg/product/mke/api/cluster_spec.go +++ b/pkg/product/mke/api/cluster_spec.go @@ -8,6 +8,7 @@ import ( "strconv" "strings" "sync" + "time" "github.com/Mirantis/mcc/pkg/constant" common "github.com/Mirantis/mcc/pkg/product/common/api" @@ -267,7 +268,10 @@ func pingHost(h *Host, address string, waitgroup *sync.WaitGroup, errCh chan<- e } return nil }, - retry.Attempts(12), // last attempt should wait ~7min + retry.MaxJitter(time.Second*3), + retry.Delay(time.Second*30), + retry.DelayType(retry.FixedDelay), + retry.Attempts(10), // should try for ~5min ) if err != nil { errCh <- fmt.Errorf("MKE health check failed: %w", err) diff --git a/pkg/product/mke/phase/uninstall_mcr.go b/pkg/product/mke/phase/uninstall_mcr.go index 5117076a..b5f755e2 100644 --- a/pkg/product/mke/phase/uninstall_mcr.go +++ b/pkg/product/mke/phase/uninstall_mcr.go @@ -22,6 +22,32 @@ func (p *UninstallMCR) Title() string { // Run installs the engine on each host. func (p *UninstallMCR) Run() error { + workers := p.Config.Spec.Workers() + managers := p.Config.Spec.Managers() + swarmLeader := p.Config.Spec.SwarmLeader() + + // Drain all workers + for _, h := range workers { + if err := mcr.DrainNode(swarmLeader, h); err != nil { + return fmt.Errorf("%s: drain worker node: %w", h, err) + } + } + + // Drain all managers + for _, h := range managers { + if swarmLeader.Address() == h.Address() { + continue + } + if err := mcr.DrainNode(swarmLeader, h); err != nil { + return fmt.Errorf("%s: draining manager node: %w", h, err) + } + } + + // Drain the leader + if err := mcr.DrainNode(swarmLeader, swarmLeader); err != nil { + return fmt.Errorf("%s: drain leader node: %w", swarmLeader, err) + } + if err := phase.RunParallelOnHosts(p.Config.Spec.Hosts, p.Config, p.uninstallMCR); err != nil { return fmt.Errorf("uninstall container runtime: %w", err) } @@ -31,12 +57,6 @@ func (p *UninstallMCR) Run() error { func (p *UninstallMCR) uninstallMCR(h *api.Host, config *api.ClusterConfig) error { log.Infof("%s: uninstalling container runtime", h) - leader := config.Spec.SwarmLeader() - - if err := mcr.DrainNode(leader, h); err != nil { - return fmt.Errorf("%s: drain node: %w", h, err) - } - uVolumeCmd := h.Configurer.DockerCommandf("volume prune -f") log.Infof("%s: unmounted dangling volumes", h)