Skip to content

Commit

Permalink
PRODENG-2850 Reworked drain node logic and upgrade the aws-simple tf …
Browse files Browse the repository at this point in the history
…chart (#547)

* PRODENG-2850 Removed caching and increased timeout for the smoke test

Signed-off-by: Dimitar <[email protected]>
  • Loading branch information
cranzy authored Jan 28, 2025
1 parent 043c2ee commit 0e28bdc
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 48 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,11 @@ integration-test:

.PHONY: smoke-small
smoke-small:
go test -v ./test/smoke/... -run TestSmallCluster -timeout 20m
go test -count=1 -v ./test/smoke/... -run TestSmallCluster -timeout 20m

.PHONY: smoke-full
smoke-full:
go test -v ./test/smoke/... -run TestSupportedMatrixCluster -timeout 40m
go test -count=1 -v ./test/smoke/... -run TestSupportedMatrixCluster -timeout 50m

.PHONY: clean-launchpad-chart
clean-launchpad-chart:
Expand Down
3 changes: 0 additions & 3 deletions examples/terraform/aws-simple/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions examples/terraform/aws-simple/iam.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
resource "aws_iam_role" "common_role" {
name = "common-iam-role-${var.name}"
path = "/"
assume_role_policy = data.aws_iam_policy_document.assume_role.json
}

data "aws_iam_policy_document" "assume_role" {
statement {
effect = "Allow"

principals {
type = "Service"
identifiers = ["ec2.amazonaws.com"]
}

actions = ["sts:AssumeRole"]
}
}

resource "aws_iam_instance_profile" "common_profile" {
name = "common-instance-profile-${var.name}"
role = aws_iam_role.common_role.name
}
2 changes: 2 additions & 0 deletions examples/terraform/aws-simple/provision.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ module "provision" {
role : ngd.role
public : ngd.public
user_data : ngd.user_data
instance_profile_name : aws_iam_instance_profile.common_profile.name
tags : local.tags
} }

// ingress/lb (should likely merge with an input to allow more flexibility
Expand Down
71 changes: 35 additions & 36 deletions examples/terraform/aws-simple/terraform.tfvars.template
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// used to name infrastructure (CHANGE THIS)
name = "mcc-smoke-test"
//name = "test"

aws = {
region = "us-east-1"
region = "us-east-2"
}

launchpad = {
Expand All @@ -14,55 +14,54 @@ launchpad = {

mke_connect = {
username = "admin"
password = "" // an MKE passwords must be provided
password = "" // an MKE passwords must be provided
insecure = false
}

skip_create = false
skip_destroy = true // don't bother running launchpad reset
}

// configure the network stack
network = {
cidr = "172.31.0.0/16"
"cidr": "172.31.0.0/16",
"enable_nat_gateway": false,
"enable_vpn_gateway": false,
"tags": {}
}

// configure the subnets in the vpc
subnets = {
"Main" = {
cidr = "172.31.0.0/17"
nodegroups = ["ACon", "AWrk_Ubu22", "AWrk_Roc9", "AWrk_Win2022"]
private = false
"main" = {
"cidr" = "172.31.0.0/17",
"private" = false,
"nodegroups" = ["MngrUbuntu22", "WrkUbuntu22"]
}
}


// machine node groups by role & platform
// one definition for each group of machines to include in the stack
nodegroups = {
"ACon" = { // managers for A group
role = "manager"
platform = "ubuntu_22.04"
count = 1
type = "m6a.2xlarge"
"MngrUbuntu22" = {
"platform" = "ubuntu_22.04",
"count" = 1,
"type" = "m6a.2xlarge",
"volume_size" = "100",
"public" = true,
"role" = "manager",
"user_data" = "sudo ufw allow 7946/tcp ; sudo ufw allow 10250/tcp "
},
"AWrk_Ubu22" = { // workers for A group
role = "worker"
platform = "ubuntu_22.04"
count = 1
type = "c6a.xlarge"
volume_size = 100
},
"AWrk_Roc9" = { // workers for A group
role = "worker"
platform = "rocky_9"
count = 1
type = "c6a.xlarge"
volume_size = 100
},
// "AWrk_Win2022" = {
// role = "worker"
// platform = "windows_core_2022"
// count = 1
// type = "c6a.xlarge"
// },
"WrkUbuntu22" = {
"platform" = "ubuntu_22.04",
"count" = 1,
"type" = "c6a.xlarge",
"volume_size" = "100",
"public" = true,
"role" = "worker",
"user_data" = "sudo ufw allow 7946/tcp ; sudo ufw allow 10250/tcp "
}
}

// set a windows password, if you have windows nodes
// windows passwords must match a pattern, or connections will fail.
// use something like: `testp@ss!`
# windows_password = ""
// windows_password = ""
6 changes: 5 additions & 1 deletion pkg/product/mke/api/cluster_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"strconv"
"strings"
"sync"
"time"

"github.com/Mirantis/mcc/pkg/constant"
common "github.com/Mirantis/mcc/pkg/product/common/api"
Expand Down Expand Up @@ -267,7 +268,10 @@ func pingHost(h *Host, address string, waitgroup *sync.WaitGroup, errCh chan<- e
}
return nil
},
retry.Attempts(12), // last attempt should wait ~7min
retry.MaxJitter(time.Second*3),
retry.Delay(time.Second*30),
retry.DelayType(retry.FixedDelay),
retry.Attempts(10), // should try for ~5min
)
if err != nil {
errCh <- fmt.Errorf("MKE health check failed: %w", err)
Expand Down
32 changes: 26 additions & 6 deletions pkg/product/mke/phase/uninstall_mcr.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,32 @@ func (p *UninstallMCR) Title() string {

// Run installs the engine on each host.
func (p *UninstallMCR) Run() error {
workers := p.Config.Spec.Workers()
managers := p.Config.Spec.Managers()
swarmLeader := p.Config.Spec.SwarmLeader()

// Drain all workers
for _, h := range workers {
if err := mcr.DrainNode(swarmLeader, h); err != nil {
return fmt.Errorf("%s: drain worker node: %w", h, err)
}
}

// Drain all managers
for _, h := range managers {
if swarmLeader.Address() == h.Address() {
continue
}
if err := mcr.DrainNode(swarmLeader, h); err != nil {
return fmt.Errorf("%s: draining manager node: %w", h, err)
}
}

// Drain the leader
if err := mcr.DrainNode(swarmLeader, swarmLeader); err != nil {
return fmt.Errorf("%s: drain leader node: %w", swarmLeader, err)
}

if err := phase.RunParallelOnHosts(p.Config.Spec.Hosts, p.Config, p.uninstallMCR); err != nil {
return fmt.Errorf("uninstall container runtime: %w", err)
}
Expand All @@ -31,12 +57,6 @@ func (p *UninstallMCR) Run() error {
func (p *UninstallMCR) uninstallMCR(h *api.Host, config *api.ClusterConfig) error {
log.Infof("%s: uninstalling container runtime", h)

leader := config.Spec.SwarmLeader()

if err := mcr.DrainNode(leader, h); err != nil {
return fmt.Errorf("%s: drain node: %w", h, err)
}

uVolumeCmd := h.Configurer.DockerCommandf("volume prune -f")
log.Infof("%s: unmounted dangling volumes", h)

Expand Down

0 comments on commit 0e28bdc

Please sign in to comment.