diff --git a/cmd/kops/rollingupdatecluster.go b/cmd/kops/rollingupdatecluster.go index d3ef21fba0081..f25e8f9a8d6de 100644 --- a/cmd/kops/rollingupdatecluster.go +++ b/cmd/kops/rollingupdatecluster.go @@ -45,32 +45,50 @@ var ( rollingupdate_long = pretty.LongDesc(i18n.T(` This command updates a kubernetes cluster to match the cloud, and kops specifications. - To perform rolling update, you need to update the cloud resources first with "kops update cluster" - - Note: terraform users will need run the following commands all from the same directory "kops update cluster --target=terraform" then "terraform plan" then "terraform apply" - prior to running "kops rolling-update cluster" - - Use ` + pretty.Bash("export KOPS_FEATURE_FLAGS=\"+DrainAndValidateRollingUpdate\"") + ` to use beta code that drains the nodes - and validates the cluster. New flags for Drain and Validation operations will be shown when - the environment variable is set.`)) + To perform rolling update, you need to update the cloud resources first with the command + ` + pretty.Bash("kops update cluster") + `. + + If rolling-update does not report that the cluster needs to be rolled you can force the cluster to be + rolled with the force flag. Rolling update drains and validates the cluster by default. A cluster is + deemed validated when all required nodes are running, and all pods in the kube-system namespace are operational. + When a node is deleted rolling-update sleeps the interval for the node type, and the tries for the same period + of time for the cluster to be validated. For instance setting --master-interval=3m causes rolling-update + to wait for 3m after a master is rolled, and another 3m for the cluster to stabilize and pass + validation. + + Note: terraform users will need run the following commands all from the same directory + ` + pretty.Bash("kops update cluster --target=terraform") + `then + ` + pretty.Bash("terraform plan") + ` then ` + pretty.Bash("terraform apply") + + `prior to running` + pretty.Bash("kops rolling-update cluster") + `.`)) rollingupdate_example = templates.Examples(i18n.T(` - # Roll the currently selected kops cluster + # Preview a rolling-update + kops rolling-update cluster + + # Roll the currently selected kops cluster with defaults. + # Nodes will be drained and the cluster will be validated between node replacement kops rolling-update cluster --yes # Roll the k8s-cluster.example.com kops cluster - # use the new drain an validate functionality - export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate" + # do not fail if the cluster does not validate + # wait 8 min to create new node, and at least 8 min + # to validate the cluster. kops rolling-update cluster k8s-cluster.example.com --yes \ --fail-on-validate-error="false" \ --master-interval=8m \ --node-interval=8m + # Roll the k8s-cluster.example.com kops cluster + # do not validate the cluster because of the cloudonly flag. + # Force the entire cluster to roll, even if rolling update + # reports that the cluster does not need to be rolled. + kops rolling-update cluster k8s-cluster.example.com --yes \ + --cloudonly \ + --force # Roll the k8s-cluster.example.com kops cluster # only roll the node instancegroup # use the new drain an validate functionality - export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate" kops rolling-update cluster k8s-cluster.example.com --yes \ --fail-on-validate-error="false" \ --node-interval 8m \ @@ -98,8 +116,6 @@ type RollingUpdateOptions struct { DrainInterval time.Duration - ValidateRetries int - MasterInterval time.Duration NodeInterval time.Duration BastionInterval time.Duration @@ -119,11 +135,9 @@ func (o *RollingUpdateOptions) InitDefaults() { o.FailOnValidate = true o.MasterInterval = 5 * time.Minute - o.NodeInterval = 2 * time.Minute + o.NodeInterval = 4 * time.Minute o.BastionInterval = 5 * time.Minute - o.ValidateRetries = 8 - o.DrainInterval = 90 * time.Second } @@ -152,8 +166,6 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command { if featureflag.DrainAndValidateRollingUpdate.Enabled() { cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", true, "The rolling-update will fail if draining a node fails.") cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate-error", true, "The rolling-update will fail if the cluster fails to validate.") - cmd.Flags().IntVar(&options.ValidateRetries, "validate-retries", options.ValidateRetries, "The number of times that a node will be validated. Between validation kops sleeps the master-interval/2 or node-interval/2 duration.") - cmd.Flags().DurationVar(&options.DrainInterval, "drain-interval", options.DrainInterval, "The duration that a rolling-update will wait after the node is drained.") } cmd.Run = func(cmd *cobra.Command, args []string) { @@ -202,10 +214,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd return fmt.Errorf("cannot load kubecfg settings for %q: %v", contextName, err) } - if options.ValidateRetries <= 0 { - return fmt.Errorf("validate-retries flag cannot be 0 or smaller") - } - var nodes []v1.Node var k8sClient kubernetes.Interface if !options.CloudOnly { @@ -339,7 +347,7 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd } if featureflag.DrainAndValidateRollingUpdate.Enabled() { - glog.V(2).Infof("New rolling update with drain and validate enabled.") + glog.V(2).Infof("Rolling update with drain and validate enabled.") } d := &instancegroups.RollingUpdateCluster{ MasterInterval: options.MasterInterval, @@ -352,7 +360,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd FailOnValidate: options.FailOnValidate, CloudOnly: options.CloudOnly, ClusterName: options.ClusterName, - ValidateRetries: options.ValidateRetries, DrainInterval: options.DrainInterval, } return d.RollingUpdate(groups, list) diff --git a/docs/cli/kops_rolling-update.md b/docs/cli/kops_rolling-update.md index 1f00dc735dd6f..e5aa6f6ade50a 100644 --- a/docs/cli/kops_rolling-update.md +++ b/docs/cli/kops_rolling-update.md @@ -10,34 +10,51 @@ Rolling update a cluster. This command updates a kubernetes cluster to match the cloud, and kops specifications. -To perform rolling update, you need to update the cloud resources first with "kops update cluster" +To perform rolling update, you need to update the cloud resources first with the command +`kops update cluster`. -Note: terraform users will need run the following commands all from the same directory "kops update cluster --target=terraform" then "terraform plan" then "terraform apply" -prior to running "kops rolling-update cluster" +If rolling-update does not report that the cluster needs to be rolled you can force the cluster to be +rolled with the force flag. Rolling update drains and validates the cluster by default. A cluster is +deemed validated when all required nodes are running, and all pods in the kube-system namespace are operational. +When a node is deleted rolling-update sleeps the interval for the node type, and the tries for the same period +of time for the cluster to be validated. For instance setting --master-interval=3m causes rolling-update +to wait for 3m after a master is rolled, and another 3m for the cluster to stabilize and pass +validation. -Use `export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"` to use beta code that drains the nodes -and validates the cluster. New flags for Drain and Validation operations will be shown when -the environment variable is set. +Note: terraform users will need run the following commands all from the same directory +`kops update cluster --target=terraform`then +`terraform plan` then `terraform apply`prior to running`kops rolling-update cluster`. ### Examples ``` - # Roll the currently selected kops cluster + # Preview a rolling-update + kops rolling-update cluster + + # Roll the currently selected kops cluster with defaults. + # Nodes will be drained and the cluster will be validated between node replacement kops rolling-update cluster --yes # Roll the k8s-cluster.example.com kops cluster - # use the new drain an validate functionality - export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate" + # do not fail if the cluster does not validate + # wait 8 min to create new node, and at least 8 min + # to validate the cluster. kops rolling-update cluster k8s-cluster.example.com --yes \ --fail-on-validate-error="false" \ --master-interval=8m \ --node-interval=8m + # Roll the k8s-cluster.example.com kops cluster + # do not validate the cluster because of the cloudonly flag. + # Force the entire cluster to roll, even if rolling update + # reports that the cluster does not need to be rolled. + kops rolling-update cluster k8s-cluster.example.com --yes \ + --cloudonly \ + --force # Roll the k8s-cluster.example.com kops cluster # only roll the node instancegroup # use the new drain an validate functionality - export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate" kops rolling-update cluster k8s-cluster.example.com --yes \ --fail-on-validate-error="false" \ --node-interval 8m \ diff --git a/docs/cli/kops_rolling-update_cluster.md b/docs/cli/kops_rolling-update_cluster.md index 2cec8a76092c8..784e9a26a8d95 100644 --- a/docs/cli/kops_rolling-update_cluster.md +++ b/docs/cli/kops_rolling-update_cluster.md @@ -10,14 +10,20 @@ Rolling update a cluster. This command updates a kubernetes cluster to match the cloud, and kops specifications. -To perform rolling update, you need to update the cloud resources first with "kops update cluster" +To perform rolling update, you need to update the cloud resources first with the command +`kops update cluster`. -Note: terraform users will need run the following commands all from the same directory "kops update cluster --target=terraform" then "terraform plan" then "terraform apply" -prior to running "kops rolling-update cluster" +If rolling-update does not report that the cluster needs to be rolled you can force the cluster to be +rolled with the force flag. Rolling update drains and validates the cluster by default. A cluster is +deemed validated when all required nodes are running, and all pods in the kube-system namespace are operational. +When a node is deleted rolling-update sleeps the interval for the node type, and the tries for the same period +of time for the cluster to be validated. For instance setting --master-interval=3m causes rolling-update +to wait for 3m after a master is rolled, and another 3m for the cluster to stabilize and pass +validation. -Use `export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"` to use beta code that drains the nodes -and validates the cluster. New flags for Drain and Validation operations will be shown when -the environment variable is set. +Note: terraform users will need run the following commands all from the same directory +`kops update cluster --target=terraform`then +`terraform plan` then `terraform apply`prior to running`kops rolling-update cluster`. ``` kops rolling-update cluster @@ -26,22 +32,33 @@ kops rolling-update cluster ### Examples ``` - # Roll the currently selected kops cluster + # Preview a rolling-update + kops rolling-update cluster + + # Roll the currently selected kops cluster with defaults. + # Nodes will be drained and the cluster will be validated between node replacement kops rolling-update cluster --yes # Roll the k8s-cluster.example.com kops cluster - # use the new drain an validate functionality - export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate" + # do not fail if the cluster does not validate + # wait 8 min to create new node, and at least 8 min + # to validate the cluster. kops rolling-update cluster k8s-cluster.example.com --yes \ --fail-on-validate-error="false" \ --master-interval=8m \ --node-interval=8m + # Roll the k8s-cluster.example.com kops cluster + # do not validate the cluster because of the cloudonly flag. + # Force the entire cluster to roll, even if rolling update + # reports that the cluster does not need to be rolled. + kops rolling-update cluster k8s-cluster.example.com --yes \ + --cloudonly \ + --force # Roll the k8s-cluster.example.com kops cluster # only roll the node instancegroup # use the new drain an validate functionality - export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate" kops rolling-update cluster k8s-cluster.example.com --yes \ --fail-on-validate-error="false" \ --node-interval 8m \ @@ -53,10 +70,12 @@ kops rolling-update cluster ``` --bastion-interval duration Time to wait between restarting bastions (default 5m0s) --cloudonly Perform rolling update without confirming progress with k8s + --fail-on-drain-error The rolling-update will fail if draining a node fails. (default true) + --fail-on-validate-error The rolling-update will fail if the cluster fails to validate. (default true) --force Force rolling update, even if no changes --instance-group stringSlice List of instance groups to update (defaults to all if not specified) --master-interval duration Time to wait between restarting masters (default 5m0s) - --node-interval duration Time to wait between restarting nodes (default 2m0s) + --node-interval duration Time to wait between restarting nodes (default 4m0s) --yes perform rolling update without confirmation ``` diff --git a/pkg/featureflag/featureflag.go b/pkg/featureflag/featureflag.go index 03babde292d73..4c2d78b1ba9e8 100644 --- a/pkg/featureflag/featureflag.go +++ b/pkg/featureflag/featureflag.go @@ -40,7 +40,7 @@ func Bool(b bool) *bool { var DNSPreCreate = New("DNSPreCreate", Bool(true)) // DrainAndValidateRollingUpdate if set will use new rolling update code that will drain and validate. -var DrainAndValidateRollingUpdate = New("DrainAndValidateRollingUpdate", Bool(false)) +var DrainAndValidateRollingUpdate = New("DrainAndValidateRollingUpdate", Bool(true)) // VPCSkipEnableDNSSupport if set will make that a VPC does not need DNSSupport enabled. var VPCSkipEnableDNSSupport = New("VPCSkipEnableDNSSupport", Bool(false)) diff --git a/pkg/instancegroups/instancegroups.go b/pkg/instancegroups/instancegroups.go index 803be40137cca..31871f4d09105 100644 --- a/pkg/instancegroups/instancegroups.go +++ b/pkg/instancegroups/instancegroups.go @@ -297,9 +297,10 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust glog.Infof("Validating the cluster.") - if err = n.ValidateClusterWithRetries(rollingUpdateData, instanceGroupList, t); err != nil { + if err = n.ValidateClusterWithDuration(rollingUpdateData, instanceGroupList, t); err != nil { if rollingUpdateData.FailOnValidate { + glog.Errorf("Cluster did not validate within the set duration of %q, you can retry, and maybe extend the duration", t) return fmt.Errorf("error validating cluster after removing a node: %v", err) } @@ -311,25 +312,43 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust return nil } -// ValidateClusterWithRetries runs our validation methods on the K8s Cluster x times and then fails. -func (n *CloudInstanceGroup) ValidateClusterWithRetries(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, t time.Duration) (err error) { - - // TODO - We are going to need to improve Validate to allow for more than one node, not master - // TODO - going down at a time. - for i := 0; i <= rollingUpdateData.ValidateRetries; i++ { +// ValidateClusterWithDuration runs validation.ValidateCluster until either we get positive result or the timeout expires +func (n *CloudInstanceGroup) ValidateClusterWithDuration(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration) error { + // TODO should we expose this to the UI? + tickDuration := 30 * time.Second + // Try to validate cluster at least once, this will handle durations that are lower + // than our tick time + if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) { + return nil + } - if _, err = validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil { - glog.Infof("Cluster did not validate, and waiting longer: %v.", err) - time.Sleep(t / 2) - } else { - glog.Infof("Cluster validated.") - return nil + timeout := time.After(duration) + tick := time.Tick(tickDuration) + // Keep trying until we're timed out or got a result or got an error + for { + select { + case <-timeout: + // Got a timeout fail with a timeout error + return fmt.Errorf("cluster did not validate within a duation of %q", duration) + case <-tick: + // Got a tick, validate cluster + if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) { + return nil + } + // ValidateCluster didn't work yet, so let's try again + // this will exit up to the for loop } - } +} - // for loop is done, and did not end when the cluster validated - return fmt.Errorf("cluster validation failed: %v", err) +func (n *CloudInstanceGroup) tryValidateCluster(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration, tickDuration time.Duration) bool { + if _, err := validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil { + glog.Infof("Cluster did not validate, will try again in %q util duration %q expires: %v.", tickDuration, duration, err) + return false + } else { + glog.Infof("Cluster validated.") + return true + } } // ValidateCluster runs our validation methods on the K8s Cluster. diff --git a/pkg/instancegroups/rollingupdate.go b/pkg/instancegroups/rollingupdate.go index d5bb456d8de16..170940e0b931a 100644 --- a/pkg/instancegroups/rollingupdate.go +++ b/pkg/instancegroups/rollingupdate.go @@ -44,7 +44,6 @@ type RollingUpdateCluster struct { FailOnValidate bool CloudOnly bool ClusterName string - ValidateRetries int DrainInterval time.Duration } @@ -171,6 +170,6 @@ func (c *RollingUpdateCluster) RollingUpdate(groups map[string]*CloudInstanceGro } } - glog.Infof("Rolling update completed!") + glog.Infof("Rolling update completed for cluster %q!", c.ClusterName) return nil }