Merge pull request #3329 from chrislovecnm/promote-drain-validate

Automatic merge from submit-queue. . promoting drain and validate by setting feature flag to true I am unable to recreate #2407, and frankly, it may be an edge case. We could warn a user if their wait times are low, but that would be another PR. This PR moves Drain and Validate functionality for rolling-updates into the default user experience, setting the Feature Flag to true. Per feedback, I am using the node and master interval times for the validation.
kubernetes · Sep 24, 2017 · ba42020 · ba42020
2 parents dcd0406 + 8dabeec
commit ba42020
Show file tree

Hide file tree

Showing 6 changed files with 126 additions and 65 deletions.
diff --git a/cmd/kops/rollingupdatecluster.go b/cmd/kops/rollingupdatecluster.go
@@ -45,32 +45,50 @@ var (
 	rollingupdate_long = pretty.LongDesc(i18n.T(`
 	This command updates a kubernetes cluster to match the cloud, and kops specifications.
 
-	To perform rolling update, you need to update the cloud resources first with "kops update cluster"
-
-	Note: terraform users will need run the following commands all from the same directory "kops update cluster --target=terraform" then "terraform plan" then "terraform apply"
-	prior to running "kops rolling-update cluster"
-
-	Use ` + pretty.Bash("export KOPS_FEATURE_FLAGS=\"+DrainAndValidateRollingUpdate\"") + ` to use beta code that drains the nodes
-	and validates the cluster.  New flags for Drain and Validation operations will be shown when
-	the environment variable is set.`))
+	To perform rolling update, you need to update the cloud resources first with the command
+	` + pretty.Bash("kops update cluster") + `.
+
+	If rolling-update does not report that the cluster needs to be rolled you can force the cluster to be
+	rolled with the force flag.  Rolling update drains and validates the cluster by default.  A cluster is
+	deemed validated when all required nodes are running, and all pods in the kube-system namespace are operational.
+	When a node is deleted rolling-update sleeps the interval for the node type, and the tries for the same period
+	of time for the cluster to be validated.  For instance setting --master-interval=3m causes rolling-update
+	to wait for 3m after a master is rolled, and another 3m for the cluster to stabilize and pass
+	validation.
+
+	Note: terraform users will need run the following commands all from the same directory
+	` + pretty.Bash("kops update cluster --target=terraform") + `then
+	` + pretty.Bash("terraform plan") + ` then ` + pretty.Bash("terraform apply") +
+		`prior to running` + pretty.Bash("kops rolling-update cluster") + `.`))
 
 	rollingupdate_example = templates.Examples(i18n.T(`
-		# Roll the currently selected kops cluster
+		# Preview a rolling-update
+		kops rolling-update cluster
+
+		# Roll the currently selected kops cluster with defaults.
+	    # Nodes will be drained and the cluster will be validated between node replacement
 		kops rolling-update cluster --yes
 
 		# Roll the k8s-cluster.example.com kops cluster
-		# use the new drain an validate functionality
-		export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
+		# do not fail if the cluster does not validate
+		# wait 8 min to create new node, and at least 8 min
+	    # to validate the cluster.
 		kops rolling-update cluster k8s-cluster.example.com --yes \
 		  --fail-on-validate-error="false" \
 		  --master-interval=8m \
 		  --node-interval=8m
 
+		# Roll the k8s-cluster.example.com kops cluster
+		# do not validate the cluster because of the cloudonly flag.
+	    # Force the entire cluster to roll, even if rolling update
+	    # reports that the cluster does not need to be rolled.
+		kops rolling-update cluster k8s-cluster.example.com --yes \
+	      --cloudonly \
+		  --force
 
 		# Roll the k8s-cluster.example.com kops cluster
 		# only roll the node instancegroup
 		# use the new drain an validate functionality
-		export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
 		kops rolling-update cluster k8s-cluster.example.com --yes \
 		  --fail-on-validate-error="false" \
 		  --node-interval 8m \
@@ -98,8 +116,6 @@ type RollingUpdateOptions struct {
 
 	DrainInterval time.Duration
 
-	ValidateRetries int
-
 	MasterInterval  time.Duration
 	NodeInterval    time.Duration
 	BastionInterval time.Duration
@@ -119,11 +135,9 @@ func (o *RollingUpdateOptions) InitDefaults() {
 	o.FailOnValidate = true
 
 	o.MasterInterval = 5 * time.Minute
-	o.NodeInterval = 2 * time.Minute
+	o.NodeInterval = 4 * time.Minute
 	o.BastionInterval = 5 * time.Minute
 
-	o.ValidateRetries = 8
-
 	o.DrainInterval = 90 * time.Second
 
 }
@@ -152,8 +166,6 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
 	if featureflag.DrainAndValidateRollingUpdate.Enabled() {
 		cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", true, "The rolling-update will fail if draining a node fails.")
 		cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate-error", true, "The rolling-update will fail if the cluster fails to validate.")
-		cmd.Flags().IntVar(&options.ValidateRetries, "validate-retries", options.ValidateRetries, "The number of times that a node will be validated.  Between validation kops sleeps the master-interval/2 or node-interval/2 duration.")
-		cmd.Flags().DurationVar(&options.DrainInterval, "drain-interval", options.DrainInterval, "The duration that a rolling-update will wait after the node is drained.")
 	}
 
 	cmd.Run = func(cmd *cobra.Command, args []string) {
@@ -202,10 +214,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
 		return fmt.Errorf("cannot load kubecfg settings for %q: %v", contextName, err)
 	}
 
-	if options.ValidateRetries <= 0 {
-		return fmt.Errorf("validate-retries flag cannot be 0 or smaller")
-	}
-
 	var nodes []v1.Node
 	var k8sClient kubernetes.Interface
 	if !options.CloudOnly {
@@ -339,7 +347,7 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
 	}
 
 	if featureflag.DrainAndValidateRollingUpdate.Enabled() {
-		glog.V(2).Infof("New rolling update with drain and validate enabled.")
+		glog.V(2).Infof("Rolling update with drain and validate enabled.")
 	}
 	d := &instancegroups.RollingUpdateCluster{
 		MasterInterval:   options.MasterInterval,
@@ -352,7 +360,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
 		FailOnValidate:   options.FailOnValidate,
 		CloudOnly:        options.CloudOnly,
 		ClusterName:      options.ClusterName,
-		ValidateRetries:  options.ValidateRetries,
 		DrainInterval:    options.DrainInterval,
 	}
 	return d.RollingUpdate(groups, list)

diff --git a/docs/cli/kops_rolling-update.md b/docs/cli/kops_rolling-update.md
@@ -10,34 +10,51 @@ Rolling update a cluster.
 
 This command updates a kubernetes cluster to match the cloud, and kops specifications.
 
-To perform rolling update, you need to update the cloud resources first with "kops update cluster"
+To perform rolling update, you need to update the cloud resources first with the command
+`kops update cluster`.
 
-Note: terraform users will need run the following commands all from the same directory "kops update cluster --target=terraform" then "terraform plan" then "terraform apply"
-prior to running "kops rolling-update cluster"
+If rolling-update does not report that the cluster needs to be rolled you can force the cluster to be
+rolled with the force flag.  Rolling update drains and validates the cluster by default.  A cluster is
+deemed validated when all required nodes are running, and all pods in the kube-system namespace are operational.
+When a node is deleted rolling-update sleeps the interval for the node type, and the tries for the same period
+of time for the cluster to be validated.  For instance setting --master-interval=3m causes rolling-update
+to wait for 3m after a master is rolled, and another 3m for the cluster to stabilize and pass
+validation.
 
-Use `export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"` to use beta code that drains the nodes
-and validates the cluster.  New flags for Drain and Validation operations will be shown when
-the environment variable is set.
+Note: terraform users will need run the following commands all from the same directory
+`kops update cluster --target=terraform`then
+`terraform plan` then `terraform apply`prior to running`kops rolling-update cluster`.
 
 ### Examples
 
 ```
-  # Roll the currently selected kops cluster
+  # Preview a rolling-update
+  kops rolling-update cluster
+  
+  # Roll the currently selected kops cluster with defaults.
+  # Nodes will be drained and the cluster will be validated between node replacement
   kops rolling-update cluster --yes
   
   # Roll the k8s-cluster.example.com kops cluster
-  # use the new drain an validate functionality
-  export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
+  # do not fail if the cluster does not validate
+  # wait 8 min to create new node, and at least 8 min
+  # to validate the cluster.
   kops rolling-update cluster k8s-cluster.example.com --yes \
   --fail-on-validate-error="false" \
   --master-interval=8m \
   --node-interval=8m
   
+  # Roll the k8s-cluster.example.com kops cluster
+  # do not validate the cluster because of the cloudonly flag.
+  # Force the entire cluster to roll, even if rolling update
+  # reports that the cluster does not need to be rolled.
+  kops rolling-update cluster k8s-cluster.example.com --yes \
+  --cloudonly \
+  --force
   
   # Roll the k8s-cluster.example.com kops cluster
   # only roll the node instancegroup
   # use the new drain an validate functionality
-  export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
   kops rolling-update cluster k8s-cluster.example.com --yes \
   --fail-on-validate-error="false" \
   --node-interval 8m \

diff --git a/docs/cli/kops_rolling-update_cluster.md b/docs/cli/kops_rolling-update_cluster.md
@@ -10,14 +10,20 @@ Rolling update a cluster.
 
 This command updates a kubernetes cluster to match the cloud, and kops specifications.
 
-To perform rolling update, you need to update the cloud resources first with "kops update cluster"
+To perform rolling update, you need to update the cloud resources first with the command
+`kops update cluster`.
 
-Note: terraform users will need run the following commands all from the same directory "kops update cluster --target=terraform" then "terraform plan" then "terraform apply"
-prior to running "kops rolling-update cluster"
+If rolling-update does not report that the cluster needs to be rolled you can force the cluster to be
+rolled with the force flag.  Rolling update drains and validates the cluster by default.  A cluster is
+deemed validated when all required nodes are running, and all pods in the kube-system namespace are operational.
+When a node is deleted rolling-update sleeps the interval for the node type, and the tries for the same period
+of time for the cluster to be validated.  For instance setting --master-interval=3m causes rolling-update
+to wait for 3m after a master is rolled, and another 3m for the cluster to stabilize and pass
+validation.
 
-Use `export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"` to use beta code that drains the nodes
-and validates the cluster.  New flags for Drain and Validation operations will be shown when
-the environment variable is set.
+Note: terraform users will need run the following commands all from the same directory
+`kops update cluster --target=terraform`then
+`terraform plan` then `terraform apply`prior to running`kops rolling-update cluster`.
 
 ```
 kops rolling-update cluster
@@ -26,22 +32,33 @@ kops rolling-update cluster
 ### Examples
 
 ```
-  # Roll the currently selected kops cluster
+  # Preview a rolling-update
+  kops rolling-update cluster
+  
+  # Roll the currently selected kops cluster with defaults.
+  # Nodes will be drained and the cluster will be validated between node replacement
   kops rolling-update cluster --yes
   
   # Roll the k8s-cluster.example.com kops cluster
-  # use the new drain an validate functionality
-  export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
+  # do not fail if the cluster does not validate
+  # wait 8 min to create new node, and at least 8 min
+  # to validate the cluster.
   kops rolling-update cluster k8s-cluster.example.com --yes \
   --fail-on-validate-error="false" \
   --master-interval=8m \
   --node-interval=8m
   
+  # Roll the k8s-cluster.example.com kops cluster
+  # do not validate the cluster because of the cloudonly flag.
+  # Force the entire cluster to roll, even if rolling update
+  # reports that the cluster does not need to be rolled.
+  kops rolling-update cluster k8s-cluster.example.com --yes \
+  --cloudonly \
+  --force
   
   # Roll the k8s-cluster.example.com kops cluster
   # only roll the node instancegroup
   # use the new drain an validate functionality
-  export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
   kops rolling-update cluster k8s-cluster.example.com --yes \
   --fail-on-validate-error="false" \
   --node-interval 8m \
@@ -53,10 +70,12 @@ kops rolling-update cluster
 ```
       --bastion-interval duration    Time to wait between restarting bastions (default 5m0s)
       --cloudonly                    Perform rolling update without confirming progress with k8s
+      --fail-on-drain-error          The rolling-update will fail if draining a node fails. (default true)
+      --fail-on-validate-error       The rolling-update will fail if the cluster fails to validate. (default true)
       --force                        Force rolling update, even if no changes
       --instance-group stringSlice   List of instance groups to update (defaults to all if not specified)
       --master-interval duration     Time to wait between restarting masters (default 5m0s)
-      --node-interval duration       Time to wait between restarting nodes (default 2m0s)
+      --node-interval duration       Time to wait between restarting nodes (default 4m0s)
       --yes                          perform rolling update without confirmation
 ```
 

diff --git a/pkg/featureflag/featureflag.go b/pkg/featureflag/featureflag.go
@@ -40,7 +40,7 @@ func Bool(b bool) *bool {
 var DNSPreCreate = New("DNSPreCreate", Bool(true))
 
 // DrainAndValidateRollingUpdate if set will use new rolling update code that will drain and validate.
-var DrainAndValidateRollingUpdate = New("DrainAndValidateRollingUpdate", Bool(false))
+var DrainAndValidateRollingUpdate = New("DrainAndValidateRollingUpdate", Bool(true))
 
 // VPCSkipEnableDNSSupport if set will make that a VPC does not need DNSSupport enabled.
 var VPCSkipEnableDNSSupport = New("VPCSkipEnableDNSSupport", Bool(false))

diff --git a/pkg/instancegroups/instancegroups.go b/pkg/instancegroups/instancegroups.go
@@ -297,9 +297,10 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
 
 			glog.Infof("Validating the cluster.")
 
-			if err = n.ValidateClusterWithRetries(rollingUpdateData, instanceGroupList, t); err != nil {
+			if err = n.ValidateClusterWithDuration(rollingUpdateData, instanceGroupList, t); err != nil {
 
 				if rollingUpdateData.FailOnValidate {
+					glog.Errorf("Cluster did not validate within the set duration of %q, you can retry, and maybe extend the duration", t)
 					return fmt.Errorf("error validating cluster after removing a node: %v", err)
 				}
 
@@ -311,25 +312,43 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
 	return nil
 }
 
-// ValidateClusterWithRetries runs our validation methods on the K8s Cluster x times and then fails.
-func (n *CloudInstanceGroup) ValidateClusterWithRetries(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, t time.Duration) (err error) {
-
-	// TODO - We are going to need to improve Validate to allow for more than one node, not master
-	// TODO - going down at a time.
-	for i := 0; i <= rollingUpdateData.ValidateRetries; i++ {
+// ValidateClusterWithDuration runs validation.ValidateCluster until either we get positive result or the timeout expires
+func (n *CloudInstanceGroup) ValidateClusterWithDuration(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration) error {
+	// TODO should we expose this to the UI?
+	tickDuration := 30 * time.Second
+	// Try to validate cluster at least once, this will handle durations that are lower
+	// than our tick time
+	if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
+		return nil
+	}
 
-		if _, err = validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
-			glog.Infof("Cluster did not validate, and waiting longer: %v.", err)
-			time.Sleep(t / 2)
-		} else {
-			glog.Infof("Cluster validated.")
-			return nil
+	timeout := time.After(duration)
+	tick := time.Tick(tickDuration)
+	// Keep trying until we're timed out or got a result or got an error
+	for {
+		select {
+		case <-timeout:
+			// Got a timeout fail with a timeout error
+			return fmt.Errorf("cluster did not validate within a duation of %q", duration)
+		case <-tick:
+			// Got a tick, validate cluster
+			if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
+				return nil
+			}
+			// ValidateCluster didn't work yet, so let's try again
+			// this will exit up to the for loop
 		}
-
 	}
+}
 
-	// for loop is done, and did not end when the cluster validated
-	return fmt.Errorf("cluster validation failed: %v", err)
+func (n *CloudInstanceGroup) tryValidateCluster(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration, tickDuration time.Duration) bool {
+	if _, err := validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
+		glog.Infof("Cluster did not validate, will try again in %q util duration %q expires: %v.", tickDuration, duration, err)
+		return false
+	} else {
+		glog.Infof("Cluster validated.")
+		return true
+	}
 }
 
 // ValidateCluster runs our validation methods on the K8s Cluster.

diff --git a/pkg/instancegroups/rollingupdate.go b/pkg/instancegroups/rollingupdate.go
@@ -44,7 +44,6 @@ type RollingUpdateCluster struct {
 	FailOnValidate   bool
 	CloudOnly        bool
 	ClusterName      string
-	ValidateRetries  int
 	DrainInterval    time.Duration
 }
 
@@ -171,6 +170,6 @@ func (c *RollingUpdateCluster) RollingUpdate(groups map[string]*CloudInstanceGro
 		}
 	}
 
-	glog.Infof("Rolling update completed!")
+	glog.Infof("Rolling update completed for cluster %q!", c.ClusterName)
 	return nil
 }