Skip to content

Commit

Permalink
Merge pull request #3329 from chrislovecnm/promote-drain-validate
Browse files Browse the repository at this point in the history
Automatic merge from submit-queue. .

promoting drain and validate by setting feature flag to true

I am unable to recreate #2407, and frankly, it may be an edge case.  We could warn a user if their wait times are low, but that would be another PR.

This PR moves Drain and Validate functionality for rolling-updates into the default user experience, setting the Feature Flag to true.

Per feedback, I am using the node and master interval times for the validation.
  • Loading branch information
Kubernetes Submit Queue authored Sep 24, 2017
2 parents dcd0406 + 8dabeec commit ba42020
Show file tree
Hide file tree
Showing 6 changed files with 126 additions and 65 deletions.
57 changes: 32 additions & 25 deletions cmd/kops/rollingupdatecluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,32 +45,50 @@ var (
rollingupdate_long = pretty.LongDesc(i18n.T(`
This command updates a kubernetes cluster to match the cloud, and kops specifications.
To perform rolling update, you need to update the cloud resources first with "kops update cluster"
Note: terraform users will need run the following commands all from the same directory "kops update cluster --target=terraform" then "terraform plan" then "terraform apply"
prior to running "kops rolling-update cluster"
Use ` + pretty.Bash("export KOPS_FEATURE_FLAGS=\"+DrainAndValidateRollingUpdate\"") + ` to use beta code that drains the nodes
and validates the cluster. New flags for Drain and Validation operations will be shown when
the environment variable is set.`))
To perform rolling update, you need to update the cloud resources first with the command
` + pretty.Bash("kops update cluster") + `.
If rolling-update does not report that the cluster needs to be rolled you can force the cluster to be
rolled with the force flag. Rolling update drains and validates the cluster by default. A cluster is
deemed validated when all required nodes are running, and all pods in the kube-system namespace are operational.
When a node is deleted rolling-update sleeps the interval for the node type, and the tries for the same period
of time for the cluster to be validated. For instance setting --master-interval=3m causes rolling-update
to wait for 3m after a master is rolled, and another 3m for the cluster to stabilize and pass
validation.
Note: terraform users will need run the following commands all from the same directory
` + pretty.Bash("kops update cluster --target=terraform") + `then
` + pretty.Bash("terraform plan") + ` then ` + pretty.Bash("terraform apply") +
`prior to running` + pretty.Bash("kops rolling-update cluster") + `.`))

rollingupdate_example = templates.Examples(i18n.T(`
# Roll the currently selected kops cluster
# Preview a rolling-update
kops rolling-update cluster
# Roll the currently selected kops cluster with defaults.
# Nodes will be drained and the cluster will be validated between node replacement
kops rolling-update cluster --yes
# Roll the k8s-cluster.example.com kops cluster
# use the new drain an validate functionality
export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
# do not fail if the cluster does not validate
# wait 8 min to create new node, and at least 8 min
# to validate the cluster.
kops rolling-update cluster k8s-cluster.example.com --yes \
--fail-on-validate-error="false" \
--master-interval=8m \
--node-interval=8m
# Roll the k8s-cluster.example.com kops cluster
# do not validate the cluster because of the cloudonly flag.
# Force the entire cluster to roll, even if rolling update
# reports that the cluster does not need to be rolled.
kops rolling-update cluster k8s-cluster.example.com --yes \
--cloudonly \
--force
# Roll the k8s-cluster.example.com kops cluster
# only roll the node instancegroup
# use the new drain an validate functionality
export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
kops rolling-update cluster k8s-cluster.example.com --yes \
--fail-on-validate-error="false" \
--node-interval 8m \
Expand Down Expand Up @@ -98,8 +116,6 @@ type RollingUpdateOptions struct {

DrainInterval time.Duration

ValidateRetries int

MasterInterval time.Duration
NodeInterval time.Duration
BastionInterval time.Duration
Expand All @@ -119,11 +135,9 @@ func (o *RollingUpdateOptions) InitDefaults() {
o.FailOnValidate = true

o.MasterInterval = 5 * time.Minute
o.NodeInterval = 2 * time.Minute
o.NodeInterval = 4 * time.Minute
o.BastionInterval = 5 * time.Minute

o.ValidateRetries = 8

o.DrainInterval = 90 * time.Second

}
Expand Down Expand Up @@ -152,8 +166,6 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
if featureflag.DrainAndValidateRollingUpdate.Enabled() {
cmd.Flags().BoolVar(&options.FailOnDrainError, "fail-on-drain-error", true, "The rolling-update will fail if draining a node fails.")
cmd.Flags().BoolVar(&options.FailOnValidate, "fail-on-validate-error", true, "The rolling-update will fail if the cluster fails to validate.")
cmd.Flags().IntVar(&options.ValidateRetries, "validate-retries", options.ValidateRetries, "The number of times that a node will be validated. Between validation kops sleeps the master-interval/2 or node-interval/2 duration.")
cmd.Flags().DurationVar(&options.DrainInterval, "drain-interval", options.DrainInterval, "The duration that a rolling-update will wait after the node is drained.")
}

cmd.Run = func(cmd *cobra.Command, args []string) {
Expand Down Expand Up @@ -202,10 +214,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
return fmt.Errorf("cannot load kubecfg settings for %q: %v", contextName, err)
}

if options.ValidateRetries <= 0 {
return fmt.Errorf("validate-retries flag cannot be 0 or smaller")
}

var nodes []v1.Node
var k8sClient kubernetes.Interface
if !options.CloudOnly {
Expand Down Expand Up @@ -339,7 +347,7 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
}

if featureflag.DrainAndValidateRollingUpdate.Enabled() {
glog.V(2).Infof("New rolling update with drain and validate enabled.")
glog.V(2).Infof("Rolling update with drain and validate enabled.")
}
d := &instancegroups.RollingUpdateCluster{
MasterInterval: options.MasterInterval,
Expand All @@ -352,7 +360,6 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
FailOnValidate: options.FailOnValidate,
CloudOnly: options.CloudOnly,
ClusterName: options.ClusterName,
ValidateRetries: options.ValidateRetries,
DrainInterval: options.DrainInterval,
}
return d.RollingUpdate(groups, list)
Expand Down
37 changes: 27 additions & 10 deletions docs/cli/kops_rolling-update.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,34 +10,51 @@ Rolling update a cluster.

This command updates a kubernetes cluster to match the cloud, and kops specifications.

To perform rolling update, you need to update the cloud resources first with "kops update cluster"
To perform rolling update, you need to update the cloud resources first with the command
`kops update cluster`.

Note: terraform users will need run the following commands all from the same directory "kops update cluster --target=terraform" then "terraform plan" then "terraform apply"
prior to running "kops rolling-update cluster"
If rolling-update does not report that the cluster needs to be rolled you can force the cluster to be
rolled with the force flag. Rolling update drains and validates the cluster by default. A cluster is
deemed validated when all required nodes are running, and all pods in the kube-system namespace are operational.
When a node is deleted rolling-update sleeps the interval for the node type, and the tries for the same period
of time for the cluster to be validated. For instance setting --master-interval=3m causes rolling-update
to wait for 3m after a master is rolled, and another 3m for the cluster to stabilize and pass
validation.

Use `export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"` to use beta code that drains the nodes
and validates the cluster. New flags for Drain and Validation operations will be shown when
the environment variable is set.
Note: terraform users will need run the following commands all from the same directory
`kops update cluster --target=terraform`then
`terraform plan` then `terraform apply`prior to running`kops rolling-update cluster`.

### Examples

```
# Roll the currently selected kops cluster
# Preview a rolling-update
kops rolling-update cluster
# Roll the currently selected kops cluster with defaults.
# Nodes will be drained and the cluster will be validated between node replacement
kops rolling-update cluster --yes
# Roll the k8s-cluster.example.com kops cluster
# use the new drain an validate functionality
export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
# do not fail if the cluster does not validate
# wait 8 min to create new node, and at least 8 min
# to validate the cluster.
kops rolling-update cluster k8s-cluster.example.com --yes \
--fail-on-validate-error="false" \
--master-interval=8m \
--node-interval=8m
# Roll the k8s-cluster.example.com kops cluster
# do not validate the cluster because of the cloudonly flag.
# Force the entire cluster to roll, even if rolling update
# reports that the cluster does not need to be rolled.
kops rolling-update cluster k8s-cluster.example.com --yes \
--cloudonly \
--force
# Roll the k8s-cluster.example.com kops cluster
# only roll the node instancegroup
# use the new drain an validate functionality
export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
kops rolling-update cluster k8s-cluster.example.com --yes \
--fail-on-validate-error="false" \
--node-interval 8m \
Expand Down
41 changes: 30 additions & 11 deletions docs/cli/kops_rolling-update_cluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,20 @@ Rolling update a cluster.

This command updates a kubernetes cluster to match the cloud, and kops specifications.

To perform rolling update, you need to update the cloud resources first with "kops update cluster"
To perform rolling update, you need to update the cloud resources first with the command
`kops update cluster`.

Note: terraform users will need run the following commands all from the same directory "kops update cluster --target=terraform" then "terraform plan" then "terraform apply"
prior to running "kops rolling-update cluster"
If rolling-update does not report that the cluster needs to be rolled you can force the cluster to be
rolled with the force flag. Rolling update drains and validates the cluster by default. A cluster is
deemed validated when all required nodes are running, and all pods in the kube-system namespace are operational.
When a node is deleted rolling-update sleeps the interval for the node type, and the tries for the same period
of time for the cluster to be validated. For instance setting --master-interval=3m causes rolling-update
to wait for 3m after a master is rolled, and another 3m for the cluster to stabilize and pass
validation.

Use `export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"` to use beta code that drains the nodes
and validates the cluster. New flags for Drain and Validation operations will be shown when
the environment variable is set.
Note: terraform users will need run the following commands all from the same directory
`kops update cluster --target=terraform`then
`terraform plan` then `terraform apply`prior to running`kops rolling-update cluster`.

```
kops rolling-update cluster
Expand All @@ -26,22 +32,33 @@ kops rolling-update cluster
### Examples

```
# Roll the currently selected kops cluster
# Preview a rolling-update
kops rolling-update cluster
# Roll the currently selected kops cluster with defaults.
# Nodes will be drained and the cluster will be validated between node replacement
kops rolling-update cluster --yes
# Roll the k8s-cluster.example.com kops cluster
# use the new drain an validate functionality
export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
# do not fail if the cluster does not validate
# wait 8 min to create new node, and at least 8 min
# to validate the cluster.
kops rolling-update cluster k8s-cluster.example.com --yes \
--fail-on-validate-error="false" \
--master-interval=8m \
--node-interval=8m
# Roll the k8s-cluster.example.com kops cluster
# do not validate the cluster because of the cloudonly flag.
# Force the entire cluster to roll, even if rolling update
# reports that the cluster does not need to be rolled.
kops rolling-update cluster k8s-cluster.example.com --yes \
--cloudonly \
--force
# Roll the k8s-cluster.example.com kops cluster
# only roll the node instancegroup
# use the new drain an validate functionality
export KOPS_FEATURE_FLAGS="+DrainAndValidateRollingUpdate"
kops rolling-update cluster k8s-cluster.example.com --yes \
--fail-on-validate-error="false" \
--node-interval 8m \
Expand All @@ -53,10 +70,12 @@ kops rolling-update cluster
```
--bastion-interval duration Time to wait between restarting bastions (default 5m0s)
--cloudonly Perform rolling update without confirming progress with k8s
--fail-on-drain-error The rolling-update will fail if draining a node fails. (default true)
--fail-on-validate-error The rolling-update will fail if the cluster fails to validate. (default true)
--force Force rolling update, even if no changes
--instance-group stringSlice List of instance groups to update (defaults to all if not specified)
--master-interval duration Time to wait between restarting masters (default 5m0s)
--node-interval duration Time to wait between restarting nodes (default 2m0s)
--node-interval duration Time to wait between restarting nodes (default 4m0s)
--yes perform rolling update without confirmation
```

Expand Down
2 changes: 1 addition & 1 deletion pkg/featureflag/featureflag.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func Bool(b bool) *bool {
var DNSPreCreate = New("DNSPreCreate", Bool(true))

// DrainAndValidateRollingUpdate if set will use new rolling update code that will drain and validate.
var DrainAndValidateRollingUpdate = New("DrainAndValidateRollingUpdate", Bool(false))
var DrainAndValidateRollingUpdate = New("DrainAndValidateRollingUpdate", Bool(true))

// VPCSkipEnableDNSSupport if set will make that a VPC does not need DNSSupport enabled.
var VPCSkipEnableDNSSupport = New("VPCSkipEnableDNSSupport", Bool(false))
Expand Down
51 changes: 35 additions & 16 deletions pkg/instancegroups/instancegroups.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,9 +297,10 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust

glog.Infof("Validating the cluster.")

if err = n.ValidateClusterWithRetries(rollingUpdateData, instanceGroupList, t); err != nil {
if err = n.ValidateClusterWithDuration(rollingUpdateData, instanceGroupList, t); err != nil {

if rollingUpdateData.FailOnValidate {
glog.Errorf("Cluster did not validate within the set duration of %q, you can retry, and maybe extend the duration", t)
return fmt.Errorf("error validating cluster after removing a node: %v", err)
}

Expand All @@ -311,25 +312,43 @@ func (n *CloudInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateClust
return nil
}

// ValidateClusterWithRetries runs our validation methods on the K8s Cluster x times and then fails.
func (n *CloudInstanceGroup) ValidateClusterWithRetries(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, t time.Duration) (err error) {

// TODO - We are going to need to improve Validate to allow for more than one node, not master
// TODO - going down at a time.
for i := 0; i <= rollingUpdateData.ValidateRetries; i++ {
// ValidateClusterWithDuration runs validation.ValidateCluster until either we get positive result or the timeout expires
func (n *CloudInstanceGroup) ValidateClusterWithDuration(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration) error {
// TODO should we expose this to the UI?
tickDuration := 30 * time.Second
// Try to validate cluster at least once, this will handle durations that are lower
// than our tick time
if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
return nil
}

if _, err = validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
glog.Infof("Cluster did not validate, and waiting longer: %v.", err)
time.Sleep(t / 2)
} else {
glog.Infof("Cluster validated.")
return nil
timeout := time.After(duration)
tick := time.Tick(tickDuration)
// Keep trying until we're timed out or got a result or got an error
for {
select {
case <-timeout:
// Got a timeout fail with a timeout error
return fmt.Errorf("cluster did not validate within a duation of %q", duration)
case <-tick:
// Got a tick, validate cluster
if n.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
return nil
}
// ValidateCluster didn't work yet, so let's try again
// this will exit up to the for loop
}

}
}

// for loop is done, and did not end when the cluster validated
return fmt.Errorf("cluster validation failed: %v", err)
func (n *CloudInstanceGroup) tryValidateCluster(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration, tickDuration time.Duration) bool {
if _, err := validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
glog.Infof("Cluster did not validate, will try again in %q util duration %q expires: %v.", tickDuration, duration, err)
return false
} else {
glog.Infof("Cluster validated.")
return true
}
}

// ValidateCluster runs our validation methods on the K8s Cluster.
Expand Down
3 changes: 1 addition & 2 deletions pkg/instancegroups/rollingupdate.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ type RollingUpdateCluster struct {
FailOnValidate bool
CloudOnly bool
ClusterName string
ValidateRetries int
DrainInterval time.Duration
}

Expand Down Expand Up @@ -171,6 +170,6 @@ func (c *RollingUpdateCluster) RollingUpdate(groups map[string]*CloudInstanceGro
}
}

glog.Infof("Rolling update completed!")
glog.Infof("Rolling update completed for cluster %q!", c.ClusterName)
return nil
}

0 comments on commit ba42020

Please sign in to comment.