diff --git a/cluster/cluster.go b/cluster/cluster.go index bb7d1333f..e55ee9664 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -166,6 +166,7 @@ func (c *Cluster) UpgradeControlPlane(ctx context.Context, kubeClient *kubernete inactiveHosts := make(map[string]bool) var controlPlaneHosts, notReadyHosts []*hosts.Host var notReadyHostNames []string + var err error for _, host := range c.InactiveHosts { // include only hosts with controlplane role @@ -173,6 +174,10 @@ func (c *Cluster) UpgradeControlPlane(ctx context.Context, kubeClient *kubernete inactiveHosts[host.HostnameOverride] = true } } + c.MaxUnavailableForControlNodes, err = services.ResetMaxUnavailable(c.MaxUnavailableForControlNodes, len(inactiveHosts), services.ControlRole) + if err != nil { + return "", err + } for _, host := range c.ControlPlaneHosts { if !c.HostsLabeledToIgnoreUpgrade[host.Address] { controlPlaneHosts = append(controlPlaneHosts, host) @@ -265,6 +270,7 @@ func (c *Cluster) UpgradeWorkerPlane(ctx context.Context, kubeClient *kubernetes inactiveHosts := make(map[string]bool) var notReadyHosts []*hosts.Host var notReadyHostNames []string + var err error for _, host := range c.InactiveHosts { // if host has controlplane role, it already has worker components upgraded @@ -272,6 +278,10 @@ func (c *Cluster) UpgradeWorkerPlane(ctx context.Context, kubeClient *kubernetes inactiveHosts[host.HostnameOverride] = true } } + c.MaxUnavailableForWorkerNodes, err = services.ResetMaxUnavailable(c.MaxUnavailableForWorkerNodes, len(inactiveHosts), services.WorkerRole) + if err != nil { + return "", err + } for _, host := range append(etcdAndWorkerHosts, workerOnlyHosts...) { if c.NewHosts[host.HostnameOverride] { continue diff --git a/cluster/defaults.go b/cluster/defaults.go index 30e96df03..a673f8907 100644 --- a/cluster/defaults.go +++ b/cluster/defaults.go @@ -603,7 +603,7 @@ func GetExternalFlags(local, updateOnly, disablePortCheck bool, configDir, clust func (c *Cluster) setAddonsDefaults() { c.Ingress.UpdateStrategy = setDaemonsetAddonDefaults(c.Ingress.UpdateStrategy) c.Network.UpdateStrategy = setDaemonsetAddonDefaults(c.Network.UpdateStrategy) - c.DNS.UpdateStrategy = setDeploymentAddonDefaults(c.DNS.UpdateStrategy) + c.DNS.UpdateStrategy = setDNSDeploymentAddonDefaults(c.DNS.UpdateStrategy, c.DNS.Provider) if c.DNS.LinearAutoscalerParams == nil { c.DNS.LinearAutoscalerParams = &DefaultClusterProportionalAutoscalerLinearParams } @@ -638,3 +638,43 @@ func setDeploymentAddonDefaults(updateStrategy *appsv1.DeploymentStrategy) *apps } return updateStrategy } + +func setDNSDeploymentAddonDefaults(updateStrategy *appsv1.DeploymentStrategy, dnsProvider string) *appsv1.DeploymentStrategy { + var ( + coreDNSMaxUnavailable, coreDNSMaxSurge = intstr.FromInt(1), intstr.FromInt(0) + kubeDNSMaxSurge, kubeDNSMaxUnavailable = intstr.FromString("10%"), intstr.FromInt(0) + ) + if updateStrategy != nil && updateStrategy.Type != appsv1.RollingUpdateDeploymentStrategyType { + return updateStrategy + } + switch dnsProvider { + case CoreDNSProvider: + if updateStrategy == nil || updateStrategy.RollingUpdate == nil { + return &appsv1.DeploymentStrategy{ + Type: appsv1.RollingUpdateDeploymentStrategyType, + RollingUpdate: &appsv1.RollingUpdateDeployment{ + MaxUnavailable: &coreDNSMaxUnavailable, + MaxSurge: &coreDNSMaxSurge, + }, + } + } + if updateStrategy.RollingUpdate.MaxUnavailable == nil { + updateStrategy.RollingUpdate.MaxUnavailable = &coreDNSMaxUnavailable + } + case KubeDNSProvider: + if updateStrategy == nil || updateStrategy.RollingUpdate == nil { + return &appsv1.DeploymentStrategy{ + Type: appsv1.RollingUpdateDeploymentStrategyType, + RollingUpdate: &appsv1.RollingUpdateDeployment{ + MaxUnavailable: &kubeDNSMaxUnavailable, + MaxSurge: &kubeDNSMaxSurge, + }, + } + } + if updateStrategy.RollingUpdate.MaxSurge == nil { + updateStrategy.RollingUpdate.MaxSurge = &kubeDNSMaxSurge + } + } + + return updateStrategy +} diff --git a/services/controlplane.go b/services/controlplane.go index a8b791de0..473af4006 100644 --- a/services/controlplane.go +++ b/services/controlplane.go @@ -72,16 +72,22 @@ func UpgradeControlPlaneNodes(ctx context.Context, kubeClient *kubernetes.Client drainHelper = getDrainHelper(kubeClient, *upgradeStrategy) log.Infof(ctx, "[%s] Parameters provided to drain command: %#v", ControlRole, fmt.Sprintf("Force: %v, IgnoreAllDaemonSets: %v, DeleteLocalData: %v, Timeout: %v, GracePeriodSeconds: %v", drainHelper.Force, drainHelper.IgnoreAllDaemonSets, drainHelper.DeleteLocalData, drainHelper.Timeout, drainHelper.GracePeriodSeconds)) } - maxUnavailable, err := resetMaxUnavailable(maxUnavailable, len(inactiveHosts), ControlRole) - if err != nil { - return errMsgMaxUnavailableNotFailed, err + var inactiveHostErr error + if len(inactiveHosts) > 0 { + var inactiveHostNames []string + for hostName := range inactiveHosts { + inactiveHostNames = append(inactiveHostNames, hostName) + } + inactiveHostErr = fmt.Errorf("provisioning incomplete, host(s) [%s] skipped because they could not be contacted", strings.Join(inactiveHostNames, ",")) } hostsFailedToUpgrade, err := processControlPlaneForUpgrade(ctx, kubeClient, controlHosts, localConnDialerFactory, prsMap, cpNodePlanMap, updateWorkersOnly, alpineImage, certMap, upgradeStrategy, newHosts, inactiveHosts, maxUnavailable, drainHelper) - if err != nil { - logrus.Errorf("Failed to upgrade hosts: %v with error %v", strings.Join(hostsFailedToUpgrade, ","), err) - errMsgMaxUnavailableNotFailed = fmt.Sprintf("Failed to upgrade hosts: %v with error %v", strings.Join(hostsFailedToUpgrade, ","), err) - return errMsgMaxUnavailableNotFailed, err + if err != nil || inactiveHostErr != nil { + if len(hostsFailedToUpgrade) > 0 { + logrus.Errorf("Failed to upgrade hosts: %v with error %v", strings.Join(hostsFailedToUpgrade, ","), err) + errMsgMaxUnavailableNotFailed = fmt.Sprintf("Failed to upgrade hosts: %v with error %v", strings.Join(hostsFailedToUpgrade, ","), err) + } + return errMsgMaxUnavailableNotFailed, util.ErrList([]error{err, inactiveHostErr}) } log.Infof(ctx, "[%s] Successfully upgraded Controller Plane..", ControlRole) return errMsgMaxUnavailableNotFailed, nil diff --git a/services/node_util.go b/services/node_util.go index 6a373050c..15b7ff85d 100644 --- a/services/node_util.go +++ b/services/node_util.go @@ -115,7 +115,7 @@ func CalculateMaxUnavailable(maxUnavailableVal string, numHosts int, role string return maxUnavailable, nil } -func resetMaxUnavailable(maxUnavailable, lenInactiveHosts int, component string) (int, error) { +func ResetMaxUnavailable(maxUnavailable, lenInactiveHosts int, component string) (int, error) { if maxUnavailable > WorkerThreads { /* upgrading a large number of nodes in parallel leads to a large number of goroutines, which has led to errors regarding too many open sockets Because of this RKE switched to using workerpools. 50 workerthreads has been sufficient to optimize rke up, upgrading at most 50 nodes in parallel. diff --git a/services/workerplane.go b/services/workerplane.go index a9039b3ff..2c8282345 100644 --- a/services/workerplane.go +++ b/services/workerplane.go @@ -55,10 +55,6 @@ func RunWorkerPlane(ctx context.Context, allHosts []*hosts.Host, localConnDialer func UpgradeWorkerPlaneForWorkerAndEtcdNodes(ctx context.Context, kubeClient *kubernetes.Clientset, mixedRolesHosts []*hosts.Host, workerOnlyHosts []*hosts.Host, inactiveHosts map[string]bool, localConnDialerFactory hosts.DialerFactory, prsMap map[string]v3.PrivateRegistry, workerNodePlanMap map[string]v3.RKEConfigNodePlan, certMap map[string]pki.CertificatePKI, updateWorkersOnly bool, alpineImage string, upgradeStrategy *v3.NodeUpgradeStrategy, newHosts map[string]bool, maxUnavailable int) (string, error) { log.Infof(ctx, "[%s] Upgrading Worker Plane..", WorkerRole) var errMsgMaxUnavailableNotFailed string - maxUnavailable, err := resetMaxUnavailable(maxUnavailable, len(inactiveHosts), WorkerRole) - if err != nil { - return errMsgMaxUnavailableNotFailed, err - } updateNewHostsList(kubeClient, append(mixedRolesHosts, workerOnlyHosts...), newHosts) if len(mixedRolesHosts) > 0 { log.Infof(ctx, "First checking and processing worker components for upgrades on nodes with etcd role one at a time")