-
Notifications
You must be signed in to change notification settings - Fork 163
/
Copy pathobserve_termination_duration.go
132 lines (115 loc) · 6.27 KB
/
observe_termination_duration.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
package apiserver
import (
"fmt"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/cluster-kube-apiserver-operator/pkg/operator/configobservation"
"github.com/openshift/library-go/pkg/operator/configobserver"
"github.com/openshift/library-go/pkg/operator/events"
)
var shutdownDelayDurationPath = []string{"apiServerArguments", "shutdown-delay-duration"}
var gracefulTerminationDurationPath = []string{"gracefulTerminationDuration"}
// ObserveShutdownDelayDuration allows for overwriting shutdown-delay-duration value.
// It exists because the time needed for an LB to notice and remove unhealthy instances might vary by platform.
func ObserveShutdownDelayDuration(genericListers configobserver.Listers, _ events.Recorder, existingConfig map[string]interface{}) (ret map[string]interface{}, errs []error) {
defer func() {
// Prune the observed config so that it only contains shutdown-delay-duration field.
ret = configobserver.Pruned(ret, shutdownDelayDurationPath)
}()
// read the observed value
var observedShutdownDelayDuration string
listers := genericListers.(configobservation.Listers)
infra, err := listers.InfrastructureLister().Get("cluster")
if err != nil && !apierrors.IsNotFound(err) {
// we got an error so without the infrastructure object we are not able to determine the type of platform we are running on
return existingConfig, append(errs, err)
}
switch {
case infra.Status.ControlPlaneTopology == configv1.SingleReplicaTopologyMode:
// reduce the shutdown delay to 0 to reach the maximum downtime for SNO
observedShutdownDelayDuration = "0s"
case infra.Spec.PlatformSpec.Type == configv1.AWSPlatformType:
// AWS has a known issue: https://bugzilla.redhat.com/show_bug.cgi?id=1943804
// We need to extend the shutdown-delay-duration so that an NLB has a chance to notice and remove unhealthy instance.
// Once the mentioned issue is resolved this code must be removed and default values applied
//
// Note this is the official number we got from AWS
observedShutdownDelayDuration = "129s"
default:
// don't override default value
return map[string]interface{}{}, errs
}
// read the current value
var currentShutdownDelayDuration string
currentShutdownDelaySlice, _, err := unstructured.NestedStringSlice(existingConfig, shutdownDelayDurationPath...)
if err != nil {
errs = append(errs, fmt.Errorf("unable to extract shutdown delay duration from the existing config: %v", err))
// keep going, we are only interested in the observed value which will overwrite the current configuration anyway
}
if len(currentShutdownDelaySlice) > 0 {
currentShutdownDelayDuration = currentShutdownDelaySlice[0]
}
// see if the current and the observed value differ
observedConfig := map[string]interface{}{}
if currentShutdownDelayDuration != observedShutdownDelayDuration {
if err = unstructured.SetNestedStringSlice(observedConfig, []string{observedShutdownDelayDuration}, shutdownDelayDurationPath...); err != nil {
return existingConfig, append(errs, err)
}
return observedConfig, errs
}
// nothing has changed return the original configuration
return existingConfig, errs
}
// ObserveGracefulTerminationDuration sets the graceful termination duration according to the current platform.
func ObserveGracefulTerminationDuration(genericListers configobserver.Listers, _ events.Recorder, existingConfig map[string]interface{}) (ret map[string]interface{}, errs []error) {
defer func() {
// Prune the observed config so that it only contains gracefulTerminationDuration field.
ret = configobserver.Pruned(ret, gracefulTerminationDurationPath)
}()
// read the observed value
var observedGracefulTerminationDuration string
listers := genericListers.(configobservation.Listers)
infra, err := listers.InfrastructureLister().Get("cluster")
if err != nil && !apierrors.IsNotFound(err) {
// we got an error so without the infrastructure object we are not able to determine the type of platform we are running on
return existingConfig, append(errs, err)
}
switch {
case infra.Status.ControlPlaneTopology == configv1.SingleReplicaTopologyMode:
// reduce termination duration from 135s (default) to 15s to reach the maximum downtime for SNO:
// - the shutdown-delay-duration is set to 0s because there is no load-balancer, and no fallback apiserver
// anyway that could benefit from a service network taking out the endpoint gracefully
// - additional 15s is for in-flight requests
observedGracefulTerminationDuration = "15"
case infra.Spec.PlatformSpec.Type == configv1.AWSPlatformType:
// AWS has a known issue: https://bugzilla.redhat.com/show_bug.cgi?id=1943804
// We need to extend the shutdown-delay-duration so that an NLB has a chance to notice and remove unhealthy instance.
// Once the mentioned issue is resolved this code must be removed and default values applied
//
// 194s is calculated as follows:
// the initial 129s is reserved fo the minimal termination period - the time needed for an LB to take an instance out of rotation
// additional 60s for finishing all in-flight requests
// an extra 5s to make sure the potential SIGTERM will be sent after the server terminates itself
observedGracefulTerminationDuration = "194"
default:
// don't override default value
return map[string]interface{}{}, errs
}
// read the current value
currentGracefulTerminationDuration, _, err := unstructured.NestedString(existingConfig, gracefulTerminationDurationPath...)
if err != nil {
errs = append(errs, fmt.Errorf("unable to extract gracefulTerminationDuration from the existing config: %v, path = %v", err, gracefulTerminationDurationPath))
// keep going, we are only interested in the observed value which will overwrite the current configuration anyway
}
// see if the current and the observed value differ
observedConfig := map[string]interface{}{}
if currentGracefulTerminationDuration != observedGracefulTerminationDuration {
if err = unstructured.SetNestedField(observedConfig, observedGracefulTerminationDuration, gracefulTerminationDurationPath...); err != nil {
return existingConfig, append(errs, err)
}
return observedConfig, errs
}
// nothing has changed return the original configuration
return existingConfig, errs
}