Skip to content

Commit

Permalink
OCPBUGS-28647: tuned: operand: add support for deferred updates
Browse files Browse the repository at this point in the history
Users are able to make tuning changes through NTO/TuneD by updating
the CRs on a running system.
When the tuning change is done, the NTO sends a SIGHUP to the tuned
process, which results in the tuned process rolling back ALL the
tuning and then reapplying it.
In some key use cases, this disrupts the workload behavior.

We add an annotation that cause the update not to be applied
immediately, but to be deferred till the next tuned restart.
The NTO operator will continue merging the tuned objects into the
special `rendered` object as it does today.
The deferred annotation will be "sticky". If even just one of
the current tuned object (e.g. just the perfprofile one) have
the annotation, the rendered tuned will.

However, if there is a deferred updated pending but the tuned
objects are edited in such a way the final `rendered` update will be
immediate (= no deferred annotation), then the immediate update
will be applied overwriting the pending deferred update, which will
then be lost.

The deferred updates can stack, meaning a deferred update can be
applied on top of a previous deferred update; the latter will
just overwrite the former as it never was received.
An immediate update can overwrite a deferred update, and
clear it.

Like the immediate update, the NTO operand will take care
of applying the deferred updates, as such:

The NTO operand will gain another chance to process tuned objects
at startup, right before the main loops start.

If a tuned object is detected and it has NOT the deferred annotation:
- the regular reconcile code will work as today
- the startup reconcile code will not engage

If a tuned object is detected and it DOES HAVE the deferred annotation:
- the regular reconcile code will set a filesystem flag, will move
  the status DEGRADED and will ignore the object.
- at the first followup operand restart, the startup reconcile code
   will notice the filesystem flag, apply the rendered tuned objects
  (granted it DOES have the deferred flag, otherwise it will log and
  skip), clear the filesystem flag. The DEGRADED state will be cleared.

IOW, the startup reconcile code will engage IFF the rendered tuned
has the deferred annotation and the filesystem flag does exist.
Likewise, the regular reconcile loop will only set the filesystem
flag if detects a rendered tuned with the deferred annotation.

The default is obviously kept to immediate updates,
for backward compatibility.

Signed-off-by: Francesco Romani <[email protected]>
  • Loading branch information
ffromani committed Apr 4, 2024
1 parent 89b3e39 commit ab08d68
Show file tree
Hide file tree
Showing 8 changed files with 192 additions and 13 deletions.
4 changes: 4 additions & 0 deletions pkg/apis/tuned/v1/tuned_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ const (
// TunedBootcmdlineAnnotationKey is a Node-specific annotation denoting kernel command-line parameters
// calculated by TuneD for the current profile applied to that Node.
TunedBootcmdlineAnnotationKey string = "tuned.openshift.io/bootcmdline"

// TunedDeferredUpdate request the tuned daemons to defer the update of the rendered profile
// until the next restart
TunedDeferredUpdate string = "tuned.openshift.io/deferred"
)

/////////////////////////////////////////////////////////////////////////////////
Expand Down
14 changes: 13 additions & 1 deletion pkg/manifests/manifests.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (

tunedv1 "github.com/openshift/cluster-node-tuning-operator/pkg/apis/tuned/v1"
ntoconfig "github.com/openshift/cluster-node-tuning-operator/pkg/config"
"github.com/openshift/cluster-node-tuning-operator/pkg/util"
"k8s.io/klog/v2"
)

Expand Down Expand Up @@ -39,12 +40,17 @@ func TunedRenderedResource(tunedSlice []*tunedv1.Tuned) *tunedv1.Tuned {

tunedProfiles := []tunedv1.TunedProfile{}
m := map[string]tunedv1.TunedProfile{}
isDeferred := false

for _, tuned := range tunedSlice {
if tuned.Name == tunedv1.TunedRenderedResourceName {
// Skip the "rendered" Tuned resource itself
continue
}
if util.HasDeferredUpdateAnnotation(tuned.Annotations) {
isDeferred = true
}
klog.Infof("tuned %q deferred=%v", tuned.Name, isDeferred)
tunedRenderedProfiles(tuned, m)
}
for _, tunedProfile := range m {
Expand All @@ -63,7 +69,13 @@ func TunedRenderedResource(tunedSlice []*tunedv1.Tuned) *tunedv1.Tuned {
})

cr.Spec.Profile = tunedProfiles

if isDeferred {
klog.Infof("tuned %q will be deferred", cr.Name)
util.AddTunedDeferredUpdateAnnotation(cr)
} else {
klog.Infof("tuned %q will be immediate", cr.Name)
util.DeleteTunedDeferredUpdateAnnotation(cr)
}
return cr
}

Expand Down
3 changes: 2 additions & 1 deletion pkg/operator/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -551,11 +551,12 @@ func (c *Controller) syncTunedRendered(tuned *tunedv1.Tuned) error {
return fmt.Errorf("failed to get Tuned %s: %v", tunedv1.TunedRenderedResourceName, err)
}

if reflect.DeepEqual(crMf.Spec.Profile, cr.Spec.Profile) {
if util.IsTunedEqual(crMf, cr) {
klog.V(2).Infof("syncTunedRendered(): Tuned %s doesn't need updating", crMf.Name)
return nil
}
cr = cr.DeepCopy() // never update the objects from cache
cr.Annotations = crMf.Annotations
cr.Spec = crMf.Spec

klog.V(2).Infof("syncTunedRendered(): updating Tuned %s", cr.Name)
Expand Down
139 changes: 128 additions & 11 deletions pkg/tuned/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"k8s.io/client-go/tools/clientcmd"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"
"sigs.k8s.io/yaml"

tunedv1 "github.com/openshift/cluster-node-tuning-operator/pkg/apis/tuned/v1"
ntoclient "github.com/openshift/cluster-node-tuning-operator/pkg/client"
Expand All @@ -43,6 +44,7 @@ const (
scError
scSysctlOverride
scUnknown
scDeferred
)

// Constants
Expand All @@ -60,17 +62,20 @@ const (
tunedBootcmdlineFile = tunedProfilesDirCustom + "/bootcmdline"
// A couple of seconds should be more than enough for TuneD daemon to gracefully stop;
// be generous and give it 10s.
tunedGracefulExitWait = time.Second * time.Duration(10)
openshiftTunedHome = "/var/lib/ocp-tuned"
openshiftTunedRunDir = "/run/" + programName
openshiftTunedProvider = openshiftTunedHome + "/provider"
tunedGracefulExitWait = time.Second * time.Duration(10)
openshiftTunedHome = "/var/lib/ocp-tuned"
openshiftTunedRunDir = "/run/" + programName
openshiftTunedProvider = openshiftTunedHome + "/provider"
openshiftTunedDeferredFile = "deferred-update-pending"
openshiftTunedDeferredPath = "/host" + "/var/lib" + "/" + openshiftTunedDeferredFile // TODO
// With the less aggressive rate limiter, retries will happen at 100ms*2^(retry_n-1):
// 100ms, 200ms, 400ms, 800ms, 1.6s, 3.2s, 6.4s, 12.8s, 25.6s, 51.2s, 102.4s, 3.4m, 6.8m, 13.7m, 27.3m
maxRetries = 15
// workqueue related constants
wqKindDaemon = "daemon"
wqKindTuned = "tuned"
wqKindProfile = "profile"
wqKindDaemon = "daemon"
wqKindTuned = "tuned"
wqKindProfile = "profile"
wqKindDeferred = "deferred"
)

// Types
Expand Down Expand Up @@ -118,6 +123,10 @@ type Controller struct {
// Did the command-line parameters to run the TuneD daemon change?
// In other words, is a complete restart of the TuneD daemon needed?
daemon bool
// Did the tuned object requested for a deferred update?
deferred bool
// Are we processing now a deferred update?
wasdeferred bool
}

daemon Daemon
Expand Down Expand Up @@ -237,6 +246,19 @@ func (c *Controller) eventProcessorKube() {

func (c *Controller) sync(key wqKey) error {
switch {
case key.kind == wqKindDeferred:
klog.Infof("sync(): Processing Deferred Tuned")

c.change.rendered = true // done when the deferred update was frozen
c.change.wasdeferred = true
c.change.deferred = false

klog.Infof("sync(): Deferred Tuned change=%+v", c.change)
c.wqTuneD.Add(wqKey{kind: wqKindDaemon})

klog.Infof("sync(): Deleting Deferred Tuned")
return deleteDeferredUpdatePending()

case key.kind == wqKindTuned:
if key.name != tunedv1.TunedRenderedResourceName {
return nil
Expand All @@ -253,6 +275,31 @@ func (c *Controller) sync(key wqKey) error {
return err
}
c.change.rendered = change

isDeferred := util.HasDeferredUpdateAnnotation(tuned.Annotations)
if isDeferred {
klog.Infof("sync(): Tuned %s has to be deferred", key.name)

tuned = tuned.DeepCopy()
deferredTuned, err := getDeferredUpdatePending()
if err != nil {
klog.Errorf("error loading the pending deferred update: %v")
return err
}
if deferredTuned == nil || !util.IsTunedEqual(deferredTuned, tuned) {
// we always prefer the apiserver data to avoid trusting obsolete
// data on the node.
err = setDeferredUpdatePending(tuned)
if err != nil {
klog.Errorf("error saving the pending deferred update: %v")
return err
}
klog.Infof("sync(): Deferred Tuned %s replaced", key.name)
} else {
klog.Infof("sync(): Deferred Tuned %s skipped, looks the same as the already pending", key.name)
}
}
c.change.deferred = isDeferred
// Notify the event processor that the Tuned k8s object containing TuneD profiles changed.
c.wqTuneD.Add(wqKey{kind: wqKindDaemon})

Expand Down Expand Up @@ -655,6 +702,22 @@ func (c *Controller) changeSyncer() (synced bool, err error) {
return false, fmt.Errorf("changeSyncer(): called while the TuneD daemon was reloading")
}

if c.change.wasdeferred {
klog.Infof("changeSyncer(): processing now deferred update")
c.change.wasdeferred = false
c.daemon.status &= ^scDeferred
} else if c.change.deferred {
klog.Infof("changeSyncer(): detected deferred update, skipping change sync")
c.change.deferred = false
c.daemon.status |= scDeferred
klog.Infof("changeSyncer(): detected deferred update, setting conditions accordingly")
if err = c.updateTunedProfile(); err != nil {
klog.Error(err.Error())
return false, nil // retry later
}
return true, nil
}

if c.change.bootcmdline || c.daemon.reloaded {
// One or both of the following happened:
// 1) tunedBootcmdlineFile changed on the filesystem. This is very likely the result of
Expand All @@ -681,7 +744,13 @@ func (c *Controller) changeSyncer() (synced bool, err error) {
if activeProfile, err = getActiveProfile(); err != nil {
return false, err
}
if (c.daemon.status & scApplied) == 0 {
if (c.daemon.status & scDeferred) != 0 {
klog.Infof("deferred update pending; profile change will not trigger profile reload")
if err = c.updateTunedProfile(); err != nil {
klog.Error(err.Error())
return false, nil // retry later
}
} else if (c.daemon.status & scApplied) == 0 {
if len(activeProfile) > 0 {
// activeProfile == "" means we have not started TuneD daemon yet; do not log that case
klog.Infof("re-applying profile (%s) as the previous application did not complete", activeProfile)
Expand All @@ -705,8 +774,16 @@ func (c *Controller) changeSyncer() (synced bool, err error) {
if c.change.rendered {
// The "rendered" Tuned k8s object changed.
c.change.rendered = false
reload = true
c.daemon.status = scUnknown
if (c.daemon.status & scDeferred) != 0 {
klog.Infof("deferred update pending; rendered tuned change will not trigger profile reload")
if err = c.updateTunedProfile(); err != nil {
klog.Error(err.Error())
return false, nil // retry later
}
} else {
reload = true
c.daemon.status = scUnknown
}
}

if c.change.daemon {
Expand Down Expand Up @@ -991,7 +1068,6 @@ func (c *Controller) changeWatcher() (err error) {
if !ok {
return fmt.Errorf("failed to wait for caches to sync")
}

klog.V(1).Info("starting events processors")
go wait.Until(c.eventProcessorKube, time.Second, c.stopCh)
defer c.wqKube.ShutDown()
Expand All @@ -1014,6 +1090,16 @@ func (c *Controller) changeWatcher() (err error) {
}
}

deferredTuned, err := getDeferredUpdatePending()
if err != nil {
klog.Errorf("error detecting pending deferred updates: %v", err)
return err
}
if deferredTuned != nil {
klog.Infof("found deferred update pending, processing")
c.wqKube.Add(wqKey{kind: wqKindDeferred, name: tunedv1.TunedRenderedResourceName})
}

klog.Info("started controller")
for {
select {
Expand Down Expand Up @@ -1137,3 +1223,34 @@ func RunOperand(stopCh <-chan struct{}, version string, inCluster bool) error {

return retryLoop(c)
}

func setDeferredUpdatePending(tuned *tunedv1.Tuned) error {
data, err := yaml.Marshal(tuned)
if err != nil {
return err
}
return os.WriteFile(openshiftTunedDeferredPath, data, 0600)
}

func getDeferredUpdatePending() (*tunedv1.Tuned, error) {
data, err := os.ReadFile(openshiftTunedDeferredPath)
if err != nil {
if os.IsNotExist(err) {
// not a real error, can happen
return nil, nil
}
return nil, err
}
var tuned tunedv1.Tuned
err = yaml.Unmarshal(data, &tuned)
return &tuned, err
}

func deleteDeferredUpdatePending() error {
err := os.Remove(openshiftTunedDeferredPath)
if err != nil && os.IsNotExist(err) {
// not a real error, can happen
return nil
}
return err
}
2 changes: 2 additions & 0 deletions pkg/tuned/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ func TunedRun(cmd *exec.Cmd, daemon *Daemon, onDaemonReload func()) error {

if profileApplied {
daemon.status |= scApplied
// any update means we processed a deferred update and we're no synchronized
daemon.status &= ^scDeferred
}

strIndex := strings.Index(l, " WARNING ")
Expand Down
4 changes: 4 additions & 0 deletions pkg/tuned/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,10 @@ func computeStatusConditions(status Bits, stderr string, conditions []tunedv1.Pr
tunedDegradedCondition.Status = corev1.ConditionTrue // treat overrides as regular errors; users should use "reapply_sysctl: true" or remove conflicting sysctls
tunedDegradedCondition.Reason = "TunedSysctlOverride"
tunedDegradedCondition.Message = "TuneD daemon issued one or more sysctl override message(s) during profile application. Use reapply_sysctl=true or remove conflicting sysctl " + stderr
} else if (status & scDeferred) != 0 {
tunedDegradedCondition.Status = corev1.ConditionTrue
tunedDegradedCondition.Reason = "TunedDeferredUpdate"
tunedDegradedCondition.Message = "Profile will be applied at the next daemon restart"
} else if (status & scWarn) != 0 {
tunedDegradedCondition.Status = corev1.ConditionFalse // consider warnings from TuneD as non-fatal
tunedDegradedCondition.Reason = "TunedWarning"
Expand Down
28 changes: 28 additions & 0 deletions pkg/util/annotations.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package util

import (
tunedv1 "github.com/openshift/cluster-node-tuning-operator/pkg/apis/tuned/v1"
)

func HasDeferredUpdateAnnotation(anns map[string]string) bool {
if anns == nil {
return false
}
_, ok := anns[tunedv1.TunedDeferredUpdate]
return ok
}

func AddTunedDeferredUpdateAnnotation(tuned *tunedv1.Tuned) {
if tuned.Annotations == nil {
tuned.Annotations = make(map[string]string)
}
tuned.Annotations[tunedv1.TunedDeferredUpdate] = ""
}

func DeleteTunedDeferredUpdateAnnotation(tuned *tunedv1.Tuned) {
if tuned.Annotations == nil {
// nothing to do
return
}
delete(tuned.Annotations, tunedv1.TunedDeferredUpdate)
}
11 changes: 11 additions & 0 deletions pkg/util/equality.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package util

import (
"reflect"

tunedv1 "github.com/openshift/cluster-node-tuning-operator/pkg/apis/tuned/v1"
)

func IsTunedEqual(crMf, cr *tunedv1.Tuned) bool {
return reflect.DeepEqual(crMf.Annotations, cr.Annotations) && reflect.DeepEqual(crMf.Spec.Profile, cr.Spec.Profile)
}

0 comments on commit ab08d68

Please sign in to comment.