Skip to content

Commit

Permalink
NO-JIRA: deferred updates: fix in-place update handling on reboot (op…
Browse files Browse the repository at this point in the history
…enshift#1162)

* log: make sure to have high verbosity in tests

Signed-off-by: Francesco Romani <[email protected]>

* e2e: deferred: stricter testing

add stricter check for the node condition when
validating OCPBUGS-38795

Signed-off-by: Francesco Romani <[email protected]>

* deferred: tuned: fix reload trigger when inplace update

There are conditions on which we should not set the
reload flag. This avoid regression in the test
"Profile deferred when applied should trigger changes when applied fist,
then deferred when edited, if tuned restart should be kept deferred"

Signed-off-by: Francesco Romani <[email protected]>

* deferred: tuned: clarify comments and logs

document better the logic about processing edits

Signed-off-by: Francesco Romani <[email protected]>

---------

Signed-off-by: Francesco Romani <[email protected]>
  • Loading branch information
ffromani authored and yanirq committed Sep 17, 2024
1 parent 312dc05 commit 16379f7
Show file tree
Hide file tree
Showing 11 changed files with 128 additions and 54 deletions.
26 changes: 19 additions & 7 deletions pkg/tuned/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,12 @@ func (c *Controller) sync(key wqKeyKube) error {
change.provider = profile.Spec.Config.ProviderName
change.recommendedProfile = profile.Spec.Config.TunedProfile
change.debug = profile.Spec.Config.Debug
err = util.SetLogLevel(profile.Spec.Config.Verbosity)
if err != nil {
klog.Errorf("failed to set log level %d: %v", profile.Spec.Config.Verbosity, err)
} else {
klog.Infof("set log level %d", profile.Spec.Config.Verbosity)
}
change.reapplySysctl = true
if profile.Spec.Config.TuneDConfig.ReapplySysctl != nil {
change.reapplySysctl = *profile.Spec.Config.TuneDConfig.ReapplySysctl
Expand Down Expand Up @@ -1025,15 +1031,21 @@ func (c *Controller) changeSyncerTuneD(change Change) (synced bool, err error) {
// Cache the value written to tunedRecommendFile.
c.daemon.recommendedProfile = change.recommendedProfile
klog.V(1).Infof("recommended TuneD profile updated from %q to %q [inplaceUpdate=%v nodeRestart=%v]", prevRecommended, change.recommendedProfile, inplaceUpdate, change.nodeRestart)
changeRecommend = true

if change.deferredMode == util.DeferUpdate && !inplaceUpdate && c.daemon.recoveredRecommendedProfile == change.recommendedProfile {
klog.V(1).Infof("recommended TuneD profile changed; skip TuneD reload [deferred=%v recoveredRecommended=%v]", change.deferredMode, c.daemon.recoveredRecommendedProfile)
// If we get this far, it's either because we detected a node restart or the recommended profile changed.
// If it's a node restart, we just process the change. If it's the latter -- TuneD profile switch, i.e. not an in-place update -- we need further
// logic to distinguish between the "update" and "always" DeferMode.
// For the "update" DeferMode, we only defer profile edits. If we are re-processing a profile which we already applied, then we must
// not reload tuned, otherwise we're missing the point of deferred updates.
// See the test "Profile deferred when applied should trigger changes when applied first, then deferred when edited, if tuned restart should be kept deferred"
// See the commit 3655f22656d4a3aa9f471099305dcd78a9c80320
if !inplaceUpdate && change.deferredMode == util.DeferUpdate && c.daemon.recoveredRecommendedProfile == change.recommendedProfile {
klog.V(1).Infof("reprocessing profile already in effect; this seems a daemon reload. Skip TuneD reload [deferred=%v recoveredRecommended=%v]", change.deferredMode, c.daemon.recoveredRecommendedProfile)
// Reset because we need only once the first time we process the TuneD k8s object. Let's avoid stale data.
c.daemon.recoveredRecommendedProfile = ""
} else {
klog.V(1).Infof("recommended TuneD profile changed; trigger TuneD reload [deferred=%v]", change.deferredMode)
reload = true
changeRecommend = true
}
} else if util.IsImmediateUpdate(change.deferredMode) && (c.daemon.status&scDeferred != 0) {
klog.V(1).Infof("detected deferred update changed to immediate after object update")
Expand Down Expand Up @@ -1723,7 +1735,7 @@ func RunInCluster(stopCh <-chan struct{}, version string) error {
c.daemon.profileFingerprintUnpacked = profileFP
// We should *never* recover `c.daemon.recommendedProfile`.
// NTO operand supervises the TuneD daemon, so we must trigger
// a TuneD restart. Currently, the main trigger for the tuned
// a TuneD restart. Currently, the main reason to trigger a tuned
// reload to be triggered is when the current recommended profile
// is different from the change.recommended profile (see changeSyncerTuneD).
// Hence, we MUST NOT recover the recommended profile name
Expand All @@ -1737,10 +1749,10 @@ func RunInCluster(stopCh <-chan struct{}, version string) error {
deferredFP, isNodeReboot, err := c.recoverAndClearDeferredUpdate()
if err != nil {
klog.ErrorS(err, "unable to recover the pending update")
} else if deferredFP == "" {
klog.Infof("starting: no pending deferred update")
} else if !isNodeReboot {
klog.Infof("starting: does not seem a node reboot, but a daemon restart. Ignoring pending deferred updates (if any)")
} else if deferredFP == "" {
klog.Infof("starting: node reboot, but no pending deferred update")
} else {
klog.Infof("starting: recovered and cleared pending deferred update %q for %s (fingerprint=%q)", recommended, restartReason(isNodeReboot), deferredFP)
c.pendingChange = &Change{
Expand Down
51 changes: 35 additions & 16 deletions test/e2e/deferred/basic.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "profile-st
util.ExecAndLogCommand("oc", "delete", "-n", ntoconfig.WatchNamespace(), "-f", createdTuned)
}
util.ExecAndLogCommand("oc", "label", "node", targetNode.Name, tunedMatchLabelLater+"-")

checkWorkerNodeIsDefaultState(context.Background(), targetNode)
})

ginkgo.It("should not trigger any actual change", func(ctx context.Context) {
Expand All @@ -89,15 +91,15 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "profile-st
tunedMutated := setDeferred(tuned.DeepCopy(), ntoutil.DeferAlways)
ginkgo.By(fmt.Sprintf("creating tuned object %s deferred=%v", tunedMutated.Name, ntoutil.GetDeferredUpdateAnnotation(tunedMutated.Annotations)))

_, err = cs.Tuneds(ntoconfig.WatchNamespace()).Create(ctx, tunedMutated, metav1.CreateOptions{})
tunedCreated, err := cs.Tuneds(ntoconfig.WatchNamespace()).Create(ctx, tunedMutated, metav1.CreateOptions{})
gomega.Expect(err).ToNot(gomega.HaveOccurred())

createdTuneds = prepend(createdTuneds, tunedPath) // we need the path, not the name
ginkgo.By(fmt.Sprintf("create tuneds: %v", createdTuneds))

gomega.Expect(tuned.Spec.Recommend).ToNot(gomega.BeEmpty(), "tuned %q has empty recommendations", tuned.Name)
gomega.Expect(tuned.Spec.Recommend[0].Profile).ToNot(gomega.BeNil(), "tuned %q has empty recommended tuned profile", tuned.Name)
expectedProfile := *tuned.Spec.Recommend[0].Profile
gomega.Expect(tunedCreated.Spec.Recommend).ToNot(gomega.BeEmpty(), "tuned %q has empty recommendations", tunedCreated.Name)
gomega.Expect(tunedCreated.Spec.Recommend[0].Profile).ToNot(gomega.BeNil(), "tuned %q has empty recommended tuned profile", tunedCreated.Name)
expectedProfile := *tunedCreated.Spec.Recommend[0].Profile
ginkgo.By(fmt.Sprintf("expecting Tuned Profile %q to be picked up", expectedProfile))

gomega.Eventually(func() error {
Expand Down Expand Up @@ -178,15 +180,15 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "profile-st
tunedMutated := setDeferred(tuned.DeepCopy(), ntoutil.DeferAlways)
ginkgo.By(fmt.Sprintf("creating tuned object %s deferred=%v", tunedMutated.Name, ntoutil.GetDeferredUpdateAnnotation(tunedMutated.Annotations)))

_, err = cs.Tuneds(ntoconfig.WatchNamespace()).Create(ctx, tunedMutated, metav1.CreateOptions{})
tunedCreated, err := cs.Tuneds(ntoconfig.WatchNamespace()).Create(ctx, tunedMutated, metav1.CreateOptions{})
gomega.Expect(err).ToNot(gomega.HaveOccurred())

createdTuneds = prepend(createdTuneds, tunedPath) // we need the path, not the name
ginkgo.By(fmt.Sprintf("create tuneds: %v", createdTuneds))

gomega.Expect(tuned.Spec.Recommend).ToNot(gomega.BeEmpty(), "tuned %q has empty recommendations", tuned.Name)
gomega.Expect(tuned.Spec.Recommend[0].Profile).ToNot(gomega.BeNil(), "tuned %q has empty recommended tuned profile", tuned.Name)
expectedProfile := *tuned.Spec.Recommend[0].Profile
gomega.Expect(tunedCreated.Spec.Recommend).ToNot(gomega.BeEmpty(), "tuned %q has empty recommendations", tunedCreated.Name)
gomega.Expect(tunedCreated.Spec.Recommend[0].Profile).ToNot(gomega.BeNil(), "tuned %q has empty recommended tuned profile", tunedCreated.Name)
expectedProfile := *tunedCreated.Spec.Recommend[0].Profile
ginkgo.By(fmt.Sprintf("expecting Tuned Profile %q to be picked up", expectedProfile))

gomega.Expect(verifData.TargetTunedPod.Status.Phase).To(gomega.Equal(corev1.PodRunning), "TargetTunedPod %s/%s uid %s phase %s", verifData.TargetTunedPod.Namespace, verifData.TargetTunedPod.Name, verifData.TargetTunedPod.UID, verifData.TargetTunedPod.Status.Phase)
Expand Down Expand Up @@ -463,7 +465,12 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "profile-st
}).WithPolling(10 * time.Second).WithTimeout(1 * time.Minute).Should(gomega.Succeed())
})

ginkgo.It("should apply defer-on-update immediately after node is labelled", func(ctx context.Context) {
ginkgo.It("should apply defer-on-update immediately after node is labelled", ginkgo.Label("node-label"), func(ctx context.Context) {
nodeLabel := tunedMatchLabelLater
_, ok := targetNode.Labels[nodeLabel]
ginkgo.By(fmt.Sprintf("checking the target node labels: on %q: %q present = %v", targetNode.Name, nodeLabel, ok))
util.Logf("labels on %q: %v", targetNode.Name, targetNode.Labels)

tunedPath := filepath.Join(dirPath, tunedSHMMNI)
ginkgo.By(fmt.Sprintf("loading tuned data from %s (basepath=%s)", tunedPath, dirPath))

Expand All @@ -481,27 +488,39 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "profile-st
// Change the recommendation to use non-role label to test a scenario where the Tuned is created first (and processed by NTO)
// and only later a node is changed to consume it. This results in a flow where no change is detected on the Tuned level (it is already)
// on the disk, but an update must be triggered.
nodeLabel := tunedMatchLabelLater
tunedMutated.Spec.Recommend[0].Match[0].Label = &nodeLabel

ginkgo.By(fmt.Sprintf("creating tuned object %s deferred=%v", tunedMutated.Name, ntoutil.GetDeferredUpdateAnnotation(tunedMutated.Annotations)))
_, err = cs.Tuneds(ntoconfig.WatchNamespace()).Create(ctx, tunedMutated, metav1.CreateOptions{})
tunedCreated, err := cs.Tuneds(ntoconfig.WatchNamespace()).Create(ctx, tunedMutated, metav1.CreateOptions{})
gomega.Expect(err).ToNot(gomega.HaveOccurred())

createdTuneds = prepend(createdTuneds, tunedPath) // we need the path, not the name
ginkgo.By(fmt.Sprintf("create tuneds: %v", createdTuneds))

gomega.Expect(tuned.Spec.Recommend[0].Profile).ToNot(gomega.BeNil(), "tuned %q has empty recommended tuned profile", tuned.Name)
gomega.Expect(tunedCreated.Spec.Recommend[0].Profile).ToNot(gomega.BeNil(), "tuned %q has empty recommended tuned profile", tunedCreated.Name)
expectedProfile := *tuned.Spec.Recommend[0].Profile

checkCurrentProfileIsDefaultForWorker(ctx, targetNode.Name, verifData.TargetTunedPod)

ginkgo.By(fmt.Sprintf("labelling node %q with %q", targetNode.Name, nodeLabel))
gomega.Eventually(func() error {
nd, err := cs.CoreV1Interface.Nodes().Get(ctx, targetNode.Name, metav1.GetOptions{})
if err != nil {
return err
}
nd = nd.DeepCopy()
gomega.Expect(nd.Labels).ToNot(gomega.HaveKey(nodeLabel), "node %q has label %q too early", nd.Name, nodeLabel)

gomega.Expect(targetNode.Labels).ToNot(gomega.HaveKey(nodeLabel), "node %q has label %q too early", targetNode.Name, nodeLabel)
targetNode.Labels[nodeLabel] = ""
_, err = cs.CoreV1Interface.Nodes().Update(ctx, targetNode, metav1.UpdateOptions{})
gomega.Expect(err).ToNot(gomega.HaveOccurred())
nd.Labels[nodeLabel] = ""
_, err = cs.CoreV1Interface.Nodes().Update(ctx, nd, metav1.UpdateOptions{})
return err
}).WithPolling(10 * time.Second).WithTimeout(1 * time.Minute).Should(gomega.Succeed())

ginkgo.By(fmt.Sprintf("expecting Tuned Profile %q to be picked up", expectedProfile))
// feedback for debugging
tmpNd, err := cs.CoreV1Interface.Nodes().Get(ctx, targetNode.Name, metav1.GetOptions{})
gomega.Expect(err).ToNot(gomega.HaveOccurred(), "cannot get feedback about %q", targetNode.Name)
util.Logf("labels on %q after update: %v", tmpNd.Name, tmpNd.Labels)

gomega.Eventually(func() error {
curProf, err := cs.Profiles(ntoconfig.WatchNamespace()).Get(ctx, targetNode.Name, metav1.GetOptions{})
Expand Down
2 changes: 2 additions & 0 deletions test/e2e/deferred/non_regression.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ var _ = ginkgo.Describe("Profile non-deferred", ginkgo.Label("deferred", "non-re
ginkgo.By(fmt.Sprintf("cluster changes rollback: %q", createdTuned))
util.ExecAndLogCommand("oc", "delete", "-n", ntoconfig.WatchNamespace(), "-f", createdTuned)
}

checkWorkerNodeIsDefaultState(context.Background(), targetNode)
})

ginkgo.It("should trigger changes", func(ctx context.Context) {
Expand Down
53 changes: 53 additions & 0 deletions test/e2e/deferred/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,3 +233,56 @@ func checkNonTargetWorkerNodesAreUnaffected(ctx context.Context, workerNodes []c
}).WithPolling(10. * time.Second).WithTimeout(1 * time.Minute).Should(gomega.Succeed())
}
}

func checkCurrentProfileIsDefaultForWorker(ctx context.Context, targetNodeName string, targetTunedPod *corev1.Pod) {
ginkgo.GinkgoHelper()

gomega.Eventually(func() error {
curProf, err := cs.Profiles(ntoconfig.WatchNamespace()).Get(ctx, targetNodeName, metav1.GetOptions{})
if err != nil {
return err
}
ginkgo.By(fmt.Sprintf("checking profile for target node %q matches expectations about %q", curProf.Name, defaultWorkerProfile))

if len(curProf.Status.Conditions) == 0 {
return fmt.Errorf("missing status conditions")
}
cond := findCondition(curProf.Status.Conditions, tunedv1.TunedProfileApplied)
if cond == nil {
return fmt.Errorf("missing status applied condition")
}
err = checkAppliedConditionOK(cond)
if err != nil {
util.Logf("profile for target node %q does not match expectations about %q: %v", curProf.Name, defaultWorkerProfile, err)
}
recommended, err := getRecommendedProfile(targetTunedPod)
if err != nil {
return err
}
if recommended != defaultWorkerProfile {
return fmt.Errorf("recommended profile is %q expected %q", recommended, defaultWorkerProfile)
}
return err
}).WithPolling(10 * time.Second).WithTimeout(1 * time.Minute).Should(gomega.Succeed())
}

func checkWorkerNodeIsDefaultState(ctx context.Context, targetNode *corev1.Node) {
ginkgo.GinkgoHelper()

var targetTunedPod *corev1.Pod
gomega.Eventually(func() error {
tunedPod, err := util.GetTunedForNode(cs, targetNode)
if err != nil {
return err
}
if tunedPod.Status.Phase != corev1.PodRunning {
return fmt.Errorf("pod %s/%s phase %v not running", tunedPod.Namespace, tunedPod.Name, tunedPod.Status.Phase)
}
targetTunedPod = tunedPod
return nil
}).WithPolling(10 * time.Second).WithTimeout(1 * time.Minute).Should(gomega.Succeed())

checkCurrentProfileIsDefaultForWorker(context.Background(), targetNode.Name, targetTunedPod)

ginkgo.By(fmt.Sprintf("node %q seems to match default worker state", targetNode.Name))
}
38 changes: 7 additions & 31 deletions test/e2e/deferred/restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "slow", "di
if len(createdTuneds) == 0 {
return
}
checkNonTargetWorkerNodesAreUnaffected(context.TODO(), workerNodes, targetNode.Name)

ctx := context.Background()
checkNonTargetWorkerNodesAreUnaffected(ctx, workerNodes, targetNode.Name)
checkWorkerNodeIsDefaultState(ctx, targetNode)
})

ginkgo.Context("the tuned daemon", ginkgo.Label("slow", "pod"), func() {
Expand All @@ -82,6 +85,8 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "slow", "di
verifData := util.MustExtractVerificationOutputAndCommand(cs, targetNode, tunedImmediate)
gomega.Expect(verifData.OutputCurrent).ToNot(gomega.Equal(verifData.OutputExpected), "current output %q already matches expected %q", verifData.OutputCurrent, verifData.OutputExpected)

checkCurrentProfileIsDefaultForWorker(ctx, targetNode.Name, verifData.TargetTunedPod)

tunedMutated := setDeferred(tunedImmediate.DeepCopy(), ntoutil.DeferAlways)
ginkgo.By(fmt.Sprintf("creating tuned object %s deferred=%v", tunedMutated.Name, ntoutil.GetDeferredUpdateAnnotation(tunedMutated.Annotations)))

Expand Down Expand Up @@ -396,37 +401,8 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "slow", "di

ginkgo.By(fmt.Sprintf("checking target node after rollback: %q", targetNode.Name))
_, createdTuneds, _ = popleft(createdTuneds)
gomega.Eventually(func() error {
curProf, err := cs.Profiles(ntoconfig.WatchNamespace()).Get(ctx, targetNode.Name, metav1.GetOptions{})
if err != nil {
return err
}
ginkgo.By(fmt.Sprintf("checking profile for target node %q matches expectations about %q", curProf.Name, expectedProfile))
gomega.Expect(curProf.Name).To(gomega.Equal(expectedProfile), "checking profile for worker node %q matches expectations %q", curProf.Name, expectedProfile)

if len(curProf.Status.Conditions) == 0 {
return fmt.Errorf("missing status conditions")
}
cond := findCondition(curProf.Status.Conditions, tunedv1.TunedProfileApplied)
if cond == nil {
return fmt.Errorf("missing status applied condition")
}
err = checkAppliedConditionOK(cond)
if err != nil {
util.Logf("profile for target node %q does not match expectations about %q: %v", curProf.Name, expectedProfile, err)
}

ginkgo.By(fmt.Sprintf("checking real node conditions are restored pristine: %v -> %s", verifData.CommandArgs, verifData.OutputCurrent))
out, err := util.ExecCmdInPod(targetTunedPod, verifData.CommandArgs...)
if err != nil {
return err
}
out = strings.TrimSpace(out)
if out != verifData.OutputCurrent {
return fmt.Errorf("got: %s; expected: %s", out, verifData.OutputCurrent)
}
return nil
}).WithPolling(10 * time.Second).WithTimeout(1 * time.Minute).Should(gomega.Succeed())
checkWorkerNodeIsDefaultState(ctx, targetNode)

checkNonTargetWorkerNodesAreUnaffected(ctx, workerNodes, targetNode.Name)
})
Expand Down
2 changes: 2 additions & 0 deletions test/e2e/deferred/updates.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ var _ = ginkgo.Describe("Profile deferred", ginkgo.Label("deferred", "inplace-up
ginkgo.By(fmt.Sprintf("cluster changes rollback: %q", createdTuned))
util.ExecAndLogCommand("oc", "delete", "-n", ntoconfig.WatchNamespace(), "-f", createdTuned)
}

checkWorkerNodeIsDefaultState(context.Background(), targetNode)
})

ginkgo.It("should trigger changes when applied fist, then deferred when edited", func(ctx context.Context) {
Expand Down
2 changes: 2 additions & 0 deletions test/e2e/testing_manifests/deferred/tuned-basic-00.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ spec:
- label: node-role.kubernetes.io/worker
priority: 20
profile: test-shmmni
operand:
verbosity: 4
2 changes: 2 additions & 0 deletions test/e2e/testing_manifests/deferred/tuned-basic-10.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ spec:
- label: node-role.kubernetes.io/worker
priority: 15
profile: test-cpu-energy
operand:
verbosity: 4
2 changes: 2 additions & 0 deletions test/e2e/testing_manifests/deferred/tuned-basic-20.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,5 @@ spec:
- label: node-role.kubernetes.io/worker
priority: 15
profile: test-vm-latency
operand:
verbosity: 4
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ spec:
- label: node-role.kubernetes.io/worker
priority: 15
profile: test-vmlat-inplace
operand:
verbosity: 4
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ spec:
- label: node-role.kubernetes.io/worker
priority: 15
profile: test-vmlat-inplace
operand:
verbosity: 4

0 comments on commit 16379f7

Please sign in to comment.