Skip to content

Commit

Permalink
Merge watchdog controller into scheduling controller (#283)
Browse files Browse the repository at this point in the history
Both controllers respond to essentially every Eno CR state transition,
so merging the controllers will help reduce overhead.

This does effectively remove a few metrics that we haven't found useful.
The watchdog controller is profoundly useful for the one metric that we
are keeping, but the others are not well-scoped so the logs are better
anyway.

---------

Co-authored-by: Jordan Olshevski <[email protected]>
  • Loading branch information
jveski and Jordan Olshevski authored Feb 18, 2025
1 parent b423a78 commit 2a298b9
Show file tree
Hide file tree
Showing 12 changed files with 48 additions and 335 deletions.
8 changes: 1 addition & 7 deletions cmd/eno-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import (
"github.com/Azure/eno/internal/controllers/selfhealing"
"github.com/Azure/eno/internal/controllers/synthesis"
"github.com/Azure/eno/internal/controllers/watch"
"github.com/Azure/eno/internal/controllers/watchdog"
"github.com/Azure/eno/internal/execution"
"github.com/Azure/eno/internal/manager"
)
Expand Down Expand Up @@ -117,11 +116,6 @@ func runController() error {
return fmt.Errorf("constructing resource slice cleanup controller: %w", err)
}

err = watchdog.NewController(mgr, watchdogThres)
if err != nil {
return fmt.Errorf("constructing watchdog controller: %w", err)
}

err = replication.NewSymphonyController(mgr)
if err != nil {
return fmt.Errorf("constructing symphony replication controller: %w", err)
Expand All @@ -147,7 +141,7 @@ func runController() error {
return fmt.Errorf("constructing watch controller: %w", err)
}

err = scheduling.NewController(mgr, concurrencyLimit, rolloutCooldown)
err = scheduling.NewController(mgr, concurrencyLimit, rolloutCooldown, watchdogThres)
if err != nil {
return fmt.Errorf("constructing synthesis scheduling controller: %w", err)
}
Expand Down
4 changes: 1 addition & 3 deletions internal/controllers/reconciliation/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ import (
"github.com/Azure/eno/internal/controllers/selfhealing"
"github.com/Azure/eno/internal/controllers/synthesis"
"github.com/Azure/eno/internal/controllers/watch"
"github.com/Azure/eno/internal/controllers/watchdog"
"github.com/Azure/eno/internal/testutil"
"github.com/stretchr/testify/require"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -23,11 +22,10 @@ func registerControllers(t *testing.T, mgr *testutil.Manager) {
require.NoError(t, aggregation.NewSliceController(mgr.Manager))
require.NoError(t, synthesis.NewPodLifecycleController(mgr.Manager, defaultConf))
require.NoError(t, synthesis.NewSliceCleanupController(mgr.Manager))
require.NoError(t, watchdog.NewController(mgr.Manager, time.Second*10))
require.NoError(t, replication.NewSymphonyController(mgr.Manager))
require.NoError(t, aggregation.NewSymphonyController(mgr.Manager))
require.NoError(t, aggregation.NewCompositionController(mgr.Manager))
require.NoError(t, scheduling.NewController(mgr.Manager, 10, time.Millisecond))
require.NoError(t, scheduling.NewController(mgr.Manager, 10, time.Millisecond, time.Second))
require.NoError(t, liveness.NewNamespaceController(mgr.Manager, 3, time.Second))
require.NoError(t, watch.NewController(mgr.Manager))
require.NoError(t, selfhealing.NewSliceController(mgr.Manager, time.Minute*5))
Expand Down
25 changes: 16 additions & 9 deletions internal/controllers/scheduling/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,20 +42,22 @@ const (
// Compositions will not receive the new synthesizer in the same order for every generation, but
// the same generation will always roll out in the same order.
type controller struct {
client client.Client
concurrencyLimit int
cooldownPeriod time.Duration
cacheGracePeriod time.Duration
client client.Client
concurrencyLimit int
cooldownPeriod time.Duration
cacheGracePeriod time.Duration
watchdogThreshold time.Duration

lastApplied *op
}

func NewController(mgr ctrl.Manager, concurrencyLimit int, cooldown time.Duration) error {
func NewController(mgr ctrl.Manager, concurrencyLimit int, cooldown, watchdogThreshold time.Duration) error {
c := &controller{
client: mgr.GetClient(),
concurrencyLimit: concurrencyLimit,
cooldownPeriod: cooldown,
cacheGracePeriod: time.Second,
client: mgr.GetClient(),
concurrencyLimit: concurrencyLimit,
cooldownPeriod: cooldown,
cacheGracePeriod: time.Second,
watchdogThreshold: watchdogThreshold,
}
return ctrl.NewControllerManagedBy(mgr).
Named("schedulingController").
Expand Down Expand Up @@ -101,12 +103,17 @@ func (c *controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu

var inFlight int
var op *op
stuckReconciling.Reset()
for _, comp := range comps.Items {
comp := comp
if comp.Synthesizing() {
inFlight++
}

if missedReconciliation(&comp, c.watchdogThreshold) {
stuckReconciling.WithLabelValues(comp.Spec.Synthesizer.Name).Inc()
}

synth, ok := synthsByName[comp.Spec.Synthesizer.Name]
if !ok {
continue
Expand Down
14 changes: 7 additions & 7 deletions internal/controllers/scheduling/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (
func TestBasics(t *testing.T) {
ctx := testutil.NewContext(t)
mgr := testutil.NewManager(t)
require.NoError(t, NewController(mgr.Manager, 100, 2*time.Second))
require.NoError(t, NewController(mgr.Manager, 100, 2*time.Second, 0))
mgr.Start(t)
cli := mgr.GetClient()

Expand Down Expand Up @@ -94,7 +94,7 @@ func TestBasics(t *testing.T) {
func TestSynthRolloutBasics(t *testing.T) {
ctx := testutil.NewContext(t)
mgr := testutil.NewManager(t)
require.NoError(t, NewController(mgr.Manager, 100, 2*time.Second))
require.NoError(t, NewController(mgr.Manager, 100, 2*time.Second, 0))
mgr.Start(t)
cli := mgr.GetClient()

Expand Down Expand Up @@ -187,7 +187,7 @@ func TestSynthRolloutBasics(t *testing.T) {
func TestDeferredInput(t *testing.T) {
ctx := testutil.NewContext(t)
mgr := testutil.NewManager(t)
require.NoError(t, NewController(mgr.Manager, 100, 2*time.Second))
require.NoError(t, NewController(mgr.Manager, 100, 2*time.Second, 0))
mgr.Start(t)
cli := mgr.GetClient()

Expand Down Expand Up @@ -253,7 +253,7 @@ func TestDeferredInput(t *testing.T) {
func TestForcedResynth(t *testing.T) {
ctx := testutil.NewContext(t)
mgr := testutil.NewManager(t)
require.NoError(t, NewController(mgr.Manager, 100, 2*time.Second))
require.NoError(t, NewController(mgr.Manager, 100, 2*time.Second, 0))
mgr.Start(t)
cli := mgr.GetClient()

Expand Down Expand Up @@ -295,7 +295,7 @@ func TestForcedResynth(t *testing.T) {
func TestChaos(t *testing.T) {
t.Run("one leader", func(t *testing.T) {
mgr := testutil.NewManager(t)
require.NoError(t, NewController(mgr.Manager, 5, time.Second))
require.NoError(t, NewController(mgr.Manager, 5, time.Second, 0))
mgr.Start(t)

testChaos(t, mgr)
Expand All @@ -304,8 +304,8 @@ func TestChaos(t *testing.T) {
// Run the same test but with another controller competing for the same resources
t.Run("zombie leader", func(t *testing.T) {
mgr := testutil.NewManager(t)
require.NoError(t, NewController(mgr.Manager, 5, time.Second))
require.NoError(t, NewController(mgr.Manager, 5, time.Second))
require.NoError(t, NewController(mgr.Manager, 5, time.Second, 0))
require.NoError(t, NewController(mgr.Manager, 5, time.Second, 0))
mgr.Start(t)

testChaos(t, mgr)
Expand Down
17 changes: 16 additions & 1 deletion internal/controllers/scheduling/metrics.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package scheduling

import (
"time"

apiv1 "github.com/Azure/eno/api/v1"
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)
Expand All @@ -20,8 +23,20 @@ var (
Buckets: []float64{0.1, 0.25, 0.5, 1, 5},
},
)

stuckReconciling = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "eno_compositions_stuck_reconciling_total",
Help: "Number of compositions that have not been reconciled since a period after their current synthesis was initialized",
}, []string{"synthesizer"},
)
)

func init() {
metrics.Registry.MustRegister(freeSynthesisSlots, schedulingLatency)
metrics.Registry.MustRegister(freeSynthesisSlots, schedulingLatency, stuckReconciling)
}

func missedReconciliation(comp *apiv1.Composition, threshold time.Duration) bool {
syn := comp.Status.CurrentSynthesis
return syn != nil && syn.Reconciled == nil && syn.Initialized != nil && time.Since(syn.Initialized.Time) > threshold
}
2 changes: 1 addition & 1 deletion internal/controllers/selfhealing/slice_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ var testSynthesisConfig = &synthesis.Config{

func registerControllers(t *testing.T, mgr *testutil.Manager) {
require.NoError(t, NewSliceController(mgr.Manager, time.Minute*5))
require.NoError(t, scheduling.NewController(mgr.Manager, 10, time.Microsecond*10))
require.NoError(t, scheduling.NewController(mgr.Manager, 10, time.Microsecond*10, time.Second))
require.NoError(t, synthesis.NewPodLifecycleController(mgr.Manager, testSynthesisConfig))
require.NoError(t, synthesis.NewSliceCleanupController(mgr.Manager))
}
Expand Down
2 changes: 1 addition & 1 deletion internal/controllers/synthesis/backoff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ func TestControllerBackoff(t *testing.T) {
cli := mgr.GetClient()

require.NoError(t, NewPodLifecycleController(mgr.Manager, minimalTestConfig))
require.NoError(t, scheduling.NewController(mgr.Manager, 10, 2*time.Second))
require.NoError(t, scheduling.NewController(mgr.Manager, 10, 2*time.Second, time.Second))
mgr.Start(t)

syn := &apiv1.Synthesizer{}
Expand Down
6 changes: 3 additions & 3 deletions internal/controllers/synthesis/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func TestControllerHappyPath(t *testing.T) {
mgr := testutil.NewManager(t)
cli := mgr.GetClient()

require.NoError(t, scheduling.NewController(mgr.Manager, 10, 2*time.Second))
require.NoError(t, scheduling.NewController(mgr.Manager, 10, 2*time.Second, time.Second))
require.NoError(t, NewPodLifecycleController(mgr.Manager, minimalTestConfig))

calls := atomic.Int64{}
Expand Down Expand Up @@ -108,7 +108,7 @@ func TestControllerFastCompositionUpdates(t *testing.T) {
mgr := testutil.NewManager(t)
cli := mgr.GetClient()

require.NoError(t, scheduling.NewController(mgr.Manager, 10, 2*time.Second))
require.NoError(t, scheduling.NewController(mgr.Manager, 10, 2*time.Second, time.Second))
require.NoError(t, NewPodLifecycleController(mgr.Manager, minimalTestConfig))
testutil.WithFakeExecutor(t, mgr, func(ctx context.Context, s *apiv1.Synthesizer, input *krmv1.ResourceList) (*krmv1.ResourceList, error) {
output := &krmv1.ResourceList{}
Expand Down Expand Up @@ -186,7 +186,7 @@ func TestControllerSwitchingSynthesizers(t *testing.T) {
return output, nil
})

require.NoError(t, scheduling.NewController(mgr.Manager, 10, 2*time.Second))
require.NoError(t, scheduling.NewController(mgr.Manager, 10, 2*time.Second, time.Second))
require.NoError(t, NewPodLifecycleController(mgr.Manager, minimalTestConfig))
mgr.Start(t)

Expand Down
4 changes: 2 additions & 2 deletions internal/controllers/synthesis/lifecycle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func TestCompositionDeletion(t *testing.T) {

require.NoError(t, NewPodLifecycleController(mgr.Manager, minimalTestConfig))
require.NoError(t, NewSliceCleanupController(mgr.Manager))
require.NoError(t, scheduling.NewController(mgr.Manager, 10, 2*time.Second))
require.NoError(t, scheduling.NewController(mgr.Manager, 10, 2*time.Second, time.Second))
mgr.Start(t)

syn := &apiv1.Synthesizer{}
Expand Down Expand Up @@ -125,7 +125,7 @@ func TestDeleteCompositionWhenSynthesizerMissing(t *testing.T) {

require.NoError(t, NewPodLifecycleController(mgr.Manager, minimalTestConfig))
require.NoError(t, NewSliceCleanupController(mgr.Manager))
require.NoError(t, scheduling.NewController(mgr.Manager, 10, 2*time.Second))
require.NoError(t, scheduling.NewController(mgr.Manager, 10, 2*time.Second, time.Second))
mgr.Start(t)

syn := &apiv1.Synthesizer{}
Expand Down
114 changes: 0 additions & 114 deletions internal/controllers/watchdog/controller.go

This file was deleted.

Loading

0 comments on commit 2a298b9

Please sign in to comment.