From 58fc83d8cc313e7fc65727f7b73604dd81c13528 Mon Sep 17 00:00:00 2001 From: Mateusz Szostok Date: Wed, 4 Aug 2021 12:09:58 +0200 Subject: [PATCH] Remove the ClusterRoleBinding, fix get query for ClusterRoleBinding (#424) * Remove the ClusterRoleBinding, fix get query * Ignore evicted Pods * Run integration tests without waiting for entry-tests * Update images to thoses from PR --- .github/workflows/pr-build.yaml | 2 +- .../charts/engine/templates/deployment.yaml | 4 +- .../capact/templates/tests/test-e2e.yaml | 4 +- .../controller/action_controller.go | 62 ++------------ .../k8s-engine/controller/action_service.go | 81 +++++++++++++++++-- test/e2e/cluster_test.go | 7 +- 6 files changed, 92 insertions(+), 68 deletions(-) diff --git a/.github/workflows/pr-build.yaml b/.github/workflows/pr-build.yaml index 6f31952ee..0b1492195 100644 --- a/.github/workflows/pr-build.yaml +++ b/.github/workflows/pr-build.yaml @@ -242,7 +242,7 @@ jobs: name: Integration tests runs-on: ubuntu-latest if: github.event.pull_request.draft == false - needs: [ entry-tests, build-app, build-tests, build-cli ] + needs: [ build-app, build-tests, build-cli ] permissions: contents: read env: diff --git a/deploy/kubernetes/charts/capact/charts/engine/templates/deployment.yaml b/deploy/kubernetes/charts/capact/charts/engine/templates/deployment.yaml index bf642f528..f7110a7f6 100644 --- a/deploy/kubernetes/charts/capact/charts/engine/templates/deployment.yaml +++ b/deploy/kubernetes/charts/capact/charts/engine/templates/deployment.yaml @@ -31,7 +31,9 @@ spec: - name: {{ .Chart.Name }} securityContext: {{- toYaml .Values.securityContext | nindent 12 }} - image: "{{ .Values.global.containerRegistry.path }}/{{ .Values.image.name }}:{{ .Values.global.containerRegistry.overrideTag | default .Chart.AppVersion }}" + # image: "{{ .Values.global.containerRegistry.path }}/{{ .Values.image.name }}:{{ .Values.global.containerRegistry.overrideTag | default .Chart.AppVersion }}" + # Use PR image + image: "ghcr.io/capactio/pr/k8s-engine:PR-424" imagePullPolicy: {{ .Values.image.pullPolicy }} env: - name: APP_GRAPH_QL_ADDR diff --git a/deploy/kubernetes/charts/capact/templates/tests/test-e2e.yaml b/deploy/kubernetes/charts/capact/templates/tests/test-e2e.yaml index 9fb56bc8e..14136d4d7 100644 --- a/deploy/kubernetes/charts/capact/templates/tests/test-e2e.yaml +++ b/deploy/kubernetes/charts/capact/templates/tests/test-e2e.yaml @@ -10,7 +10,9 @@ spec: serviceAccountName: "{{ include "capact.fullname" . }}-test-e2e" containers: - name: tests-runner - image: "{{ .Values.global.containerRegistry.path }}/{{ .Values.integrationTest.image.name }}:{{ .Values.global.containerRegistry.overrideTag | default .Chart.AppVersion }}" + # image: "{{ .Values.global.containerRegistry.path }}/{{ .Values.integrationTest.image.name }}:{{ .Values.global.containerRegistry.overrideTag | default .Chart.AppVersion }}" + # Use PR image + image: "ghcr.io/capactio/pr/e2e-test:PR-424" env: - name: STATUS_ENDPOINTS value: "http://capact-engine-graphql.{{.Release.Namespace}}.svc.cluster.local/healthz,http://capact-gateway.{{.Release.Namespace}}.svc.cluster.local/healthz,http://capact-hub-local.{{.Release.Namespace}}.svc.cluster.local/healthz,http://capact-hub-public.{{.Release.Namespace}}.svc.cluster.local/healthz" diff --git a/internal/k8s-engine/controller/action_controller.go b/internal/k8s-engine/controller/action_controller.go index c68918574..62f368563 100644 --- a/internal/k8s-engine/controller/action_controller.go +++ b/internal/k8s-engine/controller/action_controller.go @@ -3,7 +3,6 @@ package controller import ( "context" "fmt" - "strings" "capact.io/capact/internal/ptr" "capact.io/capact/pkg/engine/k8s/api/v1alpha1" @@ -42,6 +41,9 @@ type ( LockTypeInstances(ctx context.Context, action *v1alpha1.Action) error UnlockTypeInstances(ctx context.Context, action *v1alpha1.Action) error } + actionCleanup interface { + CleanupActionOwnedResources(ctx context.Context, action *v1alpha1.Action) (ignored bool, err error) + } actionRenderer interface { RenderAction(ctx context.Context, action *v1alpha1.Action) (*v1alpha1.RenderingStatus, error) } @@ -57,6 +59,7 @@ type ( actionService interface { actionRenderer actionStarter + actionCleanup actionStatusGetter actionTypeInstanceGetter } @@ -103,7 +106,7 @@ func (r *ActionReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { if action.IsBeingDeleted() { log.Info("Deleting runner action") - ignored, err := r.handleFinalizer(ctx, action, log) + ignored, err := r.svc.CleanupActionOwnedResources(ctx, action) if err != nil { return reportOnError(err, "Delete runner action") } @@ -162,40 +165,6 @@ func (r *ActionReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { return ctrl.Result{}, nil } -func (r *ActionReconciler) handleFinalizer(ctx context.Context, action *v1alpha1.Action, log logr.Logger) (bool, error) { - if !controllerutil.ContainsFinalizer(action, v1alpha1.ActionFinalizer) { - return true, nil // our finalizer was already removed - } - - switch { - case action.IsExecuted(): - - // Current decision: - log.Info("Ignoring delete request. Wait until Action execution will be finished.", "phase", string(action.Status.Phase)) - - // Deletion in this state is complicated as such Action can be in the middle of e.g. data migration, system shutdown, - // creating resources on hyperscaler side, etc. - // In the future we can revisit this approach based on user feedback and e.g. cancel running actions, maybe even rollback - // already executed steps, etc. - return true, nil - - case action.IsCompleted(): - - // Ensure that TypeInstances are unlocked. - err := r.svc.UnlockTypeInstances(ctx, action) - if r.ignoreNotActionableTypeInstanceErrors(err) != nil { - return false, errors.Wrap(err, "while unlocking TypeInstances") - } - - default: - - // Currently, we do not need to clean up anything as we use ownerReference for other resources. - } - - controllerutil.RemoveFinalizer(action, v1alpha1.ActionFinalizer) - return false, r.k8sCli.Update(ctx, action) -} - // initAction can be extracted to mutation webhook. func (r *ActionReconciler) initAction(ctx context.Context, action *v1alpha1.Action) (ctrl.Result, error) { if !controllerutil.ContainsFinalizer(action, v1alpha1.ActionFinalizer) { @@ -439,27 +408,6 @@ func (r *ActionReconciler) newStatusForAction(action *v1alpha1.Action, eventType return *statusCpy } -// ignoreNotActionableTypeInstanceErrors ignores GraphQL error which says that TI are locked by different owner or do not exist. -// In our case it means that TI were already unlocked by a given Action and someone else locked them or deleted. -// -// TODO: Get rid of ridiculous string assertion after adding proper error types to GraphQL responses. -// http://knowyourmeme.com/memes/this-is-fine -func (r *ActionReconciler) ignoreNotActionableTypeInstanceErrors(err error) error { - if err == nil { - return nil - } - - if strings.Contains(err.Error(), "locked by different owner") { - return nil - } - - if strings.Contains(err.Error(), "not found") { - return nil - } - - return err -} - // SetupWithManager sets up Action reconciler with a given controller manager. func (r *ActionReconciler) SetupWithManager(mgr ctrl.Manager, maxConcurrentReconciles int) error { r.k8sCli = mgr.GetClient() diff --git a/internal/k8s-engine/controller/action_service.go b/internal/k8s-engine/controller/action_service.go index b373263f1..c1b6f3c4e 100644 --- a/internal/k8s-engine/controller/action_service.go +++ b/internal/k8s-engine/controller/action_service.go @@ -5,30 +5,30 @@ import ( "context" "encoding/json" "fmt" + "strings" "time" - policypkg "capact.io/capact/internal/k8s-engine/policy" - - "go.uber.org/zap" - - "capact.io/capact/pkg/engine/k8s/policy" - graphqldomain "capact.io/capact/internal/k8s-engine/graphql/domain/action" + policypkg "capact.io/capact/internal/k8s-engine/policy" statusreporter "capact.io/capact/internal/k8s-engine/status-reporter" "capact.io/capact/internal/ptr" "capact.io/capact/pkg/engine/k8s/api/v1alpha1" + "capact.io/capact/pkg/engine/k8s/policy" hublocalapi "capact.io/capact/pkg/hub/api/graphql/local" hubpublicapi "capact.io/capact/pkg/hub/api/graphql/public" "capact.io/capact/pkg/runner" "capact.io/capact/pkg/sdk/apis/0.0.1/types" "capact.io/capact/pkg/sdk/renderer/argo" + "github.com/pkg/errors" + "go.uber.org/zap" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/yaml" ) @@ -150,7 +150,7 @@ func (a *ActionService) EnsureWorkflowSAExists(ctx context.Context, action *v1al case err == nil: case apierrors.IsAlreadyExists(err): old := &rbacv1.ClusterRoleBinding{} - key := client.ObjectKey{Name: binding.Name, Namespace: binding.Namespace} + key := client.ObjectKey{Name: binding.Name} if err := a.k8sCli.Get(ctx, key, old); err != nil { return nil, err } @@ -170,6 +170,73 @@ func (a *ActionService) EnsureWorkflowSAExists(ctx context.Context, action *v1al return sa, nil } +// CleanupActionOwnedResources removes the Action owned resources. +func (a *ActionService) CleanupActionOwnedResources(ctx context.Context, action *v1alpha1.Action) (bool, error) { + isCleanupIgnored := true + if !controllerutil.ContainsFinalizer(action, v1alpha1.ActionFinalizer) { + return isCleanupIgnored, nil // our finalizer was already removed + } + + if action.IsExecuted() { + // Current decision: + a.log.Info("Ignoring delete request. Wait until Action execution will be finished.", zap.String("phase", string(action.Status.Phase))) + + // Deletion in this state is complicated as such Action can be in the middle of e.g. data migration, system shutdown, + // creating resources on hyperscaler side, etc. + // In the future we can revisit this approach based on user feedback and e.g. cancel running actions, maybe even rollback + // already executed steps, etc. + return isCleanupIgnored, nil + } + + // ===== Execute clean-up logic + isCleanupIgnored = false + + // 1. Ensure that TypeInstances are unlocked. + err := a.UnlockTypeInstances(ctx, action) + if a.ignoreNotActionableTypeInstanceErrors(err) != nil { + return isCleanupIgnored, errors.Wrap(err, "while unlocking TypeInstances") + } + + // 2. Ensure ClusterRoleBinding deleted + // We use ownerReference for created resources. But the cluster-scoped resources cannot have namespace-scoped owners. + // As a result, we need to remove it manually. + binding := &rbacv1.ClusterRoleBinding{ + ObjectMeta: a.objectMetaFromAction(action), + } + if err := a.k8sCli.Delete(ctx, binding); client.IgnoreNotFound(err) != nil { + return isCleanupIgnored, errors.Wrapf(err, "while deleting ClusterRoleBinding owned by %s/%s Action", action.GetName(), action.GetNamespace()) + } + + // 3. Remove finalizer + controllerutil.RemoveFinalizer(action, v1alpha1.ActionFinalizer) + if err := a.k8sCli.Update(ctx, action); err != nil { + return isCleanupIgnored, errors.Wrap(err, "while removing Action finalizer") + } + + return isCleanupIgnored, nil +} + +// ignoreNotActionableTypeInstanceErrors ignores GraphQL error which says that TI are locked by different owner or do not exist. +// In our case it means that TI were already unlocked by a given Action and someone else locked them or deleted. +// +// TODO: Get rid of ridiculous string assertion after adding proper error types to GraphQL responses. +// http://knowyourmeme.com/memes/this-is-fine +func (a *ActionService) ignoreNotActionableTypeInstanceErrors(err error) error { + if err == nil { + return nil + } + + if strings.Contains(err.Error(), "locked by different owner") { + return nil + } + + if strings.Contains(err.Error(), "not found") { + return nil + } + + return err +} + // EnsureRunnerInputDataCreated ensures that Kubernetes Secret with input data for a runner is created and up to date. func (a *ActionService) EnsureRunnerInputDataCreated(ctx context.Context, saName string, action *v1alpha1.Action) error { runnerCtx := runner.Context{ diff --git a/test/e2e/cluster_test.go b/test/e2e/cluster_test.go index d1fb6f018..fd939017e 100644 --- a/test/e2e/cluster_test.go +++ b/test/e2e/cluster_test.go @@ -27,7 +27,7 @@ var _ = Describe("Cluster check", func() { Describe("Capact cluster health", func() { Context("Pods in cluster", func() { - It("should be in running phase (ignored kube-system and default)", func() { + It("should be in running phase (ignored Namespace: [kube-system, default], ignored evicted Pods)", func() { k8sCfg, err := config.GetConfig() Expect(err).ToNot(HaveOccurred()) @@ -77,6 +77,11 @@ func podRunningAndReadyOrFinished(pod *v1.Pod) bool { } return ready default: + // Ignore evicted pods which are in Failed state because of graceful node shutdown. + // see: https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown + if strings.EqualFold(pod.Status.Reason, "Shutdown") { + return true + } log("The status of Pod %s/%s is %s, waiting for it to be Running (with Ready = true)", pod.Namespace, pod.Name, pod.Status.Phase) return false }