Skip to content

Commit

Permalink
Remove the ClusterRoleBinding, fix get query for ClusterRoleBinding (#…
Browse files Browse the repository at this point in the history
…424)

* Remove the ClusterRoleBinding, fix get query

* Ignore evicted Pods

* Run integration tests without waiting for entry-tests

* Update images to thoses from PR
  • Loading branch information
mszostok authored Aug 4, 2021
1 parent 1a9e5bd commit 58fc83d
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 68 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pr-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ jobs:
name: Integration tests
runs-on: ubuntu-latest
if: github.event.pull_request.draft == false
needs: [ entry-tests, build-app, build-tests, build-cli ]
needs: [ build-app, build-tests, build-cli ]
permissions:
contents: read
env:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ spec:
- name: {{ .Chart.Name }}
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: "{{ .Values.global.containerRegistry.path }}/{{ .Values.image.name }}:{{ .Values.global.containerRegistry.overrideTag | default .Chart.AppVersion }}"
# image: "{{ .Values.global.containerRegistry.path }}/{{ .Values.image.name }}:{{ .Values.global.containerRegistry.overrideTag | default .Chart.AppVersion }}"
# Use PR image
image: "ghcr.io/capactio/pr/k8s-engine:PR-424"
imagePullPolicy: {{ .Values.image.pullPolicy }}
env:
- name: APP_GRAPH_QL_ADDR
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ spec:
serviceAccountName: "{{ include "capact.fullname" . }}-test-e2e"
containers:
- name: tests-runner
image: "{{ .Values.global.containerRegistry.path }}/{{ .Values.integrationTest.image.name }}:{{ .Values.global.containerRegistry.overrideTag | default .Chart.AppVersion }}"
# image: "{{ .Values.global.containerRegistry.path }}/{{ .Values.integrationTest.image.name }}:{{ .Values.global.containerRegistry.overrideTag | default .Chart.AppVersion }}"
# Use PR image
image: "ghcr.io/capactio/pr/e2e-test:PR-424"
env:
- name: STATUS_ENDPOINTS
value: "http://capact-engine-graphql.{{.Release.Namespace}}.svc.cluster.local/healthz,http://capact-gateway.{{.Release.Namespace}}.svc.cluster.local/healthz,http://capact-hub-local.{{.Release.Namespace}}.svc.cluster.local/healthz,http://capact-hub-public.{{.Release.Namespace}}.svc.cluster.local/healthz"
Expand Down
62 changes: 5 additions & 57 deletions internal/k8s-engine/controller/action_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package controller
import (
"context"
"fmt"
"strings"

"capact.io/capact/internal/ptr"
"capact.io/capact/pkg/engine/k8s/api/v1alpha1"
Expand Down Expand Up @@ -42,6 +41,9 @@ type (
LockTypeInstances(ctx context.Context, action *v1alpha1.Action) error
UnlockTypeInstances(ctx context.Context, action *v1alpha1.Action) error
}
actionCleanup interface {
CleanupActionOwnedResources(ctx context.Context, action *v1alpha1.Action) (ignored bool, err error)
}
actionRenderer interface {
RenderAction(ctx context.Context, action *v1alpha1.Action) (*v1alpha1.RenderingStatus, error)
}
Expand All @@ -57,6 +59,7 @@ type (
actionService interface {
actionRenderer
actionStarter
actionCleanup
actionStatusGetter
actionTypeInstanceGetter
}
Expand Down Expand Up @@ -103,7 +106,7 @@ func (r *ActionReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {

if action.IsBeingDeleted() {
log.Info("Deleting runner action")
ignored, err := r.handleFinalizer(ctx, action, log)
ignored, err := r.svc.CleanupActionOwnedResources(ctx, action)
if err != nil {
return reportOnError(err, "Delete runner action")
}
Expand Down Expand Up @@ -162,40 +165,6 @@ func (r *ActionReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
return ctrl.Result{}, nil
}

func (r *ActionReconciler) handleFinalizer(ctx context.Context, action *v1alpha1.Action, log logr.Logger) (bool, error) {
if !controllerutil.ContainsFinalizer(action, v1alpha1.ActionFinalizer) {
return true, nil // our finalizer was already removed
}

switch {
case action.IsExecuted():

// Current decision:
log.Info("Ignoring delete request. Wait until Action execution will be finished.", "phase", string(action.Status.Phase))

// Deletion in this state is complicated as such Action can be in the middle of e.g. data migration, system shutdown,
// creating resources on hyperscaler side, etc.
// In the future we can revisit this approach based on user feedback and e.g. cancel running actions, maybe even rollback
// already executed steps, etc.
return true, nil

case action.IsCompleted():

// Ensure that TypeInstances are unlocked.
err := r.svc.UnlockTypeInstances(ctx, action)
if r.ignoreNotActionableTypeInstanceErrors(err) != nil {
return false, errors.Wrap(err, "while unlocking TypeInstances")
}

default:

// Currently, we do not need to clean up anything as we use ownerReference for other resources.
}

controllerutil.RemoveFinalizer(action, v1alpha1.ActionFinalizer)
return false, r.k8sCli.Update(ctx, action)
}

// initAction can be extracted to mutation webhook.
func (r *ActionReconciler) initAction(ctx context.Context, action *v1alpha1.Action) (ctrl.Result, error) {
if !controllerutil.ContainsFinalizer(action, v1alpha1.ActionFinalizer) {
Expand Down Expand Up @@ -439,27 +408,6 @@ func (r *ActionReconciler) newStatusForAction(action *v1alpha1.Action, eventType
return *statusCpy
}

// ignoreNotActionableTypeInstanceErrors ignores GraphQL error which says that TI are locked by different owner or do not exist.
// In our case it means that TI were already unlocked by a given Action and someone else locked them or deleted.
//
// TODO: Get rid of ridiculous string assertion after adding proper error types to GraphQL responses.
// http://knowyourmeme.com/memes/this-is-fine
func (r *ActionReconciler) ignoreNotActionableTypeInstanceErrors(err error) error {
if err == nil {
return nil
}

if strings.Contains(err.Error(), "locked by different owner") {
return nil
}

if strings.Contains(err.Error(), "not found") {
return nil
}

return err
}

// SetupWithManager sets up Action reconciler with a given controller manager.
func (r *ActionReconciler) SetupWithManager(mgr ctrl.Manager, maxConcurrentReconciles int) error {
r.k8sCli = mgr.GetClient()
Expand Down
81 changes: 74 additions & 7 deletions internal/k8s-engine/controller/action_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,30 @@ import (
"context"
"encoding/json"
"fmt"
"strings"
"time"

policypkg "capact.io/capact/internal/k8s-engine/policy"

"go.uber.org/zap"

"capact.io/capact/pkg/engine/k8s/policy"

graphqldomain "capact.io/capact/internal/k8s-engine/graphql/domain/action"
policypkg "capact.io/capact/internal/k8s-engine/policy"
statusreporter "capact.io/capact/internal/k8s-engine/status-reporter"
"capact.io/capact/internal/ptr"
"capact.io/capact/pkg/engine/k8s/api/v1alpha1"
"capact.io/capact/pkg/engine/k8s/policy"
hublocalapi "capact.io/capact/pkg/hub/api/graphql/local"
hubpublicapi "capact.io/capact/pkg/hub/api/graphql/public"
"capact.io/capact/pkg/runner"
"capact.io/capact/pkg/sdk/apis/0.0.1/types"
"capact.io/capact/pkg/sdk/renderer/argo"

"github.com/pkg/errors"
"go.uber.org/zap"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/yaml"
)

Expand Down Expand Up @@ -150,7 +150,7 @@ func (a *ActionService) EnsureWorkflowSAExists(ctx context.Context, action *v1al
case err == nil:
case apierrors.IsAlreadyExists(err):
old := &rbacv1.ClusterRoleBinding{}
key := client.ObjectKey{Name: binding.Name, Namespace: binding.Namespace}
key := client.ObjectKey{Name: binding.Name}
if err := a.k8sCli.Get(ctx, key, old); err != nil {
return nil, err
}
Expand All @@ -170,6 +170,73 @@ func (a *ActionService) EnsureWorkflowSAExists(ctx context.Context, action *v1al
return sa, nil
}

// CleanupActionOwnedResources removes the Action owned resources.
func (a *ActionService) CleanupActionOwnedResources(ctx context.Context, action *v1alpha1.Action) (bool, error) {
isCleanupIgnored := true
if !controllerutil.ContainsFinalizer(action, v1alpha1.ActionFinalizer) {
return isCleanupIgnored, nil // our finalizer was already removed
}

if action.IsExecuted() {
// Current decision:
a.log.Info("Ignoring delete request. Wait until Action execution will be finished.", zap.String("phase", string(action.Status.Phase)))

// Deletion in this state is complicated as such Action can be in the middle of e.g. data migration, system shutdown,
// creating resources on hyperscaler side, etc.
// In the future we can revisit this approach based on user feedback and e.g. cancel running actions, maybe even rollback
// already executed steps, etc.
return isCleanupIgnored, nil
}

// ===== Execute clean-up logic
isCleanupIgnored = false

// 1. Ensure that TypeInstances are unlocked.
err := a.UnlockTypeInstances(ctx, action)
if a.ignoreNotActionableTypeInstanceErrors(err) != nil {
return isCleanupIgnored, errors.Wrap(err, "while unlocking TypeInstances")
}

// 2. Ensure ClusterRoleBinding deleted
// We use ownerReference for created resources. But the cluster-scoped resources cannot have namespace-scoped owners.
// As a result, we need to remove it manually.
binding := &rbacv1.ClusterRoleBinding{
ObjectMeta: a.objectMetaFromAction(action),
}
if err := a.k8sCli.Delete(ctx, binding); client.IgnoreNotFound(err) != nil {
return isCleanupIgnored, errors.Wrapf(err, "while deleting ClusterRoleBinding owned by %s/%s Action", action.GetName(), action.GetNamespace())
}

// 3. Remove finalizer
controllerutil.RemoveFinalizer(action, v1alpha1.ActionFinalizer)
if err := a.k8sCli.Update(ctx, action); err != nil {
return isCleanupIgnored, errors.Wrap(err, "while removing Action finalizer")
}

return isCleanupIgnored, nil
}

// ignoreNotActionableTypeInstanceErrors ignores GraphQL error which says that TI are locked by different owner or do not exist.
// In our case it means that TI were already unlocked by a given Action and someone else locked them or deleted.
//
// TODO: Get rid of ridiculous string assertion after adding proper error types to GraphQL responses.
// http://knowyourmeme.com/memes/this-is-fine
func (a *ActionService) ignoreNotActionableTypeInstanceErrors(err error) error {
if err == nil {
return nil
}

if strings.Contains(err.Error(), "locked by different owner") {
return nil
}

if strings.Contains(err.Error(), "not found") {
return nil
}

return err
}

// EnsureRunnerInputDataCreated ensures that Kubernetes Secret with input data for a runner is created and up to date.
func (a *ActionService) EnsureRunnerInputDataCreated(ctx context.Context, saName string, action *v1alpha1.Action) error {
runnerCtx := runner.Context{
Expand Down
7 changes: 6 additions & 1 deletion test/e2e/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ var _ = Describe("Cluster check", func() {

Describe("Capact cluster health", func() {
Context("Pods in cluster", func() {
It("should be in running phase (ignored kube-system and default)", func() {
It("should be in running phase (ignored Namespace: [kube-system, default], ignored evicted Pods)", func() {
k8sCfg, err := config.GetConfig()
Expect(err).ToNot(HaveOccurred())

Expand Down Expand Up @@ -77,6 +77,11 @@ func podRunningAndReadyOrFinished(pod *v1.Pod) bool {
}
return ready
default:
// Ignore evicted pods which are in Failed state because of graceful node shutdown.
// see: https://kubernetes.io/docs/concepts/architecture/nodes/#graceful-node-shutdown
if strings.EqualFold(pod.Status.Reason, "Shutdown") {
return true
}
log("The status of Pod %s/%s is %s, waiting for it to be Running (with Ready = true)", pod.Namespace, pod.Name, pod.Status.Phase)
return false
}
Expand Down

0 comments on commit 58fc83d

Please sign in to comment.