Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Abort consoles with more than one pod #264

Merged
merged 3 commits into from
Mar 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.8.0
3.8.1
104 changes: 81 additions & 23 deletions controllers/workloads/console/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,13 @@ func (r *ConsoleReconciler) Reconcile(logger logr.Logger, ctx context.Context, r
}
}

job, err := r.getJob(ctx, req.NamespacedName)
var (
job *batchv1.Job
pod *corev1.Pod
podList corev1.PodList
)

job, err = r.getJob(ctx, req.NamespacedName)
if err != nil {
job = nil
}
Expand All @@ -293,6 +299,36 @@ func (r *ConsoleReconciler) Reconcile(logger logr.Logger, ctx context.Context, r
}
}

if job != nil {
matchLabels := client.MatchingLabels(map[string]string{"job-name": job.ObjectMeta.Name})
if err := r.List(ctx, &podList, client.InNamespace(csl.Namespace), matchLabels); err != nil {
return ctrl.Result{}, errors.Wrap(err, "failed to list pods for console job")
}

// Check if more than one pod belongs to the console or if the current pod is not the same
// as the original pod (which also means that more than one pods was launched). A console's
// job object has `restartPolicy=Never` and `backoffLimit=0`. These two settings together
// prevent the job from launching a second pod when a failure occurs on the original pod.
// However, these settings don't cover situations where a console pod is deleted
// (e.g. manual deletion, eviction, preemption). We want consoles to never launch more than
// one pod. Launching a subsequent pod is problematic even if there is only one running pod
// at any given time. It causes the controller to enter its state machine logic in a way
// that it wasn't designed to handle. It also causes the console to remain in a running
// state for far longer than users expect.
if len(podList.Items) > 1 || (len(podList.Items) == 1 && csl.Status.PodName != "" && csl.Status.PodName != podList.Items[0].Name) {
logger.Info("More than one pod observed for console; deleting job and stopping console")
if err := r.abort(ctx, logger, csl, job, &podList); err != nil {
return ctrl.Result{}, errors.Wrap(err, "failed to abort console")
}
// No need to requeue after an abort because the deleted job will trigger us again.
return ctrl.Result{Requeue: false}, nil
}

if len(podList.Items) == 1 {
pod = &podList.Items[0]
}
}

// Update the status fields in case they're out of sync, or the console spec
// has been updated
statusCtx := consoleStatusContext{
Expand All @@ -301,9 +337,10 @@ func (r *ConsoleReconciler) Reconcile(logger logr.Logger, ctx context.Context, r
Authorisation: authorisation,
AuthorisationRule: authRule,
Job: job,
Pod: pod,
}

csl, err = r.generateStatusAndAuditEvents(ctx, logger, req.NamespacedName, csl, statusCtx)
csl, err = r.generateStatusAndAuditEvents(ctx, logger, csl, statusCtx)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "failed to generate console status or audit events")
}
Expand Down Expand Up @@ -339,6 +376,10 @@ func (r *ConsoleReconciler) Reconcile(logger logr.Logger, ctx context.Context, r
return ctrl.Result{}, err
}
}
// Retrigger reconciliation periodically to catch situations where a console pod is deleted
// and re-spawned by the console job. Note that this isn't strictly necessary as Kubernetes
// will periodically refresh caches and queue reconciliation events anyway.
res = requeueAfterInterval(logger, 30*time.Second)
case csl.PostRunning():
// Requeue for when the console has reached its after finished TTL so it can be deleted
res = requeueAfterInterval(logger, time.Until(*csl.GetGCTime()))
Expand Down Expand Up @@ -517,27 +558,7 @@ type consoleStatusContext struct {
Job *batchv1.Job
}

func (r *ConsoleReconciler) generateStatusAndAuditEvents(ctx context.Context, logger logr.Logger, name types.NamespacedName, csl *workloadsv1alpha1.Console, statusCtx consoleStatusContext) (*workloadsv1alpha1.Console, error) {
var (
pod *corev1.Pod
podList corev1.PodList
)

if statusCtx.Job != nil {
inNamespace := client.InNamespace(name.Namespace)
matchLabels := client.MatchingLabels(map[string]string{"job-name": statusCtx.Job.ObjectMeta.Name})
if err := r.List(ctx, &podList, inNamespace, matchLabels); err != nil {
return nil, errors.Wrap(err, "failed to list pods for console job")
}
}
if len(podList.Items) > 0 {
pod = &podList.Items[0]
} else {
pod = nil
}

statusCtx.Pod = pod

func (r *ConsoleReconciler) generateStatusAndAuditEvents(ctx context.Context, logger logr.Logger, csl *workloadsv1alpha1.Console, statusCtx consoleStatusContext) (*workloadsv1alpha1.Console, error) {
logger = getAuditLogger(logger, csl, statusCtx)
newStatus := calculateStatus(csl, statusCtx)

Expand Down Expand Up @@ -635,6 +656,43 @@ func calculateStatus(csl *workloadsv1alpha1.Console, statusCtx consoleStatusCont
return newStatus
}

func (r *ConsoleReconciler) abort(ctx context.Context, logger logr.Logger, csl *workloadsv1alpha1.Console, job *batchv1.Job, podList *corev1.PodList) error {
// Delete job
if err := r.Client.Delete(ctx, job); err != nil {
return errors.Wrap(err, "failed to delete job")
}

// Delete pods. In theory we shouldn't have to do this. All console pods are owned by
// the console job. A delete operation should cascade. However, in our testing we saw
// that the second pod launched by the job consistently lingers on after the job is gone.
var podDeleteError error
for _, pod := range podList.Items {
if err := r.Client.Delete(ctx, &pod); err != nil {
podDeleteError = err
}
}
if podDeleteError != nil {
return errors.Wrap(podDeleteError, "failed to delete pod(s)")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure we need this, as the Job will own the pods. Deleting the job should remove all of the pods for us.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC there's no guarantee that the job controller will get the deletion event prior to being notified of it's pods being removed, although this is unlikely.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes indeed, in theory we shouldn't have to delete pods. They are owned by the console job. A delete operation should cascade. However, when I tested manual deletion, for some reason the newly launched second pod lingers on indefinitely after the job is gone.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As agreed, I’ve added a comment in the source code about this.

}

// Update console status
newStatus := csl.DeepCopy().Status
newStatus.Phase = workloadsv1alpha1.ConsoleStopped
updatedCsl := csl.DeepCopy()
updatedCsl.Status = newStatus

if err := r.createOrUpdate(ctx, logger, csl, csl, Console, consoleDiff); err != nil {
return err
}

// Publish termination event
if err := r.LifecycleRecorder.ConsoleTerminate(ctx, csl, false, nil); err != nil {
logging.WithNoRecord(logger).Error(err, "failed to record event", "event", "console.terminate")
}

return nil
}

func calculatePhase(statusCtx consoleStatusContext) workloadsv1alpha1.ConsolePhase {
if !statusCtx.IsAuthorised {
return workloadsv1alpha1.ConsolePendingAuthorisation
Expand Down