From 2580e66d99879d147166b68ebcbe17cc66b6ef1c Mon Sep 17 00:00:00 2001 From: Stuart Douglas Date: Tue, 24 Sep 2024 06:33:07 +1000 Subject: [PATCH] fix: rolling deployments race (#2790) If we delete the deployments/runners straight away it may still be in some controllers route tables. By adding a small delay we make sure that all the controllers will have updated their table. This is a pretty nasty hack, but will likely be temporary. fixes: #2789 --- .../scaling/k8sscaling/deployment_provisioner.go | 16 +++++++++++----- .../scaling/localscaling/local_scaling.go | 7 ++++++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/backend/controller/scaling/k8sscaling/deployment_provisioner.go b/backend/controller/scaling/k8sscaling/deployment_provisioner.go index 94d877b67f..10063485d4 100644 --- a/backend/controller/scaling/k8sscaling/deployment_provisioner.go +++ b/backend/controller/scaling/k8sscaling/deployment_provisioner.go @@ -131,11 +131,17 @@ func (r *DeploymentProvisioner) handleSchemaChange(ctx context.Context, msg *ftl case ftlv1.DeploymentChangeType_DEPLOYMENT_REMOVED: delete(r.KnownDeployments, msg.DeploymentKey) if deploymentExists { - logger.Infof("deleting deployment %s", msg.ModuleName) - err := deploymentClient.Delete(ctx, msg.DeploymentKey, v1.DeleteOptions{}) - if err != nil { - return fmt.Errorf("failed to delete deployment %s: %w", msg.ModuleName, err) - } + go func() { + + // Nasty hack, we want all the controllers to have updated their route tables before we kill the runner + // so we add a slight delay here + time.Sleep(time.Second * 10) + logger.Infof("deleting deployment %s", msg.ModuleName) + err := deploymentClient.Delete(ctx, msg.DeploymentKey, v1.DeleteOptions{}) + if err != nil { + logger.Errorf(err, "failed to delete deployment %s", msg.ModuleName) + } + }() } } return nil diff --git a/backend/controller/scaling/localscaling/local_scaling.go b/backend/controller/scaling/localscaling/local_scaling.go index e88d90fba3..0a96daee1a 100644 --- a/backend/controller/scaling/localscaling/local_scaling.go +++ b/backend/controller/scaling/localscaling/local_scaling.go @@ -139,7 +139,12 @@ func (l *localScaling) reconcileRunners(ctx context.Context, deploymentRunners * return err } } else if deploymentRunners.replicas == 0 && deploymentRunners.runner.Ok() { - deploymentRunners.runner.MustGet().cancelFunc() + go func() { + // Nasty hack, we want all the controllers to have updated their route tables before we kill the runner + // so we add a slight delay here + time.Sleep(time.Second * 10) + deploymentRunners.runner.MustGet().cancelFunc() + }() deploymentRunners.runner = optional.None[runnerInfo]() } return nil