From 7c5a3e509bb110cfce7c953ecdde7f7b8dd360e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Gurhem?= Date: Wed, 30 Oct 2024 09:42:32 +0100 Subject: [PATCH] fix: health check waits for tasks to finish before becoming unhealthy --- Common/src/Pollster/Pollster.cs | 24 +++++++++++++----------- Common/tests/Pollster/PollsterTest.cs | 1 - 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/Common/src/Pollster/Pollster.cs b/Common/src/Pollster/Pollster.cs index bf098cf3a..78fcb7036 100644 --- a/Common/src/Pollster/Pollster.cs +++ b/Common/src/Pollster/Pollster.cs @@ -43,6 +43,8 @@ using Microsoft.Extensions.Diagnostics.HealthChecks; using Microsoft.Extensions.Logging; +using TaskStatus = ArmoniK.Core.Common.Storage.TaskStatus; + namespace ArmoniK.Core.Common.Pollster; public class Pollster : IInitializable @@ -170,12 +172,7 @@ public async Task Init(CancellationToken cancellationToken) public async Task Check(HealthCheckTag tag) { - if (healthCheckFailedResult_ is not null) - { - return healthCheckFailedResult_ ?? HealthCheckResult.Unhealthy("Health Check failed previously so this polling agent should be destroyed."); - } - - if (endLoopReached_) + if (endLoopReached_ && taskProcessingDict_.IsEmpty) { return HealthCheckResult.Unhealthy("End of main loop reached, no more tasks will be executed."); } @@ -234,6 +231,12 @@ public async Task Check(HealthCheckTag tag) healthCheckFailedResult_ = result; } + if (tag == HealthCheckTag.Liveness && taskProcessingDict_.Any(pair => pair.Value.GetAcquiredTaskInfo() + .TaskStatus != TaskStatus.Dispatched)) + { + return HealthCheckResult.Healthy(); + } + if (tag == HealthCheckTag.Readiness && taskProcessingDict_.IsEmpty) { return HealthCheckResult.Unhealthy("No tasks to process"); @@ -264,11 +267,10 @@ await Init(exceptionManager_.EarlyCancellationToken) if (healthCheckFailedResult_ is not null) { var hcr = healthCheckFailedResult_.Value; - exceptionManager_.FatalError(logger_, - hcr.Exception, - "Health Check failed with status {Status} thus no more tasks will be executed:\n{Description}", - hcr.Status, - hcr.Description); + logger_.LogError(hcr.Exception, + "Health Check failed with status {Status} thus no more tasks will be acquired (tasks already acquired will be executed to completion if possible):\n{Description}", + hcr.Status, + hcr.Description); return; } diff --git a/Common/tests/Pollster/PollsterTest.cs b/Common/tests/Pollster/PollsterTest.cs index f9ea55cea..c5e753e8b 100644 --- a/Common/tests/Pollster/PollsterTest.cs +++ b/Common/tests/Pollster/PollsterTest.cs @@ -376,7 +376,6 @@ await testServiceProvider.Pollster.Init(CancellationToken.None) // This test that we return from the mainloop after the health check is unhealthy await testServiceProvider.Pollster.MainLoop() .ConfigureAwait(false); - Assert.True(testServiceProvider.ExceptionManager.Failed); } [Test]