Skip to content

Commit

Permalink
[Issue 9195] Evict silos from cluster if they remain in the Joining o…
Browse files Browse the repository at this point in the history
…r Created state for longer than MaxJoinAttemptTime (#9201)
  • Loading branch information
Chris-Eckhardt authored Nov 7, 2024
1 parent 186e5f8 commit f086d1b
Show file tree
Hide file tree
Showing 4 changed files with 296 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,10 @@ public class ClusterMembershipOptions
/// Gets or sets a value indicating whether to enable probing silos indirectly, via other silos.
/// </summary>
public bool EnableIndirectProbes { get; set; } = true;

/// <summary>
/// Gets or sets a value indicating whether to enable membership eviction of silos when in a state of `Joining` or `Created` for longer than MaxJoinAttemptTime
/// </summary>
public bool EvictWhenMaxJoinAttemptTimeExceeded { get; set; } = true;
}
}
48 changes: 47 additions & 1 deletion src/Orleans.Runtime/MembershipService/ClusterHealthMonitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,14 @@ private async Task ProcessMembershipUpdates()
if (this.log.IsEnabled(LogLevel.Debug)) this.log.LogDebug("Starting to process membership updates");
await foreach (var tableSnapshot in this.membershipService.MembershipTableUpdates.WithCancellation(this.shutdownCancellation.Token))
{
var newMonitoredSilos = this.UpdateMonitoredSilos(tableSnapshot, this.monitoredSilos, DateTime.UtcNow);
var utcNow = DateTime.UtcNow;

var newMonitoredSilos = this.UpdateMonitoredSilos(tableSnapshot, this.monitoredSilos, utcNow);

if (this.clusterMembershipOptions.CurrentValue.EvictWhenMaxJoinAttemptTimeExceeded)
{
await this.EvictStaleStateSilos(tableSnapshot, utcNow);
}

foreach (var pair in this.monitoredSilos)
{
Expand All @@ -103,6 +110,45 @@ private async Task ProcessMembershipUpdates()
}
}

private async Task EvictStaleStateSilos(
MembershipTableSnapshot membership,
DateTime utcNow)
{
foreach (var member in membership.Entries)
{
if (IsCreatedOrJoining(member.Value.Status)
&& HasExceededMaxJoinTime(
startTime: member.Value.StartTime,
now: utcNow,
maxJoinTime: this.clusterMembershipOptions.CurrentValue.MaxJoinAttemptTime))
{
try
{
if (this.log.IsEnabled(LogLevel.Debug)) this.log.LogDebug("Stale silo with a joining or created state found, calling `TryToSuspectOrKill`");
await this.membershipService.TryToSuspectOrKill(member.Key);
}
catch(Exception exception)
{
log.LogError(
exception,
"Silo {suspectAddress} has had the status `{siloStatus}` for longer than `MaxJoinAttemptTime` but a call to `TryToSuspectOrKill` has failed",
member.Value.SiloAddress,
member.Value.Status.ToString());
}
}
}

static bool IsCreatedOrJoining(SiloStatus status)
{
return status == SiloStatus.Created || status == SiloStatus.Joining;
}

static bool HasExceededMaxJoinTime(DateTime startTime, DateTime now, TimeSpan maxJoinTime)
{
return now > startTime.Add(maxJoinTime);
}
}

[Pure]
private ImmutableDictionary<SiloAddress, SiloHealthMonitor> UpdateMonitoredSilos(
MembershipTableSnapshot membership,
Expand Down
Loading

0 comments on commit f086d1b

Please sign in to comment.