Skip to content

Commit

Permalink
[Bugfix] Allow shards with RF1 in EnforcedResignLeadership action (#1441
Browse files Browse the repository at this point in the history
)
  • Loading branch information
ajanikow authored Oct 15, 2023
1 parent 83c5c83 commit ebd0dfd
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
- (Feature) EnforcedResignLeadership action
- (Maintenance) Make scale_down_candidate annotation obsolete
- (Bugfix) Fix ResignJob ID propagation
- (Bugfix) Allow shards with RF1 in EnforcedResignLeadership action

## [1.2.33](https://github.com/arangodb/kube-arangodb/tree/1.2.33) (2023-09-27)
- (Maintenance) Bump golang.org/x/net to v0.13.0
Expand Down
24 changes: 24 additions & 0 deletions pkg/deployment/agency/state/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,30 @@ func (s State) PlanLeaderServers() Servers {
return r
}

// PlanLeaderServersWithFailOver returns all servers which are part of the plan as a leader and can fail over
func (s State) PlanLeaderServersWithFailOver() Servers {
q := map[Server]bool{}

for _, db := range s.Plan.Collections {
for _, col := range db {
for _, shards := range col.Shards {
if len(shards) <= 1 {
continue
}
q[shards[0]] = true
}
}
}

r := make([]Server, 0, len(q))

for k := range q {
r = append(r, k)
}

return r
}

type CollectionShardDetails []CollectionShardDetail

type CollectionShardDetail struct {
Expand Down
17 changes: 12 additions & 5 deletions pkg/deployment/reconcile/action_enforce_resign_leadership.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,30 +103,37 @@ func (a *actionEnforceResignLeadership) CheckProgress(ctx context.Context) (bool
}

// Lets start resign job if required
if j, ok := a.actionCtx.Get(a.action, resignLeadershipJobID); ok && j != "" {
if j, ok := a.actionCtx.Get(a.action, resignLeadershipJobID); ok && j != "" && j != "N/A" {
_, jobStatus := agencyState.Target.GetJob(state.JobID(j))
switch jobStatus {
case state.JobPhaseFailed:
a.log.Error("Resign server job failed")
// Remove key
a.actionCtx.Add(resignLeadershipJobID, "", true)
a.actionCtx.Add(resignLeadershipJobID, "N/A", true)
return false, false, nil
case state.JobPhaseFinished:
a.log.Info("Job finished")
// Remove key
a.actionCtx.Add(resignLeadershipJobID, "", true)
a.actionCtx.Add(resignLeadershipJobID, "N/A", true)
case state.JobPhaseUnknown:
a.log.Str("status", string(jobStatus)).Error("Resign server job unknown status")
return false, false, nil
default:
return false, false, nil
}

a.actionCtx.Add(resignLeadershipJobID, "N/A", true)

// Job is Finished, check if we are not a leader anymore
if agencyState.PlanLeaderServers().Contains(state.Server(m.ID)) {
// We are still a leader!
a.log.Warn("DBServers is still a leader for shards")
return false, false, nil
if agencyState.PlanLeaderServersWithFailOver().Contains(state.Server(m.ID)) {
// We need to retry
a.log.Warn("DBServer is still a leader for shards")
return false, false, nil
}
// Nothing to do as RF is set to 1
a.log.Warn("DBServer is still a leader for shards, but ReplicationFactor is set to 1")
}
return true, false, nil
}
Expand Down

0 comments on commit ebd0dfd

Please sign in to comment.