Skip to content

Commit

Permalink
Retry cluster join on "too many learners" error
Browse files Browse the repository at this point in the history
Signed-off-by: Brad Davidson <[email protected]>
  • Loading branch information
brandond committed Apr 28, 2023
1 parent f1b6a35 commit 91afb38
Showing 1 changed file with 13 additions and 3 deletions.
16 changes: 13 additions & 3 deletions pkg/etcd/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -419,10 +419,20 @@ func (e *ETCD) Start(ctx context.Context, clientAccessInfo *clientaccess.Info) e
for {
select {
case <-time.After(30 * time.Second):
logrus.Infof("Waiting for agent to become ready before joining ETCD cluster")
logrus.Infof("Waiting for agent to become ready before joining etcd cluster")
case <-e.config.Runtime.AgentReady:
if err := e.join(ctx, clientAccessInfo); err != nil {
logrus.Fatalf("ETCD join failed: %v", err)
if err := wait.PollImmediateUntilWithContext(ctx, time.Second, func(ctx context.Context) (bool, error) {
if err := e.join(ctx, clientAccessInfo); err != nil {
// Retry the join if waiting for another member to be promoted, or waiting for peers to connect after promotion
if errors.Is(err, rpctypes.ErrTooManyLearners) || errors.Is(err, rpctypes.ErrGRPCUnhealthy) {
logrus.Infof("Waiting for other members to finish joining etcd cluster")
return false, nil
}
return false, err
}
return true, nil
}); err != nil {
logrus.Fatalf("etcd cluster join failed: %v", err)
}
return
case <-ctx.Done():
Expand Down

0 comments on commit 91afb38

Please sign in to comment.