Skip to content

Commit

Permalink
agent: Fix a bug where retry_join was not retrying. (#24561)
Browse files Browse the repository at this point in the history
The retry_join logic was not allowing for retries to happen and
was exiting after the first failed discovery attempt. This change
fixes that behaviour and adds a test to ensure no further
regressions.
  • Loading branch information
jrasell authored Nov 29, 2024
1 parent 3a18f22 commit 261359f
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 4 deletions.
3 changes: 3 additions & 0 deletions .changelog/24561.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
agent: Fixed a bug where `retry_join` gave up after a single failure, rather than retrying until max attempts had been reached
```
11 changes: 7 additions & 4 deletions command/agent/retry_join.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ func (r *retryJoiner) Validate(config *Config) error {
return nil
}

// retryJoin is used to handle retrying a join until it succeeds or all retries
// RetryJoin is used to handle retrying a join until it succeeds or all retries
// are exhausted.
func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) {
if len(serverJoin.RetryJoin) == 0 {
Expand All @@ -176,13 +176,16 @@ func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) {
var err error

for _, addr := range serverJoin.RetryJoin {

// If auto-discovery returns an error, log the error and
// fall-through, so we reach the retry logic and loop back around
// for another go.
servers, err := r.autoDiscover.Addrs(addr, r.logger)
if err != nil {
r.logger.Error("discovering join addresses failed", "join_config", addr, "error", err)
return
} else {
addrs = append(addrs, servers...)
}

addrs = append(addrs, servers...)
}

if len(addrs) > 0 {
Expand Down
63 changes: 63 additions & 0 deletions command/agent/retry_join_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (

"github.com/hashicorp/go-netaddrs"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/testutil"
"github.com/mitchellh/cli"
Expand Down Expand Up @@ -261,6 +262,68 @@ func TestRetryJoin_Client(t *testing.T) {
require.Equal(stubAddress, output[0])
}

// MockFailDiscover implements the DiscoverInterface interface and can be used
// for tests that want to purposely fail the discovery process.
type MockFailDiscover struct {
ReceivedConfig string
}

func (m *MockFailDiscover) Addrs(cfg string, _ *golog.Logger) ([]string, error) {
return nil, fmt.Errorf("test: failed discovery %q", cfg)
}
func (m *MockFailDiscover) Help() string { return "" }
func (m *MockFailDiscover) Names() []string {
return []string{""}
}

func TestRetryJoin_RetryMaxAttempts(t *testing.T) {
ci.Parallel(t)

// Create an error channel to pass to the retry joiner. When the retry
// attempts have been exhausted, this channel is closed and our only way
// to test this apart from inspecting log entries.
errCh := make(chan struct{})

// Create a timeout to protect against problems within the test blocking
// for arbitrary long times.
timeout, timeoutStop := helper.NewSafeTimer(2 * time.Second)
defer timeoutStop()

var output []string

joiner := retryJoiner{
autoDiscover: autoDiscover{goDiscover: &MockFailDiscover{}},
clientJoin: func(s []string) (int, error) {
output = s
return 0, nil
},
clientEnabled: true,
logger: testlog.HCLogger(t),
errCh: errCh,
}

// Execute the retry join function in a routine, so we can track whether
// this returns and exits without close the error channel and thus
// indicating retry failure.
doneCh := make(chan struct{})

go func(doneCh chan struct{}) {
joiner.RetryJoin(&ServerJoin{RetryMaxAttempts: 1, RetryJoin: []string{"provider=foo"}})
close(doneCh)
}(doneCh)

// The main test; ensure error channel is closed, indicating the retry
// limit has been reached.
select {
case <-errCh:
must.Len(t, 0, output)
case <-doneCh:
t.Fatal("retry join completed without closing error channel")
case <-timeout.C:
t.Fatal("timeout reached without error channel close")
}
}

func TestRetryJoin_Validate(t *testing.T) {
ci.Parallel(t)
type validateExpect struct {
Expand Down

0 comments on commit 261359f

Please sign in to comment.