Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

agent: Fix a bug where retry_join was not retrying. #24561

Merged
merged 2 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/24561.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
agent: Fixed a bug where `retry_join` gave up after a single failure, rather than retrying until max attempts had been reached
```
11 changes: 7 additions & 4 deletions command/agent/retry_join.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ func (r *retryJoiner) Validate(config *Config) error {
return nil
}

// retryJoin is used to handle retrying a join until it succeeds or all retries
// RetryJoin is used to handle retrying a join until it succeeds or all retries
// are exhausted.
func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) {
if len(serverJoin.RetryJoin) == 0 {
Expand All @@ -176,13 +176,16 @@ func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) {
var err error

for _, addr := range serverJoin.RetryJoin {

// If auto-discovery returns an error, log the error and
// fall-through, so we reach the retry logic and loop back around
// for another go.
servers, err := r.autoDiscover.Addrs(addr, r.logger)
if err != nil {
r.logger.Error("discovering join addresses failed", "join_config", addr, "error", err)
return
} else {
addrs = append(addrs, servers...)
}

addrs = append(addrs, servers...)
}

if len(addrs) > 0 {
Expand Down
63 changes: 63 additions & 0 deletions command/agent/retry_join_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (

"github.com/hashicorp/go-netaddrs"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/testutil"
"github.com/mitchellh/cli"
Expand Down Expand Up @@ -261,6 +262,68 @@ func TestRetryJoin_Client(t *testing.T) {
require.Equal(stubAddress, output[0])
}

// MockFailDiscover implements the DiscoverInterface interface and can be used
// for tests that want to purposely fail the discovery process.
type MockFailDiscover struct {
ReceivedConfig string
}

func (m *MockFailDiscover) Addrs(cfg string, _ *golog.Logger) ([]string, error) {
return nil, fmt.Errorf("test: failed discovery %q", cfg)
}
func (m *MockFailDiscover) Help() string { return "" }
func (m *MockFailDiscover) Names() []string {
return []string{""}
}

func TestRetryJoin_RetryMaxAttempts(t *testing.T) {
ci.Parallel(t)

// Create an error channel to pass to the retry joiner. When the retry
// attempts have been exhausted, this channel is closed and our only way
// to test this apart from inspecting log entries.
errCh := make(chan struct{})

// Create a timeout to protect against problems within the test blocking
// for arbitrary long times.
timeout, timeoutStop := helper.NewSafeTimer(2 * time.Second)
defer timeoutStop()

var output []string

joiner := retryJoiner{
autoDiscover: autoDiscover{goDiscover: &MockFailDiscover{}},
clientJoin: func(s []string) (int, error) {
output = s
return 0, nil
},
clientEnabled: true,
logger: testlog.HCLogger(t),
errCh: errCh,
}

// Execute the retry join function in a routine, so we can track whether
// this returns and exits without close the error channel and thus
// indicating retry failure.
doneCh := make(chan struct{})

go func(doneCh chan struct{}) {
joiner.RetryJoin(&ServerJoin{RetryMaxAttempts: 1, RetryJoin: []string{"provider=foo"}})
close(doneCh)
}(doneCh)

// The main test; ensure error channel is closed, indicating the retry
// limit has been reached.
select {
case <-errCh:
must.Len(t, 0, output)
case <-doneCh:
t.Fatal("retry join completed without closing error channel")
case <-timeout.C:
t.Fatal("timeout reached without error channel close")
}
}

func TestRetryJoin_Validate(t *testing.T) {
ci.Parallel(t)
type validateExpect struct {
Expand Down