-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add cluster join command line options and configuration options #527
Changes from 7 commits
0d0fe0a
63bdb79
2fbafc1
b2d5741
c971f7b
53b23cd
fd3765f
07563b9
5336df6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,6 +71,11 @@ func (c *Command) readConfig() *Config { | |
|
||
// Server-only options | ||
flags.IntVar(&cmdConfig.Server.BootstrapExpect, "bootstrap-expect", 0, "") | ||
flags.BoolVar(&cmdConfig.Server.RejoinAfterLeave, "rejoin", false, "") | ||
flags.Var((*sliceflag.StringFlag)(&cmdConfig.Server.StartJoin), "join", "") | ||
flags.Var((*sliceflag.StringFlag)(&cmdConfig.Server.RetryJoin), "retry-join", "") | ||
flags.IntVar(&cmdConfig.Server.RetryMaxAttempts, "retry-max", 0, "") | ||
flags.StringVar(&cmdConfig.Server.RetryInterval, "retry-interval", "", "") | ||
|
||
// Client-only options | ||
flags.StringVar(&cmdConfig.Client.StateDir, "state-dir", "", "") | ||
|
@@ -100,6 +105,15 @@ func (c *Command) readConfig() *Config { | |
return nil | ||
} | ||
|
||
if cmdConfig.Server.RetryInterval != "" { | ||
dur, err := time.ParseDuration(cmdConfig.Server.RetryInterval) | ||
if err != nil { | ||
c.Ui.Error(fmt.Sprintf("Error: %s", err)) | ||
return nil | ||
} | ||
cmdConfig.Server.retryInterval = dur | ||
} | ||
|
||
// Split the servers. | ||
if servers != "" { | ||
cmdConfig.Client.Servers = strings.Split(servers, ",") | ||
|
@@ -358,6 +372,12 @@ func (c *Command) Run(args []string) int { | |
} | ||
}() | ||
|
||
// Join startup nodes if specified | ||
if err := c.startupJoin(config); err != nil { | ||
c.Ui.Error(err.Error()) | ||
return 1 | ||
} | ||
|
||
// Compile agent information for output later | ||
info := make(map[string]string) | ||
info["client"] = strconv.FormatBool(config.Client.Enabled) | ||
|
@@ -396,12 +416,16 @@ func (c *Command) Run(args []string) int { | |
// Enable log streaming | ||
logGate.Flush() | ||
|
||
// Start retry join process | ||
errCh := make(chan struct{}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add the retryJoinErr channel to the Command struct so we don't have to pass the channel into the functions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added to the Command struct |
||
go c.retryJoin(config, errCh) | ||
|
||
// Wait for exit | ||
return c.handleSignals(config) | ||
return c.handleSignals(config, errCh) | ||
} | ||
|
||
// handleSignals blocks until we get an exit-causing signal | ||
func (c *Command) handleSignals(config *Config) int { | ||
func (c *Command) handleSignals(config *Config, retryJoin <-chan struct{}) int { | ||
signalCh := make(chan os.Signal, 4) | ||
signal.Notify(signalCh, os.Interrupt, syscall.SIGTERM, syscall.SIGHUP) | ||
|
||
|
@@ -413,6 +437,8 @@ WAIT: | |
sig = s | ||
case <-c.ShutdownCh: | ||
sig = os.Interrupt | ||
case <-retryJoin: | ||
return 1 | ||
} | ||
c.Ui.Output(fmt.Sprintf("Caught signal: %v", sig)) | ||
|
||
|
@@ -559,6 +585,52 @@ func (c *Command) setupSCADA(config *Config) error { | |
return nil | ||
} | ||
|
||
func (c *Command) startupJoin(config *Config) error { | ||
if len(config.Server.StartJoin) == 0 || !config.Server.Enabled { | ||
return nil | ||
} | ||
|
||
c.Ui.Output("Joining cluster...") | ||
n, err := c.agent.server.Join(config.Server.StartJoin) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
c.Ui.Info(fmt.Sprintf("Join completed. Synced with %d initial agents", n)) | ||
return nil | ||
} | ||
|
||
// retryJoin is used to handle retrying a join until it succeeds or all retries | ||
// are exhausted. | ||
func (c *Command) retryJoin(config *Config, errCh chan<- struct{}) { | ||
if len(config.Server.RetryJoin) == 0 || !config.Server.Enabled { | ||
return | ||
} | ||
|
||
logger := c.agent.logger | ||
logger.Printf("[INFO] agent: Joining cluster...") | ||
|
||
attempt := 0 | ||
for { | ||
n, err := c.agent.server.Join(config.Server.RetryJoin) | ||
if err == nil { | ||
logger.Printf("[INFO] agent: Join completed. Synced with %d initial agents", n) | ||
return | ||
} | ||
|
||
attempt++ | ||
if config.Server.RetryMaxAttempts > 0 && attempt > config.Server.RetryMaxAttempts { | ||
logger.Printf("[ERROR] agent: max join retry exhausted, exiting") | ||
close(errCh) | ||
return | ||
} | ||
|
||
logger.Printf("[WARN] agent: Join failed: %v, retrying in %v", err, | ||
config.Server.RetryInterval) | ||
time.Sleep(config.Server.retryInterval) | ||
} | ||
} | ||
|
||
func (c *Command) Synopsis() string { | ||
return "Runs a Nomad agent" | ||
} | ||
|
@@ -632,6 +704,24 @@ Server Options: | |
bootstrapping the cluster. Once <num> servers have joined eachother, | ||
Nomad initiates the bootstrap process. | ||
|
||
-join=<address> | ||
Address of an agent to join at start time. Can be specified | ||
multiple times. | ||
|
||
-retry-join=<address> | ||
Address of an agent to join at start time with retries enabled. | ||
Can be specified multiple times. | ||
|
||
-retry-max=<num> | ||
Maximum number of join attempts. Defaults to 0, which will retry | ||
indefinitely. | ||
|
||
-retry-interval=<dur> | ||
Time to wait between join attempts. | ||
|
||
-rejoin | ||
Ignore a previous leave and attempts to rejoin the cluster. | ||
|
||
Client Options: | ||
|
||
-client | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ import ( | |
"path/filepath" | ||
"reflect" | ||
"testing" | ||
"time" | ||
|
||
"github.com/hashicorp/nomad/nomad/structs" | ||
) | ||
|
@@ -114,6 +115,11 @@ func TestConfig_Merge(t *testing.T) { | |
NumSchedulers: 2, | ||
EnabledSchedulers: []string{structs.JobTypeBatch}, | ||
NodeGCThreshold: "12h", | ||
RejoinAfterLeave: true, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a test case that runs without any of these options set? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only tests that use these options are TestConfig_Merge, TestConfig_LoadConfigString, and TestRetryJoin. Is there a specific test case that you are thinking of? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
No, I just like to make sure we test the default case where none of the new options is specified so we don't get a regression at some point. If there is not an existing test case that covers this we should add a new one. I asked because it's hard to see this from the diff; I will look more closely later. |
||
StartJoin: []string{"1.1.1.1"}, | ||
RetryJoin: []string{"1.1.1.1"}, | ||
RetryInterval: "10s", | ||
retryInterval: time.Second * 10, | ||
}, | ||
Ports: &Ports{ | ||
HTTP: 20000, | ||
|
@@ -384,6 +390,11 @@ func TestConfig_LoadConfigString(t *testing.T) { | |
NumSchedulers: 2, | ||
EnabledSchedulers: []string{"test"}, | ||
NodeGCThreshold: "12h", | ||
RetryJoin: []string{"1.1.1.1", "2.2.2.2"}, | ||
StartJoin: []string{"1.1.1.1", "2.2.2.2"}, | ||
RetryInterval: "15s", | ||
RejoinAfterLeave: true, | ||
RetryMaxAttempts: 3, | ||
}, | ||
Telemetry: &Telemetry{ | ||
StatsiteAddr: "127.0.0.1:1234", | ||
|
@@ -457,6 +468,11 @@ server { | |
num_schedulers = 2 | ||
enabled_schedulers = ["test"] | ||
node_gc_threshold = "12h" | ||
retry_join = [ "1.1.1.1", "2.2.2.2" ] | ||
start_join = [ "1.1.1.1", "2.2.2.2" ] | ||
retry_max = 3 | ||
retry_interval = "15s" | ||
rejoin_after_leave = true | ||
} | ||
telemetry { | ||
statsite_address = "127.0.0.1:1234" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we add some extra information in the error message to indicate which field Nomad was not about to parse? I think that would help the user to quickly debug what's wrong in the configuration file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I updated the error message to
Error parsing retry interval