Skip to content

Commit

Permalink
Merge pull request #527 from asteris-llc/master
Browse files Browse the repository at this point in the history
Add cluster join command line options and configuration options
  • Loading branch information
dadgar committed Dec 9, 2015
2 parents dc23448 + 5336df6 commit 2e3fd73
Show file tree
Hide file tree
Showing 5 changed files with 237 additions and 1 deletion.
91 changes: 91 additions & 0 deletions command/agent/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ type Command struct {
httpServer *HTTPServer
logFilter *logutils.LevelFilter
logOutput io.Writer
retryJoinErrCh chan struct{}

scadaProvider *scada.Provider
scadaHttp *HTTPServer
Expand Down Expand Up @@ -72,6 +73,11 @@ func (c *Command) readConfig() *Config {

// Server-only options
flags.IntVar(&cmdConfig.Server.BootstrapExpect, "bootstrap-expect", 0, "")
flags.BoolVar(&cmdConfig.Server.RejoinAfterLeave, "rejoin", false, "")
flags.Var((*sliceflag.StringFlag)(&cmdConfig.Server.StartJoin), "join", "")
flags.Var((*sliceflag.StringFlag)(&cmdConfig.Server.RetryJoin), "retry-join", "")
flags.IntVar(&cmdConfig.Server.RetryMaxAttempts, "retry-max", 0, "")
flags.StringVar(&cmdConfig.Server.RetryInterval, "retry-interval", "", "")

// Client-only options
flags.StringVar(&cmdConfig.Client.StateDir, "state-dir", "", "")
Expand Down Expand Up @@ -101,6 +107,15 @@ func (c *Command) readConfig() *Config {
return nil
}

if cmdConfig.Server.RetryInterval != "" {
dur, err := time.ParseDuration(cmdConfig.Server.RetryInterval)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error parsing retry interval: %s", err))
return nil
}
cmdConfig.Server.retryInterval = dur
}

// Split the servers.
if servers != "" {
cmdConfig.Client.Servers = strings.Split(servers, ",")
Expand Down Expand Up @@ -368,6 +383,12 @@ func (c *Command) Run(args []string) int {
}
}()

// Join startup nodes if specified
if err := c.startupJoin(config); err != nil {
c.Ui.Error(err.Error())
return 1
}

// Compile agent information for output later
info := make(map[string]string)
info["client"] = strconv.FormatBool(config.Client.Enabled)
Expand Down Expand Up @@ -406,6 +427,10 @@ func (c *Command) Run(args []string) int {
// Enable log streaming
logGate.Flush()

// Start retry join process
c.retryJoinErrCh = make(chan struct{})
go c.retryJoin(config)

// Wait for exit
return c.handleSignals(config)
}
Expand All @@ -423,6 +448,8 @@ WAIT:
sig = s
case <-c.ShutdownCh:
sig = os.Interrupt
case <-c.retryJoinErrCh:
return 1
}
c.Ui.Output(fmt.Sprintf("Caught signal: %v", sig))

Expand Down Expand Up @@ -569,6 +596,52 @@ func (c *Command) setupSCADA(config *Config) error {
return nil
}

func (c *Command) startupJoin(config *Config) error {
if len(config.Server.StartJoin) == 0 || !config.Server.Enabled {
return nil
}

c.Ui.Output("Joining cluster...")
n, err := c.agent.server.Join(config.Server.StartJoin)
if err != nil {
return err
}

c.Ui.Info(fmt.Sprintf("Join completed. Synced with %d initial agents", n))
return nil
}

// retryJoin is used to handle retrying a join until it succeeds or all retries
// are exhausted.
func (c *Command) retryJoin(config *Config) {
if len(config.Server.RetryJoin) == 0 || !config.Server.Enabled {
return
}

logger := c.agent.logger
logger.Printf("[INFO] agent: Joining cluster...")

attempt := 0
for {
n, err := c.agent.server.Join(config.Server.RetryJoin)
if err == nil {
logger.Printf("[INFO] agent: Join completed. Synced with %d initial agents", n)
return
}

attempt++
if config.Server.RetryMaxAttempts > 0 && attempt > config.Server.RetryMaxAttempts {
logger.Printf("[ERROR] agent: max join retry exhausted, exiting")
close(c.retryJoinErrCh)
return
}

logger.Printf("[WARN] agent: Join failed: %v, retrying in %v", err,
config.Server.RetryInterval)
time.Sleep(config.Server.retryInterval)
}
}

func (c *Command) Synopsis() string {
return "Runs a Nomad agent"
}
Expand Down Expand Up @@ -642,6 +715,24 @@ Server Options:
bootstrapping the cluster. Once <num> servers have joined eachother,
Nomad initiates the bootstrap process.
-join=<address>
Address of an agent to join at start time. Can be specified
multiple times.
-retry-join=<address>
Address of an agent to join at start time with retries enabled.
Can be specified multiple times.
-retry-max=<num>
Maximum number of join attempts. Defaults to 0, which will retry
indefinitely.
-retry-interval=<dur>
Time to wait between join attempts.
-rejoin
Ignore a previous leave and attempts to rejoin the cluster.
Client Options:
-client
Expand Down
58 changes: 58 additions & 0 deletions command/agent/command_test.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package agent

import (
"fmt"
"io/ioutil"
"log"
"os"
"strings"
"testing"

"github.com/hashicorp/nomad/testutil"
"github.com/mitchellh/cli"
)

Expand Down Expand Up @@ -69,3 +72,58 @@ func TestCommand_Args(t *testing.T) {
}
}
}

func TestRetryJoin(t *testing.T) {
dir, agent := makeAgent(t, nil)
defer os.RemoveAll(dir)
defer agent.Shutdown()

tmpDir, err := ioutil.TempDir("", "nomad")
if err != nil {
t.Fatalf("err: %s", err)
}
defer os.RemoveAll(tmpDir)

doneCh := make(chan struct{})
shutdownCh := make(chan struct{})

defer func() {
close(shutdownCh)
<-doneCh
}()

cmd := &Command{
ShutdownCh: shutdownCh,
Ui: new(cli.MockUi),
}

serfAddr := fmt.Sprintf(
"%s:%d",
agent.config.BindAddr,
agent.config.Ports.Serf)

args := []string{
"-server",
"-data-dir", tmpDir,
"-node", fmt.Sprintf(`"Node %d"`, getPort()),
"-retry-join", serfAddr,
"-retry-interval", "1s",
}

go func() {
if code := cmd.Run(args); code != 0 {
log.Printf("bad: %d", code)
}
close(doneCh)
}()

testutil.WaitForResult(func() (bool, error) {
mem := agent.server.Members()
if len(mem) != 2 {
return false, fmt.Errorf("bad :%#v", mem)
}
return true, nil
}, func(err error) {
t.Fatalf(err.Error())
})
}
52 changes: 51 additions & 1 deletion command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"path/filepath"
"runtime"
"strings"
"time"

"github.com/hashicorp/hcl"
client "github.com/hashicorp/nomad/client/config"
Expand Down Expand Up @@ -184,6 +185,31 @@ type ServerConfig struct {

// NodeGCThreshold contros how "old" a node must be to be collected by GC.
NodeGCThreshold string `hcl:"node_gc_threshold"`

// StartJoin is a list of addresses to attempt to join when the
// agent starts. If Serf is unable to communicate with any of these
// addresses, then the agent will error and exit.
StartJoin []string `hcl:"start_join"`

// RetryJoin is a list of addresses to join with retry enabled.
RetryJoin []string `hcl:"retry_join"`

// RetryMaxAttempts specifies the maximum number of times to retry joining a
// host on startup. This is useful for cases where we know the node will be
// online eventually.
RetryMaxAttempts int `hcl:"retry_max"`

// RetryInterval specifies the amount of time to wait in between join
// attempts on agent start. The minimum allowed value is 1 second and
// the default is 30s.
RetryInterval string `hcl:"retry_interval"`
retryInterval time.Duration `hcl:"-"`

// RejoinAfterLeave controls our interaction with the cluster after leave.
// When set to false (default), a leave causes Consul to not rejoin
// the cluster until an explicit join is received. If this is set to
// true, we ignore the leave, and rejoin the cluster on start.
RejoinAfterLeave bool `hcl:"rejoin_after_leave"`
}

// Telemetry is the telemetry configuration for the server
Expand Down Expand Up @@ -255,7 +281,11 @@ func DefaultConfig() *Config {
NetworkSpeed: 100,
},
Server: &ServerConfig{
Enabled: false,
Enabled: false,
StartJoin: []string{},
RetryJoin: []string{},
RetryInterval: "30s",
RetryMaxAttempts: 0,
},
SyslogFacility: "LOCAL0",
}
Expand Down Expand Up @@ -414,10 +444,30 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
if b.NodeGCThreshold != "" {
result.NodeGCThreshold = b.NodeGCThreshold
}
if b.RetryMaxAttempts != 0 {
result.RetryMaxAttempts = b.RetryMaxAttempts
}
if b.RetryInterval != "" {
result.RetryInterval = b.RetryInterval
result.retryInterval = b.retryInterval
}
if b.RejoinAfterLeave {
result.RejoinAfterLeave = true
}

// Add the schedulers
result.EnabledSchedulers = append(result.EnabledSchedulers, b.EnabledSchedulers...)

// Copy the start join addresses
result.StartJoin = make([]string, 0, len(a.StartJoin)+len(b.StartJoin))
result.StartJoin = append(result.StartJoin, a.StartJoin...)
result.StartJoin = append(result.StartJoin, b.StartJoin...)

// Copy the retry join addresses
result.RetryJoin = make([]string, 0, len(a.RetryJoin)+len(b.RetryJoin))
result.RetryJoin = append(result.RetryJoin, a.RetryJoin...)
result.RetryJoin = append(result.RetryJoin, b.RetryJoin...)

return &result
}

Expand Down
16 changes: 16 additions & 0 deletions command/agent/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"path/filepath"
"reflect"
"testing"
"time"

"github.com/hashicorp/nomad/nomad/structs"
)
Expand Down Expand Up @@ -114,6 +115,11 @@ func TestConfig_Merge(t *testing.T) {
NumSchedulers: 2,
EnabledSchedulers: []string{structs.JobTypeBatch},
NodeGCThreshold: "12h",
RejoinAfterLeave: true,
StartJoin: []string{"1.1.1.1"},
RetryJoin: []string{"1.1.1.1"},
RetryInterval: "10s",
retryInterval: time.Second * 10,
},
Ports: &Ports{
HTTP: 20000,
Expand Down Expand Up @@ -424,6 +430,11 @@ func TestConfig_LoadConfigString(t *testing.T) {
NumSchedulers: 2,
EnabledSchedulers: []string{"test"},
NodeGCThreshold: "12h",
RetryJoin: []string{"1.1.1.1", "2.2.2.2"},
StartJoin: []string{"1.1.1.1", "2.2.2.2"},
RetryInterval: "15s",
RejoinAfterLeave: true,
RetryMaxAttempts: 3,
},
Telemetry: &Telemetry{
StatsiteAddr: "127.0.0.1:1234",
Expand Down Expand Up @@ -497,6 +508,11 @@ server {
num_schedulers = 2
enabled_schedulers = ["test"]
node_gc_threshold = "12h"
retry_join = [ "1.1.1.1", "2.2.2.2" ]
start_join = [ "1.1.1.1", "2.2.2.2" ]
retry_max = 3
retry_interval = "15s"
rejoin_after_leave = true
}
telemetry {
statsite_address = "127.0.0.1:1234"
Expand Down
21 changes: 21 additions & 0 deletions website/source/docs/agent/config.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,21 @@ configured on client nodes.
"1.5h" or "25m". Valid time units are "ns", "us" (or "µs"), "ms", "s",
"m", "h". Controls how long a node must be in a terminal state before it is
garbage collected and purged from the system.
* <a id="rejoin_after_leave">`rejoin_after_leave`</a> When provided, Nomad will ignore a previous leave and
attempt to rejoin the cluster when starting. By default, Nomad treats leave
as a permanent intent and does not attempt to join the cluster again when
starting. This flag allows the previous state to be used to rejoin the
cluster.
* <a id="retry_join">`retry_join`</a> Similar to [`start_join`](#start_join) but allows retrying a join
if the first attempt fails. This is useful for cases where we know the
address will become available eventually.
* <a id="retry_interval">`retry_interval`</a> The time to wait between join attempts. Defaults to 30s.
* <a id="retry_max">`retry_max`</a> The maximum number of join attempts to be made before exiting
with a return code of 1. By default, this is set to 0 which is interpreted
as infinite retries.
* <a id="start_join">`start_join`</a> An array of strings specifying addresses of nodes to join upon startup.
If Nomad is unable to join with any of the specified addresses, agent startup will
fail. By default, the agent won't join any nodes when it starts up.
## Client-specific Options
Expand Down Expand Up @@ -348,6 +363,8 @@ via CLI arguments. The `agent` command accepts the following arguments:
* `-dev`: Start the agent in development mode. This enables a pre-configured
dual-role agent (client + server) which is useful for developing or testing
Nomad. No other configuration is required to start the agent in this mode.
* `-join=<address>`: Address of another agent to join upon starting up. This can
be specified multiple times to specify multiple agents to join.
* `-log-level=<level>`: Equivalent to the [log_level](#log_level) config option.
* `-meta=<key=value>`: Equivalent to the Client [meta](#meta) config option.
* `-network-interface<interface>`: Equivalent to the Client
Expand All @@ -359,6 +376,10 @@ via CLI arguments. The `agent` command accepts the following arguments:
config option.
* `-node-id=<uuid>`: Equivalent to the Client [node_id](#node_id) config option.
* `-region=<region>`: Equivalent to the [region](#region) config option.
* `-rejoin`: Equivalent to the [rejoin_after_leave](#rejoin_after_leave) config option.
* `-retry-interval`: Equivalent to the [retry_interval](#retry_interval) config option.
* `-retry-join`: Similar to `-join` but allows retrying a join if the first attempt fails.
* `-retry-max`: Similar to the [retry_max](#retry_max) config option.
* `-server`: Enable server mode on the local agent.
* `-servers=<host:port>`: Equivalent to the Client [servers](#servers) config
option.
Expand Down

0 comments on commit 2e3fd73

Please sign in to comment.