diff --git a/client/client.go b/client/client.go index 0ef9cda9f40..0cc2d81112b 100644 --- a/client/client.go +++ b/client/client.go @@ -283,7 +283,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic // Set the preconfigured list of static servers c.configLock.RLock() if len(c.configCopy.Servers) > 0 { - if err := c.setServersImpl(c.configCopy.Servers, true); err != nil { + if _, err := c.setServersImpl(c.configCopy.Servers, true); err != nil { logger.Printf("[WARN] client: None of the configured servers are valid: %v", err) } } @@ -623,7 +623,7 @@ func (c *Client) GetServers() []string { // SetServers sets a new list of nomad servers to connect to. As long as one // server is resolvable no error is returned. -func (c *Client) SetServers(in []string) error { +func (c *Client) SetServers(in []string) (int, error) { return c.setServersImpl(in, false) } @@ -633,7 +633,7 @@ func (c *Client) SetServers(in []string) error { // // Force should be used when setting the servers from the initial configuration // since the server may be starting up in parallel and initial pings may fail. -func (c *Client) setServersImpl(in []string, force bool) error { +func (c *Client) setServersImpl(in []string, force bool) (int, error) { var mu sync.Mutex var wg sync.WaitGroup var merr multierror.Error @@ -673,13 +673,13 @@ func (c *Client) setServersImpl(in []string, force bool) error { // Only return errors if no servers are valid if len(endpoints) == 0 { if len(merr.Errors) > 0 { - return merr.ErrorOrNil() + return 0, merr.ErrorOrNil() } - return noServersErr + return 0, noServersErr } c.servers.SetServers(endpoints) - return nil + return len(endpoints), nil } // restoreState is used to restore our state from the data dir diff --git a/client/client_test.go b/client/client_test.go index 7f86b70ba62..f697972d735 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -975,13 +975,13 @@ func TestClient_ServerList(t *testing.T) { if s := client.GetServers(); len(s) != 0 { t.Fatalf("expected server lit to be empty but found: %+q", s) } - if err := client.SetServers(nil); err != noServersErr { + if _, err := client.SetServers(nil); err != noServersErr { t.Fatalf("expected setting an empty list to return a 'no servers' error but received %v", err) } - if err := client.SetServers([]string{"123.456.13123.123.13:80"}); err == nil { + if _, err := client.SetServers([]string{"123.456.13123.123.13:80"}); err == nil { t.Fatalf("expected setting a bad server to return an error") } - if err := client.SetServers([]string{"123.456.13123.123.13:80", "127.0.0.1:1234", "127.0.0.1"}); err == nil { + if _, err := client.SetServers([]string{"123.456.13123.123.13:80", "127.0.0.1:1234", "127.0.0.1"}); err == nil { t.Fatalf("expected setting at least one good server to succeed but received: %v", err) } s := client.GetServers() diff --git a/command/agent/agent_endpoint.go b/command/agent/agent_endpoint.go index 54300177529..ade1e0ffe06 100644 --- a/command/agent/agent_endpoint.go +++ b/command/agent/agent_endpoint.go @@ -222,7 +222,7 @@ func (s *HTTPServer) updateServers(resp http.ResponseWriter, req *http.Request) // Set the servers list into the client s.agent.logger.Printf("[TRACE] Adding servers %+q to the client's primary server list", servers) - if err := client.SetServers(servers); err != nil { + if _, err := client.SetServers(servers); err != nil { s.agent.logger.Printf("[ERR] Attempt to add servers %q to client failed: %v", servers, err) //TODO is this the right error to return? return nil, CodedError(400, err.Error()) diff --git a/command/agent/command.go b/command/agent/command.go index f456c3aeb5a..9b9fb9ad436 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -63,9 +63,11 @@ func (c *Command) readConfig() *Config { Client: &ClientConfig{}, Consul: &config.ConsulConfig{}, Ports: &Ports{}, - Server: &ServerConfig{}, - Vault: &config.VaultConfig{}, - ACL: &ACLConfig{}, + Server: &ServerConfig{ + ServerJoin: &ServerJoin{}, + }, + Vault: &config.VaultConfig{}, + ACL: &ACLConfig{}, } flags := flag.NewFlagSet("agent", flag.ContinueOnError) @@ -78,13 +80,16 @@ func (c *Command) readConfig() *Config { // Server-only options flags.IntVar(&cmdConfig.Server.BootstrapExpect, "bootstrap-expect", 0, "") - flags.BoolVar(&cmdConfig.Server.RejoinAfterLeave, "rejoin", false, "") - flags.Var((*flaghelper.StringFlag)(&cmdConfig.Server.StartJoin), "join", "") - flags.Var((*flaghelper.StringFlag)(&cmdConfig.Server.RetryJoin), "retry-join", "") - flags.IntVar(&cmdConfig.Server.RetryMaxAttempts, "retry-max", 0, "") - flags.StringVar(&cmdConfig.Server.RetryInterval, "retry-interval", "", "") flags.StringVar(&cmdConfig.Server.EncryptKey, "encrypt", "", "gossip encryption key") flags.IntVar(&cmdConfig.Server.RaftProtocol, "raft-protocol", 0, "") + flags.BoolVar(&cmdConfig.Server.RejoinAfterLeave, "rejoin", false, "") + flags.Var((*flaghelper.StringFlag)(&cmdConfig.Server.ServerJoin.StartJoin), "join", "") + flags.Var((*flaghelper.StringFlag)(&cmdConfig.Server.ServerJoin.RetryJoin), "retry-join", "") + flags.IntVar(&cmdConfig.Server.ServerJoin.RetryMaxAttempts, "retry-max", 0, "") + flags.Var((flaghelper.FuncDurationVar)(func(d time.Duration) error { + cmdConfig.Server.ServerJoin.RetryInterval = d + return nil + }), "retry-interval", "") // Client-only options flags.StringVar(&cmdConfig.Client.StateDir, "state-dir", "", "") @@ -267,14 +272,6 @@ func (c *Command) readConfig() *Config { } } - // Parse the RetryInterval. - dur, err := time.ParseDuration(config.Server.RetryInterval) - if err != nil { - c.Ui.Error(fmt.Sprintf("Error parsing retry interval: %s", err)) - return nil - } - config.Server.retryInterval = dur - // Check that the server is running in at least one mode. if !(config.Server.Enabled || config.Client.Enabled) { c.Ui.Error("Must specify either server, client or dev mode for the agent.") @@ -547,20 +544,89 @@ func (c *Command) Run(args []string) int { logGate.Flush() // Start retry join process - c.retryJoinErrCh = make(chan struct{}) - - joiner := retryJoiner{ - join: c.agent.server.Join, - discover: &discover.Discover{}, - errCh: c.retryJoinErrCh, - logger: c.agent.logger, + if err := c.handleRetryJoin(config); err != nil { + c.Ui.Error(err.Error()) + return 1 } - go joiner.RetryJoin(config) // Wait for exit return c.handleSignals() } +// handleRetryJoin is used to start retry joining if it is configured. +func (c *Command) handleRetryJoin(config *Config) error { + c.retryJoinErrCh = make(chan struct{}) + + if config.Server.Enabled && len(config.Server.RetryJoin) != 0 { + joiner := retryJoiner{ + discover: &discover.Discover{}, + errCh: c.retryJoinErrCh, + logger: c.agent.logger, + serverJoin: c.agent.server.Join, + serverEnabled: true, + } + + if err := joiner.Validate(config); err != nil { + return err + } + + // Remove the duplicate fields + if len(config.Server.RetryJoin) != 0 { + config.Server.ServerJoin.RetryJoin = config.Server.RetryJoin + config.Server.RetryJoin = nil + } + if config.Server.RetryMaxAttempts != 0 { + config.Server.ServerJoin.RetryMaxAttempts = config.Server.RetryMaxAttempts + config.Server.RetryMaxAttempts = 0 + } + if config.Server.RetryInterval != 0 { + config.Server.ServerJoin.RetryInterval = config.Server.RetryInterval + config.Server.RetryInterval = 0 + } + + c.agent.logger.Printf("[WARN] agent: Using deprecated retry_join fields. Upgrade configuration to use server_join") + } + + if config.Server.Enabled && + config.Server.ServerJoin != nil && + len(config.Server.ServerJoin.RetryJoin) != 0 { + + joiner := retryJoiner{ + discover: &discover.Discover{}, + errCh: c.retryJoinErrCh, + logger: c.agent.logger, + serverJoin: c.agent.server.Join, + serverEnabled: true, + } + + if err := joiner.Validate(config); err != nil { + return err + } + + go joiner.RetryJoin(config.Server.ServerJoin) + } + + if config.Client.Enabled && + config.Client.ServerJoin != nil && + len(config.Client.ServerJoin.RetryJoin) != 0 { + joiner := retryJoiner{ + discover: &discover.Discover{}, + errCh: c.retryJoinErrCh, + logger: c.agent.logger, + clientJoin: c.agent.client.SetServers, + clientEnabled: true, + } + + if err := joiner.Validate(config); err != nil { + return err + } + + go joiner.RetryJoin(config.Client.ServerJoin) + } + + return nil +} + // handleSignals blocks until we get an exit-causing signal func (c *Command) handleSignals() int { signalCh := make(chan os.Signal, 4) @@ -831,12 +897,34 @@ func (c *Command) setupTelemetry(config *Config) (*metrics.InmemSink, error) { } func (c *Command) startupJoin(config *Config) error { - if len(config.Server.StartJoin) == 0 || !config.Server.Enabled { + // Nothing to do + if !config.Server.Enabled { return nil } + // Validate both old and new aren't being set + old := len(config.Server.StartJoin) + var new int + if config.Server.ServerJoin != nil { + new = len(config.Server.ServerJoin.StartJoin) + } + if old != 0 && new != 0 { + return fmt.Errorf("server_join and start_join cannot both be defined; prefer setting the server_join stanza") + } + + // Nothing to do + if old+new == 0 { + return nil + } + + // Combine the lists and join + joining := config.Server.StartJoin + if new != 0 { + joining = append(joining, config.Server.ServerJoin.StartJoin...) + } + c.Ui.Output("Joining cluster...") - n, err := c.agent.server.Join(config.Server.StartJoin) + n, err := c.agent.server.Join(joining) if err != nil { return err } diff --git a/command/agent/config-test-fixtures/basic.hcl b/command/agent/config-test-fixtures/basic.hcl index 7398dff43ca..b5a3a77cd50 100644 --- a/command/agent/config-test-fixtures/basic.hcl +++ b/command/agent/config-test-fixtures/basic.hcl @@ -19,6 +19,7 @@ advertise { rpc = "127.0.0.3" serf = "127.0.0.4" } + client { enabled = true state_dir = "/tmp/client-state" @@ -29,6 +30,11 @@ client { foo = "bar" baz = "zip" } + server_join { + retry_join = [ "1.1.1.1", "2.2.2.2" ] + retry_max = 3 + retry_interval = "15s" + } options { foo = "bar" baz = "zip" @@ -49,17 +55,17 @@ client { } client_min_port = 1000 client_max_port = 2000 - max_kill_timeout = "10s" - stats { - data_points = 35 - collection_interval = "5s" - } - gc_interval = "6s" - gc_parallel_destroys = 6 - gc_disk_usage_threshold = 82 - gc_inode_usage_threshold = 91 - gc_max_allocs = 50 - no_host_uuid = false + max_kill_timeout = "10s" + stats { + data_points = 35 + collection_interval = "5s" + } + gc_interval = "6s" + gc_parallel_destroys = 6 + gc_disk_usage_threshold = 82 + gc_inode_usage_threshold = 91 + gc_max_allocs = 50 + no_host_uuid = false } server { enabled = true @@ -86,23 +92,28 @@ server { redundancy_zone = "foo" upgrade_version = "0.8.0" encrypt = "abc" + server_join { + retry_join = [ "1.1.1.1", "2.2.2.2" ] + retry_max = 3 + retry_interval = "15s" + } } acl { - enabled = true - token_ttl = "60s" - policy_ttl = "60s" - replication_token = "foobar" + enabled = true + token_ttl = "60s" + policy_ttl = "60s" + replication_token = "foobar" } telemetry { statsite_address = "127.0.0.1:1234" statsd_address = "127.0.0.1:2345" prometheus_metrics = true disable_hostname = true - collection_interval = "3s" - publish_allocation_metrics = true - publish_node_metrics = true - disable_tagged_metrics = true - backwards_compatible_metrics = true + collection_interval = "3s" + publish_allocation_metrics = true + publish_node_metrics = true + disable_tagged_metrics = true + backwards_compatible_metrics = true } leave_on_interrupt = true leave_on_terminate = true @@ -114,68 +125,68 @@ http_api_response_headers { Access-Control-Allow-Origin = "*" } consul { - server_service_name = "nomad" - server_http_check_name = "nomad-server-http-health-check" - server_serf_check_name = "nomad-server-serf-health-check" - server_rpc_check_name = "nomad-server-rpc-health-check" - client_service_name = "nomad-client" - client_http_check_name = "nomad-client-http-health-check" - address = "127.0.0.1:9500" - token = "token1" - auth = "username:pass" - ssl = true - verify_ssl = true - ca_file = "/path/to/ca/file" - cert_file = "/path/to/cert/file" - key_file = "/path/to/key/file" - server_auto_join = true - client_auto_join = true - auto_advertise = true - checks_use_advertise = true + server_service_name = "nomad" + server_http_check_name = "nomad-server-http-health-check" + server_serf_check_name = "nomad-server-serf-health-check" + server_rpc_check_name = "nomad-server-rpc-health-check" + client_service_name = "nomad-client" + client_http_check_name = "nomad-client-http-health-check" + address = "127.0.0.1:9500" + token = "token1" + auth = "username:pass" + ssl = true + verify_ssl = true + ca_file = "/path/to/ca/file" + cert_file = "/path/to/cert/file" + key_file = "/path/to/key/file" + server_auto_join = true + client_auto_join = true + auto_advertise = true + checks_use_advertise = true } vault { - address = "127.0.0.1:9500" - allow_unauthenticated = true - task_token_ttl = "1s" - enabled = false - token = "12345" - ca_file = "/path/to/ca/file" - ca_path = "/path/to/ca" - cert_file = "/path/to/cert/file" - key_file = "/path/to/key/file" - tls_server_name = "foobar" - tls_skip_verify = true - create_from_role = "test_role" + address = "127.0.0.1:9500" + allow_unauthenticated = true + task_token_ttl = "1s" + enabled = false + token = "12345" + ca_file = "/path/to/ca/file" + ca_path = "/path/to/ca" + cert_file = "/path/to/cert/file" + key_file = "/path/to/key/file" + tls_server_name = "foobar" + tls_skip_verify = true + create_from_role = "test_role" } tls { - http = true - rpc = true - verify_server_hostname = true - ca_file = "foo" - cert_file = "bar" - key_file = "pipe" - rpc_upgrade_mode = true - verify_https_client = true - tls_prefer_server_cipher_suites = true - tls_cipher_suites = "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256" - tls_min_version = "tls12" + http = true + rpc = true + verify_server_hostname = true + ca_file = "foo" + cert_file = "bar" + key_file = "pipe" + rpc_upgrade_mode = true + verify_https_client = true + tls_prefer_server_cipher_suites = true + tls_cipher_suites = "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256" + tls_min_version = "tls12" } sentinel { - import "foo" { - path = "foo" - args = ["a", "b", "c"] - } - import "bar" { - path = "bar" - args = ["x", "y", "z"] - } + import "foo" { + path = "foo" + args = ["a", "b", "c"] + } + import "bar" { + path = "bar" + args = ["x", "y", "z"] + } } autopilot { - cleanup_dead_servers = true - disable_upgrade_migration = true - last_contact_threshold = "12705s" - max_trailing_logs = 17849 - enable_redundancy_zones = true - server_stabilization_time = "23057s" - enable_custom_upgrades = true + cleanup_dead_servers = true + disable_upgrade_migration = true + last_contact_threshold = "12705s" + max_trailing_logs = 17849 + enable_redundancy_zones = true + server_stabilization_time = "23057s" + enable_custom_upgrades = true } diff --git a/command/agent/config.go b/command/agent/config.go index 929d3d74b11..301d31b3cb8 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -217,6 +217,9 @@ type ClientConfig struct { // NoHostUUID disables using the host's UUID and will force generation of a // random UUID. NoHostUUID *bool `mapstructure:"no_host_uuid"` + + // ServerJoin contains information that is used to attempt to join servers + ServerJoin *ServerJoin `mapstructure:"server_join"` } // ACLConfig is configuration specific to the ACL system @@ -311,21 +314,24 @@ type ServerConfig struct { // StartJoin is a list of addresses to attempt to join when the // agent starts. If Serf is unable to communicate with any of these // addresses, then the agent will error and exit. + // Deprecated in Nomad 0.10 StartJoin []string `mapstructure:"start_join"` // RetryJoin is a list of addresses to join with retry enabled. + // Deprecated in Nomad 0.10 RetryJoin []string `mapstructure:"retry_join"` // RetryMaxAttempts specifies the maximum number of times to retry joining a // host on startup. This is useful for cases where we know the node will be // online eventually. + // Deprecated in Nomad 0.10 RetryMaxAttempts int `mapstructure:"retry_max"` // RetryInterval specifies the amount of time to wait in between join // attempts on agent start. The minimum allowed value is 1 second and // the default is 30s. - RetryInterval string `mapstructure:"retry_interval"` - retryInterval time.Duration `mapstructure:"-"` + // Deprecated in Nomad 0.10 + RetryInterval time.Duration `mapstructure:"retry_interval"` // RejoinAfterLeave controls our interaction with the cluster after leave. // When set to false (default), a leave causes Consul to not rejoin @@ -346,6 +352,59 @@ type ServerConfig struct { // Encryption key to use for the Serf communication EncryptKey string `mapstructure:"encrypt" json:"-"` + + // ServerJoin contains information that is used to attempt to join servers + ServerJoin *ServerJoin `mapstructure:"server_join"` +} + +// ServerJoin is used in both clients and servers to bootstrap connections to +// servers +type ServerJoin struct { + // StartJoin is a list of addresses to attempt to join when the + // agent starts. If Serf is unable to communicate with any of these + // addresses, then the agent will error and exit. + StartJoin []string `mapstructure:"start_join"` + + // RetryJoin is a list of addresses to join with retry enabled, or a single + // value to find multiple servers using go-discover syntax. + RetryJoin []string `mapstructure:"retry_join"` + + // RetryMaxAttempts specifies the maximum number of times to retry joining a + // host on startup. This is useful for cases where we know the node will be + // online eventually. + RetryMaxAttempts int `mapstructure:"retry_max"` + + // RetryInterval specifies the amount of time to wait in between join + // attempts on agent start. The minimum allowed value is 1 second and + // the default is 30s. + RetryInterval time.Duration `mapstructure:"retry_interval"` +} + +func (s *ServerJoin) Merge(b *ServerJoin) *ServerJoin { + if s == nil { + return b + } + + result := *s + + if b == nil { + return &result + } + + if len(b.StartJoin) != 0 { + result.StartJoin = b.StartJoin + } + if len(b.RetryJoin) != 0 { + result.RetryJoin = b.RetryJoin + } + if b.RetryMaxAttempts != 0 { + result.RetryMaxAttempts = b.RetryMaxAttempts + } + if b.RetryInterval != 0 { + result.RetryInterval = b.RetryInterval + } + + return &result } // EncryptBytes returns the encryption key configured. @@ -601,13 +660,20 @@ func DefaultConfig() *Config { GCInodeUsageThreshold: 70, GCMaxAllocs: 50, NoHostUUID: helper.BoolToPtr(true), + ServerJoin: &ServerJoin{ + RetryJoin: []string{}, + RetryInterval: 30 * time.Second, + RetryMaxAttempts: 0, + }, }, Server: &ServerConfig{ - Enabled: false, - StartJoin: []string{}, - RetryJoin: []string{}, - RetryInterval: "30s", - RetryMaxAttempts: 0, + Enabled: false, + StartJoin: []string{}, + ServerJoin: &ServerJoin{ + RetryJoin: []string{}, + RetryInterval: 30 * time.Second, + RetryMaxAttempts: 0, + }, }, ACL: &ACLConfig{ Enabled: false, @@ -1036,9 +1102,8 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig { if b.RetryMaxAttempts != 0 { result.RetryMaxAttempts = b.RetryMaxAttempts } - if b.RetryInterval != "" { + if b.RetryInterval != 0 { result.RetryInterval = b.RetryInterval - result.retryInterval = b.retryInterval } if b.RejoinAfterLeave { result.RejoinAfterLeave = true @@ -1055,6 +1120,9 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig { if b.EncryptKey != "" { result.EncryptKey = b.EncryptKey } + if b.ServerJoin != nil { + result.ServerJoin = result.ServerJoin.Merge(b.ServerJoin) + } // Add the schedulers result.EnabledSchedulers = append(result.EnabledSchedulers, b.EnabledSchedulers...) @@ -1162,6 +1230,10 @@ func (a *ClientConfig) Merge(b *ClientConfig) *ClientConfig { result.ChrootEnv[k] = v } + if b.ServerJoin != nil { + result.ServerJoin = result.ServerJoin.Merge(b.ServerJoin) + } + return &result } diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index 9d82050cbcb..66f704f6ca0 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -370,6 +370,7 @@ func parseClient(result **ClientConfig, list *ast.ObjectList) error { "gc_parallel_destroys", "gc_max_allocs", "no_host_uuid", + "server_join", } if err := helper.CheckHCLKeys(listVal, valid); err != nil { return err @@ -385,6 +386,7 @@ func parseClient(result **ClientConfig, list *ast.ObjectList) error { delete(m, "chroot_env") delete(m, "reserved") delete(m, "stats") + delete(m, "server_join") var config ClientConfig dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ @@ -448,6 +450,13 @@ func parseClient(result **ClientConfig, list *ast.ObjectList) error { } } + // Parse ServerJoin config + if o := listVal.Filter("server_join"); len(o.Items) > 0 { + if err := parseServerJoin(&config.ServerJoin, o); err != nil { + return multierror.Prefix(err, "server_join->") + } + } + *result = &config return nil } @@ -531,16 +540,20 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error { "heartbeat_grace", "min_heartbeat_ttl", "max_heartbeats_per_second", - "start_join", - "retry_join", - "retry_max", - "retry_interval", "rejoin_after_leave", "encrypt", "authoritative_region", "non_voting_server", "redundancy_zone", "upgrade_version", + + "server_join", + + // For backwards compatibility + "start_join", + "retry_join", + "retry_max", + "retry_interval", } if err := helper.CheckHCLKeys(listVal, valid); err != nil { return err @@ -551,6 +564,8 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error { return err } + delete(m, "server_join") + var config ServerConfig dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ DecodeHook: mapstructure.StringToTimeDurationHookFunc(), @@ -570,10 +585,59 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error { } } + // Parse ServerJoin config + if o := listVal.Filter("server_join"); len(o.Items) > 0 { + if err := parseServerJoin(&config.ServerJoin, o); err != nil { + return multierror.Prefix(err, "server_join->") + } + } + *result = &config return nil } +func parseServerJoin(result **ServerJoin, list *ast.ObjectList) error { + list = list.Elem() + if len(list.Items) > 1 { + return fmt.Errorf("only one 'server_join' block allowed") + } + + // Get our object + listVal := list.Items[0].Val + + // Check for invalid keys + valid := []string{ + "start_join", + "retry_join", + "retry_max", + "retry_interval", + } + if err := helper.CheckHCLKeys(listVal, valid); err != nil { + return err + } + + var m map[string]interface{} + if err := hcl.DecodeObject(&m, listVal); err != nil { + return err + } + + var serverJoinInfo ServerJoin + dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ + DecodeHook: mapstructure.StringToTimeDurationHookFunc(), + WeaklyTypedInput: true, + Result: &serverJoinInfo, + }) + if err != nil { + return err + } + if err := dec.Decode(m); err != nil { + return err + } + + *result = &serverJoinInfo + return nil +} + func parseACL(result **ACLConfig, list *ast.ObjectList) error { list = list.Elem() if len(list.Items) > 1 { diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 994ed1d2a6d..fe8c5c6855b 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -47,6 +47,11 @@ func TestConfig_Parse(t *testing.T) { AllocDir: "/tmp/alloc", Servers: []string{"a.b.c:80", "127.0.0.1:1234"}, NodeClass: "linux-medium-64bit", + ServerJoin: &ServerJoin{ + RetryJoin: []string{"1.1.1.1", "2.2.2.2"}, + RetryInterval: time.Duration(15) * time.Second, + RetryMaxAttempts: 3, + }, Meta: map[string]string{ "foo": "bar", "baz": "zip", @@ -99,13 +104,18 @@ func TestConfig_Parse(t *testing.T) { MaxHeartbeatsPerSecond: 11.0, RetryJoin: []string{"1.1.1.1", "2.2.2.2"}, StartJoin: []string{"1.1.1.1", "2.2.2.2"}, - RetryInterval: "15s", + RetryInterval: 15 * time.Second, RejoinAfterLeave: true, RetryMaxAttempts: 3, NonVotingServer: true, RedundancyZone: "foo", UpgradeVersion: "0.8.0", EncryptKey: "abc", + ServerJoin: &ServerJoin{ + RetryJoin: []string{"1.1.1.1", "2.2.2.2"}, + RetryInterval: time.Duration(15) * time.Second, + RetryMaxAttempts: 3, + }, }, ACL: &ACLConfig{ Enabled: true, diff --git a/command/agent/config_test.go b/command/agent/config_test.go index d0be57cffcc..d756912094f 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -14,6 +14,7 @@ import ( "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs/config" + "github.com/stretchr/testify/require" ) var ( @@ -264,8 +265,7 @@ func TestConfig_Merge(t *testing.T) { RejoinAfterLeave: true, StartJoin: []string{"1.1.1.1"}, RetryJoin: []string{"1.1.1.1"}, - RetryInterval: "10s", - retryInterval: time.Second * 10, + RetryInterval: time.Second * 10, NonVotingServer: true, RedundancyZone: "bar", UpgradeVersion: "bar", @@ -907,3 +907,109 @@ func TestIsMissingPort(t *testing.T) { t.Errorf("expected no error, but got %v", err) } } + +func TestMergeServerJoin(t *testing.T) { + require := require.New(t) + + { + retryJoin := []string{"127.0.0.1", "127.0.0.2"} + startJoin := []string{"127.0.0.1", "127.0.0.2"} + retryMaxAttempts := 1 + retryInterval := time.Duration(0) + + a := &ServerJoin{ + RetryJoin: retryJoin, + StartJoin: startJoin, + RetryMaxAttempts: retryMaxAttempts, + RetryInterval: time.Duration(retryInterval), + } + b := &ServerJoin{} + + result := a.Merge(b) + require.Equal(result.RetryJoin, retryJoin) + require.Equal(result.StartJoin, startJoin) + require.Equal(result.RetryMaxAttempts, retryMaxAttempts) + require.Equal(result.RetryInterval, retryInterval) + } + { + retryJoin := []string{"127.0.0.1", "127.0.0.2"} + startJoin := []string{"127.0.0.1", "127.0.0.2"} + retryMaxAttempts := 1 + retryInterval := time.Duration(0) + + a := &ServerJoin{} + b := &ServerJoin{ + RetryJoin: retryJoin, + StartJoin: startJoin, + RetryMaxAttempts: retryMaxAttempts, + RetryInterval: time.Duration(retryInterval), + } + + result := a.Merge(b) + require.Equal(result.RetryJoin, retryJoin) + require.Equal(result.StartJoin, startJoin) + require.Equal(result.RetryMaxAttempts, retryMaxAttempts) + require.Equal(result.RetryInterval, retryInterval) + } + { + retryJoin := []string{"127.0.0.1", "127.0.0.2"} + startJoin := []string{"127.0.0.1", "127.0.0.2"} + retryMaxAttempts := 1 + retryInterval := time.Duration(0) + + var a *ServerJoin + b := &ServerJoin{ + RetryJoin: retryJoin, + StartJoin: startJoin, + RetryMaxAttempts: retryMaxAttempts, + RetryInterval: time.Duration(retryInterval), + } + + result := a.Merge(b) + require.Equal(result.RetryJoin, retryJoin) + require.Equal(result.StartJoin, startJoin) + require.Equal(result.RetryMaxAttempts, retryMaxAttempts) + require.Equal(result.RetryInterval, retryInterval) + } + { + retryJoin := []string{"127.0.0.1", "127.0.0.2"} + startJoin := []string{"127.0.0.1", "127.0.0.2"} + retryMaxAttempts := 1 + retryInterval := time.Duration(0) + + a := &ServerJoin{ + RetryJoin: retryJoin, + StartJoin: startJoin, + RetryMaxAttempts: retryMaxAttempts, + RetryInterval: time.Duration(retryInterval), + } + var b *ServerJoin + + result := a.Merge(b) + require.Equal(result.RetryJoin, retryJoin) + require.Equal(result.StartJoin, startJoin) + require.Equal(result.RetryMaxAttempts, retryMaxAttempts) + require.Equal(result.RetryInterval, retryInterval) + } + { + retryJoin := []string{"127.0.0.1", "127.0.0.2"} + startJoin := []string{"127.0.0.1", "127.0.0.2"} + retryMaxAttempts := 1 + retryInterval := time.Duration(0) + + a := &ServerJoin{ + RetryJoin: retryJoin, + StartJoin: startJoin, + } + b := &ServerJoin{ + RetryMaxAttempts: retryMaxAttempts, + RetryInterval: time.Duration(retryInterval), + } + + result := a.Merge(b) + require.Equal(result.RetryJoin, retryJoin) + require.Equal(result.StartJoin, startJoin) + require.Equal(result.RetryMaxAttempts, retryMaxAttempts) + require.Equal(result.RetryInterval, retryInterval) + } +} diff --git a/command/agent/retry_join.go b/command/agent/retry_join.go index 1a0aea8581d..2e8735be1d0 100644 --- a/command/agent/retry_join.go +++ b/command/agent/retry_join.go @@ -1,6 +1,7 @@ package agent import ( + "fmt" "log" "strings" "time" @@ -27,8 +28,17 @@ type DiscoverInterface interface { // retryJoiner is used to handle retrying a join until it succeeds or all of // its tries are exhausted. type retryJoiner struct { - // join adds the specified servers to the serf cluster - join func([]string) (int, error) + // serverJoin adds the specified servers to the serf cluster + serverJoin func([]string) (int, error) + + // serverEnabled indicates whether the nomad agent will run in server mode + serverEnabled bool + + // clientJoin adds the specified servers to the serf cluster + clientJoin func([]string) (int, error) + + // clientEnabled indicates whether the nomad agent will run in client mode + clientEnabled bool // discover is of type Discover, where this is either the go-discover // implementation or a mock used for testing @@ -42,23 +52,62 @@ type retryJoiner struct { logger *log.Logger } +// Validate ensures that the configuration passes validity checks for the +// retry_join stanza. If the configuration is not valid, returns an error that +// will be displayed to the operator, otherwise nil. +func (r *retryJoiner) Validate(config *Config) error { + + // If retry_join is defined for the server, ensure that deprecated + // fields and the server_join stanza are not both set + if config.Server != nil && config.Server.ServerJoin != nil && len(config.Server.ServerJoin.RetryJoin) != 0 { + if len(config.Server.RetryJoin) != 0 { + return fmt.Errorf("server_join and retry_join cannot both be defined; prefer setting the server_join stanza") + } + if len(config.Server.StartJoin) != 0 { + return fmt.Errorf("server_join and start_join cannot both be defined; prefer setting the server_join stanza") + } + if config.Server.RetryMaxAttempts != 0 { + return fmt.Errorf("server_join and retry_max cannot both be defined; prefer setting the server_join stanza") + } + + if config.Server.RetryInterval != 0 { + return fmt.Errorf("server_join and retry_interval cannot both be defined; prefer setting the server_join stanza") + } + + if len(config.Server.ServerJoin.StartJoin) != 0 { + return fmt.Errorf("retry_join and start_join cannot both be defined") + } + } + + // if retry_join is defined for the client, ensure that start_join is not + // set as this configuration is only defined for servers. + if config.Client != nil && config.Client.ServerJoin != nil { + if config.Client.ServerJoin.StartJoin != nil { + return fmt.Errorf("start_join is not supported for Nomad clients") + } + } + + return nil +} + // retryJoin is used to handle retrying a join until it succeeds or all retries // are exhausted. -func (r *retryJoiner) RetryJoin(config *Config) { - if len(config.Server.RetryJoin) == 0 || !config.Server.Enabled { +func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) { + if len(serverJoin.RetryJoin) == 0 { return } attempt := 0 - addrsToJoin := strings.Join(config.Server.RetryJoin, " ") + addrsToJoin := strings.Join(serverJoin.RetryJoin, " ") r.logger.Printf("[INFO] agent: Joining cluster... %s", addrsToJoin) for { var addrs []string + var n int var err error - for _, addr := range config.Server.RetryJoin { + for _, addr := range serverJoin.RetryJoin { switch { case strings.HasPrefix(addr, "provider="): servers, err := r.discover.Addrs(addr, r.logger) @@ -73,23 +122,33 @@ func (r *retryJoiner) RetryJoin(config *Config) { } if len(addrs) > 0 { - n, err := r.join(addrs) - if err == nil { - r.logger.Printf("[INFO] agent: Join completed. Synced with %d initial agents", n) + if r.serverEnabled && r.serverJoin != nil { + n, err = r.serverJoin(addrs) + if err == nil { + r.logger.Printf("[INFO] agent: Join completed. Server synced with %d initial servers", n) + return + } + } + if r.clientEnabled && r.clientJoin != nil { + n, err = r.clientJoin(addrs) + if err == nil { + r.logger.Printf("[INFO] agent: Join completed. Client synced with %d initial servers", n) + return + } } } attempt++ - if config.Server.RetryMaxAttempts > 0 && attempt > config.Server.RetryMaxAttempts { + if serverJoin.RetryMaxAttempts > 0 && attempt > serverJoin.RetryMaxAttempts { r.logger.Printf("[ERR] agent: max join retry exhausted, exiting") close(r.errCh) return } if err != nil { - r.logger.Printf("[WARN] agent: Join failed: %v, retrying in %v", err, - config.Server.RetryInterval) + r.logger.Printf("[WARN] agent: Join failed: %q, retrying in %v", err, + serverJoin.RetryInterval) } - time.Sleep(config.Server.retryInterval) + time.Sleep(serverJoin.RetryInterval) } } diff --git a/command/agent/retry_join_test.go b/command/agent/retry_join_test.go index 34b381373ab..b07848d2c9c 100644 --- a/command/agent/retry_join_test.go +++ b/command/agent/retry_join_test.go @@ -6,9 +6,9 @@ import ( "log" "os" "testing" + "time" "github.com/hashicorp/nomad/testutil" - "github.com/hashicorp/nomad/version" "github.com/mitchellh/cli" "github.com/stretchr/testify/require" ) @@ -30,43 +30,37 @@ func (m *MockDiscover) Names() []string { func TestRetryJoin_Integration(t *testing.T) { t.Parallel() + + // Create two agents and have one retry join the other agent := NewTestAgent(t, t.Name(), nil) defer agent.Shutdown() - doneCh := make(chan struct{}) - shutdownCh := make(chan struct{}) - - defer func() { - close(shutdownCh) - <-doneCh - }() + agent2 := NewTestAgent(t, t.Name(), func(c *Config) { + c.NodeName = "foo" + if c.Server.ServerJoin == nil { + c.Server.ServerJoin = &ServerJoin{} + } + c.Server.ServerJoin.RetryJoin = []string{agent.Config.normalizedAddrs.Serf} + c.Server.ServerJoin.RetryInterval = 1 * time.Second + }) + defer agent2.Shutdown() + // Create a fake command and have it wrap the second agent and run the retry + // join handler cmd := &Command{ - Version: version.GetVersion(), - ShutdownCh: shutdownCh, Ui: &cli.BasicUi{ Reader: os.Stdin, Writer: os.Stdout, ErrorWriter: os.Stderr, }, + agent: agent2.Agent, } - serfAddr := agent.Config.normalizedAddrs.Serf - - args := []string{ - "-dev", - "-node", "foo", - "-retry-join", serfAddr, - "-retry-interval", "1s", + if err := cmd.handleRetryJoin(agent2.Config); err != nil { + t.Fatalf("handleRetryJoin failed: %v", err) } - go func() { - if code := cmd.Run(args); code != 0 { - t.Logf("bad: %d", code) - } - close(doneCh) - }() - + // Ensure the retry join occured. testutil.WaitForResult(func() (bool, error) { mem := agent.server.Members() if len(mem) != 2 { @@ -78,16 +72,13 @@ func TestRetryJoin_Integration(t *testing.T) { }) } -func TestRetryJoin_NonCloud(t *testing.T) { +func TestRetryJoin_Server_NonCloud(t *testing.T) { t.Parallel() require := require.New(t) - newConfig := &Config{ - Server: &ServerConfig{ - RetryMaxAttempts: 1, - RetryJoin: []string{"127.0.0.1"}, - Enabled: true, - }, + serverJoin := &ServerJoin{ + RetryMaxAttempts: 1, + RetryJoin: []string{"127.0.0.1"}, } var output []string @@ -98,28 +89,26 @@ func TestRetryJoin_NonCloud(t *testing.T) { } joiner := retryJoiner{ - discover: &MockDiscover{}, - join: mockJoin, - logger: log.New(ioutil.Discard, "", 0), - errCh: make(chan struct{}), + discover: &MockDiscover{}, + serverJoin: mockJoin, + serverEnabled: true, + logger: log.New(ioutil.Discard, "", 0), + errCh: make(chan struct{}), } - joiner.RetryJoin(newConfig) + joiner.RetryJoin(serverJoin) require.Equal(1, len(output)) require.Equal(stubAddress, output[0]) } -func TestRetryJoin_Cloud(t *testing.T) { +func TestRetryJoin_Server_Cloud(t *testing.T) { t.Parallel() require := require.New(t) - newConfig := &Config{ - Server: &ServerConfig{ - RetryMaxAttempts: 1, - RetryJoin: []string{"provider=aws, tag_value=foo"}, - Enabled: true, - }, + serverJoin := &ServerJoin{ + RetryMaxAttempts: 1, + RetryJoin: []string{"provider=aws, tag_value=foo"}, } var output []string @@ -131,29 +120,27 @@ func TestRetryJoin_Cloud(t *testing.T) { mockDiscover := &MockDiscover{} joiner := retryJoiner{ - discover: mockDiscover, - join: mockJoin, - logger: log.New(ioutil.Discard, "", 0), - errCh: make(chan struct{}), + discover: mockDiscover, + serverJoin: mockJoin, + serverEnabled: true, + logger: log.New(ioutil.Discard, "", 0), + errCh: make(chan struct{}), } - joiner.RetryJoin(newConfig) + joiner.RetryJoin(serverJoin) require.Equal(1, len(output)) require.Equal("provider=aws, tag_value=foo", mockDiscover.ReceivedAddrs) require.Equal(stubAddress, output[0]) } -func TestRetryJoin_MixedProvider(t *testing.T) { +func TestRetryJoin_Server_MixedProvider(t *testing.T) { t.Parallel() require := require.New(t) - newConfig := &Config{ - Server: &ServerConfig{ - RetryMaxAttempts: 1, - RetryJoin: []string{"provider=aws, tag_value=foo", "127.0.0.1"}, - Enabled: true, - }, + serverJoin := &ServerJoin{ + RetryMaxAttempts: 1, + RetryJoin: []string{"provider=aws, tag_value=foo", "127.0.0.1"}, } var output []string @@ -165,15 +152,197 @@ func TestRetryJoin_MixedProvider(t *testing.T) { mockDiscover := &MockDiscover{} joiner := retryJoiner{ - discover: mockDiscover, - join: mockJoin, - logger: log.New(ioutil.Discard, "", 0), - errCh: make(chan struct{}), + discover: mockDiscover, + serverJoin: mockJoin, + serverEnabled: true, + logger: log.New(ioutil.Discard, "", 0), + errCh: make(chan struct{}), } - joiner.RetryJoin(newConfig) + joiner.RetryJoin(serverJoin) require.Equal(2, len(output)) require.Equal("provider=aws, tag_value=foo", mockDiscover.ReceivedAddrs) require.Equal(stubAddress, output[0]) } + +func TestRetryJoin_Client(t *testing.T) { + t.Parallel() + require := require.New(t) + + serverJoin := &ServerJoin{ + RetryMaxAttempts: 1, + RetryJoin: []string{"127.0.0.1"}, + } + + var output []string + + mockJoin := func(s []string) (int, error) { + output = s + return 0, nil + } + + joiner := retryJoiner{ + discover: &MockDiscover{}, + clientJoin: mockJoin, + clientEnabled: true, + logger: log.New(ioutil.Discard, "", 0), + errCh: make(chan struct{}), + } + + joiner.RetryJoin(serverJoin) + + require.Equal(1, len(output)) + require.Equal(stubAddress, output[0]) +} + +func TestRetryJoin_Validate(t *testing.T) { + t.Parallel() + type validateExpect struct { + config *Config + isValid bool + reason string + } + + scenarios := []*validateExpect{ + { + config: &Config{ + Server: &ServerConfig{ + ServerJoin: &ServerJoin{ + RetryJoin: []string{"127.0.0.1"}, + RetryMaxAttempts: 0, + RetryInterval: 0, + StartJoin: []string{}, + }, + RetryJoin: []string{"127.0.0.1"}, + RetryMaxAttempts: 0, + RetryInterval: 0, + StartJoin: []string{}, + }, + }, + isValid: false, + reason: "server_join cannot be defined if retry_join is defined on the server stanza", + }, + { + config: &Config{ + Server: &ServerConfig{ + ServerJoin: &ServerJoin{ + RetryJoin: []string{"127.0.0.1"}, + RetryMaxAttempts: 0, + RetryInterval: 0, + StartJoin: []string{}, + }, + StartJoin: []string{"127.0.0.1"}, + RetryMaxAttempts: 0, + RetryInterval: 0, + RetryJoin: []string{}, + }, + }, + isValid: false, + reason: "server_join cannot be defined if start_join is defined on the server stanza", + }, + { + config: &Config{ + Server: &ServerConfig{ + ServerJoin: &ServerJoin{ + RetryJoin: []string{"127.0.0.1"}, + RetryMaxAttempts: 0, + RetryInterval: 0, + StartJoin: []string{}, + }, + StartJoin: []string{}, + RetryMaxAttempts: 1, + RetryInterval: 0, + RetryJoin: []string{}, + }, + }, + isValid: false, + reason: "server_join cannot be defined if retry_max_attempts is defined on the server stanza", + }, + { + config: &Config{ + Server: &ServerConfig{ + ServerJoin: &ServerJoin{ + RetryJoin: []string{"127.0.0.1"}, + RetryMaxAttempts: 0, + RetryInterval: time.Duration(1), + StartJoin: []string{}, + }, + StartJoin: []string{}, + RetryMaxAttempts: 0, + RetryInterval: 3 * time.Second, + RetryJoin: []string{}, + }, + }, + isValid: false, + reason: "server_join cannot be defined if retry_interval is defined on the server stanza", + }, + { + config: &Config{ + Server: &ServerConfig{ + ServerJoin: &ServerJoin{ + RetryJoin: []string{"127.0.0.1"}, + RetryMaxAttempts: 0, + RetryInterval: 0, + StartJoin: []string{"127.0.0.1"}, + }, + }, + }, + isValid: false, + reason: "start_join and retry_join should not both be defined", + }, + { + config: &Config{ + Client: &ClientConfig{ + ServerJoin: &ServerJoin{ + RetryJoin: []string{}, + RetryMaxAttempts: 0, + RetryInterval: 0, + StartJoin: []string{"127.0.0.1"}, + }, + }, + }, + isValid: false, + reason: "start_join should not be defined on the client", + }, + { + config: &Config{ + Client: &ClientConfig{ + ServerJoin: &ServerJoin{ + RetryJoin: []string{"127.0.0.1"}, + RetryMaxAttempts: 0, + RetryInterval: 0, + }, + }, + }, + isValid: true, + reason: "client server_join should be valid", + }, + { + config: &Config{ + Server: &ServerConfig{ + ServerJoin: &ServerJoin{ + RetryJoin: []string{"127.0.0.1"}, + RetryMaxAttempts: 1, + RetryInterval: 1, + StartJoin: []string{}, + }, + }, + }, + isValid: true, + reason: "server server_join should be valid", + }, + } + + joiner := retryJoiner{} + for _, scenario := range scenarios { + t.Run(scenario.reason, func(t *testing.T) { + err := joiner.Validate(scenario.config) + if scenario.isValid { + require.NoError(t, err) + } else { + require.Error(t, err) + } + }) + } +} diff --git a/website/source/docs/agent/cloud_auto_join.html.md b/website/source/docs/agent/cloud_auto_join.html.md new file mode 100644 index 00000000000..d733fe7066e --- /dev/null +++ b/website/source/docs/agent/cloud_auto_join.html.md @@ -0,0 +1,136 @@ +--- +layout: "docs" +page_title: "Cloud Auto-join" +sidebar_current: "docs-agent-cloud-auto-join" +description: |- + Nomad supports automatic cluster joining using cloud metadata from various cloud providers +--- + +# Cloud Auto-joining + +As of Nomad 0.8.4, +[`retry_join`](/docs/agent/configuration/server_join.html#retry_join) accepts a +unified interface using the +[go-discover](https://github.com/hashicorp/go-discover) library for doing +automatic cluster joining using cloud metadata. To use retry-join with a +supported cloud provider, specify the configuration on the command line or +configuration file as a `key=value key=value ...` string. + +Values are taken literally and must not be URL +encoded. If the values contain spaces, backslashes or double quotes then +they need to be double quoted and the usual escaping rules apply. + +```json +{ + "retry_join": ["provider=my-cloud config=val config2=\"some other val\" ..."] +} +``` + +The cloud provider-specific configurations are detailed below. This can be +combined with static IP or DNS addresses or even multiple configurations +for different providers. + +In order to use discovery behind a proxy, you will need to set +`HTTP_PROXY`, `HTTPS_PROXY` and `NO_PROXY` environment variables per +[Golang `net/http` library](https://golang.org/pkg/net/http/#ProxyFromEnvironment). + +The following sections give the options specific to a subset of supported cloud +provider. For information on all providers, see further documentation in +[go-discover](https://github.com/hashicorp/go-discover). + +### Amazon EC2 + +This returns the first private IP address of all servers in the given +region which have the given `tag_key` and `tag_value`. + + +```json +{ + "retry_join": ["provider=aws tag_key=... tag_value=..."] +} +``` + +- `provider` (required) - the name of the provider ("aws" in this case). +- `tag_key` (required) - the key of the tag to auto-join on. +- `tag_value` (required) - the value of the tag to auto-join on. +- `region` (optional) - the AWS region to authenticate in. +- `addr_type` (optional) - the type of address to discover: `private_v4`, `public_v4`, `public_v6`. Default is `private_v4`. (>= 1.0) +- `access_key_id` (optional) - the AWS access key for authentication (see below for more information about authenticating). +- `secret_access_key` (optional) - the AWS secret access key for authentication (see below for more information about authenticating). + +#### Authentication & Precedence + +- Static credentials `access_key_id=... secret_access_key=...` +- Environment variables (`AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`) +- Shared credentials file (`~/.aws/credentials` or the path specified by `AWS_SHARED_CREDENTIALS_FILE`) +- ECS task role metadata (container-specific). +- EC2 instance role metadata. + + The only required IAM permission is `ec2:DescribeInstances`, and it is + recommended that you make a dedicated key used only for auto-joining. If the + region is omitted it will be discovered through the local instance's [EC2 + metadata + endpoint](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-identity-documents.html). + +### Microsoft Azure + + This returns the first private IP address of all servers in the given region + which have the given `tag_key` and `tag_value` in the tenant and subscription, or in + the given `resource_group` of a `vm_scale_set` for Virtual Machine Scale Sets. + + + ```json +{ + "retry_join": ["provider=azure tag_name=... tag_value=... tenant_id=... client_id=... subscription_id=... secret_access_key=..."] +} +``` + +- `provider` (required) - the name of the provider ("azure" in this case). +- `tenant_id` (required) - the tenant to join machines in. +- `client_id` (required) - the client to authenticate with. +- `secret_access_key` (required) - the secret client key. + +Use these configuration parameters when using tags: +- `tag_name` - the name of the tag to auto-join on. +- `tag_value` - the value of the tag to auto-join on. + +Use these configuration parameters when using Virtual Machine Scale Sets (Consul 1.0.3 and later): +- `resource_group` - the name of the resource group to filter on. +- `vm_scale_set` - the name of the virtual machine scale set to filter on. + + When using tags the only permission needed is the `ListAll` method for `NetworkInterfaces`. When using + Virtual Machine Scale Sets the only role action needed is `Microsoft.Compute/virtualMachineScaleSets/*/read`. + +### Google Compute Engine + +This returns the first private IP address of all servers in the given +project which have the given `tag_value`. +``` + +```json +{ +"retry_join": ["provider=gce project_name=... tag_value=..."] +} +``` + +- `provider` (required) - the name of the provider ("gce" in this case). +- `tag_value` (required) - the value of the tag to auto-join on. +- `project_name` (optional) - the name of the project to auto-join on. Discovered if not set. +- `zone_pattern` (optional) - the list of zones can be restricted through an RE2 compatible regular expression. If omitted, servers in all zones are returned. +- `credentials_file` (optional) - the credentials file for authentication. See below for more information. + +#### Authentication & Precedence + +- Use credentials from `credentials_file`, if provided. +- Use JSON file from `GOOGLE_APPLICATION_CREDENTIALS` environment variable. +- Use JSON file in a location known to the gcloud command-line tool. +- On Windows, this is `%APPDATA%/gcloud/application_default_credentials.json`. +- On other systems, `$HOME/.config/gcloud/application_default_credentials.json`. +- On Google Compute Engine, use credentials from the metadata +server. In this final case any provided scopes are ignored. + +Discovery requires a [GCE Service +Account](https://cloud.google.com/compute/docs/access/service-accounts). +Credentials are searched using the following paths, in order of precedence. + + diff --git a/website/source/docs/agent/configuration/client.html.md b/website/source/docs/agent/configuration/client.html.md index 2f96da892a0..2ba20be5b3a 100644 --- a/website/source/docs/agent/configuration/client.html.md +++ b/website/source/docs/agent/configuration/client.html.md @@ -90,6 +90,12 @@ client { receive work. This may be specified as an IP address or DNS, with or without the port. If the port is omitted, the default port of `4647` is used. +- `server_join` ([server_join][server-join]: nil) - Specifies + how the Nomad client will connect to Nomad servers. The `start_join` field + is not supported on the client. The retry_join fields may directly specify + the server address or use go-discover syntax for auto-discovery. See the + documentation for more detail. + - `state_dir` `(string: "[data_dir]/client")` - Specifies the directory to use to store client state. By default, this is - the top-level [data_dir](/docs/agent/configuration/index.html#data_dir) suffixed with @@ -307,7 +313,11 @@ cluster. ```hcl client { enabled = true - servers = ["1.2.3.4:4647", "5.6.7.8:4647"] + server_join { + retry_join = [ "1.1.1.1", "2.2.2.2" ] + retry_max = 3 + retry_interval = "15s" + } } ``` @@ -346,3 +356,4 @@ client { } } ``` +[server-join]: /docs/agent/configuration/server_join.html "Server Join" diff --git a/website/source/docs/agent/configuration/server.html.md b/website/source/docs/agent/configuration/server.html.md index 56b5e4abcd6..6da8ab88e85 100644 --- a/website/source/docs/agent/configuration/server.html.md +++ b/website/source/docs/agent/configuration/server.html.md @@ -28,7 +28,11 @@ join failures, and more. server { enabled = true bootstrap_expect = 3 - retry_join = ["1.2.3.4", "5.6.7.8"] + server_join { + retry_join = [ "1.1.1.1", "2.2.2.2" ] + retry_max = 3 + retry_interval = "15s" + } } ``` @@ -102,9 +106,9 @@ server { second is a tradeoff as it lowers failure detection time of nodes at the tradeoff of false positives and increased load on the leader. -- `non_voting_server` `(bool: false)` - (Enterprise-only) Specifies whether - this server will act as a non-voting member of the cluster to help provide - read scalability. +- `non_voting_server` `(bool: false)` - (Enterprise-only) Specifies whether + this server will act as a non-voting member of the cluster to help provide + read scalability. - `num_schedulers` `(int: [num-cores])` - Specifies the number of parallel scheduler threads to run. This can be as many as one per core, or `0` to @@ -131,6 +135,17 @@ server { cluster again when starting. This flag allows the previous state to be used to rejoin the cluster. +- `server_join` ([server_join][server-join]: nil) - Specifies + how the Nomad server will connect to other Nomad servers. The `retry_join` + fields may directly specify the server address or use go-discover syntax for + auto-discovery. See the [server_join documentation][server-join] for more detail. + +- `upgrade_version` `(string: "")` - A custom version of the format X.Y.Z to use + in place of the Nomad version when custom upgrades are enabled in Autopilot. + For more information, see the [Autopilot Guide](/guides/cluster/autopilot.html). + +### Deprecated Parameters + - `retry_join` `(array: [])` - Specifies a list of server addresses to retry joining if the first attempt fails. This is similar to [`start_join`](#start_join), but only invokes if the initial join attempt @@ -138,63 +153,25 @@ server { succeeds. After one succeeds, no further addresses will be contacted. This is useful for cases where we know the address will become available eventually. Use `retry_join` with an array as a replacement for `start_join`, **do not use - both options**. See the [server address format](#server-address-format) - section for more information on the format of the string. + both options**. See the [server_join][server-join] + section for more information on the format of the string. This field is + deprecated in favor of the [server_join stanza][server-join]. - `retry_interval` `(string: "30s")` - Specifies the time to wait between retry - join attempts. + join attempts. This field is deprecated in favor of the [server_join + stanza][server-join]. - `retry_max` `(int: 0)` - Specifies the maximum number of join attempts to be made before exiting with a return code of 1. By default, this is set to 0 - which is interpreted as infinite retries. + which is interpreted as infinite retries. This field is deprecated in favor of + the [server_join stanza][server-join]. - `start_join` `(array: [])` - Specifies a list of server addresses to join on startup. If Nomad is unable to join with any of the specified - addresses, agent startup will fail. See the - [server address format](#server-address-format) section for more information - on the format of the string. - -- `upgrade_version` `(string: "")` - A custom version of the format X.Y.Z to use - in place of the Nomad version when custom upgrades are enabled in Autopilot. - For more information, see the [Autopilot Guide](/guides/cluster/autopilot.html). - -### Server Address Format - -This section describes the acceptable syntax and format for describing the -location of a Nomad server. There are many ways to reference a Nomad server, -including directly by IP address and resolving through DNS. - -#### Directly via IP Address - -It is possible to address another Nomad server using its IP address. This is -done in the `ip:port` format, such as: - -``` -1.2.3.4:5678 -``` - -If the port option is omitted, it defaults to the Serf port, which is 4648 -unless configured otherwise: - -``` -1.2.3.4 => 1.2.3.4:4648 -``` - -#### Via Domains or DNS - -It is possible to address another Nomad server using its DNS address. This is -done in the `address:port` format, such as: - -``` -nomad-01.company.local:5678 -``` - -If the port option is omitted, it defaults to the Serf port, which is 4648 -unless configured otherwise: - -``` -nomad-01.company.local => nomad-01.company.local:4648 -``` + addresses, agent startup will fail. See the [server address + format](/docs/agent/configuration/server_join.html#server-address-format) + section for more information on the format of the string. This field is + deprecated in favor of the [server_join stanza][server-join]. ## `server` Examples @@ -242,3 +219,4 @@ server { ``` [encryption]: /docs/agent/encryption.html "Nomad Agent Encryption" +[server-join]: /docs/agent/configuration/server_join.html "Server Join" diff --git a/website/source/docs/agent/configuration/server_join.html.md b/website/source/docs/agent/configuration/server_join.html.md new file mode 100644 index 00000000000..82620a781c2 --- /dev/null +++ b/website/source/docs/agent/configuration/server_join.html.md @@ -0,0 +1,131 @@ +--- +layout: "docs" +page_title: "server_join Stanza - Agent Configuration" +sidebar_current: "docs-agent-configuration--server-join" +description: |- + The "server_join" stanza specifies how the Nomad agent will discover and connect to Nomad servers. +--- + +# `server_join` Stanza + + + + + + +
Placement + server -> **server_join** +
+ client -> **server_join** +
+ +The `server_join` stanza specifies how the Nomad agent will discover and connect +to Nomad servers. + +```hcl +server_join { + retry_join = [ "1.1.1.1", "2.2.2.2" ] + retry_max = 3 + retry_interval = "15s" +} +``` + +## `server_join` Parameters + +- `retry_join` `(array: [])` - Specifies a list of server addresses to + join. This is similar to [`start_join`](#start_join), but will continue to + be attempted even if the initial join attempt fails, up to + [retry_max](#retry_max). Further, `retry_join` is available to + both Nomad servers and clients, while `start_join` is only defined for Nomad + servers. This is useful for cases where we know the address will become + available eventually. Use `retry_join` with an array as a replacement for + `start_join`, **do not use both options**. + + Address format includes both using IP addresses as well as an interface to the + [go-discover](https://github.com/hashicorp/go-discover) library for doing + automated cluster joining using cloud metadata. See [Cloud + Auto-join][cloud_auto_join] for more information. + + ``` + server_join { + retry_join = [ "1.1.1.1", "2.2.2.2" ] + } + ``` + + Using the `go-discover` interface, this can be defined both in a client or + server configuration as well as provided as a command-line argument. + + ``` + server_join { + retry_join = [ "provider=aws tag_key=..." ] + } + ``` + + See the [server address format](#server-address-format) for more information + about expected server address formats. + +- `retry_interval` `(string: "30s")` - Specifies the time to wait between retry + join attempts. + +- `retry_max` `(int: 0)` - Specifies the maximum number of join attempts to be + made before exiting with a return code of 1. By default, this is set to 0 + which is interpreted as infinite retries. + +- `start_join` `(array: [])` - Specifies a list of server addresses to + join on startup. If Nomad is unable to join with any of the specified + addresses, agent startup will fail. See the + [server address format](#server-address-format) section for more information + on the format of the string. This field is defined only for Nomad servers and + will result in a configuration parse error if included in a client + configuration. + +## Server Address Format + +This section describes the acceptable syntax and format for describing the +location of a Nomad server. There are many ways to reference a Nomad server, +including directly by IP address and resolving through DNS. + +### Directly via IP Address + +It is possible to address another Nomad server using its IP address. This is +done in the `ip:port` format, such as: + +``` +1.2.3.4:5678 +``` + +If the port option is omitted, it defaults to the Serf port, which is 4648 +unless configured otherwise: + +``` +1.2.3.4 => 1.2.3.4:4648 +``` + +### Via Domains or DNS + +It is possible to address another Nomad server using its DNS address. This is +done in the `address:port` format, such as: + +``` +nomad-01.company.local:5678 +``` + +If the port option is omitted, it defaults to the Serf port, which is 4648 +unless configured otherwise: + +``` +nomad-01.company.local => nomad-01.company.local:4648 +``` + +### Via the go-discover interface + +As of Nomad 0.8.4, `retry_join` accepts a unified interface using the +[go-discover](https://github.com/hashicorp/go-discover) library for doing +automated cluster joining using cloud metadata. See [Cloud +Auto-join][cloud_auto_join] for more information. + +``` +"provider=aws tag_key=..." => 1.2.3.4:4648 +``` + +[cloud_auto_join]: /docs/agent/cloud_auto_join.html "Nomad Cloud Auto-join" diff --git a/website/source/docs/commands/agent.html.md.erb b/website/source/docs/commands/agent.html.md.erb index 58dc9bc0327..faebc7e867a 100644 --- a/website/source/docs/commands/agent.html.md.erb +++ b/website/source/docs/commands/agent.html.md.erb @@ -71,7 +71,15 @@ via CLI arguments. The `agent` command accepts the following arguments: * `-region=`: Equivalent to the [region](#region) config option. * `-rejoin`: Equivalent to the [rejoin_after_leave](#rejoin_after_leave) config option. * `-retry-interval`: Equivalent to the [retry_interval](#retry_interval) config option. -* `-retry-join`: Similar to `-join` but allows retrying a join if the first attempt fails. +* `-retry-join`: Similar to `-join` but allows retrying a join if the first attempt fails. + + ```sh + $ nomad agent -retry-join "127.0.0.1:4648" + ``` + + `retry-join` can be defined as a command line flag only for servers. Clients + can configure `retry-join` only in configuration files. + * `-retry-max`: Similar to the [retry_max](#retry_max) config option. * `-server`: Enable server mode on the local agent. * `-servers=`: Equivalent to the Client [servers](#servers) config diff --git a/website/source/guides/cluster/manual.html.md b/website/source/guides/cluster/manual.html.md index e54504beda1..cddd390dce7 100644 --- a/website/source/guides/cluster/manual.html.md +++ b/website/source/guides/cluster/manual.html.md @@ -31,7 +31,9 @@ server { bootstrap_expect = 3 # This is the IP address of the first server we provisioned - retry_join = [":4648"] + server_join { + retry_join = [":4648"] + } } ``` diff --git a/website/source/layouts/docs.erb b/website/source/layouts/docs.erb index b0e3b7250fc..eea874b8f7e 100644 --- a/website/source/layouts/docs.erb +++ b/website/source/layouts/docs.erb @@ -396,6 +396,9 @@ Nomad Agent