From a1cb3f14d06df67f85043b25637284c479b917a1 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 3 May 2016 00:06:59 -0700 Subject: [PATCH 001/166] Use consul/lib's RateScaledInterval --- nomad/heartbeat.go | 4 +- nomad/util.go | 10 ---- nomad/util_test.go | 23 -------- .../hashicorp/consul/lib/cluster.go | 56 +++++++++++++++++++ .../github.com/hashicorp/consul/lib/math.go | 22 ++++++++ .../github.com/hashicorp/consul/lib/rand.go | 34 +++++++++++ .../github.com/hashicorp/consul/lib/string.go | 11 ++++ 7 files changed, 125 insertions(+), 35 deletions(-) create mode 100644 vendor/github.com/hashicorp/consul/lib/cluster.go create mode 100644 vendor/github.com/hashicorp/consul/lib/math.go create mode 100644 vendor/github.com/hashicorp/consul/lib/rand.go create mode 100644 vendor/github.com/hashicorp/consul/lib/string.go diff --git a/nomad/heartbeat.go b/nomad/heartbeat.go index 3f2c037651a..aed47d4bd58 100644 --- a/nomad/heartbeat.go +++ b/nomad/heartbeat.go @@ -4,6 +4,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/hashicorp/consul/lib" "github.com/hashicorp/nomad/nomad/structs" ) @@ -49,8 +50,7 @@ func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) { // Compute the target TTL value n := len(s.heartbeatTimers) - ttl := rateScaledInterval(s.config.MaxHeartbeatsPerSecond, - s.config.MinHeartbeatTTL, n) + ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n) ttl += randomStagger(ttl) // Reset the TTL diff --git a/nomad/util.go b/nomad/util.go index 1c7dba0bd48..961c75ab6d0 100644 --- a/nomad/util.go +++ b/nomad/util.go @@ -122,16 +122,6 @@ func maxUint64(a, b uint64) uint64 { return b } -// rateScaledInterval is used to choose an interval to perform an action in order -// to target an aggregate number of actions per second across the whole cluster. -func rateScaledInterval(rate float64, min time.Duration, n int) time.Duration { - interval := time.Duration(float64(time.Second) * float64(n) / rate) - if interval < min { - return min - } - return interval -} - // seedRandom seeds the global random variable using a cryptographically random // seed. It returns an error if determing the random seed fails. func seedRandom() error { diff --git a/nomad/util_test.go b/nomad/util_test.go index d1a399590e9..e71b0efefb8 100644 --- a/nomad/util_test.go +++ b/nomad/util_test.go @@ -98,26 +98,3 @@ func TestMaxUint64(t *testing.T) { t.Fatalf("bad") } } - -func TestRateScaledInterval(t *testing.T) { - min := 1 * time.Second - rate := 200.0 - if v := rateScaledInterval(rate, min, 0); v != min { - t.Fatalf("Bad: %v", v) - } - if v := rateScaledInterval(rate, min, 100); v != min { - t.Fatalf("Bad: %v", v) - } - if v := rateScaledInterval(rate, min, 200); v != 1*time.Second { - t.Fatalf("Bad: %v", v) - } - if v := rateScaledInterval(rate, min, 1000); v != 5*time.Second { - t.Fatalf("Bad: %v", v) - } - if v := rateScaledInterval(rate, min, 5000); v != 25*time.Second { - t.Fatalf("Bad: %v", v) - } - if v := rateScaledInterval(rate, min, 10000); v != 50*time.Second { - t.Fatalf("Bad: %v", v) - } -} diff --git a/vendor/github.com/hashicorp/consul/lib/cluster.go b/vendor/github.com/hashicorp/consul/lib/cluster.go new file mode 100644 index 00000000000..a95232c5737 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/lib/cluster.go @@ -0,0 +1,56 @@ +package lib + +import ( + "math/rand" + "time" +) + +// DurationMinusBuffer returns a duration, minus a buffer and jitter +// subtracted from the duration. This function is used primarily for +// servicing Consul TTL Checks in advance of the TTL. +func DurationMinusBuffer(intv time.Duration, buffer time.Duration, jitter int64) time.Duration { + d := intv - buffer + if jitter == 0 { + d -= RandomStagger(d) + } else { + d -= RandomStagger(time.Duration(int64(d) / jitter)) + } + return d +} + +// DurationMinusBufferDomain returns the domain of valid durations from a +// call to DurationMinusBuffer. This function is used to check user +// specified input values to DurationMinusBuffer. +func DurationMinusBufferDomain(intv time.Duration, buffer time.Duration, jitter int64) (min time.Duration, max time.Duration) { + max = intv - buffer + if jitter == 0 { + min = max + } else { + min = max - time.Duration(int64(max)/jitter) + } + return min, max +} + +// Returns a random stagger interval between 0 and the duration +func RandomStagger(intv time.Duration) time.Duration { + if intv == 0 { + return 0 + } + return time.Duration(uint64(rand.Int63()) % uint64(intv)) +} + +// RateScaledInterval is used to choose an interval to perform an action in +// order to target an aggregate number of actions per second across the whole +// cluster. +func RateScaledInterval(rate float64, min time.Duration, n int) time.Duration { + const minRate = 1 / 86400 // 1/(1 * time.Day) + if rate <= minRate { + return min + } + interval := time.Duration(float64(time.Second) * float64(n) / rate) + if interval < min { + return min + } + + return interval +} diff --git a/vendor/github.com/hashicorp/consul/lib/math.go b/vendor/github.com/hashicorp/consul/lib/math.go new file mode 100644 index 00000000000..1d0b6dc0f6b --- /dev/null +++ b/vendor/github.com/hashicorp/consul/lib/math.go @@ -0,0 +1,22 @@ +package lib + +func AbsInt(a int) int { + if a > 0 { + return a + } + return a * -1 +} + +func MaxInt(a, b int) int { + if a > b { + return a + } + return b +} + +func MinInt(a, b int) int { + if a > b { + return b + } + return a +} diff --git a/vendor/github.com/hashicorp/consul/lib/rand.go b/vendor/github.com/hashicorp/consul/lib/rand.go new file mode 100644 index 00000000000..22aa4f3544b --- /dev/null +++ b/vendor/github.com/hashicorp/consul/lib/rand.go @@ -0,0 +1,34 @@ +package lib + +import ( + crand "crypto/rand" + "math" + "math/big" + "math/rand" + "sync" + "time" +) + +var ( + once sync.Once + + // SeededSecurely is set to true if a cryptographically secure seed + // was used to initialize rand. When false, the start time is used + // as a seed. + SeededSecurely bool +) + +// SeedMathRand provides weak, but guaranteed seeding, which is better than +// running with Go's default seed of 1. A call to SeedMathRand() is expected +// to be called via init(), but never a second time. +func SeedMathRand() { + once.Do(func() { + n, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64)) + if err != nil { + rand.Seed(time.Now().UTC().UnixNano()) + return + } + rand.Seed(n.Int64()) + SeededSecurely = true + }) +} diff --git a/vendor/github.com/hashicorp/consul/lib/string.go b/vendor/github.com/hashicorp/consul/lib/string.go new file mode 100644 index 00000000000..0780abb632c --- /dev/null +++ b/vendor/github.com/hashicorp/consul/lib/string.go @@ -0,0 +1,11 @@ +package lib + +// StrContains checks if a list contains a string +func StrContains(l []string, s string) bool { + for _, v := range l { + if v == s { + return true + } + } + return false +} From 7db2eb03c40ecb4dea100ffb1c60d019ff70dfb1 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 3 May 2016 00:15:29 -0700 Subject: [PATCH 002/166] Use consul/lib's RandomStagger Removes four redundant copies of the method in the process. --- client/client.go | 8 ++++---- client/consul/check.go | 9 ++------- client/util.go | 6 ------ client/util_test.go | 12 ------------ command/agent/command.go | 3 ++- command/agent/util.go | 7 ------- command/agent/util_test.go | 16 ---------------- nomad/heartbeat.go | 2 +- nomad/rpc.go | 3 ++- nomad/util.go | 6 ------ nomad/util_test.go | 11 ----------- 11 files changed, 11 insertions(+), 72 deletions(-) delete mode 100644 command/agent/util_test.go diff --git a/client/client.go b/client/client.go index 1c32e13e117..eecb27d9fbd 100644 --- a/client/client.go +++ b/client/client.go @@ -13,8 +13,7 @@ import ( "time" "github.com/armon/go-metrics" - "github.com/mitchellh/hashstructure" - + "github.com/hashicorp/consul/lib" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/config" @@ -24,6 +23,7 @@ import ( "github.com/hashicorp/nomad/client/stats" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/structs" + "github.com/mitchellh/hashstructure" ) const ( @@ -785,7 +785,7 @@ func (c *Client) retryIntv(base time.Duration) time.Duration { if c.config.DevMode { return devModeRetryIntv } - return base + randomStagger(base) + return base + lib.RandomStagger(base) } // registerAndHeartbeat is a long lived goroutine used to register the client @@ -804,7 +804,7 @@ func (c *Client) registerAndHeartbeat() { if c.config.DevMode { heartbeat = time.After(0) } else { - heartbeat = time.After(randomStagger(initialHeartbeatStagger)) + heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger)) } for { diff --git a/client/consul/check.go b/client/consul/check.go index 052c5c78c20..28df291f67f 100644 --- a/client/consul/check.go +++ b/client/consul/check.go @@ -2,10 +2,10 @@ package consul import ( "log" - "math/rand" "sync" "time" + "github.com/hashicorp/consul/lib" cstructs "github.com/hashicorp/nomad/client/driver/structs" ) @@ -60,7 +60,7 @@ func (r *CheckRunner) Stop() { // run is invoked by a goroutine to run until Stop() is called func (r *CheckRunner) run() { // Get the randomized initial pause time - initialPauseTime := randomStagger(r.check.Interval()) + initialPauseTime := lib.RandomStagger(r.check.Interval()) r.logger.Printf("[DEBUG] agent: pausing %v before first invocation of %s", initialPauseTime, r.check.ID()) next := time.NewTimer(initialPauseTime) for { @@ -82,8 +82,3 @@ type Check interface { Interval() time.Duration Timeout() time.Duration } - -// Returns a random stagger interval between 0 and the duration -func randomStagger(intv time.Duration) time.Duration { - return time.Duration(uint64(rand.Int63()) % uint64(intv)) -} diff --git a/client/util.go b/client/util.go index a8afcd17152..b04f173f008 100644 --- a/client/util.go +++ b/client/util.go @@ -7,7 +7,6 @@ import ( "math/rand" "os" "path/filepath" - "time" "github.com/hashicorp/nomad/nomad/structs" ) @@ -69,11 +68,6 @@ func diffAllocs(existing []*structs.Allocation, allocs *allocUpdates) *diffResul return result } -// Returns a random stagger interval between 0 and the duration -func randomStagger(intv time.Duration) time.Duration { - return time.Duration(uint64(rand.Int63()) % uint64(intv)) -} - // shuffleStrings randomly shuffles the list of strings func shuffleStrings(list []string) { for i := range list { diff --git a/client/util_test.go b/client/util_test.go index 431a93f6ba5..c0a8633c329 100644 --- a/client/util_test.go +++ b/client/util_test.go @@ -6,7 +6,6 @@ import ( "path/filepath" "reflect" "testing" - "time" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" @@ -56,17 +55,6 @@ func TestDiffAllocs(t *testing.T) { } } -func TestRandomStagger(t *testing.T) { - t.Parallel() - intv := time.Minute - for i := 0; i < 10; i++ { - stagger := randomStagger(intv) - if stagger < 0 || stagger >= intv { - t.Fatalf("Bad: %v", stagger) - } - } -} - func TestShuffleStrings(t *testing.T) { t.Parallel() // Generate input diff --git a/command/agent/command.go b/command/agent/command.go index 5c8371b79a9..129bb239863 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -16,6 +16,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/hashicorp/consul/lib" "github.com/hashicorp/go-checkpoint" "github.com/hashicorp/go-syslog" "github.com/hashicorp/logutils" @@ -334,7 +335,7 @@ func (c *Command) setupAgent(config *Config, logOutput io.Writer) error { // Do an immediate check within the next 30 seconds go func() { - time.Sleep(randomStagger(30 * time.Second)) + time.Sleep(lib.RandomStagger(30 * time.Second)) c.checkpointResults(checkpoint.Check(updateParams)) }() } diff --git a/command/agent/util.go b/command/agent/util.go index 2fa4993aeaa..c74fe645cf6 100644 --- a/command/agent/util.go +++ b/command/agent/util.go @@ -2,16 +2,9 @@ package agent import ( "fmt" - "math/rand" "net" - "time" ) -// Returns a random stagger interval between 0 and the duration -func randomStagger(intv time.Duration) time.Duration { - return time.Duration(uint64(rand.Int63()) % uint64(intv)) -} - // IpOfDevice returns a routable ip addr of a device func ipOfDevice(name string) (net.IP, error) { intf, err := net.InterfaceByName(name) diff --git a/command/agent/util_test.go b/command/agent/util_test.go deleted file mode 100644 index e31943a2037..00000000000 --- a/command/agent/util_test.go +++ /dev/null @@ -1,16 +0,0 @@ -package agent - -import ( - "testing" - "time" -) - -func TestRandomStagger(t *testing.T) { - intv := time.Minute - for i := 0; i < 10; i++ { - stagger := randomStagger(intv) - if stagger < 0 || stagger >= intv { - t.Fatalf("Bad: %v", stagger) - } - } -} diff --git a/nomad/heartbeat.go b/nomad/heartbeat.go index aed47d4bd58..3102b73206a 100644 --- a/nomad/heartbeat.go +++ b/nomad/heartbeat.go @@ -51,7 +51,7 @@ func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) { // Compute the target TTL value n := len(s.heartbeatTimers) ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n) - ttl += randomStagger(ttl) + ttl += lib.RandomStagger(ttl) // Reset the TTL s.resetHeartbeatTimerLocked(id, ttl+s.config.HeartbeatGrace) diff --git a/nomad/rpc.go b/nomad/rpc.go index a25566f111f..2080737c325 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -11,6 +11,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/hashicorp/consul/lib" "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -308,7 +309,7 @@ func (s *Server) blockingRPC(opts *blockingOptions) error { } // Apply a small amount of jitter to the request - opts.queryOpts.MaxQueryTime += randomStagger(opts.queryOpts.MaxQueryTime / jitterFraction) + opts.queryOpts.MaxQueryTime += lib.RandomStagger(opts.queryOpts.MaxQueryTime / jitterFraction) // Setup a query timeout timeout = time.NewTimer(opts.queryOpts.MaxQueryTime) diff --git a/nomad/util.go b/nomad/util.go index 961c75ab6d0..8bc3fb7d739 100644 --- a/nomad/util.go +++ b/nomad/util.go @@ -10,7 +10,6 @@ import ( "path/filepath" "runtime" "strconv" - "time" crand "crypto/rand" @@ -101,11 +100,6 @@ func isNomadServer(m serf.Member) (bool, *serverParts) { return true, parts } -// Returns a random stagger interval between 0 and the duration -func randomStagger(intv time.Duration) time.Duration { - return time.Duration(uint64(rand.Int63()) % uint64(intv)) -} - // shuffleStrings randomly shuffles the list of strings func shuffleStrings(list []string) { for i := range list { diff --git a/nomad/util_test.go b/nomad/util_test.go index e71b0efefb8..e415bb4c9a0 100644 --- a/nomad/util_test.go +++ b/nomad/util_test.go @@ -4,7 +4,6 @@ import ( "net" "reflect" "testing" - "time" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/serf/serf" @@ -57,16 +56,6 @@ func TestIsNomadServer(t *testing.T) { } } -func TestRandomStagger(t *testing.T) { - intv := time.Minute - for i := 0; i < 10; i++ { - stagger := randomStagger(intv) - if stagger < 0 || stagger >= intv { - t.Fatalf("Bad: %v", stagger) - } - } -} - func TestShuffleStrings(t *testing.T) { // Generate input inp := make([]string, 10) From 2d2392749a69160da1b6ce34dc13250bfd4888f3 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 3 May 2016 00:18:11 -0700 Subject: [PATCH 003/166] Use `rand.Int*n()` where appropriate --- nomad/eval_broker.go | 2 +- nomad/rpc.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nomad/eval_broker.go b/nomad/eval_broker.go index 950c9ac0d3c..96060b6c327 100644 --- a/nomad/eval_broker.go +++ b/nomad/eval_broker.go @@ -348,7 +348,7 @@ func (b *EvalBroker) scanForSchedulers(schedulers []string) (*structs.Evaluation default: // Multiple tasks. We pick a random task so that we fairly // distribute work. - offset := rand.Int63() % int64(n) + offset := rand.Int63n(n) return b.dequeueForSched(eligibleSched[offset]) } } diff --git a/nomad/rpc.go b/nomad/rpc.go index 2080737c325..3d233921594 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -232,7 +232,7 @@ func (s *Server) forwardRegion(region, method string, args interface{}, reply in } // Select a random addr - offset := rand.Int31() % int32(len(servers)) + offset := rand.Int31n(len(servers)) server := servers[offset] s.peerLock.RUnlock() From 284661410d7cf5947f8732744946346f7f65a463 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 3 May 2016 00:18:48 -0700 Subject: [PATCH 004/166] Fix small typo --- nomad/heartbeat.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nomad/heartbeat.go b/nomad/heartbeat.go index 3102b73206a..9b2867ecaa3 100644 --- a/nomad/heartbeat.go +++ b/nomad/heartbeat.go @@ -72,7 +72,7 @@ func (s *Server) resetHeartbeatTimerLocked(id string, ttl time.Duration) { return } - // Create a new timer to track expiration of thi sheartbeat + // Create a new timer to track expiration of this heartbeat timer := time.AfterFunc(ttl, func() { s.invalidateHeartbeat(id) }) From 3f9d3854dda232736900e01d76a17f58cbb9d95a Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 3 May 2016 00:28:23 -0700 Subject: [PATCH 005/166] Seed random once in main --- main.go | 5 +++++ nomad/server.go | 5 ----- nomad/util.go | 15 --------------- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/main.go b/main.go index 212c28f502a..2c15085504a 100644 --- a/main.go +++ b/main.go @@ -4,9 +4,14 @@ import ( "fmt" "os" + "github.com/hashicorp/consul/lib" "github.com/mitchellh/cli" ) +func init() { + lib.SeedMathRand() +} + func main() { os.Exit(Run(os.Args[1:])) } diff --git a/nomad/server.go b/nomad/server.go index 1aad1394869..8553d0c7c9e 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -249,11 +249,6 @@ func NewServer(config *Config) (*Server, error) { // Emit metrics go s.heartbeatStats() - // Seed the global random. - if err := seedRandom(); err != nil { - return nil, err - } - // Done return s, nil } diff --git a/nomad/util.go b/nomad/util.go index 8bc3fb7d739..7a74c954270 100644 --- a/nomad/util.go +++ b/nomad/util.go @@ -2,8 +2,6 @@ package nomad import ( "fmt" - "math" - "math/big" "math/rand" "net" "os" @@ -11,8 +9,6 @@ import ( "runtime" "strconv" - crand "crypto/rand" - "github.com/hashicorp/serf/serf" ) @@ -115,14 +111,3 @@ func maxUint64(a, b uint64) uint64 { } return b } - -// seedRandom seeds the global random variable using a cryptographically random -// seed. It returns an error if determing the random seed fails. -func seedRandom() error { - n, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64)) - if err != nil { - return err - } - rand.Seed(n.Int64()) - return nil -} From 4030e380d6198aeae4aea712f8a0696a6b1f17c9 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 3 May 2016 00:29:23 -0700 Subject: [PATCH 006/166] Use the correctly typed `rand.Int*` variant --- nomad/eval_broker.go | 2 +- nomad/rpc.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nomad/eval_broker.go b/nomad/eval_broker.go index 96060b6c327..d91ee824488 100644 --- a/nomad/eval_broker.go +++ b/nomad/eval_broker.go @@ -348,7 +348,7 @@ func (b *EvalBroker) scanForSchedulers(schedulers []string) (*structs.Evaluation default: // Multiple tasks. We pick a random task so that we fairly // distribute work. - offset := rand.Int63n(n) + offset := rand.Intn(n) return b.dequeueForSched(eligibleSched[offset]) } } diff --git a/nomad/rpc.go b/nomad/rpc.go index 3d233921594..26b94489b12 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -232,7 +232,7 @@ func (s *Server) forwardRegion(region, method string, args interface{}, reply in } // Select a random addr - offset := rand.Int31n(len(servers)) + offset := rand.Intn(len(servers)) server := servers[offset] s.peerLock.RUnlock() From 0e1bdad2a056aac0259e57036b2dac15a46ca073 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 21 May 2016 16:27:21 -0500 Subject: [PATCH 007/166] Rename consul.ConsulConfig to consul.AgentConfig There were two `ConsulConfig` structs running around, one of them needed to go away. Rely on the package's path to provide context for the type of AgentConfig. --- client/client_test.go | 2 +- client/config/config.go | 2 +- client/consul/sync.go | 6 +++--- client/consul/sync_test.go | 4 ++-- client/driver/executor/executor.go | 2 +- client/driver/utils.go | 2 +- command/agent/agent.go | 4 ++-- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/client/client_test.go b/client/client_test.go index c429527adbe..e0c7d7177c8 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -72,7 +72,7 @@ func testServer(t *testing.T, cb func(*nomad.Config)) (*nomad.Server, string) { func testClient(t *testing.T, cb func(c *config.Config)) *Client { conf := DefaultConfig() conf.DevMode = true - conf.ConsulConfig = &consul.ConsulConfig{} + conf.ConsulConfig = &consul.AgentConfig{} if cb != nil { cb(conf) } diff --git a/client/config/config.go b/client/config/config.go index 4d606ba2439..e1e2ae892b5 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -111,7 +111,7 @@ type Config struct { Revision string // ConsulConfig is the configuration to connect with Consul Agent - ConsulConfig *consul.ConsulConfig + ConsulConfig *consul.AgentConfig // StatsDataPoints is the number of resource usage data points the Nomad // client keeps in memory diff --git a/client/consul/sync.go b/client/consul/sync.go index 08da6e89d23..f5c45dbf580 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -38,8 +38,8 @@ type ConsulService struct { shutdownLock sync.Mutex } -// ConsulConfig is the configuration used to create a new ConsulService client -type ConsulConfig struct { +// AgentConfig is the configuration used to create a new ConsulService client +type AgentConfig struct { Addr string Token string Auth string @@ -60,7 +60,7 @@ const ( ) // NewConsulService returns a new ConsulService -func NewConsulService(config *ConsulConfig, logger *log.Logger) (*ConsulService, error) { +func NewConsulService(config *AgentConfig, logger *log.Logger) (*ConsulService, error) { var err error var c *consul.Client cfg := consul.DefaultConfig() diff --git a/client/consul/sync_test.go b/client/consul/sync_test.go index 80907c78780..73c8e94a7fd 100644 --- a/client/consul/sync_test.go +++ b/client/consul/sync_test.go @@ -41,7 +41,7 @@ var ( ) func TestConsulServiceRegisterServices(t *testing.T) { - cs, err := NewConsulService(&ConsulConfig{}, logger) + cs, err := NewConsulService(&AgentConfig{}, logger) if err != nil { t.Fatalf("Err: %v", err) } @@ -68,7 +68,7 @@ func TestConsulServiceRegisterServices(t *testing.T) { } func TestConsulServiceUpdateService(t *testing.T) { - cs, err := NewConsulService(&ConsulConfig{}, logger) + cs, err := NewConsulService(&AgentConfig{}, logger) if err != nil { t.Fatalf("Err: %v", err) } diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index c6e4b9655fd..a4444aa0520 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -61,7 +61,7 @@ type Executor interface { // ConsulContext holds context to configure the consul client and run checks type ConsulContext struct { // ConsulConfig is the configuration used to create a consul client - ConsulConfig *consul.ConsulConfig + ConsulConfig *consul.AgentConfig // ContainerID is the ID of the container ContainerID string diff --git a/client/driver/utils.go b/client/driver/utils.go index ce1160a79f1..4994671f07d 100644 --- a/client/driver/utils.go +++ b/client/driver/utils.go @@ -73,7 +73,7 @@ func createLogCollector(config *plugin.ClientConfig, w io.Writer, } func consulContext(clientConfig *config.Config, containerID string) *executor.ConsulContext { - cfg := consul.ConsulConfig{ + cfg := consul.AgentConfig{ Addr: clientConfig.ReadDefault("consul.address", "127.0.0.1:8500"), Token: clientConfig.Read("consul.token"), Auth: clientConfig.Read("consul.auth"), diff --git a/command/agent/agent.go b/command/agent/agent.go index c7e623d8348..37d2580c00e 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -30,7 +30,7 @@ type Agent struct { logOutput io.Writer consulService *consul.ConsulService // consulService registers the Nomad agent with the consul agent - consulConfig *consul.ConsulConfig // consulConfig is the consul configuration the Nomad client uses to connect with Consul agent + consulConfig *consul.AgentConfig // consulConfig is the configuration the Nomad client uses to connect with Consul agent serverHTTPAddr string clientHTTPAddr string @@ -488,7 +488,7 @@ func (a *Agent) Stats() map[string]map[string]string { } func (a *Agent) createConsulConfig() { - cfg := &consul.ConsulConfig{ + cfg := &consul.AgentConfig{ Addr: a.config.ConsulConfig.Addr, Token: a.config.ConsulConfig.Token, Auth: a.config.ConsulConfig.Auth, From 4fed6b0ab5ccd309766ee1bec74e862f80333311 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 21 May 2016 16:37:21 -0500 Subject: [PATCH 008/166] Rename client/config/config's ConsulConfig to ConsulAgentConfig A follow up commit to the previous rename. More to come. --- client/client.go | 2 +- client/client_test.go | 2 +- client/config/config.go | 5 +++-- command/agent/agent.go | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/client/client.go b/client/client.go index eecb27d9fbd..91a86a57a0c 100644 --- a/client/client.go +++ b/client/client.go @@ -1268,7 +1268,7 @@ func (c *Client) addAlloc(alloc *structs.Allocation) error { // setupConsulClient creates a ConsulService func (c *Client) setupConsulClient() error { - cs, err := consul.NewConsulService(c.config.ConsulConfig, c.logger) + cs, err := consul.NewConsulService(c.config.ConsulAgentConfig, c.logger) c.consulService = cs return err } diff --git a/client/client_test.go b/client/client_test.go index e0c7d7177c8..6e7553ce3e7 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -72,7 +72,7 @@ func testServer(t *testing.T, cb func(*nomad.Config)) (*nomad.Server, string) { func testClient(t *testing.T, cb func(c *config.Config)) *Client { conf := DefaultConfig() conf.DevMode = true - conf.ConsulConfig = &consul.AgentConfig{} + conf.ConsulAgentConfig = &consul.AgentConfig{} if cb != nil { cb(conf) } diff --git a/client/config/config.go b/client/config/config.go index e1e2ae892b5..4f392ec806b 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -110,8 +110,9 @@ type Config struct { // Revision is the commit number of the Nomad client Revision string - // ConsulConfig is the configuration to connect with Consul Agent - ConsulConfig *consul.AgentConfig + // ConsulAgentConfig is the configuration to connect with Consul + // Agent + ConsulAgentConfig *consul.AgentConfig // StatsDataPoints is the number of resource usage data points the Nomad // client keeps in memory diff --git a/command/agent/agent.go b/command/agent/agent.go index 37d2580c00e..53c821b0c01 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -272,7 +272,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { conf.Version = fmt.Sprintf("%s%s", a.config.Version, a.config.VersionPrerelease) conf.Revision = a.config.Revision - conf.ConsulConfig = a.consulConfig + conf.ConsulAgentConfig = a.consulConfig conf.StatsDataPoints = a.config.Client.StatsConfig.DataPoints conf.StatsCollectionInterval = a.config.Client.StatsConfig.collectionInterval From 1f7bfb30d91a5810f67a17f153ca8948d824f2db Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 21 May 2016 16:51:22 -0500 Subject: [PATCH 009/166] Rename consulConfig to consulAgentConfig --- command/agent/agent.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 53c821b0c01..42ae2dac591 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -30,7 +30,7 @@ type Agent struct { logOutput io.Writer consulService *consul.ConsulService // consulService registers the Nomad agent with the consul agent - consulConfig *consul.AgentConfig // consulConfig is the configuration the Nomad client uses to connect with Consul agent + consulAgentConfig *consul.AgentConfig // consulAgentConfig is the configuration the Nomad client uses to connect with Consul agent serverHTTPAddr string clientHTTPAddr string @@ -58,7 +58,7 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { // creating the consul client configuration that both the server and client // uses - a.createConsulConfig() + a.createAgentConfig() if err := a.setupServer(); err != nil { return nil, err @@ -272,7 +272,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { conf.Version = fmt.Sprintf("%s%s", a.config.Version, a.config.VersionPrerelease) conf.Revision = a.config.Revision - conf.ConsulAgentConfig = a.consulConfig + conf.ConsulAgentConfig = a.consulAgentConfig conf.StatsDataPoints = a.config.Client.StatsConfig.DataPoints conf.StatsCollectionInterval = a.config.Client.StatsConfig.collectionInterval @@ -487,7 +487,7 @@ func (a *Agent) Stats() map[string]map[string]string { return stats } -func (a *Agent) createConsulConfig() { +func (a *Agent) createAgentConfig() { cfg := &consul.AgentConfig{ Addr: a.config.ConsulConfig.Addr, Token: a.config.ConsulConfig.Token, @@ -498,12 +498,12 @@ func (a *Agent) createConsulConfig() { CertFile: a.config.ConsulConfig.CertFile, KeyFile: a.config.ConsulConfig.KeyFile, } - a.consulConfig = cfg + a.consulAgentConfig = cfg } // syncAgentServicesWithConsul syncs the client and server services with Consul func (a *Agent) syncAgentServicesWithConsul(clientHttpAddr string, serverHttpAddr string) error { - cs, err := consul.NewConsulService(a.consulConfig, a.logger) + cs, err := consul.NewConsulService(a.consulAgentConfig, a.logger) if err != nil { return err } From 8e256471a97ac0b2aa17a58a4ca4d670b3f4b63d Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 21 May 2016 17:03:46 -0500 Subject: [PATCH 010/166] Rename ConsulConfig to ConsulAgentConfig --- client/driver/executor/executor.go | 9 +++++---- client/driver/utils.go | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index a4444aa0520..77d94fad6d7 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -58,10 +58,11 @@ type Executor interface { Stats() (*cstructs.TaskResourceUsage, error) } -// ConsulContext holds context to configure the consul client and run checks +// ConsulContext holds context to configure the Consul client and run checks type ConsulContext struct { - // ConsulConfig is the configuration used to create a consul client - ConsulConfig *consul.AgentConfig + // ConsulAgentConfig contains the configuration information for + // talking with this Nomad Agent's Consul Agent. + ConsulAgentConfig *consul.AgentConfig // ContainerID is the ID of the container ContainerID string @@ -470,7 +471,7 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { e.logger.Printf("[INFO] executor: registering services") e.consulCtx = ctx if e.consulService == nil { - cs, err := consul.NewConsulService(ctx.ConsulConfig, e.logger) + cs, err := consul.NewConsulService(ctx.ConsulAgentConfig, e.logger) if err != nil { return err } diff --git a/client/driver/utils.go b/client/driver/utils.go index 4994671f07d..0584d351d3b 100644 --- a/client/driver/utils.go +++ b/client/driver/utils.go @@ -84,7 +84,7 @@ func consulContext(clientConfig *config.Config, containerID string) *executor.Co KeyFile: clientConfig.Read("consul.tls_key_file"), } return &executor.ConsulContext{ - ConsulConfig: &cfg, + ConsulAgentConfig: &cfg, ContainerID: containerID, DockerEndpoint: clientConfig.Read("docker.endpoint"), TLSCa: clientConfig.Read("docker.tls.ca"), From af72200cf4c0abeef02541d6a07b6cce478c00e6 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 21 May 2016 17:39:45 -0700 Subject: [PATCH 011/166] Distill config.Config.ConsulConfig down to config.Config.Consul The enclosed struct provides the necessary context --- command/agent/agent.go | 24 ++++++++++++------------ command/agent/agent_test.go | 2 +- command/agent/command.go | 2 +- command/agent/config.go | 28 +++++++++++++++++----------- command/agent/config_parse.go | 2 +- command/agent/config_parse_test.go | 2 +- 6 files changed, 33 insertions(+), 27 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 42ae2dac591..935a205bca9 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -489,14 +489,14 @@ func (a *Agent) Stats() map[string]map[string]string { func (a *Agent) createAgentConfig() { cfg := &consul.AgentConfig{ - Addr: a.config.ConsulConfig.Addr, - Token: a.config.ConsulConfig.Token, - Auth: a.config.ConsulConfig.Auth, - EnableSSL: a.config.ConsulConfig.EnableSSL, - VerifySSL: a.config.ConsulConfig.VerifySSL, - CAFile: a.config.ConsulConfig.CAFile, - CertFile: a.config.ConsulConfig.CertFile, - KeyFile: a.config.ConsulConfig.KeyFile, + Addr: a.config.Consul.Addr, + Token: a.config.Consul.Token, + Auth: a.config.Consul.Auth, + EnableSSL: a.config.Consul.EnableSSL, + VerifySSL: a.config.Consul.VerifySSL, + CAFile: a.config.Consul.CAFile, + CertFile: a.config.Consul.CertFile, + KeyFile: a.config.Consul.KeyFile, } a.consulAgentConfig = cfg } @@ -509,20 +509,20 @@ func (a *Agent) syncAgentServicesWithConsul(clientHttpAddr string, serverHttpAdd } a.consulService = cs var services []*structs.Service - if a.client != nil && a.config.ConsulConfig.ClientServiceName != "" { + if a.client != nil && a.config.Consul.ClientServiceName != "" { if err != nil { return err } clientService := &structs.Service{ - Name: a.config.ConsulConfig.ClientServiceName, + Name: a.config.Consul.ClientServiceName, PortLabel: clientHttpAddr, } services = append(services, clientService) cs.SetServiceIdentifier("agent-client") } - if a.server != nil && a.config.ConsulConfig.ServerServiceName != "" { + if a.server != nil && a.config.Consul.ServerServiceName != "" { serverService := &structs.Service{ - Name: a.config.ConsulConfig.ServerServiceName, + Name: a.config.Consul.ServerServiceName, PortLabel: serverHttpAddr, } services = append(services, serverService) diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 23662ef4477..2c864e27683 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -42,7 +42,7 @@ func makeAgent(t testing.TB, cb func(*Config)) (string, *Agent) { Serf: getPort(), } conf.NodeName = fmt.Sprintf("Node %d", conf.Ports.RPC) - conf.ConsulConfig = &ConsulConfig{} + conf.Consul = &ConsulConfig{} // Tighten the Serf timing config.SerfConfig.MemberlistConfig.SuspicionMult = 2 diff --git a/command/agent/command.go b/command/agent/command.go index 129bb239863..542565a15b5 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -60,7 +60,7 @@ func (c *Command) readConfig() *Config { // Make a new, empty config. cmdConfig := &Config{ Atlas: &AtlasConfig{}, - ConsulConfig: &ConsulConfig{}, + Consul: &Consul{}, Client: &ClientConfig{}, Ports: &Ports{}, Server: &ServerConfig{}, diff --git a/command/agent/config.go b/command/agent/config.go index be0cd0db26b..dfdf3797a39 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -82,9 +82,10 @@ type Config struct { // AtlasConfig is used to configure Atlas Atlas *AtlasConfig `mapstructure:"atlas"` - // ConsulConfig is used to configure Consul clients and register the nomad - // server and client services with Consul - ConsulConfig *ConsulConfig `mapstructure:"consul"` + // Consul contains the configuration for the Consul Agent and + // parameters necessary to register services, their checks, and + // discover the current Nomad servers. + Consul *ConsulConfig `mapstructure:"consul"` // NomadConfig is used to override the default config. // This is largly used for testing purposes. @@ -128,8 +129,13 @@ type AtlasConfig struct { Endpoint string `mapstructure:"endpoint"` } -// ConsulConfig is used to configure Consul clients and register the nomad -// server and client services with Consul +// ConsulConfig contains the configuration information necessary to +// communicate with a Consul Agent in order to: +// +// - Register services and checks with Consul +// +// - Bootstrap this Nomad Client with the list of Nomad Servers registered +// with Consul type ConsulConfig struct { // ServerServiceName is the name of the service that Nomad uses to register @@ -439,7 +445,7 @@ func DefaultConfig() *Config { Addresses: &Addresses{}, AdvertiseAddrs: &AdvertiseAddrs{}, Atlas: &AtlasConfig{}, - ConsulConfig: &ConsulConfig{ + Consul: &ConsulConfig{ ServerServiceName: "nomad-server", ClientServiceName: "nomad-client", }, @@ -593,11 +599,11 @@ func (c *Config) Merge(b *Config) *Config { } // Apply the Consul Configuration - if result.ConsulConfig == nil && b.ConsulConfig != nil { - consulConfig := *b.ConsulConfig - result.ConsulConfig = &consulConfig - } else if b.ConsulConfig != nil { - result.ConsulConfig = result.ConsulConfig.Merge(b.ConsulConfig) + if result.Consul == nil && b.Consul != nil { + consulConfig := *b.Consul + result.Consul = &consulConfig + } else if b.Consul != nil { + result.Consul = result.Consul.Merge(b.Consul) } // Merge config files lists diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index 4ecbab43117..07aecaa1336 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -169,7 +169,7 @@ func parseConfig(result *Config, list *ast.ObjectList) error { // Parse the consul config if o := list.Filter("consul"); len(o.Items) > 0 { - if err := parseConsulConfig(&result.ConsulConfig, o); err != nil { + if err := parseConsulConfig(&result.Consul, o); err != nil { return multierror.Prefix(err, "consul ->") } } diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 6012ba88126..6153296b33b 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -100,7 +100,7 @@ func TestConfig_Parse(t *testing.T) { Join: true, Endpoint: "127.0.0.1:1234", }, - ConsulConfig: &ConsulConfig{ + Consul: &ConsulConfig{ ServerServiceName: "nomad-server", ClientServiceName: "nomad-client", Addr: "127.0.0.1:9500", From 7bf6af8c2e7ee20201dd9e6220c6a8b9c21622e3 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 21 May 2016 18:06:35 -0700 Subject: [PATCH 012/166] Fix copy pasta comment. These parameters are used to bootstrap Nomad servers, not Consul servers. --- nomad/config.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nomad/config.go b/nomad/config.go index 1773921984f..d998045d9ab 100644 --- a/nomad/config.go +++ b/nomad/config.go @@ -44,14 +44,14 @@ var ( // Config is used to parameterize the server type Config struct { - // Bootstrap mode is used to bring up the first Consul server. - // It is required so that it can elect a leader without any - // other nodes being present + // Bootstrap mode is used to bring up the first Nomad server. It is + // required so that it can elect a leader without any other nodes + // being present Bootstrap bool - // BootstrapExpect mode is used to automatically bring up a collection of - // Consul servers. This can be used to automatically bring up a collection - // of nodes. + // BootstrapExpect mode is used to automatically bring up a + // collection of Nomad servers. This can be used to automatically + // bring up a collection of nodes. BootstrapExpect int // DataDir is the directory to store our state in From 092e07f334046308674f6c368cc331889c4c5c39 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 21 May 2016 20:15:58 -0700 Subject: [PATCH 013/166] Create a `nomad/structs/config` to break an import cycle. Flattening and normalizing the various Consul config structures and services has led to an import cycle. Break this by creating a new package that is intended to be terminal in the import DAG. --- command/agent/agent_test.go | 3 +- command/agent/config.go | 108 ++--------------------------- command/agent/config_parse.go | 5 +- command/agent/config_parse_test.go | 4 +- nomad/structs/config/README.md | 7 ++ nomad/structs/config/consul.go | 103 +++++++++++++++++++++++++++ 6 files changed, 122 insertions(+), 108 deletions(-) create mode 100644 nomad/structs/config/README.md create mode 100644 nomad/structs/config/consul.go diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 2c864e27683..6893c39c888 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -10,6 +10,7 @@ import ( "time" "github.com/hashicorp/nomad/nomad" + cconfig "github.com/hashicorp/nomad/nomad/structs/config" ) var nextPort uint32 = 17000 @@ -42,7 +43,7 @@ func makeAgent(t testing.TB, cb func(*Config)) (string, *Agent) { Serf: getPort(), } conf.NodeName = fmt.Sprintf("Node %d", conf.Ports.RPC) - conf.Consul = &ConsulConfig{} + conf.Consul = &cconfig.ConsulConfig{} // Tighten the Serf timing config.SerfConfig.MemberlistConfig.SuspicionMult = 2 diff --git a/command/agent/config.go b/command/agent/config.go index dfdf3797a39..e1033257b53 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -14,6 +14,7 @@ import ( client "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/nomad" + "github.com/hashicorp/nomad/nomad/structs/config" ) // Config is the configuration for the Nomad agent. @@ -85,7 +86,7 @@ type Config struct { // Consul contains the configuration for the Consul Agent and // parameters necessary to register services, their checks, and // discover the current Nomad servers. - Consul *ConsulConfig `mapstructure:"consul"` + Consul *config.ConsulConfig `mapstructure:"consul"` // NomadConfig is used to override the default config. // This is largly used for testing purposes. @@ -129,62 +130,6 @@ type AtlasConfig struct { Endpoint string `mapstructure:"endpoint"` } -// ConsulConfig contains the configuration information necessary to -// communicate with a Consul Agent in order to: -// -// - Register services and checks with Consul -// -// - Bootstrap this Nomad Client with the list of Nomad Servers registered -// with Consul -type ConsulConfig struct { - - // ServerServiceName is the name of the service that Nomad uses to register - // servers with Consul - ServerServiceName string `mapstructure:"server_service_name"` - - // ClientServiceName is the name of the service that Nomad uses to register - // clients with Consul - ClientServiceName string `mapstructure:"client_service_name"` - - // AutoRegister determines if Nomad will register the Nomad client and - // server agents with Consul - AutoRegister bool `mapstructure:"auto_register"` - - // Addr is the address of the local Consul agent - Addr string `mapstructure:"addr"` - - // Token is used to provide a per-request ACL token.This options overrides - // the agent's default token - Token string `mapstructure:"token"` - - // Auth is the information to use for http access to Consul agent - Auth string `mapstructure:"auth"` - - // EnableSSL sets the transport scheme to talk to the Consul agent as https - EnableSSL bool `mapstructure:"ssl"` - - // VerifySSL enables or disables SSL verification when the transport scheme - // for the consul api client is https - VerifySSL bool `mapstructure:"verify_ssl"` - - // CAFile is the path to the ca certificate used for Consul communication - CAFile string `mapstructure:"ca_file"` - - // CertFile is the path to the certificate for Consul communication - CertFile string `mapstructure:"cert_file"` - - // KeyFile is the path to the private key for Consul communication - KeyFile string `mapstructure:"key_file"` - - // ServerAutoJoin enables Nomad servers to find peers by querying Consul and - // joining them - ServerAutoJoin bool `mapstructure:"server_auto_join"` - - // ClientAutoJoin enables Nomad servers to find addresses of Nomad servers - // and register with them - ClientAutoJoin bool `mapstructure:"client_auto_join"` -} - // StatsConfig determines behavior of resource usage stats collections type StatsConfig struct { @@ -445,9 +390,10 @@ func DefaultConfig() *Config { Addresses: &Addresses{}, AdvertiseAddrs: &AdvertiseAddrs{}, Atlas: &AtlasConfig{}, - Consul: &ConsulConfig{ + Consul: &config.ConsulConfig{ ServerServiceName: "nomad-server", ClientServiceName: "nomad-client", + AutoRegister: true, }, Client: &ClientConfig{ Enabled: false, @@ -815,52 +761,6 @@ func (a *AtlasConfig) Merge(b *AtlasConfig) *AtlasConfig { return &result } -// Merge merges two Consul Configurations together. -func (a *ConsulConfig) Merge(b *ConsulConfig) *ConsulConfig { - result := *a - - if b.ServerServiceName != "" { - result.ServerServiceName = b.ServerServiceName - } - if b.ClientServiceName != "" { - result.ClientServiceName = b.ClientServiceName - } - if b.AutoRegister { - result.AutoRegister = true - } - if b.Addr != "" { - result.Addr = b.Addr - } - if b.Token != "" { - result.Token = b.Token - } - if b.Auth != "" { - result.Auth = b.Auth - } - if b.EnableSSL { - result.EnableSSL = true - } - if b.VerifySSL { - result.VerifySSL = true - } - if b.CAFile != "" { - result.CAFile = b.CAFile - } - if b.CertFile != "" { - result.CertFile = b.CertFile - } - if b.KeyFile != "" { - result.KeyFile = b.KeyFile - } - if b.ServerAutoJoin { - result.ServerAutoJoin = true - } - if b.ClientAutoJoin { - result.ClientAutoJoin = true - } - return &result -} - func (r *Resources) Merge(b *Resources) *Resources { result := *r if b.CPU != 0 { diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index 07aecaa1336..a5a7e713afa 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -10,6 +10,7 @@ import ( "github.com/hashicorp/go-multierror" "github.com/hashicorp/hcl" "github.com/hashicorp/hcl/hcl/ast" + "github.com/hashicorp/nomad/nomad/structs/config" "github.com/mitchellh/mapstructure" ) @@ -586,7 +587,7 @@ func parseAtlas(result **AtlasConfig, list *ast.ObjectList) error { return nil } -func parseConsulConfig(result **ConsulConfig, list *ast.ObjectList) error { +func parseConsulConfig(result **config.ConsulConfig, list *ast.ObjectList) error { list = list.Elem() if len(list.Items) > 1 { return fmt.Errorf("only one 'consul' block allowed") @@ -621,7 +622,7 @@ func parseConsulConfig(result **ConsulConfig, list *ast.ObjectList) error { return err } - var consulConfig ConsulConfig + var consulConfig config.ConsulConfig if err := mapstructure.WeakDecode(m, &consulConfig); err != nil { return err } diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 6153296b33b..5f894ad103c 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -4,6 +4,8 @@ import ( "path/filepath" "reflect" "testing" + + "github.com/hashicorp/nomad/nomad/structs/config" ) func TestConfig_Parse(t *testing.T) { @@ -100,7 +102,7 @@ func TestConfig_Parse(t *testing.T) { Join: true, Endpoint: "127.0.0.1:1234", }, - Consul: &ConsulConfig{ + Consul: &config.ConsulConfig{ ServerServiceName: "nomad-server", ClientServiceName: "nomad-client", Addr: "127.0.0.1:9500", diff --git a/nomad/structs/config/README.md b/nomad/structs/config/README.md new file mode 100644 index 00000000000..c75016932d3 --- /dev/null +++ b/nomad/structs/config/README.md @@ -0,0 +1,7 @@ +# Overview + +`nomad/structs/config` is a package for configuration `struct`s that are +shared among packages that needs the same `struct` definitions, but can't +import each other without creating a cyle. This `config` package must be +terminal in the import graph (or very close to terminal in the dependency +graph). diff --git a/nomad/structs/config/consul.go b/nomad/structs/config/consul.go new file mode 100644 index 00000000000..758ef4d0537 --- /dev/null +++ b/nomad/structs/config/consul.go @@ -0,0 +1,103 @@ +package config + +// ConsulConfig contains the configuration information necessary to +// communicate with a Consul Agent in order to: +// +// - Register services and checks with Consul +// +// - Bootstrap this Nomad Client with the list of Nomad Servers registered +// with Consul +type ConsulConfig struct { + + // ServerServiceName is the name of the service that Nomad uses to register + // servers with Consul + ServerServiceName string `mapstructure:"server_service_name"` + + // ClientServiceName is the name of the service that Nomad uses to register + // clients with Consul + ClientServiceName string `mapstructure:"client_service_name"` + + // AutoRegister determines if Nomad will register the Nomad client and + // server agents with Consul + AutoRegister bool `mapstructure:"auto_register"` + + // Addr is the address of the local Consul agent + Addr string `mapstructure:"addr"` + + // Token is used to provide a per-request ACL token.This options overrides + // the agent's default token + Token string `mapstructure:"token"` + + // Auth is the information to use for http access to Consul agent + Auth string `mapstructure:"auth"` + + // EnableSSL sets the transport scheme to talk to the Consul agent as https + EnableSSL bool `mapstructure:"ssl"` + + // VerifySSL enables or disables SSL verification when the transport scheme + // for the consul api client is https + VerifySSL bool `mapstructure:"verify_ssl"` + + // CAFile is the path to the ca certificate used for Consul communication + CAFile string `mapstructure:"ca_file"` + + // CertFile is the path to the certificate for Consul communication + CertFile string `mapstructure:"cert_file"` + + // KeyFile is the path to the private key for Consul communication + KeyFile string `mapstructure:"key_file"` + + // ServerAutoJoin enables Nomad servers to find peers by querying Consul and + // joining them + ServerAutoJoin bool `mapstructure:"server_auto_join"` + + // ClientAutoJoin enables Nomad servers to find addresses of Nomad servers + // and register with them + ClientAutoJoin bool `mapstructure:"client_auto_join"` +} + +// Merge merges two Consul Configurations together. +func (a *ConsulConfig) Merge(b *ConsulConfig) *ConsulConfig { + result := *a + + if b.ServerServiceName != "" { + result.ServerServiceName = b.ServerServiceName + } + if b.ClientServiceName != "" { + result.ClientServiceName = b.ClientServiceName + } + if b.AutoRegister { + result.AutoRegister = true + } + if b.Addr != "" { + result.Addr = b.Addr + } + if b.Token != "" { + result.Token = b.Token + } + if b.Auth != "" { + result.Auth = b.Auth + } + if b.EnableSSL { + result.EnableSSL = true + } + if b.VerifySSL { + result.VerifySSL = true + } + if b.CAFile != "" { + result.CAFile = b.CAFile + } + if b.CertFile != "" { + result.CertFile = b.CertFile + } + if b.KeyFile != "" { + result.KeyFile = b.KeyFile + } + if b.ServerAutoJoin { + result.ServerAutoJoin = true + } + if b.ClientAutoJoin { + result.ClientAutoJoin = true + } + return &result +} From d84d71847ca5ac742a94390c700891871abff873 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sun, 22 May 2016 08:24:54 -0700 Subject: [PATCH 014/166] Rename client/consul/sync.ConsulService to client/consul/sync.Syncer Syncer describes the responsibility and actions of the type. --- client/client.go | 7 ++--- client/consul/sync.go | 42 +++++++++++++++--------------- client/consul/sync_test.go | 8 +++--- client/driver/executor/executor.go | 18 ++++++------- command/agent/agent.go | 26 +++++++++--------- 5 files changed, 52 insertions(+), 49 deletions(-) diff --git a/client/client.go b/client/client.go index 91a86a57a0c..fc7c7d97fbb 100644 --- a/client/client.go +++ b/client/client.go @@ -133,7 +133,8 @@ type Client struct { // allocUpdates stores allocations that need to be synced to the server. allocUpdates chan *structs.Allocation - consulService *consul.ConsulService + // consulSyncer advertises this Nomad Agent with Consul + consulSyncer *consul.Syncer // HostStatsCollector collects host resource usage stats hostStatsCollector *stats.HostStatsCollector @@ -1269,7 +1270,7 @@ func (c *Client) addAlloc(alloc *structs.Allocation) error { // setupConsulClient creates a ConsulService func (c *Client) setupConsulClient() error { cs, err := consul.NewConsulService(c.config.ConsulAgentConfig, c.logger) - c.consulService = cs + c.consulSyncer = cs return err } @@ -1311,7 +1312,7 @@ func (c *Client) syncConsul() { } } - if err := c.consulService.KeepServices(services); err != nil { + if err := c.consulSyncer.KeepServices(services); err != nil { c.logger.Printf("[DEBUG] client: error removing services from non-running tasks: %v", err) } case <-c.shutdownCh: diff --git a/client/consul/sync.go b/client/consul/sync.go index f5c45dbf580..507bab9d41c 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -18,7 +18,7 @@ import ( ) // ConsulService allows syncing of services and checks with Consul -type ConsulService struct { +type Syncer struct { client *consul.Client availble bool @@ -60,7 +60,7 @@ const ( ) // NewConsulService returns a new ConsulService -func NewConsulService(config *AgentConfig, logger *log.Logger) (*ConsulService, error) { +func NewConsulService(config *AgentConfig, serverManager *servers.Manager, logger *log.Logger) (*Syncer, error) { var err error var c *consul.Client cfg := consul.DefaultConfig() @@ -112,7 +112,7 @@ func NewConsulService(config *AgentConfig, logger *log.Logger) (*ConsulService, if c, err = consul.NewClient(cfg); err != nil { return nil, err } - consulService := ConsulService{ + consulService := Syncer{ client: c, logger: logger, trackedServices: make(map[string]*consul.AgentService), @@ -126,26 +126,26 @@ func NewConsulService(config *AgentConfig, logger *log.Logger) (*ConsulService, // SetDelegatedChecks sets the checks that nomad is going to run and report the // result back to consul -func (c *ConsulService) SetDelegatedChecks(delegateChecks map[string]struct{}, createCheck func(*structs.ServiceCheck, string) (Check, error)) *ConsulService { +func (c *Syncer) SetDelegatedChecks(delegateChecks map[string]struct{}, createCheck func(*structs.ServiceCheck, string) (Check, error)) *Syncer { c.delegateChecks = delegateChecks c.createCheck = createCheck return c } // SetAddrFinder sets a function to find the host and port for a Service given its port label -func (c *ConsulService) SetAddrFinder(addrFinder func(string) (string, int)) *ConsulService { +func (c *Syncer) SetAddrFinder(addrFinder func(string) (string, int)) *Syncer { c.addrFinder = addrFinder return c } // SetServiceIdentifier sets the identifier of the services we are syncing with Consul -func (c *ConsulService) SetServiceIdentifier(serviceIdentifier string) *ConsulService { +func (c *Syncer) SetServiceIdentifier(serviceIdentifier string) *Syncer { c.serviceIdentifier = serviceIdentifier return c } // SyncServices sync the services with consul -func (c *ConsulService) SyncServices(services []*structs.Service) error { +func (c *Syncer) SyncServices(services []*structs.Service) error { var mErr multierror.Error taskServices := make(map[string]*consul.AgentService) taskChecks := make(map[string]*consul.AgentCheckRegistration) @@ -217,7 +217,7 @@ func (c *ConsulService) SyncServices(services []*structs.Service) error { } // Shutdown de-registers the services and checks and shuts down periodic syncing -func (c *ConsulService) Shutdown() error { +func (c *Syncer) Shutdown() error { var mErr multierror.Error c.shutdownLock.Lock() @@ -243,7 +243,7 @@ func (c *ConsulService) Shutdown() error { // KeepServices removes services from consul which are not present in the list // of tasks passed to it -func (c *ConsulService) KeepServices(services map[string]struct{}) error { +func (c *Syncer) KeepServices(services map[string]struct{}) error { var mErr multierror.Error // Get the services from Consul @@ -265,7 +265,7 @@ func (c *ConsulService) KeepServices(services map[string]struct{}) error { } // registerCheck registers a check definition with Consul -func (c *ConsulService) registerCheck(chkReg *consul.AgentCheckRegistration) error { +func (c *Syncer) registerCheck(chkReg *consul.AgentCheckRegistration) error { if cr, ok := c.checkRunners[chkReg.ID]; ok { cr.Start() } @@ -274,7 +274,7 @@ func (c *ConsulService) registerCheck(chkReg *consul.AgentCheckRegistration) err // createCheckReg creates a Check that can be registered with Nomad. It also // creates a Nomad check for the check types that it can handle. -func (c *ConsulService) createCheckReg(check *structs.ServiceCheck, service *consul.AgentService) (*consul.AgentCheckRegistration, error) { +func (c *Syncer) createCheckReg(check *structs.ServiceCheck, service *consul.AgentService) (*consul.AgentCheckRegistration, error) { chkReg := consul.AgentCheckRegistration{ ID: check.Hash(service.ID), Name: check.Name, @@ -304,7 +304,7 @@ func (c *ConsulService) createCheckReg(check *structs.ServiceCheck, service *con } // createService creates a Consul AgentService from a Nomad Service -func (c *ConsulService) createService(service *structs.Service) (*consul.AgentService, error) { +func (c *Syncer) createService(service *structs.Service) (*consul.AgentService, error) { srv := consul.AgentService{ ID: service.ID(c.serviceIdentifier), Service: service.Name, @@ -323,7 +323,7 @@ func (c *ConsulService) createService(service *structs.Service) (*consul.AgentSe } // registerService registers a service with Consul -func (c *ConsulService) registerService(service *consul.AgentService) error { +func (c *Syncer) registerService(service *consul.AgentService) error { srvReg := consul.AgentServiceRegistration{ ID: service.ID, Name: service.Service, @@ -335,12 +335,12 @@ func (c *ConsulService) registerService(service *consul.AgentService) error { } // deregisterService de-registers a service with the given ID from consul -func (c *ConsulService) deregisterService(ID string) error { +func (c *Syncer) deregisterService(ID string) error { return c.client.Agent().ServiceDeregister(ID) } // deregisterCheck de-registers a check with a given ID from Consul. -func (c *ConsulService) deregisterCheck(ID string) error { +func (c *Syncer) deregisterCheck(ID string) error { // Deleting the nomad check if cr, ok := c.checkRunners[ID]; ok { cr.Stop() @@ -353,7 +353,7 @@ func (c *ConsulService) deregisterCheck(ID string) error { // PeriodicSync triggers periodic syncing of services and checks with Consul. // This is a long lived go-routine which is stopped during shutdown -func (c *ConsulService) PeriodicSync() { +func (c *Syncer) PeriodicSync() { sync := time.NewTicker(syncInterval) for { select { @@ -375,7 +375,7 @@ func (c *ConsulService) PeriodicSync() { } // performSync sync the services and checks we are tracking with Consul. -func (c *ConsulService) performSync() error { +func (c *Syncer) performSync() error { var mErr multierror.Error cServices, err := c.client.Agent().Services() if err != nil { @@ -408,7 +408,7 @@ func (c *ConsulService) performSync() error { // filterConsulServices prunes out all the service whose ids are not prefixed // with nomad- -func (c *ConsulService) filterConsulServices(srvcs map[string]*consul.AgentService) map[string]*consul.AgentService { +func (c *Syncer) filterConsulServices(srvcs map[string]*consul.AgentService) map[string]*consul.AgentService { nomadServices := make(map[string]*consul.AgentService) for _, srv := range srvcs { if strings.HasPrefix(srv.ID, structs.NomadConsulPrefix) && @@ -421,7 +421,7 @@ func (c *ConsulService) filterConsulServices(srvcs map[string]*consul.AgentServi // filterConsulChecks prunes out all the consul checks which do not have // services with id prefixed with noamd- -func (c *ConsulService) filterConsulChecks(chks map[string]*consul.AgentCheck) map[string]*consul.AgentCheck { +func (c *Syncer) filterConsulChecks(chks map[string]*consul.AgentCheck) map[string]*consul.AgentCheck { nomadChecks := make(map[string]*consul.AgentCheck) for _, chk := range chks { if strings.HasPrefix(chk.ServiceID, structs.NomadConsulPrefix) { @@ -432,13 +432,13 @@ func (c *ConsulService) filterConsulChecks(chks map[string]*consul.AgentCheck) m } // consulPresent indicates whether the consul agent is responding -func (c *ConsulService) consulPresent() bool { +func (c *Syncer) consulPresent() bool { _, err := c.client.Agent().Self() return err == nil } // runCheck runs a check and updates the corresponding ttl check in consul -func (c *ConsulService) runCheck(check Check) { +func (c *Syncer) runCheck(check Check) { res := check.Run() if res.Duration >= check.Timeout() { c.logger.Printf("[DEBUG] consul.sync: check took time: %v, timeout: %v", res.Duration, check.Timeout()) diff --git a/client/consul/sync_test.go b/client/consul/sync_test.go index 73c8e94a7fd..a94fdbceef2 100644 --- a/client/consul/sync_test.go +++ b/client/consul/sync_test.go @@ -112,9 +112,9 @@ func TestConsulServiceUpdateService(t *testing.T) { } } -func servicesPresent(t *testing.T, serviceIDs []string, consulService *ConsulService) error { +func servicesPresent(t *testing.T, serviceIDs []string, syncer *Syncer) error { var mErr multierror.Error - services, err := consulService.client.Agent().Services() + services, err := syncer.client.Agent().Services() if err != nil { t.Fatalf("err: %v", err) } @@ -127,9 +127,9 @@ func servicesPresent(t *testing.T, serviceIDs []string, consulService *ConsulSer return mErr.ErrorOrNil() } -func checksPresent(t *testing.T, checkIDs []string, consulService *ConsulService) error { +func checksPresent(t *testing.T, checkIDs []string, syncer *Syncer) error { var mErr multierror.Error - checks, err := consulService.client.Agent().Checks() + checks, err := syncer.client.Agent().Checks() if err != nil { t.Fatalf("err: %v", err) } diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index 77d94fad6d7..54f597fee89 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -189,7 +189,7 @@ type UniversalExecutor struct { cgPaths map[string]string cgLock sync.Mutex - consulService *consul.ConsulService + consulSyncer *consul.Syncer consulCtx *ConsulContext totalCpuStats *stats.CpuStats userCpuStats *stats.CpuStats @@ -354,8 +354,8 @@ func (e *UniversalExecutor) UpdateTask(task *structs.Task) error { e.lre.FileSize = fileSize // Re-syncing task with consul service - if e.consulService != nil { - if err := e.consulService.SyncServices(task.Services); err != nil { + if e.consulSyncer != nil { + if err := e.consulSyncer.SyncServices(task.Services); err != nil { return err } } @@ -470,7 +470,7 @@ func (e *UniversalExecutor) ShutDown() error { func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { e.logger.Printf("[INFO] executor: registering services") e.consulCtx = ctx - if e.consulService == nil { + if e.consulSyncer == nil { cs, err := consul.NewConsulService(ctx.ConsulAgentConfig, e.logger) if err != nil { return err @@ -478,13 +478,13 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { cs.SetDelegatedChecks(e.createCheckMap(), e.createCheck) cs.SetServiceIdentifier(consul.GenerateServiceIdentifier(e.ctx.AllocID, e.ctx.Task.Name)) cs.SetAddrFinder(e.ctx.Task.FindHostAndPortFor) - e.consulService = cs + e.consulSyncer = cs } if e.ctx != nil { e.interpolateServices(e.ctx.Task) } - err := e.consulService.SyncServices(e.ctx.Task.Services) - go e.consulService.PeriodicSync() + err := e.consulSyncer.SyncServices(e.ctx.Task.Services) + go e.consulSyncer.PeriodicSync() return err } @@ -492,8 +492,8 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { // running from Consul func (e *UniversalExecutor) DeregisterServices() error { e.logger.Printf("[INFO] executor: de-registering services and shutting down consul service") - if e.consulService != nil { - return e.consulService.Shutdown() + if e.consulSyncer != nil { + return e.consulSyncer.Shutdown() } return nil } diff --git a/command/agent/agent.go b/command/agent/agent.go index 935a205bca9..2aa05e19513 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -29,10 +29,10 @@ type Agent struct { logger *log.Logger logOutput io.Writer - consulService *consul.ConsulService // consulService registers the Nomad agent with the consul agent - consulAgentConfig *consul.AgentConfig // consulAgentConfig is the configuration the Nomad client uses to connect with Consul agent - serverHTTPAddr string - clientHTTPAddr string + consulSyncer *consul.Syncer // consulSyncer registers the Nomad agent with the Consul Agent + consulAgentConfig *consul.AgentConfig // consulAgentConfig is the configuration the Nomad client uses to connect with Consul agent + serverHTTPAddr string + clientHTTPAddr string server *nomad.Server client *client.Client @@ -69,14 +69,15 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { if a.client == nil && a.server == nil { return nil, fmt.Errorf("must have at least client or server mode enabled") } - if a.config.ConsulConfig.AutoRegister { + if a.config.Consul.AutoRegister { if err := a.syncAgentServicesWithConsul(a.serverHTTPAddr, a.clientHTTPAddr); err != nil { a.logger.Printf("[ERR] agent: unable to sync agent services with consul: %v", err) } - if a.consulService != nil { - go a.consulService.PeriodicSync() + if a.consulSyncer != nil { + go a.consulSyncer.PeriodicSync() } } + return a, nil } @@ -190,7 +191,7 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { } // clientConfig is used to generate a new client configuration struct -// for initializing a nomad client. +// for initializing a Nomad client. func (a *Agent) clientConfig() (*clientconfig.Config, error) { // Setup the configuration conf := a.config.ClientConfig @@ -438,8 +439,8 @@ func (a *Agent) Shutdown() error { } } - if a.consulService != nil { - if err := a.consulService.Shutdown(); err != nil { + if a.consulSyncer != nil { + if err := a.consulSyncer.Shutdown(); err != nil { a.logger.Printf("[ERR] agent: shutting down consul service failed: %v", err) } } @@ -501,13 +502,14 @@ func (a *Agent) createAgentConfig() { a.consulAgentConfig = cfg } -// syncAgentServicesWithConsul syncs the client and server services with Consul +// syncAgentServicesWithConsul syncs this Nomad Agent's services with Consul +// when running in either Client or Server mode. func (a *Agent) syncAgentServicesWithConsul(clientHttpAddr string, serverHttpAddr string) error { cs, err := consul.NewConsulService(a.consulAgentConfig, a.logger) if err != nil { return err } - a.consulService = cs + a.consulSyncer = cs var services []*structs.Service if a.client != nil && a.config.Consul.ClientServiceName != "" { if err != nil { From f280c596337b19dd370e905fb6aa73821b46091f Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 07:24:00 -0700 Subject: [PATCH 015/166] Rename client/consul/sync.PeriodicSync to Run --- client/consul/sync.go | 6 +++--- client/driver/executor/executor.go | 2 +- command/agent/agent.go | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/client/consul/sync.go b/client/consul/sync.go index 507bab9d41c..5e8cdf36031 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -351,10 +351,10 @@ func (c *Syncer) deregisterCheck(ID string) error { return c.client.Agent().CheckDeregister(ID) } -// PeriodicSync triggers periodic syncing of services and checks with Consul. -// This is a long lived go-routine which is stopped during shutdown -func (c *Syncer) PeriodicSync() { sync := time.NewTicker(syncInterval) +// Run triggers periodic syncing of services and checks with Consul. This is +// a long lived go-routine which is stopped during shutdown. +func (c *Syncer) Run() { for { select { case <-sync.C: diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index 54f597fee89..3b9ac1b46a4 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -484,7 +484,7 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { e.interpolateServices(e.ctx.Task) } err := e.consulSyncer.SyncServices(e.ctx.Task.Services) - go e.consulSyncer.PeriodicSync() + go e.consulSyncer.Run() return err } diff --git a/command/agent/agent.go b/command/agent/agent.go index 2aa05e19513..8807d8d1966 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -74,7 +74,7 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { a.logger.Printf("[ERR] agent: unable to sync agent services with consul: %v", err) } if a.consulSyncer != nil { - go a.consulSyncer.PeriodicSync() + go a.consulSyncer.Run() } } From 7c6ad53d89792b3d5691f7e3ff4cdeb517a5e78e Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 07:29:08 -0700 Subject: [PATCH 016/166] Rename NewConsulService to NewSyncer --- client/client.go | 4 ++-- client/consul/sync.go | 4 ++-- client/consul/sync_test.go | 4 ++-- client/driver/executor/executor.go | 2 +- command/agent/agent.go | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/client/client.go b/client/client.go index fc7c7d97fbb..8098e16ee99 100644 --- a/client/client.go +++ b/client/client.go @@ -1267,9 +1267,9 @@ func (c *Client) addAlloc(alloc *structs.Allocation) error { return nil } -// setupConsulClient creates a ConsulService +// setupConsulClient creates a consul.Syncer func (c *Client) setupConsulClient() error { - cs, err := consul.NewConsulService(c.config.ConsulAgentConfig, c.logger) + cs, err := consul.NewSyncer(c.config.ConsulAgentConfig, c.logger) c.consulSyncer = cs return err } diff --git a/client/consul/sync.go b/client/consul/sync.go index 5e8cdf36031..2aa79192428 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -59,8 +59,8 @@ const ( ttlCheckBuffer = 31 * time.Second ) -// NewConsulService returns a new ConsulService -func NewConsulService(config *AgentConfig, serverManager *servers.Manager, logger *log.Logger) (*Syncer, error) { +// NewSyncer returns a new consul.Syncer +func NewSyncer(config *AgentConfig, logger *log.Logger) (*Syncer, error) { var err error var c *consul.Client cfg := consul.DefaultConfig() diff --git a/client/consul/sync_test.go b/client/consul/sync_test.go index a94fdbceef2..735b77b0727 100644 --- a/client/consul/sync_test.go +++ b/client/consul/sync_test.go @@ -41,7 +41,7 @@ var ( ) func TestConsulServiceRegisterServices(t *testing.T) { - cs, err := NewConsulService(&AgentConfig{}, logger) + cs, err := NewSyncer(&AgentConfig{}, logger) if err != nil { t.Fatalf("Err: %v", err) } @@ -68,7 +68,7 @@ func TestConsulServiceRegisterServices(t *testing.T) { } func TestConsulServiceUpdateService(t *testing.T) { - cs, err := NewConsulService(&AgentConfig{}, logger) + cs, err := NewSyncer(&AgentConfig{}, logger) if err != nil { t.Fatalf("Err: %v", err) } diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index 3b9ac1b46a4..348733dd700 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -471,7 +471,7 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { e.logger.Printf("[INFO] executor: registering services") e.consulCtx = ctx if e.consulSyncer == nil { - cs, err := consul.NewConsulService(ctx.ConsulAgentConfig, e.logger) + cs, err := consul.NewSyncer(ctx.ConsulAgentConfig, e.logger) if err != nil { return err } diff --git a/command/agent/agent.go b/command/agent/agent.go index 8807d8d1966..5ff90c6b5de 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -505,7 +505,7 @@ func (a *Agent) createAgentConfig() { // syncAgentServicesWithConsul syncs this Nomad Agent's services with Consul // when running in either Client or Server mode. func (a *Agent) syncAgentServicesWithConsul(clientHttpAddr string, serverHttpAddr string) error { - cs, err := consul.NewConsulService(a.consulAgentConfig, a.logger) + cs, err := consul.NewSyncer(a.consulAgentConfig, a.logger) if err != nil { return err } From ffcd2332d48126d62fcca53699f7d8a7452fe5cf Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 11:09:31 -0700 Subject: [PATCH 017/166] Rebalance Nomad client RPCs among different Nomad servers. Implement client/rpc_proxy.RpcProxy. --- client/client.go | 159 ++--- client/client_test.go | 52 -- client/consul/sync.go | 6 +- client/driver/utils.go | 10 +- client/rpc_proxy/manager_internal_test.go | 353 ++++++++++ client/rpc_proxy/manager_test.go | 387 +++++++++++ client/rpc_proxy/rpc_proxy.go | 761 ++++++++++++++++++++++ client/rpc_proxy/server_endpoint.go | 81 +++ command/agent/agent.go | 28 +- command/agent/agent_endpoint.go | 17 +- command/agent/command.go | 11 +- nomad/heartbeat.go | 12 + nomad/node_endpoint.go | 28 +- nomad/node_endpoint_test.go | 6 + nomad/pool.go | 25 + nomad/server.go | 12 +- nomad/structs/config/consul.go | 3 +- nomad/structs/structs.go | 25 + 18 files changed, 1791 insertions(+), 185 deletions(-) create mode 100644 client/rpc_proxy/manager_internal_test.go create mode 100644 client/rpc_proxy/manager_test.go create mode 100644 client/rpc_proxy/rpc_proxy.go create mode 100644 client/rpc_proxy/server_endpoint.go diff --git a/client/client.go b/client/client.go index 8098e16ee99..f3bd5d2be43 100644 --- a/client/client.go +++ b/client/client.go @@ -4,11 +4,9 @@ import ( "fmt" "io/ioutil" "log" - "net" "os" "path/filepath" "strconv" - "strings" "sync" "time" @@ -20,6 +18,7 @@ import ( "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/driver" "github.com/hashicorp/nomad/client/fingerprint" + "github.com/hashicorp/nomad/client/rpc_proxy" "github.com/hashicorp/nomad/client/stats" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/structs" @@ -46,6 +45,9 @@ const ( // devModeRetryIntv is the retry interval used for development devModeRetryIntv = time.Second + // rpcVersion specifies the RPC version + rpcVersion = 1 + // stateSnapshotIntv is how often the client snapshots state stateSnapshotIntv = 60 * time.Second @@ -113,12 +115,7 @@ type Client struct { logger *log.Logger - lastServer net.Addr - lastRPCTime time.Time - lastServerLock sync.Mutex - - servers []string - serverLock sync.RWMutex + rpcProxy *rpc_proxy.RpcProxy connPool *nomad.ConnPool @@ -192,8 +189,12 @@ func NewClient(cfg *config.Config) (*Client, error) { // Setup the reserved resources c.reservePorts() - // Set up the known servers list - c.SetServers(c.config.Servers) + // Create the RPC Proxy and bootstrap with the preconfigured list of + // static servers + c.rpcProxy = rpc_proxy.NewRpcProxy(c.logger, c.shutdownCh, c, c.connPool) + for _, serverAddr := range c.config.Servers { + c.rpcProxy.AddPrimaryServer(serverAddr) + } // Store the config copy before restoring state but after it has been // initialized. @@ -224,6 +225,9 @@ func NewClient(cfg *config.Config) (*Client, error) { // Start collecting stats go c.collectHostStats() + // Start maintenance task for servers + go c.rpcProxy.Run() + // Start the consul sync go c.syncConsul() @@ -273,6 +277,16 @@ func (c *Client) Leave() error { return nil } +// Region returns the region for the given client +func (c *Client) Region() string { + return c.config.Region +} + +// Region returns the rpcVersion in use by the client +func (c *Client) RPCVersion() int { + return rpcVersion +} + // Shutdown is used to tear down the client func (c *Client) Shutdown() error { c.logger.Printf("[INFO] client: shutting down") @@ -299,104 +313,24 @@ func (c *Client) Shutdown() error { // RPC is used to forward an RPC call to a nomad server, or fail if no servers func (c *Client) RPC(method string, args interface{}, reply interface{}) error { - // Invoke the RPCHandle if it exists + // Invoke the RPCHandler if it exists if c.config.RPCHandler != nil { return c.config.RPCHandler.RPC(method, args, reply) } // Pick a server to request from - addr, err := c.pickServer() - if err != nil { - return err + server := c.rpcProxy.FindServer() + if server == nil { + return fmt.Errorf("no known servers") } // Make the RPC request - err = c.connPool.RPC(c.config.Region, addr, 1, method, args, reply) - - // Update the last server information - c.lastServerLock.Lock() - if err != nil { - c.lastServer = nil - c.lastRPCTime = time.Time{} - } else { - c.lastServer = addr - c.lastRPCTime = time.Now() - } - c.lastServerLock.Unlock() - return err -} - -// pickServer is used to pick a target RPC server -func (c *Client) pickServer() (net.Addr, error) { - c.lastServerLock.Lock() - defer c.lastServerLock.Unlock() - - // Check for a valid last-used server - if c.lastServer != nil && time.Now().Sub(c.lastRPCTime) < clientRPCCache { - return c.lastServer, nil - } - - // Bail if we can't find any servers - servers := c.Servers() - if len(servers) == 0 { - return nil, fmt.Errorf("no known servers") - } - - // Shuffle so we don't always use the same server - shuffleStrings(servers) - - // Try to resolve each server - for i := 0; i < len(servers); i++ { - addr, err := net.ResolveTCPAddr("tcp", servers[i]) - if err == nil { - c.lastServer = addr - c.lastRPCTime = time.Now() - return addr, nil - } - c.logger.Printf("[WARN] client: failed to resolve '%s': %s", servers[i], err) - } - - // Bail if we reach this point - return nil, fmt.Errorf("failed to resolve any servers") -} - -// Servers is used to return the current known servers list. When an agent -// is first started, this list comes directly from configuration files. -func (c *Client) Servers() []string { - c.serverLock.RLock() - defer c.serverLock.RUnlock() - return c.servers -} - -// SetServers is used to modify the known servers list. This avoids forcing -// a config rollout + rolling restart and enables auto-join features. The -// full set of servers is passed to support adding and/or removing servers. -func (c *Client) SetServers(servers []string) { - c.serverLock.Lock() - defer c.serverLock.Unlock() - if servers == nil { - servers = make([]string, 0) - } - // net.ResolveTCPAddr requires port to be set, if one is not provided, supply default port - // Using net.SplitHostPort in the event of IPv6 addresses with multiple colons. - // IPv6 addresses must be passed in with brackets, - // i.e: [::1]:4647 or [::1] - setServers := make([]string, len(servers)) - copy(setServers, servers) - for i := 0; i < len(setServers); i++ { - if _, _, err := net.SplitHostPort(setServers[i]); err != nil { - // multiple errors can be returned here, only searching for missing - if strings.Contains(err.Error(), "missing port") { - c.logger.Printf("[WARN] client: port not specified, using default port") - setServers[i] = net.JoinHostPort(setServers[i], "4647") - } else { - c.logger.Printf("[WARN] client: server address %q invalid: %v", setServers[i], err) - } - } + if err := c.connPool.RPC(c.Region(), server.Addr, rpcVersion, method, args, reply); err != nil { + c.rpcProxy.NotifyFailedServer(server) + c.logger.Printf("[ERR] client: RPC failed to server %s: %v", server.Addr, err) + return err } - - c.logger.Printf("[INFO] client: setting server address list: %s", setServers) - c.servers = setServers + return nil } // Stats is used to return statistics for debugging and insight @@ -412,7 +346,7 @@ func (c *Client) Stats() map[string]map[string]string { stats := map[string]map[string]string{ "client": map[string]string{ "node_id": c.Node().ID, - "known_servers": toString(uint64(len(c.Servers()))), + "known_servers": toString(uint64(c.rpcProxy.NumServers())), "num_allocations": toString(uint64(numAllocs)), "last_heartbeat": fmt.Sprintf("%v", time.Since(c.lastHeartbeat)), "heartbeat_ttl": fmt.Sprintf("%v", c.heartbeatTTL), @@ -499,6 +433,12 @@ func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) { return ar.ctx.AllocDir, nil } +// AddPrimaryServerToRpcProxy adds serverAddr to the RPC Proxy's primary +// server list. +func (c *Client) AddPrimaryServerToRpcProxy(serverAddr string) { + c.rpcProxy.AddPrimaryServer(serverAddr) +} + // restoreState is used to restore our state from the data dir func (c *Client) restoreState() error { if c.config.DevMode { @@ -905,7 +845,7 @@ func (c *Client) registerNode() error { node := c.Node() req := structs.NodeRegisterRequest{ Node: node, - WriteRequest: structs.WriteRequest{Region: c.config.Region}, + WriteRequest: structs.WriteRequest{Region: c.Region()}, } var resp structs.NodeUpdateResponse err := c.RPC("Node.Register", &req, &resp) @@ -939,7 +879,7 @@ func (c *Client) updateNodeStatus() error { req := structs.NodeUpdateStatusRequest{ NodeID: node.ID, Status: structs.NodeStatusReady, - WriteRequest: structs.WriteRequest{Region: c.config.Region}, + WriteRequest: structs.WriteRequest{Region: c.Region()}, } var resp structs.NodeUpdateResponse err := c.RPC("Node.UpdateStatus", &req, &resp) @@ -958,6 +898,11 @@ func (c *Client) updateNodeStatus() error { defer c.heartbeatLock.Unlock() c.lastHeartbeat = time.Now() c.heartbeatTTL = resp.HeartbeatTTL + + if err := c.rpcProxy.UpdateFromNodeUpdateResponse(&resp); err != nil { + return err + } + return nil } @@ -1004,7 +949,7 @@ func (c *Client) allocSync() { // Send to server. args := structs.AllocUpdateRequest{ Alloc: sync, - WriteRequest: structs.WriteRequest{Region: c.config.Region}, + WriteRequest: structs.WriteRequest{Region: c.Region()}, } var resp structs.GenericResponse @@ -1044,7 +989,7 @@ func (c *Client) watchAllocations(updates chan *allocUpdates) { req := structs.NodeSpecificRequest{ NodeID: c.Node().ID, QueryOptions: structs.QueryOptions{ - Region: c.config.Region, + Region: c.Region(), AllowStale: true, }, } @@ -1054,7 +999,7 @@ func (c *Client) watchAllocations(updates chan *allocUpdates) { // new, or updated server side. allocsReq := structs.AllocsGetRequest{ QueryOptions: structs.QueryOptions{ - Region: c.config.Region, + Region: c.Region(), AllowStale: true, }, } @@ -1374,3 +1319,7 @@ func (c *Client) emitStats(hStats *stats.HostStats) { metrics.EmitKey([]string{"disk", disk.Device, "inodes_percent"}, float32(disk.InodesUsedPercent)) } } + +func (c *Client) RpcProxy() *rpc_proxy.RpcProxy { + return c.rpcProxy +} diff --git a/client/client_test.go b/client/client_test.go index 6e7553ce3e7..5a17c07a5b7 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -7,7 +7,6 @@ import ( "net" "os" "path/filepath" - "reflect" "sync/atomic" "testing" "time" @@ -509,54 +508,3 @@ func TestClient_Init(t *testing.T) { t.Fatalf("err: %s", err) } } - -func TestClient_SetServers(t *testing.T) { - client := testClient(t, nil) - - // Sets an empty list - client.SetServers(nil) - if client.servers == nil { - t.Fatalf("should not be nil") - } - - // Set the initial servers list - expect := []string{"foo:4647"} - client.SetServers(expect) - if !reflect.DeepEqual(client.servers, expect) { - t.Fatalf("expect %v, got %v", expect, client.servers) - } - - // Add a server - expect = []string{"foo:5445", "bar:8080"} - client.SetServers(expect) - if !reflect.DeepEqual(client.servers, expect) { - t.Fatalf("expect %v, got %v", expect, client.servers) - } - - // Remove a server - expect = []string{"bar:8080"} - client.SetServers(expect) - if !reflect.DeepEqual(client.servers, expect) { - t.Fatalf("expect %v, got %v", expect, client.servers) - } - - // Add and remove a server - expect = []string{"baz:9090", "zip:4545"} - client.SetServers(expect) - if !reflect.DeepEqual(client.servers, expect) { - t.Fatalf("expect %v, got %v", expect, client.servers) - } - - // Query the servers list - if servers := client.Servers(); !reflect.DeepEqual(servers, expect) { - t.Fatalf("expect %v, got %v", expect, servers) - } - - // Add servers without ports, and remove old servers - servers := []string{"foo", "bar", "baz"} - expect = []string{"foo:4647", "bar:4647", "baz:4647"} - client.SetServers(servers) - if !reflect.DeepEqual(client.servers, expect) { - t.Fatalf("expect %v, got %v", expect, client.servers) - } -} diff --git a/client/consul/sync.go b/client/consul/sync.go index 2aa79192428..223b37faa11 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -17,7 +17,7 @@ import ( "github.com/hashicorp/nomad/nomad/structs" ) -// ConsulService allows syncing of services and checks with Consul +// Syncer allows syncing of services and checks with Consul type Syncer struct { client *consul.Client availble bool @@ -112,7 +112,7 @@ func NewSyncer(config *AgentConfig, logger *log.Logger) (*Syncer, error) { if c, err = consul.NewClient(cfg); err != nil { return nil, err } - consulService := Syncer{ + consulSyncer := Syncer{ client: c, logger: logger, trackedServices: make(map[string]*consul.AgentService), @@ -121,7 +121,7 @@ func NewSyncer(config *AgentConfig, logger *log.Logger) (*Syncer, error) { shutdownCh: make(chan struct{}), } - return &consulService, nil + return &consulSyncer, nil } // SetDelegatedChecks sets the checks that nomad is going to run and report the diff --git a/client/driver/utils.go b/client/driver/utils.go index 0584d351d3b..d6202dd40b0 100644 --- a/client/driver/utils.go +++ b/client/driver/utils.go @@ -85,11 +85,11 @@ func consulContext(clientConfig *config.Config, containerID string) *executor.Co } return &executor.ConsulContext{ ConsulAgentConfig: &cfg, - ContainerID: containerID, - DockerEndpoint: clientConfig.Read("docker.endpoint"), - TLSCa: clientConfig.Read("docker.tls.ca"), - TLSCert: clientConfig.Read("docker.tls.cert"), - TLSKey: clientConfig.Read("docker.tls.key"), + ContainerID: containerID, + DockerEndpoint: clientConfig.Read("docker.endpoint"), + TLSCa: clientConfig.Read("docker.tls.ca"), + TLSCert: clientConfig.Read("docker.tls.cert"), + TLSKey: clientConfig.Read("docker.tls.key"), } } diff --git a/client/rpc_proxy/manager_internal_test.go b/client/rpc_proxy/manager_internal_test.go new file mode 100644 index 00000000000..271d056a375 --- /dev/null +++ b/client/rpc_proxy/manager_internal_test.go @@ -0,0 +1,353 @@ +package rpc_proxy + +import ( + "bytes" + "fmt" + "log" + "math/rand" + "os" + "testing" + "time" +) + +var ( + localLogger *log.Logger + localLogBuffer *bytes.Buffer +) + +func init() { + localLogBuffer = new(bytes.Buffer) + localLogger = log.New(localLogBuffer, "", 0) +} + +func GetBufferedLogger() *log.Logger { + return localLogger +} + +type fauxConnPool struct { + // failPct between 0.0 and 1.0 == pct of time a Ping should fail + failPct float64 +} + +func (cp *fauxConnPool) PingNomadServer(region string, version int, s *ServerEndpoint) (bool, error) { + var success bool + successProb := rand.Float64() + if successProb > cp.failPct { + success = true + } + return success, nil +} + +type fauxSerf struct { + numNodes int +} + +func (s *fauxSerf) NumNodes() int { + return s.numNodes +} + +func (s *fauxSerf) Region() string { + return "global" +} + +func (s *fauxSerf) RPCVersion() int { + return 1 +} + +func testManager() (p *RpcProxy) { + logger := GetBufferedLogger() + shutdownCh := make(chan struct{}) + p = NewRpcProxy(logger, shutdownCh, &fauxSerf{numNodes: 16384}, &fauxConnPool{}) + return p +} + +func testManagerFailProb(failPct float64) (p *RpcProxy) { + logger := GetBufferedLogger() + logger = log.New(os.Stderr, "", log.LstdFlags) + shutdownCh := make(chan struct{}) + p = NewRpcProxy(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) + return p +} + +// func (l *serverList) cycleServer() (servers []*Server) { +func TestManagerInternal_cycleServer(t *testing.T) { + m := testManager() + l := m.getServerList() + + server0 := &ServerEndpoint{Name: "server1"} + server1 := &ServerEndpoint{Name: "server2"} + server2 := &ServerEndpoint{Name: "server3"} + l.L = append(l.L, server0, server1, server2) + m.saveServerList(l) + + l = m.getServerList() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server0 && + l.L[1] != server1 && + l.L[2] != server2 { + t.Fatalf("initial server ordering not correct") + } + + l.L = l.cycleServer() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server1 && + l.L[1] != server2 && + l.L[2] != server0 { + t.Fatalf("server ordering after one cycle not correct") + } + + l.L = l.cycleServer() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server2 && + l.L[1] != server0 && + l.L[2] != server1 { + t.Fatalf("server ordering after two cycles not correct") + } + + l.L = l.cycleServer() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server0 && + l.L[1] != server1 && + l.L[2] != server2 { + t.Fatalf("server ordering after three cycles not correct") + } +} + +// func (m *Manager) getServerList() serverList { +func TestManagerInternal_getServerList(t *testing.T) { + m := testManager() + l := m.getServerList() + if l.L == nil { + t.Fatalf("serverList.servers nil") + } + + if len(l.L) != 0 { + t.Fatalf("serverList.servers length not zero") + } +} + +func TestManagerInternal_NewManager(t *testing.T) { + m := testManager() + if m == nil { + t.Fatalf("Manager nil") + } + + if m.logger == nil { + t.Fatalf("Manager.logger nil") + } + + if m.shutdownCh == nil { + t.Fatalf("Manager.shutdownCh nil") + } +} + +// func (m *Manager) reconcileServerList(l *serverList) bool { +func TestManagerInternal_reconcileServerList(t *testing.T) { + tests := []int{0, 1, 2, 3, 4, 5, 10, 100} + for _, n := range tests { + ok, err := test_reconcileServerList(n) + if !ok { + t.Errorf("Expected %d to pass: %v", n, err) + } + } +} + +func test_reconcileServerList(maxServers int) (bool, error) { + // Build a server list, reconcile, verify the missing servers are + // missing, the added have been added, and the original server is + // present. + const failPct = 0.5 + m := testManagerFailProb(failPct) + + var failedServers, healthyServers []*ServerEndpoint + for i := 0; i < maxServers; i++ { + nodeName := fmt.Sprintf("s%02d", i) + + node := &ServerEndpoint{Name: nodeName} + // Add 66% of servers to Manager + if rand.Float64() > 0.33 { + m.activateEndpoint(node) + + // Of healthy servers, (ab)use connPoolPinger to + // failPct of the servers for the reconcile. This + // allows for the selected server to no longer be + // healthy for the reconcile below. + if ok, _ := m.connPoolPinger.PingNomadServer(m.configInfo.Region(), m.configInfo.RPCVersion(), node); ok { + // Will still be present + healthyServers = append(healthyServers, node) + } else { + // Will be missing + failedServers = append(failedServers, node) + } + } else { + // Will be added from the call to reconcile + healthyServers = append(healthyServers, node) + } + } + + // Randomize Manager's server list + m.RebalanceServers() + selectedServer := m.FindServer() + + var selectedServerFailed bool + for _, s := range failedServers { + if selectedServer.Key().Equal(s.Key()) { + selectedServerFailed = true + break + } + } + + // Update Manager's server list to be "healthy" based on Serf. + // Reconcile this with origServers, which is shuffled and has a live + // connection, but possibly out of date. + origServers := m.getServerList() + m.saveServerList(serverList{L: healthyServers}) + + // This should always succeed with non-zero server lists + if !selectedServerFailed && !m.reconcileServerList(&origServers) && + len(m.getServerList().L) != 0 && + len(origServers.L) != 0 { + // If the random gods are unfavorable and we end up with zero + // length lists, expect things to fail and retry the test. + return false, fmt.Errorf("Expected reconcile to succeed: %v %d %d", + selectedServerFailed, + len(m.getServerList().L), + len(origServers.L)) + } + + // If we have zero-length server lists, test succeeded in degenerate + // case. + if len(m.getServerList().L) == 0 && + len(origServers.L) == 0 { + // Failed as expected w/ zero length list + return true, nil + } + + resultingServerMap := make(map[EndpointKey]bool) + for _, s := range m.getServerList().L { + resultingServerMap[*s.Key()] = true + } + + // Test to make sure no failed servers are in the Manager's + // list. Error if there are any failedServers in l.servers + for _, s := range failedServers { + _, ok := resultingServerMap[*s.Key()] + if ok { + return false, fmt.Errorf("Found failed server %v in merged list %v", s, resultingServerMap) + } + } + + // Test to make sure all healthy servers are in the healthy list. + if len(healthyServers) != len(m.getServerList().L) { + return false, fmt.Errorf("Expected healthy map and servers to match: %d/%d", len(healthyServers), len(healthyServers)) + } + + // Test to make sure all healthy servers are in the resultingServerMap list. + for _, s := range healthyServers { + _, ok := resultingServerMap[*s.Key()] + if !ok { + return false, fmt.Errorf("Server %v missing from healthy map after merged lists", s) + } + } + return true, nil +} + +// func (l *serverList) refreshServerRebalanceTimer() { +func TestManagerInternal_refreshServerRebalanceTimer(t *testing.T) { + type clusterSizes struct { + numNodes int + numServers int + minRebalance time.Duration + } + clusters := []clusterSizes{ + {0, 3, 2 * time.Minute}, + {1, 0, 2 * time.Minute}, // partitioned cluster + {1, 3, 2 * time.Minute}, + {2, 3, 2 * time.Minute}, + {100, 0, 2 * time.Minute}, // partitioned + {100, 1, 2 * time.Minute}, // partitioned + {100, 3, 2 * time.Minute}, + {1024, 1, 2 * time.Minute}, // partitioned + {1024, 3, 2 * time.Minute}, // partitioned + {1024, 5, 2 * time.Minute}, + {16384, 1, 4 * time.Minute}, // partitioned + {16384, 2, 2 * time.Minute}, // partitioned + {16384, 3, 2 * time.Minute}, // partitioned + {16384, 5, 2 * time.Minute}, + {65535, 0, 2 * time.Minute}, // partitioned + {65535, 1, 8 * time.Minute}, // partitioned + {65535, 2, 3 * time.Minute}, // partitioned + {65535, 3, 5 * time.Minute}, // partitioned + {65535, 5, 3 * time.Minute}, // partitioned + {65535, 7, 2 * time.Minute}, + {1000000, 1, 4 * time.Hour}, // partitioned + {1000000, 2, 2 * time.Hour}, // partitioned + {1000000, 3, 80 * time.Minute}, // partitioned + {1000000, 5, 50 * time.Minute}, // partitioned + {1000000, 11, 20 * time.Minute}, // partitioned + {1000000, 19, 10 * time.Minute}, + } + + logger := log.New(os.Stderr, "", log.LstdFlags) + shutdownCh := make(chan struct{}) + + for _, s := range clusters { + m := NewRpcProxy(logger, shutdownCh, &fauxSerf{numNodes: s.numNodes}, &fauxConnPool{}) + for i := 0; i < s.numServers; i++ { + nodeName := fmt.Sprintf("s%02d", i) + m.activateEndpoint(&ServerEndpoint{Name: nodeName}) + } + + d := m.refreshServerRebalanceTimer() + if d < s.minRebalance { + t.Errorf("duration too short for cluster of size %d and %d servers (%s < %s)", s.numNodes, s.numServers, d, s.minRebalance) + } + } +} + +// func (m *Manager) saveServerList(l serverList) { +func TestManagerInternal_saveServerList(t *testing.T) { + m := testManager() + + // Initial condition + func() { + l := m.getServerList() + if len(l.L) != 0 { + t.Fatalf("Manager.saveServerList failed to load init config") + } + + newServer := new(ServerEndpoint) + l.L = append(l.L, newServer) + m.saveServerList(l) + }() + + // Test that save works + func() { + l1 := m.getServerList() + t1NumServers := len(l1.L) + if t1NumServers != 1 { + t.Fatalf("Manager.saveServerList failed to save mutated config") + } + }() + + // Verify mutation w/o a save doesn't alter the original + func() { + newServer := new(ServerEndpoint) + l := m.getServerList() + l.L = append(l.L, newServer) + + l_orig := m.getServerList() + origNumServers := len(l_orig.L) + if origNumServers >= len(l.L) { + t.Fatalf("Manager.saveServerList unsaved config overwrote original") + } + }() +} diff --git a/client/rpc_proxy/manager_test.go b/client/rpc_proxy/manager_test.go new file mode 100644 index 00000000000..dc8eed6d23b --- /dev/null +++ b/client/rpc_proxy/manager_test.go @@ -0,0 +1,387 @@ +package rpc_proxy_test + +import ( + "bytes" + "fmt" + "log" + "math/rand" + "os" + "strings" + "testing" + + "github.com/hashicorp/consul/consul/agent" + "github.com/hashicorp/consul/consul/servers" +) + +var ( + localLogger *log.Logger + localLogBuffer *bytes.Buffer +) + +func init() { + localLogBuffer = new(bytes.Buffer) + localLogger = log.New(localLogBuffer, "", 0) +} + +func GetBufferedLogger() *log.Logger { + return localLogger +} + +type fauxConnPool struct { + // failPct between 0.0 and 1.0 == pct of time a Ping should fail + failPct float64 +} + +func (cp *fauxConnPool) PingConsulServer(server *agent.Server) (bool, error) { + var success bool + successProb := rand.Float64() + if successProb > cp.failPct { + success = true + } + return success, nil +} + +type fauxSerf struct { +} + +func (s *fauxSerf) NumNodes() int { + return 16384 +} + +func testManager() (m *servers.Manager) { + logger := GetBufferedLogger() + logger = log.New(os.Stderr, "", log.LstdFlags) + shutdownCh := make(chan struct{}) + m = servers.New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{}) + return m +} + +func testManagerFailProb(failPct float64) (m *servers.Manager) { + logger := GetBufferedLogger() + logger = log.New(os.Stderr, "", log.LstdFlags) + shutdownCh := make(chan struct{}) + m = servers.New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) + return m +} + +// func (m *Manager) AddServer(server *agent.Server) { +func TestServers_AddServer(t *testing.T) { + m := testManager() + var num int + num = m.NumServers() + if num != 0 { + t.Fatalf("Expected zero servers to start") + } + + s1 := &agent.Server{Name: "s1"} + m.AddServer(s1) + num = m.NumServers() + if num != 1 { + t.Fatalf("Expected one server") + } + + m.AddServer(s1) + num = m.NumServers() + if num != 1 { + t.Fatalf("Expected one server (still)") + } + + s2 := &agent.Server{Name: "s2"} + m.AddServer(s2) + num = m.NumServers() + if num != 2 { + t.Fatalf("Expected two servers") + } +} + +// func (m *Manager) FindServer() (server *agent.Server) { +func TestServers_FindServer(t *testing.T) { + m := testManager() + + if m.FindServer() != nil { + t.Fatalf("Expected nil return") + } + + m.AddServer(&agent.Server{Name: "s1"}) + if m.NumServers() != 1 { + t.Fatalf("Expected one server") + } + + s1 := m.FindServer() + if s1 == nil { + t.Fatalf("Expected non-nil server") + } + if s1.Name != "s1" { + t.Fatalf("Expected s1 server") + } + + s1 = m.FindServer() + if s1 == nil || s1.Name != "s1" { + t.Fatalf("Expected s1 server (still)") + } + + m.AddServer(&agent.Server{Name: "s2"}) + if m.NumServers() != 2 { + t.Fatalf("Expected two servers") + } + s1 = m.FindServer() + if s1 == nil || s1.Name != "s1" { + t.Fatalf("Expected s1 server (still)") + } + + m.NotifyFailedServer(s1) + s2 := m.FindServer() + if s2 == nil || s2.Name != "s2" { + t.Fatalf("Expected s2 server") + } + + m.NotifyFailedServer(s2) + s1 = m.FindServer() + if s1 == nil || s1.Name != "s1" { + t.Fatalf("Expected s1 server") + } +} + +// func New(logger *log.Logger, shutdownCh chan struct{}) (m *Manager) { +func TestServers_New(t *testing.T) { + logger := GetBufferedLogger() + logger = log.New(os.Stderr, "", log.LstdFlags) + shutdownCh := make(chan struct{}) + m := servers.New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{}) + if m == nil { + t.Fatalf("Manager nil") + } +} + +// func (m *Manager) NotifyFailedServer(server *agent.Server) { +func TestServers_NotifyFailedServer(t *testing.T) { + m := testManager() + + if m.NumServers() != 0 { + t.Fatalf("Expected zero servers to start") + } + + s1 := &agent.Server{Name: "s1"} + s2 := &agent.Server{Name: "s2"} + + // Try notifying for a server that is not managed by Manager + m.NotifyFailedServer(s1) + if m.NumServers() != 0 { + t.Fatalf("Expected zero servers to start") + } + m.AddServer(s1) + + // Test again w/ a server not in the list + m.NotifyFailedServer(s2) + if m.NumServers() != 1 { + t.Fatalf("Expected one server") + } + + m.AddServer(s2) + if m.NumServers() != 2 { + t.Fatalf("Expected two servers") + } + + s1 = m.FindServer() + if s1 == nil || s1.Name != "s1" { + t.Fatalf("Expected s1 server") + } + + m.NotifyFailedServer(s2) + s1 = m.FindServer() + if s1 == nil || s1.Name != "s1" { + t.Fatalf("Expected s1 server (still)") + } + + m.NotifyFailedServer(s1) + s2 = m.FindServer() + if s2 == nil || s2.Name != "s2" { + t.Fatalf("Expected s2 server") + } + + m.NotifyFailedServer(s2) + s1 = m.FindServer() + if s1 == nil || s1.Name != "s1" { + t.Fatalf("Expected s1 server") + } +} + +// func (m *Manager) NumServers() (numServers int) { +func TestServers_NumServers(t *testing.T) { + m := testManager() + var num int + num = m.NumServers() + if num != 0 { + t.Fatalf("Expected zero servers to start") + } + + s := &agent.Server{} + m.AddServer(s) + num = m.NumServers() + if num != 1 { + t.Fatalf("Expected one server after AddServer") + } +} + +// func (m *Manager) RebalanceServers() { +func TestServers_RebalanceServers(t *testing.T) { + const failPct = 0.5 + m := testManagerFailProb(failPct) + const maxServers = 100 + const numShuffleTests = 100 + const uniquePassRate = 0.5 + + // Make a huge list of nodes. + for i := 0; i < maxServers; i++ { + nodeName := fmt.Sprintf("s%02d", i) + m.AddServer(&agent.Server{Name: nodeName}) + } + + // Keep track of how many unique shuffles we get. + uniques := make(map[string]struct{}, maxServers) + for i := 0; i < numShuffleTests; i++ { + m.RebalanceServers() + + var names []string + for j := 0; j < maxServers; j++ { + server := m.FindServer() + m.NotifyFailedServer(server) + names = append(names, server.Name) + } + key := strings.Join(names, "|") + uniques[key] = struct{}{} + } + + // We have to allow for the fact that there won't always be a unique + // shuffle each pass, so we just look for smell here without the test + // being flaky. + if len(uniques) < int(maxServers*uniquePassRate) { + t.Fatalf("unique shuffle ratio too low: %d/%d", len(uniques), maxServers) + } +} + +// func (m *Manager) RemoveServer(server *agent.Server) { +func TestManager_RemoveServer(t *testing.T) { + const nodeNameFmt = "s%02d" + m := testManager() + + if m.NumServers() != 0 { + t.Fatalf("Expected zero servers to start") + } + + // Test removing server before its added + nodeName := fmt.Sprintf(nodeNameFmt, 1) + s1 := &agent.Server{Name: nodeName} + m.RemoveServer(s1) + m.AddServer(s1) + + nodeName = fmt.Sprintf(nodeNameFmt, 2) + s2 := &agent.Server{Name: nodeName} + m.RemoveServer(s2) + m.AddServer(s2) + + const maxServers = 19 + servers := make([]*agent.Server, maxServers) + // Already added two servers above + for i := maxServers; i > 2; i-- { + nodeName := fmt.Sprintf(nodeNameFmt, i) + server := &agent.Server{Name: nodeName} + servers = append(servers, server) + m.AddServer(server) + } + if m.NumServers() != maxServers { + t.Fatalf("Expected %d servers, received %d", maxServers, m.NumServers()) + } + + m.RebalanceServers() + + if m.NumServers() != maxServers { + t.Fatalf("Expected %d servers, received %d", maxServers, m.NumServers()) + } + + findServer := func(server *agent.Server) bool { + for i := m.NumServers(); i > 0; i-- { + s := m.FindServer() + if s == server { + return true + } + } + return false + } + + expectedNumServers := maxServers + removedServers := make([]*agent.Server, 0, maxServers) + + // Remove servers from the front of the list + for i := 3; i > 0; i-- { + server := m.FindServer() + if server == nil { + t.Fatalf("FindServer returned nil") + } + m.RemoveServer(server) + expectedNumServers-- + if m.NumServers() != expectedNumServers { + t.Fatalf("Expected %d servers (got %d)", expectedNumServers, m.NumServers()) + } + if findServer(server) == true { + t.Fatalf("Did not expect to find server %s after removal from the front", server.Name) + } + removedServers = append(removedServers, server) + } + + // Remove server from the end of the list + for i := 3; i > 0; i-- { + server := m.FindServer() + m.NotifyFailedServer(server) + m.RemoveServer(server) + expectedNumServers-- + if m.NumServers() != expectedNumServers { + t.Fatalf("Expected %d servers (got %d)", expectedNumServers, m.NumServers()) + } + if findServer(server) == true { + t.Fatalf("Did not expect to find server %s", server.Name) + } + removedServers = append(removedServers, server) + } + + // Remove server from the middle of the list + for i := 3; i > 0; i-- { + server := m.FindServer() + m.NotifyFailedServer(server) + server2 := m.FindServer() + m.NotifyFailedServer(server2) // server2 now at end of the list + + m.RemoveServer(server) + expectedNumServers-- + if m.NumServers() != expectedNumServers { + t.Fatalf("Expected %d servers (got %d)", expectedNumServers, m.NumServers()) + } + if findServer(server) == true { + t.Fatalf("Did not expect to find server %s", server.Name) + } + removedServers = append(removedServers, server) + } + + if m.NumServers()+len(removedServers) != maxServers { + t.Fatalf("Expected %d+%d=%d servers", m.NumServers(), len(removedServers), maxServers) + } + + // Drain the remaining servers from the middle + for i := m.NumServers(); i > 0; i-- { + server := m.FindServer() + m.NotifyFailedServer(server) + server2 := m.FindServer() + m.NotifyFailedServer(server2) // server2 now at end of the list + m.RemoveServer(server) + removedServers = append(removedServers, server) + } + + if m.NumServers() != 0 { + t.Fatalf("Expected an empty server list") + } + if len(removedServers) != maxServers { + t.Fatalf("Expected all servers to be in removed server list") + } +} + +// func (m *Manager) Start() { diff --git a/client/rpc_proxy/rpc_proxy.go b/client/rpc_proxy/rpc_proxy.go new file mode 100644 index 00000000000..d98ef05057a --- /dev/null +++ b/client/rpc_proxy/rpc_proxy.go @@ -0,0 +1,761 @@ +// Package rpc_proxy provides a proxy interface for Nomad Servers. The +// RpcProxy periodically shuffles which server a Nomad Client communicates +// with in order to redistribute load across Nomad Servers. Nomad Servers +// that fail an RPC request are automatically cycled to the end of the list +// until the server list is reshuffled. +// +// The servers package does not provide any external API guarantees and +// should be called only by `hashicorp/nomad`. +package rpc_proxy + +import ( + "fmt" + "log" + "math/rand" + "sync" + "sync/atomic" + "time" + + "github.com/hashicorp/consul/lib" + "github.com/hashicorp/nomad/nomad/structs" +) + +const ( + // apiMajorVersion is synchronized with `nomad/server.go` and + // represents the API version supported by this client. + // + // TODO(sean@): This symbol should be exported somewhere. + apiMajorVersion = 1 + + // clientRPCJitterFraction determines the amount of jitter added to + // clientRPCMinReuseDuration before a connection is expired and a new + // connection is established in order to rebalance load across Nomad + // servers. The cluster-wide number of connections per second from + // rebalancing is applied after this jitter to ensure the CPU impact + // is always finite. See newRebalanceConnsPerSecPerServer's comment + // for additional commentary. + // + // For example, in a 10K Nomad cluster with 5x servers, this default + // averages out to ~13 new connections from rebalancing per server + // per second (each connection is reused for 120s to 180s). + clientRPCJitterFraction = 2 + + // clientRPCMinReuseDuration controls the minimum amount of time RPC + // queries are sent over an established connection to a single server + clientRPCMinReuseDuration = 120 * time.Second + + // Limit the number of new connections a server receives per second + // for connection rebalancing. This limit caps the load caused by + // continual rebalancing efforts when a cluster is in equilibrium. A + // lower value comes at the cost of increased recovery time after a + // partition. This parameter begins to take effect when there are + // more than ~48K clients querying 5x servers or at lower server + // counts when there is a partition. + // + // For example, in a 100K Nomad cluster with 5x servers, it will take + // ~5min for all servers to rebalance their connections. If 99,995 + // agents are in the minority talking to only one server, it will + // take ~26min for all servers to rebalance. A 10K cluster in the + // same scenario will take ~2.6min to rebalance. + newRebalanceConnsPerSecPerServer = 64 + + // rpcAPIMismatchLogRate determines the rate at which log entries are + // emitted when the client and server's API versions are mismatched. + rpcAPIMismatchLogRate = 3 * time.Hour +) + +// NomadConfigInfo is an interface wrapper around this Nomad Agent's +// configuration to prevents a cyclic import dependency. +type NomadConfigInfo interface { + RPCVersion() int + Region() string +} + +// Pinger is an interface wrapping client.ConnPool to prevent a +// cyclic import dependency +type Pinger interface { + PingNomadServer(region string, version int, s *ServerEndpoint) (bool, error) +} + +// serverList is an array of Nomad Servers. The first server in the list is +// the active server. +// +// NOTE(sean@): We are explicitly relying on the fact that serverList will be +// copied onto the stack by atomic.Value. Please keep this structure light. +type serverList struct { + L []*ServerEndpoint +} + +type RpcProxy struct { + // activatedList manages the list of Nomad Servers that are eligible + // to be queried by the Agent + activatedList atomic.Value + listLock sync.Mutex + + // primaryServers is a list of servers found in the last heartbeat. + // primaryServers are periodically reshuffled. Covered by + // serverListLock. + primaryServers serverList + + // backupServers is a list of fallback servers. These servers are + // appended to the RpcProxy's serverList, but are never shuffled with + // the list of servers discovered via the Nomad heartbeat. Covered + // by serverListLock. + backupServers serverList + + // serverListLock covers both backupServers and primaryServers + serverListLock sync.RWMutex + + leaderAddr string + numNodes int + + // rebalanceTimer controls the duration of the rebalance interval + rebalanceTimer *time.Timer + + // shutdownCh is a copy of the channel in nomad.Client + shutdownCh chan struct{} + + logger *log.Logger + + configInfo NomadConfigInfo + + // rpcAPIMismatchThrottle regulates the rate at which warning + // messages are emitted in the event of an API mismatch between the + // clients and servers. + rpcAPIMismatchThrottle map[string]time.Time + + // connPoolPinger is used to test the health of a server in the + // connection pool. Pinger is an interface that wraps + // client.ConnPool. + connPoolPinger Pinger + + // notifyFailedBarrier is acts as a barrier to prevent queuing behind + // serverListLock and acts as a TryLock(). + notifyFailedBarrier int32 + + // consulLock is the lock to prevent concurrent access to Consul from + // an RpcProxy instance. + consulLock int32 +} + +// activateEndpoint adds an endpoint to the RpcProxy's active serverList. +// Returns true if the server was added, returns false if the server already +// existed in the RpcProxy's serverList. +func (p *RpcProxy) activateEndpoint(s *ServerEndpoint) bool { + l := p.getServerList() + + // Check if this server is known + found := false + for idx, existing := range l.L { + if existing.Name == s.Name { + newServers := make([]*ServerEndpoint, len(l.L)) + copy(newServers, l.L) + + // Overwrite the existing server details in order to + // possibly update metadata (e.g. server version) + newServers[idx] = s + + l.L = newServers + found = true + break + } + } + + // Add to the list if not known + if !found { + newServers := make([]*ServerEndpoint, len(l.L), len(l.L)+1) + copy(newServers, l.L) + newServers = append(newServers, s) + l.L = newServers + } + + p.saveServerList(l) + + return !found +} + +// SetBackupServers sets a list of Nomad Servers to be used in the event that +// the Nomad Agent lost contact with the list of Nomad Servers provided via +// the Nomad Agent's heartbeat. If available, the backup servers are +// populated via Consul. +func (p *RpcProxy) SetBackupServers(addrs []string) error { + l := make([]*ServerEndpoint, 0, len(addrs)) + for _, s := range addrs { + s, err := newServer(s) + if err != nil { + p.logger.Printf("[WARN] RPC Proxy: unable to create backup server %q: %v", s, err) + return fmt.Errorf("unable to create new backup server from %q: %v", s, err) + } + } + + p.serverListLock.Lock() + p.backupServers.L = l + p.serverListLock.Unlock() + + p.listLock.Lock() + defer p.listLock.Unlock() + for _, s := range l { + p.activateEndpoint(s) + } + + return nil +} + +// AddPrimaryServer takes the RPC address of a Nomad server, creates a new +// endpoint, and adds it to both the primaryServers list and the active +// serverList used in the RPC Proxy. If the endpoint is not known by the +// RpcProxy, appends the endpoint to the list. The new endpoint will begin +// seeing use after the rebalance timer fires (or enough servers fail +// organically). Any values in the primary server list are overridden by the +// next successful heartbeat. +func (p *RpcProxy) AddPrimaryServer(rpcAddr string) *ServerEndpoint { + s, err := newServer(rpcAddr) + if err != nil { + p.logger.Printf("[WARN] RPC Proxy: unable to create new primary server from endpoint %q", rpcAddr) + return nil + } + + p.serverListLock.Lock() + p.primaryServers.L = append(p.primaryServers.L, s) + p.serverListLock.Unlock() + + p.listLock.Lock() + p.activateEndpoint(s) + p.listLock.Unlock() + + return s +} + +// cycleServers returns a new list of servers that has dequeued the first +// server and enqueued it at the end of the list. cycleServers assumes the +// caller is holding the listLock. cycleServer does not test or ping +// the next server inline. cycleServer may be called when the environment +// has just entered an unhealthy situation and blocking on a server test is +// less desirable than just returning the next server in the firing line. If +// the next server fails, it will fail fast enough and cycleServer will be +// called again. +func (l *serverList) cycleServer() (servers []*ServerEndpoint) { + numServers := len(l.L) + if numServers < 2 { + return servers // No action required + } + + newServers := make([]*ServerEndpoint, 0, numServers) + newServers = append(newServers, l.L[1:]...) + newServers = append(newServers, l.L[0]) + + return newServers +} + +// removeServerByKey performs an inline removal of the first matching server +func (l *serverList) removeServerByKey(targetKey *EndpointKey) { + for i, s := range l.L { + if targetKey.Equal(s.Key()) { + copy(l.L[i:], l.L[i+1:]) + l.L[len(l.L)-1] = nil + l.L = l.L[:len(l.L)-1] + return + } + } +} + +// shuffleServers shuffles the server list in place +func (l *serverList) shuffleServers() { + for i := len(l.L) - 1; i > 0; i-- { + j := rand.Int31n(int32(i + 1)) + l.L[i], l.L[j] = l.L[j], l.L[i] + } +} + +// FindServer takes out an internal "read lock" and searches through the list +// of servers to find a "healthy" server. If the server is actually +// unhealthy, we rely on heartbeats to detect this and remove the node from +// the server list. If the server at the front of the list has failed or +// fails during an RPC call, it is rotated to the end of the list. If there +// are no servers available, return nil. +func (p *RpcProxy) FindServer() *ServerEndpoint { + l := p.getServerList() + numServers := len(l.L) + if numServers == 0 { + p.logger.Printf("[WARN] RPC Proxy: No servers available") + return nil + } else { + // Return whatever is at the front of the list because it is + // assumed to be the oldest in the server list (unless - + // hypothetically - the server list was rotated right after a + // server was added). + return l.L[0] + } +} + +// getServerList is a convenience method which hides the locking semantics +// of atomic.Value from the caller. +func (p *RpcProxy) getServerList() serverList { + return p.activatedList.Load().(serverList) +} + +// saveServerList is a convenience method which hides the locking semantics +// of atomic.Value from the caller. +func (p *RpcProxy) saveServerList(l serverList) { + p.activatedList.Store(l) +} + +func (p *RpcProxy) LeaderAddr() string { + p.listLock.Lock() + defer p.listLock.Unlock() + return p.leaderAddr +} + +// NewRpcProxy is the only way to safely create a new RpcProxy. +func NewRpcProxy(logger *log.Logger, shutdownCh chan struct{}, configInfo NomadConfigInfo, connPoolPinger Pinger) (p *RpcProxy) { + p = new(RpcProxy) + p.logger = logger + p.configInfo = configInfo // can't pass *nomad.Client: import cycle + p.connPoolPinger = connPoolPinger // can't pass *nomad.ConnPool: import cycle + p.rebalanceTimer = time.NewTimer(clientRPCMinReuseDuration) + p.shutdownCh = shutdownCh + + l := serverList{} + l.L = make([]*ServerEndpoint, 0) + p.saveServerList(l) + return p +} + +// NotifyFailedServer marks the passed in server as "failed" by rotating it +// to the end of the server list. +func (p *RpcProxy) NotifyFailedServer(s *ServerEndpoint) { + l := p.getServerList() + + // If the server being failed is not the first server on the list, + // this is a noop. If, however, the server is failed and first on + // the list, acquire the lock, retest, and take the penalty of moving + // the server to the end of the list. + + // Only rotate the server list when there is more than one server + if len(l.L) > 1 && l.L[0] == s && + // Use atomic.CAS to emulate a TryLock(). + atomic.CompareAndSwapInt32(&p.notifyFailedBarrier, 0, 1) { + defer atomic.StoreInt32(&p.notifyFailedBarrier, 0) + + // Grab a lock, retest, and take the hit of cycling the first + // server to the end. + p.listLock.Lock() + defer p.listLock.Unlock() + l = p.getServerList() + + if len(l.L) > 1 && l.L[0] == s { + l.L = l.cycleServer() + p.saveServerList(l) + } + } +} + +func (p *RpcProxy) NumNodes() int { + return p.numNodes +} + +// NumServers takes out an internal "read lock" and returns the number of +// servers. numServers includes both healthy and unhealthy servers. +func (p *RpcProxy) NumServers() int { + l := p.getServerList() + return len(l.L) +} + +// RebalanceServers shuffles the list of servers on this agent. The server +// at the front of the list is selected for the next RPC. RPC calls that +// fail for a particular server are rotated to the end of the list. This +// method reshuffles the list periodically in order to redistribute work +// across all known Nomad servers (i.e. guarantee that the order of servers +// in the server list is not positively correlated with the age of a server +// in the Nomad cluster). Periodically shuffling the server list prevents +// long-lived clients from fixating on long-lived servers. +// +// Unhealthy servers are removed from the server list during the next client +// heartbeat. Before the newly shuffled server list is saved, the new remote +// endpoint is tested to ensure its responsive. +func (p *RpcProxy) RebalanceServers() { + var serverListLocked bool + p.serverListLock.Lock() + serverListLocked = true + defer func() { + if serverListLocked { + p.serverListLock.Unlock() + } + }() + + // Early abort if there is nothing to shuffle + if (len(p.primaryServers.L) + len(p.backupServers.L)) < 2 { + return + } + + // Shuffle server lists independently + p.primaryServers.shuffleServers() + p.backupServers.shuffleServers() + + // Create a new merged serverList + type targetServer struct { + server *ServerEndpoint + // 'n' == Nomad Server + // 'c' == Consul Server + // 'b' == Both + state byte + } + mergedList := make(map[EndpointKey]*targetServer, len(p.primaryServers.L)+len(p.backupServers.L)) + for _, s := range p.primaryServers.L { + mergedList[*s.Key()] = &targetServer{server: s, state: 'n'} + } + for _, s := range p.backupServers.L { + k := s.Key() + _, found := mergedList[*k] + if found { + mergedList[*k].state = 'b' + } else { + mergedList[*k] = &targetServer{server: s, state: 'c'} + } + } + + l := &serverList{L: make([]*ServerEndpoint, 0, len(mergedList))} + for _, s := range p.primaryServers.L { + l.L = append(l.L, s) + } + for _, v := range mergedList { + if v.state != 'c' { + continue + } + l.L = append(l.L, v.server) + } + + // Release the lock before we begin transition to operations on the + // network timescale and attempt to ping servers. A copy of the + // servers has been made at this point. + p.serverListLock.Unlock() + serverListLocked = false + + // Iterate through the shuffled server list to find an assumed + // healthy server. NOTE: Do not iterate on the list directly because + // this loop mutates the server list in-place. + var foundHealthyServer bool + for i := 0; i < len(l.L); i++ { + // Always test the first server. Failed servers are cycled + // and eventually removed from the list when Nomad heartbeats + // detect the failed node. + selectedServer := l.L[0] + + ok, err := p.connPoolPinger.PingNomadServer(p.configInfo.Region(), p.configInfo.RPCVersion(), selectedServer) + if ok { + foundHealthyServer = true + break + } + p.logger.Printf(`[DEBUG] RPC Proxy: pinging server "%s" failed: %s`, selectedServer.String(), err) + + l.cycleServer() + } + + // If no healthy servers were found, sleep and wait for the admin to + // join this node to a server and begin receiving heartbeats with an + // updated list of Nomad servers. Or Consul will begin advertising a + // new server in the nomad-servers service. + if !foundHealthyServer { + p.logger.Printf("[DEBUG] RPC Proxy: No healthy servers during rebalance, aborting") + return + } + + // Verify that all servers are present. Reconcile will save the + // final serverList. + if p.reconcileServerList(l) { + p.logger.Printf("[DEBUG] RPC Proxy: Rebalanced %d servers, next active server is %s", len(l.L), l.L[0].String()) + } else { + // reconcileServerList failed because Nomad removed the + // server that was at the front of the list that had + // successfully been Ping'ed. Between the Ping and + // reconcile, a Nomad heartbeat removed the node. + // + // Instead of doing any heroics, "freeze in place" and + // continue to use the existing connection until the next + // rebalance occurs. + } + + return +} + +// reconcileServerList returns true when the first server in serverList (l) +// exists in the receiver's serverList (m). If true, the merged serverList +// (l) is stored as the receiver's serverList (m). Returns false if the +// first server in m does not exist in the passed in list (l) (i.e. was +// removed by Nomad during a PingNomadServer() call. Newly added servers are +// appended to the list and other missing servers are removed from the list. +func (p *RpcProxy) reconcileServerList(l *serverList) bool { + p.listLock.Lock() + defer p.listLock.Unlock() + + // newServerList is a serverList that has been kept up-to-date with + // join and leave events. + newServerList := p.getServerList() + + // If a Nomad heartbeat removed all nodes, or there is no selected + // server (zero nodes in serverList), abort early. + if len(newServerList.L) == 0 || len(l.L) == 0 { + return false + } + + type targetServer struct { + server *ServerEndpoint + + // 'b' == both + // 'o' == original + // 'n' == new + state byte + } + mergedList := make(map[EndpointKey]*targetServer, len(l.L)) + for _, s := range l.L { + mergedList[*s.Key()] = &targetServer{server: s, state: 'o'} + } + for _, s := range newServerList.L { + k := s.Key() + _, found := mergedList[*k] + if found { + mergedList[*k].state = 'b' + } else { + mergedList[*k] = &targetServer{server: s, state: 'n'} + } + } + + // Ensure the selected server has not been removed by a heartbeat + selectedServerKey := l.L[0].Key() + if v, found := mergedList[*selectedServerKey]; found && v.state == 'o' { + return false + } + + // Append any new servers and remove any old servers + for k, v := range mergedList { + switch v.state { + case 'b': + // Do nothing, server exists in both + case 'o': + // Server has been removed + l.removeServerByKey(&k) + case 'n': + // Server added + l.L = append(l.L, v.server) + default: + panic("unknown merge list state") + } + } + + p.saveServerList(*l) + return true +} + +// RemoveServer takes out an internal write lock and removes a server from +// the server list. +func (p *RpcProxy) RemoveServer(s *ServerEndpoint) { + p.listLock.Lock() + defer p.listLock.Unlock() + l := p.getServerList() + + // Remove the server if known + for i, _ := range l.L { + if l.L[i].Name == s.Name { + newServers := make([]*ServerEndpoint, 0, len(l.L)-1) + newServers = append(newServers, l.L[:i]...) + newServers = append(newServers, l.L[i+1:]...) + l.L = newServers + + p.saveServerList(l) + return + } + } +} + +// refreshServerRebalanceTimer is only called once m.rebalanceTimer expires. +func (p *RpcProxy) refreshServerRebalanceTimer() time.Duration { + l := p.getServerList() + numServers := len(l.L) + // Limit this connection's life based on the size (and health) of the + // cluster. Never rebalance a connection more frequently than + // connReuseLowWatermarkDuration, and make sure we never exceed + // clusterWideRebalanceConnsPerSec operations/s across numLANMembers. + clusterWideRebalanceConnsPerSec := float64(numServers * newRebalanceConnsPerSecPerServer) + connReuseLowWatermarkDuration := clientRPCMinReuseDuration + lib.RandomStagger(clientRPCMinReuseDuration/clientRPCJitterFraction) + numLANMembers := p.numNodes + connRebalanceTimeout := lib.RateScaledInterval(clusterWideRebalanceConnsPerSec, connReuseLowWatermarkDuration, numLANMembers) + + p.rebalanceTimer.Reset(connRebalanceTimeout) + return connRebalanceTimeout +} + +// ResetRebalanceTimer resets the rebalance timer. This method exists for +// testing and should not be used directly. +func (p *RpcProxy) ResetRebalanceTimer() { + p.listLock.Lock() + defer p.listLock.Unlock() + p.rebalanceTimer.Reset(clientRPCMinReuseDuration) +} + +// ServerRPCAddrs returns one RPC Address per server +func (p *RpcProxy) ServerRPCAddrs() []string { + l := p.getServerList() + serverAddrs := make([]string, 0, len(l.L)) + for _, s := range l.L { + serverAddrs = append(serverAddrs, s.Addr.String()) + } + return serverAddrs +} + +// Run is used to start and manage the task of automatically shuffling and +// rebalancing the list of Nomad servers. This maintenance only happens +// periodically based on the expiration of the timer. Failed servers are +// automatically cycled to the end of the list. New servers are appended to +// the list. The order of the server list must be shuffled periodically to +// distribute load across all known and available Nomad servers. +func (p *RpcProxy) Run() { + for { + select { + case <-p.rebalanceTimer.C: + p.RebalanceServers() + + p.refreshServerRebalanceTimer() + + // Perform Consul operations asynchronously, but in a + // singleton to prevent this task from stacking + // during the next heartbeat if Consul is slow or + // unavailable. + if atomic.CompareAndSwapInt32(&p.consulLock, 0, 1) { + go func() { + // TODO(sean@): Talk w/ Consul and + // append any servers it has to our + // server list. Need access to the + // Consul Config agent out of Client + // in order to poll (or create our + // own parallel client using the + // existing consul config). + p.logger.Printf("[DEBUG] Polling Consul for servers in the nomad-server list") + defer atomic.StoreInt32(&p.consulLock, 0) + }() + } + + case <-p.shutdownCh: + p.logger.Printf("[INFO] RPC Proxy: shutting down") + return + } + } +} + +// UpdateFromNodeUpdateResponse handles heartbeat responses from Nomad +// Servers. Heartbeats contain a list of Nomad Servers that the client +// should talk with for RPC requests. UpdateFromNodeUpdateResponse does not +// rebalance its serverList, that is handled elsewhere. New servers learned +// via the heartbeat are appended to the RpcProxy's serverList. Removed +// servers are removed immediately. Servers speaking a newer RPC version are +// filtered from the serverList. +func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse) error { + // Merge all servers found in the response. Servers in the response + // with newer API versions are filtered from the list. If the list + // is missing an address found in the RpcProxy's server list, remove + // it from the RpcProxy. + // + // FIXME(sean@): This is not true. We rely on an outside pump to set + // these values. In order to catch the orphaned clients where all + // Nomad servers were rolled between the heartbeat interval, the + // rebalance task queries Consul and adds the servers found in Consul + // to the server list in order to reattach an orphan to a server. + + p.serverListLock.Lock() + defer p.serverListLock.Unlock() + + // 1) Create a map to reconcile the difference between + // m.primaryServers and resp.Servers. + type targetServer struct { + server *ServerEndpoint + + // 'b' == both + // 'o' == original + // 'n' == new + state byte + } + mergedNomadMap := make(map[EndpointKey]*targetServer, len(p.primaryServers.L)+len(resp.Servers)) + numOldServers := 0 + for _, s := range p.primaryServers.L { + mergedNomadMap[*s.Key()] = &targetServer{server: s, state: 'o'} + numOldServers++ + } + numBothServers := 0 + var newServers bool + for _, s := range resp.Servers { + // Filter out servers using a newer API version. Prevent + // spamming the logs every heartbeat. + // + // TODO(sean@): Move the logging throttle logic into a + // dedicated logging package so RpcProxy does not have to + // perform this accounting. + if int32(p.configInfo.RPCVersion()) < s.RPCVersion { + now := time.Now() + t, ok := p.rpcAPIMismatchThrottle[s.RPCAdvertiseAddr] + if ok && t.After(now) { + continue + } + + p.logger.Printf("[WARN] API mismatch between client (v%d) and server (v%d), ignoring server %q", apiMajorVersion, s.RPCVersion, s.RPCAdvertiseAddr) + p.rpcAPIMismatchThrottle[s.RPCAdvertiseAddr] = now.Add(rpcAPIMismatchLogRate) + continue + } + + server, err := newServer(s.RPCAdvertiseAddr) + if err != nil { + p.logger.Printf("[WARN] Unable to create a server from %q: %v", s.RPCAdvertiseAddr, err) + continue + } + + k := server.Key() + _, found := mergedNomadMap[*k] + if found { + mergedNomadMap[*k].state = 'b' + numBothServers++ + } else { + mergedNomadMap[*k] = &targetServer{server: server, state: 'n'} + newServers = true + } + } + + // Short-circuit acquiring a lock if nothing changed + if !newServers && numOldServers == numBothServers { + return nil + } + + p.listLock.Lock() + defer p.listLock.Unlock() + newServerCfg := p.getServerList() + for k, v := range mergedNomadMap { + switch v.state { + case 'b': + // Do nothing, server exists in both + case 'o': + // Server has been removed + + // TODO(sean@): Teach Nomad servers how to remove + // themselves from their heartbeat in order to + // gracefully drain their clients over the next + // cluster's max rebalanceTimer duration. Without + // this enhancement, if a server being shutdown and + // it is the first in serverList, the client will + // fail its next RPC connection. + p.primaryServers.removeServerByKey(&k) + newServerCfg.removeServerByKey(&k) + case 'n': + // Server added. Append it to both lists + // immediately. The server should only go into + // active use in the event of a failure or after a + // rebalance occurs. + p.primaryServers.L = append(p.primaryServers.L, v.server) + newServerCfg.L = append(newServerCfg.L, v.server) + default: + panic("unknown merge list state") + } + } + + p.numNodes = int(resp.NumNodes) + p.leaderAddr = resp.LeaderRPCAddr + p.saveServerList(newServerCfg) + + return nil +} diff --git a/client/rpc_proxy/server_endpoint.go b/client/rpc_proxy/server_endpoint.go new file mode 100644 index 00000000000..34ae322fd64 --- /dev/null +++ b/client/rpc_proxy/server_endpoint.go @@ -0,0 +1,81 @@ +package rpc_proxy + +import ( + "fmt" + "net" + "strings" +) + +const ( + defaultNomadRPCPort = "4647" +) + +// EndpointKey is used in maps and for equality tests. A key is based on endpoints. +type EndpointKey struct { + name string +} + +// Equal compares two EndpointKey objects +func (k *EndpointKey) Equal(x *EndpointKey) bool { + return k.name == x.name +} + +// ServerEndpoint contains the address information for to connect to a Nomad +// server. +// +// TODO(sean@): Server is stubbed out so that in the future it can hold a +// reference to Node (and ultimately Node.ID). +type ServerEndpoint struct { + // Name is the unique lookup key for a Server instance + Name string + Host string + Port string + Addr net.Addr +} + +// Key returns the corresponding Key +func (s *ServerEndpoint) Key() *EndpointKey { + return &EndpointKey{ + name: s.Name, + } +} + +// newServer creates a new Server instance with a resolvable endpoint +func newServer(name string) (s *ServerEndpoint, err error) { + s = &ServerEndpoint{ + Name: name, + } + + var ( + host, port string + ) + host, port, err = net.SplitHostPort(name) + if err == nil { + s.Host = host + s.Port = port + } else { + if strings.Contains(err.Error(), "missing port") { + s.Host = name + s.Port = defaultNomadRPCPort + } else { + return nil, err + } + } + + if s.Addr, err = net.ResolveTCPAddr("tcp", net.JoinHostPort(s.Host, s.Port)); err != nil { + return nil, err + } + + return s, err +} + +// String returns a string representation of Server +func (s *ServerEndpoint) String() string { + var addrStr, networkStr string + if s.Addr != nil { + addrStr = s.Addr.String() + networkStr = s.Addr.Network() + } + + return fmt.Sprintf("%s (%s:%s)", s.Name, networkStr, addrStr) +} diff --git a/command/agent/agent.go b/command/agent/agent.go index 5ff90c6b5de..1e758f3ef50 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -29,10 +29,15 @@ type Agent struct { logger *log.Logger logOutput io.Writer - consulSyncer *consul.Syncer // consulSyncer registers the Nomad agent with the Consul Agent - consulAgentConfig *consul.AgentConfig // consulAgentConfig is the configuration the Nomad client uses to connect with Consul agent - serverHTTPAddr string - clientHTTPAddr string + // consulAgentConfig is a limited subset of the information necessary + // to establish a connection with a Consul agent + consulAgentConfig *consul.AgentConfig + + // consulSyncer registers the Nomad agent with the Consul Agent + consulSyncer *consul.Syncer + + serverHTTPAddr string + clientHTTPAddr string server *nomad.Server client *client.Client @@ -56,10 +61,9 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { shutdownCh: make(chan struct{}), } - // creating the consul client configuration that both the server and client - // uses - a.createAgentConfig() - + if err := a.setupConsulSyncer(shutdownCh); err != nil { + return nil, err + } if err := a.setupServer(); err != nil { return nil, err } @@ -488,7 +492,9 @@ func (a *Agent) Stats() map[string]map[string]string { return stats } -func (a *Agent) createAgentConfig() { +// setupConsulSyncer creates the Consul task used by this Nomad Agent when +// running in either Client and Server mode. +func (a *Agent) setupConsulSyncer(shutdownCh types.ShutdownChannel) (err error) { cfg := &consul.AgentConfig{ Addr: a.config.Consul.Addr, Token: a.config.Consul.Token, @@ -500,6 +506,10 @@ func (a *Agent) createAgentConfig() { KeyFile: a.config.Consul.KeyFile, } a.consulAgentConfig = cfg + + a.consulSyncer, err = consul.NewSyncer(cfg, a.logger) + + return nil } // syncAgentServicesWithConsul syncs this Nomad Agent's services with Consul diff --git a/command/agent/agent_endpoint.go b/command/agent/agent_endpoint.go index b2ca095991d..316d95b3d44 100644 --- a/command/agent/agent_endpoint.go +++ b/command/agent/agent_endpoint.go @@ -136,8 +136,17 @@ func (s *HTTPServer) listServers(resp http.ResponseWriter, req *http.Request) (i return nil, CodedError(501, ErrInvalidMethod) } - // Get the current list of servers - return client.Servers(), nil + // Get the current list of servers according to Raft. + // + // NOTE(sean@); This could be s.agent.server.localPeers instead. + var err error + var peers []string + peers, err = s.agent.server.RaftPeers() + if err != nil { + return nil, err + } + + return peers, nil } func (s *HTTPServer) updateServers(resp http.ResponseWriter, req *http.Request) (interface{}, error) { @@ -153,7 +162,9 @@ func (s *HTTPServer) updateServers(resp http.ResponseWriter, req *http.Request) } // Set the servers list into the client - client.SetServers(servers) + for _, s := range servers { + client.AddPrimaryServerToRpcProxy(s) + } return nil, nil } diff --git a/command/agent/command.go b/command/agent/command.go index 542565a15b5..42b06569ebd 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -22,6 +22,7 @@ import ( "github.com/hashicorp/logutils" "github.com/hashicorp/nomad/helper/flag-slice" "github.com/hashicorp/nomad/helper/gated-writer" + "github.com/hashicorp/nomad/nomad/structs/config" "github.com/hashicorp/scada-client/scada" "github.com/mitchellh/cli" ) @@ -59,11 +60,11 @@ func (c *Command) readConfig() *Config { // Make a new, empty config. cmdConfig := &Config{ - Atlas: &AtlasConfig{}, - Consul: &Consul{}, - Client: &ClientConfig{}, - Ports: &Ports{}, - Server: &ServerConfig{}, + Atlas: &AtlasConfig{}, + Consul: &config.ConsulConfig{}, + Client: &ClientConfig{}, + Ports: &Ports{}, + Server: &ServerConfig{}, } flags := flag.NewFlagSet("agent", flag.ContinueOnError) diff --git a/nomad/heartbeat.go b/nomad/heartbeat.go index 9b2867ecaa3..e4e52ed3042 100644 --- a/nomad/heartbeat.go +++ b/nomad/heartbeat.go @@ -101,6 +101,18 @@ func (s *Server) invalidateHeartbeat(id string) { if err := s.endpoints.Node.UpdateStatus(&req, &resp); err != nil { s.logger.Printf("[ERR] nomad.heartbeat: update status failed: %v", err) } + + if resp.LeaderRPCAddr == "" { + s.logger.Printf("[TRACE] nomad.heartbeat: no leader address returned during heartbeat") + } else { + s.logger.Printf("[TRACE] nomad.heartbeat: current leader address according to server %q is %v", s.rpcAdvertise.String(), resp.LeaderRPCAddr) + } + + if len(resp.Servers) == 0 { + s.logger.Printf("[TRACE] nomad.heartbeat: no servers returned during heartbeat") + } else { + s.logger.Printf("[TRACE] nomad.heartbeat: current servers according to server is %v", resp.Servers) + } } // clearHeartbeatTimer is used to clear the heartbeat time for diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index e115e66599c..6a7cd04c346 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -205,8 +205,34 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct reply.HeartbeatTTL = ttl } - // Set the reply index + // Set the reply index and leader + n.srv.peerLock.RLock() + defer n.srv.peerLock.RUnlock() reply.Index = index + reply.LeaderRPCAddr = n.srv.raft.Leader() + + // Reply with config information required for future RPC requests + reply.Servers = make([]*structs.NodeServerInfo, 0, len(n.srv.localPeers)) + for p := range n.srv.localPeers { + reply.Servers = append(reply.Servers, + &structs.NodeServerInfo{ + RPCAdvertiseAddr: p, + RPCVersion: apiMajorVersion, + }) + } + + // Capture all the nodes to obtain the node count + iter, err := snap.Nodes() + if err == nil { + for { + raw := iter.Next() + if raw == nil { + break + } + reply.NumNodes++ + } + } + return nil } diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go index 5aede49916e..9d825a182d6 100644 --- a/nomad/node_endpoint_test.go +++ b/nomad/node_endpoint_test.go @@ -259,6 +259,12 @@ func TestClientEndpoint_UpdateStatus_HeartbeatOnly(t *testing.T) { t.Fatalf("bad: %#v", ttl) } + // Check for heartbeat servers + servers := resp.Servers + if len(servers) == 0 { + t.Fatalf("bad: %#v", servers) + } + // Update the status, static state dereg := &structs.NodeUpdateStatusRequest{ NodeID: node.ID, diff --git a/nomad/pool.go b/nomad/pool.go index 96328a3f091..50158b2a133 100644 --- a/nomad/pool.go +++ b/nomad/pool.go @@ -12,6 +12,7 @@ import ( "github.com/hashicorp/consul/tlsutil" "github.com/hashicorp/net-rpc-msgpackrpc" + "github.com/hashicorp/nomad/client/rpc_proxy" "github.com/hashicorp/yamux" ) @@ -373,6 +374,30 @@ func (p *ConnPool) RPC(region string, addr net.Addr, version int, method string, return nil } +// PingNomadServer sends a Status.Ping message to the specified server and +// returns true if healthy, false if an error occurred +func (p *ConnPool) PingNomadServer(region string, version int, s *rpc_proxy.ServerEndpoint) (bool, error) { + // Get a usable client + conn, sc, err := p.getClient(region, s.Addr, version) + if err != nil { + return false, err + } + + // Make the RPC call + var out struct{} + err = msgpackrpc.CallWithCodec(sc.codec, "Status.Ping", struct{}{}, &out) + if err != nil { + sc.Close() + p.releaseConn(conn) + return false, err + } + + // Done with the connection + conn.returnClient(sc) + p.releaseConn(conn) + return true, nil +} + // Reap is used to close conns open over maxTime func (p *ConnPool) reap() { for { diff --git a/nomad/server.go b/nomad/server.go index 8553d0c7c9e..424d462aef4 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -701,6 +701,16 @@ func (s *Server) RPC(method string, args interface{}, reply interface{}) error { return codec.err } +// RaftPeers returns the current list of Raft peers +func (s *Server) RaftPeers() ([]string, error) { + if peers, err := s.raftPeers.Peers(); err == nil { + return peers, nil + } else { + s.logger.Printf("[DEBUG] server: error getting raft peers: %v", err) + return nil, err + } +} + // Stats is used to return statistics for debugging and insight // for various sub-systems func (s *Server) Stats() map[string]map[string]string { @@ -719,7 +729,7 @@ func (s *Server) Stats() map[string]map[string]string { "serf": s.serf.Stats(), "runtime": RuntimeStats(), } - if peers, err := s.raftPeers.Peers(); err == nil { + if peers, err := s.RaftPeers(); err == nil { stats["raft"]["raft_peers"] = strings.Join(peers, ",") } else { s.logger.Printf("[DEBUG] server: error getting raft peers: %v", err) diff --git a/nomad/structs/config/consul.go b/nomad/structs/config/consul.go index 758ef4d0537..3b0989c73a6 100644 --- a/nomad/structs/config/consul.go +++ b/nomad/structs/config/consul.go @@ -7,8 +7,9 @@ package config // // - Bootstrap this Nomad Client with the list of Nomad Servers registered // with Consul +// +// Both the Agent and the executor need to be able to import ConsulConfig. type ConsulConfig struct { - // ServerServiceName is the name of the service that Nomad uses to register // servers with Consul ServerServiceName string `mapstructure:"server_service_name"` diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index f624c4bfdde..23b747e0099 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -151,6 +151,17 @@ type NodeDeregisterRequest struct { WriteRequest } +// NodeServerInfo is used to in NodeUpdateResponse to return Nomad server +// information used in RPC server lists. +type NodeServerInfo struct { + // RPCAdvertiseAddr is the IP endpoint that a Nomad Server wishes to + // be contacted at for RPCs. + RPCAdvertiseAddr string + + // RPCVersion is the version number the Nomad Server supports + RPCVersion int32 +} + // NodeUpdateStatusRequest is used for Node.UpdateStatus endpoint // to update the status of a node. type NodeUpdateStatusRequest struct { @@ -351,6 +362,20 @@ type NodeUpdateResponse struct { EvalIDs []string EvalCreateIndex uint64 NodeModifyIndex uint64 + + // LeaderRPCAddr is the RPC address of the current Raft Leader. If + // empty, the current Nomad Server is in the minority of a partition. + LeaderRPCAddr string + + // NumNodes is the number of Nomad nodes attached to this quorum of + // Nomad Servers at the time of the response. This value can + // fluctuate based on the health of the cluster between heartbeats. + NumNodes int32 + + // Servers is the full list of known Nomad servers in the local + // region. + Servers []*NodeServerInfo + QueryMeta } From f6e358732114a8152b3da23a647d8e237ab77fbf Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 18:12:58 -0700 Subject: [PATCH 018/166] Define a type for the PeriodicCallback handlers and ShutdownChannel --- client/consul/sync.go | 4 ++-- command/agent/agent.go | 5 +++-- nomad/types/types.go | 4 ++++ 3 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 nomad/types/types.go diff --git a/client/consul/sync.go b/client/consul/sync.go index 223b37faa11..de952b807d7 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -33,7 +33,7 @@ type Syncer struct { logger *log.Logger - shutdownCh chan struct{} + shutdownCh types.ShutdownChannel shutdown bool shutdownLock sync.Mutex } @@ -119,7 +119,7 @@ func NewSyncer(config *AgentConfig, logger *log.Logger) (*Syncer, error) { trackedChecks: make(map[string]*consul.AgentCheckRegistration), checkRunners: make(map[string]*CheckRunner), - shutdownCh: make(chan struct{}), + shutdownCh: make(types.ShutdownChannel), } return &consulSyncer, nil } diff --git a/command/agent/agent.go b/command/agent/agent.go index 1e758f3ef50..9dfa4bc757c 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -43,7 +43,7 @@ type Agent struct { client *client.Client shutdown bool - shutdownCh chan struct{} + shutdownCh types.ShutdownChannel shutdownLock sync.Mutex } @@ -54,11 +54,12 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { logOutput = os.Stderr } + shutdownCh := make(types.ShutdownChannel) a := &Agent{ config: config, logger: log.New(logOutput, "", log.LstdFlags), logOutput: logOutput, - shutdownCh: make(chan struct{}), + shutdownCh: shutdownCh, } if err := a.setupConsulSyncer(shutdownCh); err != nil { diff --git a/nomad/types/types.go b/nomad/types/types.go new file mode 100644 index 00000000000..bb4ca552568 --- /dev/null +++ b/nomad/types/types.go @@ -0,0 +1,4 @@ +package types + +type PeriodicCallback func() +type ShutdownChannel chan struct{} From d268dcb85d35a25fb68276783794c1960459b619 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 21:28:12 -0700 Subject: [PATCH 019/166] Reduce all forms of ConsulConfig down to a single struct nomad/structs/config/consul.go's ConsulConfig is the canonical definition for all things Consul now. --- client/client_test.go | 4 ++-- client/config/config.go | 7 +++--- client/consul/sync.go | 13 +---------- client/consul/sync_test.go | 5 +++-- client/driver/executor/executor.go | 9 ++++---- client/driver/utils.go | 34 +++++++++++++++------------- command/agent/agent.go | 36 +++++++++++++++++------------- 7 files changed, 52 insertions(+), 56 deletions(-) diff --git a/client/client_test.go b/client/client_test.go index 5a17c07a5b7..e8126a61193 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -12,10 +12,10 @@ import ( "time" "github.com/hashicorp/nomad/client/config" - "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" + sconfig "github.com/hashicorp/nomad/nomad/structs/config" "github.com/hashicorp/nomad/testutil" "github.com/mitchellh/hashstructure" @@ -71,7 +71,7 @@ func testServer(t *testing.T, cb func(*nomad.Config)) (*nomad.Server, string) { func testClient(t *testing.T, cb func(c *config.Config)) *Client { conf := DefaultConfig() conf.DevMode = true - conf.ConsulAgentConfig = &consul.AgentConfig{} + conf.ConsulConfig = &sconfig.ConsulConfig{} if cb != nil { cb(conf) } diff --git a/client/config/config.go b/client/config/config.go index 4f392ec806b..52bb906c102 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -7,8 +7,8 @@ import ( "strings" "time" - "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/structs/config" ) var ( @@ -110,9 +110,8 @@ type Config struct { // Revision is the commit number of the Nomad client Revision string - // ConsulAgentConfig is the configuration to connect with Consul - // Agent - ConsulAgentConfig *consul.AgentConfig + // ConsulConfig is this Agent's Consul configuration + ConsulConfig *config.ConsulConfig // StatsDataPoints is the number of resource usage data points the Nomad // client keeps in memory diff --git a/client/consul/sync.go b/client/consul/sync.go index de952b807d7..bfaa6d60138 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -36,18 +36,7 @@ type Syncer struct { shutdownCh types.ShutdownChannel shutdown bool shutdownLock sync.Mutex -} -// AgentConfig is the configuration used to create a new ConsulService client -type AgentConfig struct { - Addr string - Token string - Auth string - EnableSSL bool - VerifySSL bool - CAFile string - CertFile string - KeyFile string } const ( @@ -60,7 +49,7 @@ const ( ) // NewSyncer returns a new consul.Syncer -func NewSyncer(config *AgentConfig, logger *log.Logger) (*Syncer, error) { +func NewSyncer(config *config.ConsulConfig, logger *log.Logger) (*Syncer, error) { var err error var c *consul.Client cfg := consul.DefaultConfig() diff --git a/client/consul/sync_test.go b/client/consul/sync_test.go index 735b77b0727..211b0181826 100644 --- a/client/consul/sync_test.go +++ b/client/consul/sync_test.go @@ -10,6 +10,7 @@ import ( "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/structs/config" ) const ( @@ -41,7 +42,7 @@ var ( ) func TestConsulServiceRegisterServices(t *testing.T) { - cs, err := NewSyncer(&AgentConfig{}, logger) + cs, err := NewSyncer(&config.ConsulConfig{}, logger) if err != nil { t.Fatalf("Err: %v", err) } @@ -68,7 +69,7 @@ func TestConsulServiceRegisterServices(t *testing.T) { } func TestConsulServiceUpdateService(t *testing.T) { - cs, err := NewSyncer(&AgentConfig{}, logger) + cs, err := NewSyncer(&config.ConsulConfig{}, logger) if err != nil { t.Fatalf("Err: %v", err) } diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index 348733dd700..c6018f484e4 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -27,6 +27,7 @@ import ( cstructs "github.com/hashicorp/nomad/client/driver/structs" "github.com/hashicorp/nomad/client/stats" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/structs/config" ) const ( @@ -60,9 +61,9 @@ type Executor interface { // ConsulContext holds context to configure the Consul client and run checks type ConsulContext struct { - // ConsulAgentConfig contains the configuration information for - // talking with this Nomad Agent's Consul Agent. - ConsulAgentConfig *consul.AgentConfig + // ConsulConfig contains the configuration information for talking + // with this Nomad Agent's Consul Agent. + ConsulConfig *config.ConsulConfig // ContainerID is the ID of the container ContainerID string @@ -471,7 +472,7 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { e.logger.Printf("[INFO] executor: registering services") e.consulCtx = ctx if e.consulSyncer == nil { - cs, err := consul.NewSyncer(ctx.ConsulAgentConfig, e.logger) + cs, err := consul.NewSyncer(ctx.ConsulConfig, e.logger) if err != nil { return err } diff --git a/client/driver/utils.go b/client/driver/utils.go index d6202dd40b0..5559e65e542 100644 --- a/client/driver/utils.go +++ b/client/driver/utils.go @@ -12,11 +12,11 @@ import ( "github.com/hashicorp/go-multierror" "github.com/hashicorp/go-plugin" "github.com/hashicorp/nomad/client/config" - "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/driver/executor" "github.com/hashicorp/nomad/client/driver/logging" cstructs "github.com/hashicorp/nomad/client/driver/structs" "github.com/hashicorp/nomad/nomad/structs" + sconfig "github.com/hashicorp/nomad/nomad/structs/config" ) // createExecutor launches an executor plugin and returns an instance of the @@ -73,23 +73,25 @@ func createLogCollector(config *plugin.ClientConfig, w io.Writer, } func consulContext(clientConfig *config.Config, containerID string) *executor.ConsulContext { - cfg := consul.AgentConfig{ - Addr: clientConfig.ReadDefault("consul.address", "127.0.0.1:8500"), - Token: clientConfig.Read("consul.token"), - Auth: clientConfig.Read("consul.auth"), - EnableSSL: clientConfig.ReadBoolDefault("consul.ssl", false), - VerifySSL: clientConfig.ReadBoolDefault("consul.verifyssl", true), - CAFile: clientConfig.Read("consul.tls_ca_file"), - CertFile: clientConfig.Read("consul.tls_cert_file"), - KeyFile: clientConfig.Read("consul.tls_key_file"), + cfg := sconfig.ConsulConfig{ + Addr: clientConfig.ReadDefault("consul.address", "127.0.0.1:8500"), + Token: clientConfig.Read("consul.token"), + Auth: clientConfig.Read("consul.auth"), + EnableSSL: clientConfig.ReadBoolDefault("consul.ssl", false), + VerifySSL: clientConfig.ReadBoolDefault("consul.verifyssl", true), + CAFile: clientConfig.Read("consul.tls_ca_file"), + CertFile: clientConfig.Read("consul.tls_cert_file"), + KeyFile: clientConfig.Read("consul.tls_key_file"), + ServerServiceName: clientConfig.ReadDefault("consul.server_service_name", "nomad-server"), + ClientServiceName: clientConfig.ReadDefault("consul.client_service_name", "nomad-client"), } return &executor.ConsulContext{ - ConsulAgentConfig: &cfg, - ContainerID: containerID, - DockerEndpoint: clientConfig.Read("docker.endpoint"), - TLSCa: clientConfig.Read("docker.tls.ca"), - TLSCert: clientConfig.Read("docker.tls.cert"), - TLSKey: clientConfig.Read("docker.tls.key"), + ConsulConfig: &cfg, + ContainerID: containerID, + DockerEndpoint: clientConfig.Read("docker.endpoint"), + TLSCa: clientConfig.Read("docker.tls.ca"), + TLSCert: clientConfig.Read("docker.tls.cert"), + TLSKey: clientConfig.Read("docker.tls.key"), } } diff --git a/command/agent/agent.go b/command/agent/agent.go index 9dfa4bc757c..8381029f580 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -17,6 +17,8 @@ import ( "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/structs/config" + "github.com/hashicorp/nomad/nomad/types" ) // Agent is a long running daemon that is used to run both @@ -29,9 +31,9 @@ type Agent struct { logger *log.Logger logOutput io.Writer - // consulAgentConfig is a limited subset of the information necessary - // to establish a connection with a Consul agent - consulAgentConfig *consul.AgentConfig + // consulConfig is a limited subset of the information necessary to + // establish a connection with this Nomad Agent's Consul Agent. + consulConfig *config.ConsulConfig // consulSyncer registers the Nomad agent with the Consul Agent consulSyncer *consul.Syncer @@ -278,7 +280,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { conf.Version = fmt.Sprintf("%s%s", a.config.Version, a.config.VersionPrerelease) conf.Revision = a.config.Revision - conf.ConsulAgentConfig = a.consulAgentConfig + conf.ConsulConfig = a.consulConfig conf.StatsDataPoints = a.config.Client.StatsConfig.DataPoints conf.StatsCollectionInterval = a.config.Client.StatsConfig.collectionInterval @@ -496,17 +498,19 @@ func (a *Agent) Stats() map[string]map[string]string { // setupConsulSyncer creates the Consul task used by this Nomad Agent when // running in either Client and Server mode. func (a *Agent) setupConsulSyncer(shutdownCh types.ShutdownChannel) (err error) { - cfg := &consul.AgentConfig{ - Addr: a.config.Consul.Addr, - Token: a.config.Consul.Token, - Auth: a.config.Consul.Auth, - EnableSSL: a.config.Consul.EnableSSL, - VerifySSL: a.config.Consul.VerifySSL, - CAFile: a.config.Consul.CAFile, - CertFile: a.config.Consul.CertFile, - KeyFile: a.config.Consul.KeyFile, - } - a.consulAgentConfig = cfg + cfg := &config.ConsulConfig{ + Addr: a.config.Consul.Addr, + Token: a.config.Consul.Token, + Auth: a.config.Consul.Auth, + EnableSSL: a.config.Consul.EnableSSL, + VerifySSL: a.config.Consul.VerifySSL, + CAFile: a.config.Consul.CAFile, + CertFile: a.config.Consul.CertFile, + KeyFile: a.config.Consul.KeyFile, + ServerServiceName: a.config.Consul.ServerServiceName, + ClientServiceName: a.config.Consul.ClientServiceName, + } + a.consulConfig = cfg a.consulSyncer, err = consul.NewSyncer(cfg, a.logger) @@ -516,7 +520,7 @@ func (a *Agent) setupConsulSyncer(shutdownCh types.ShutdownChannel) (err error) // syncAgentServicesWithConsul syncs this Nomad Agent's services with Consul // when running in either Client or Server mode. func (a *Agent) syncAgentServicesWithConsul(clientHttpAddr string, serverHttpAddr string) error { - cs, err := consul.NewSyncer(a.consulAgentConfig, a.logger) + cs, err := consul.NewSyncer(a.consulConfig, a.logger) if err != nil { return err } From e07e77b79daf42f6fb36e7b861c2cd6959dcd9ea Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 21:35:31 -0700 Subject: [PATCH 020/166] s/availble/runChecks/g --- client/consul/sync.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/client/consul/sync.go b/client/consul/sync.go index bfaa6d60138..556de1b8522 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -19,8 +19,8 @@ import ( // Syncer allows syncing of services and checks with Consul type Syncer struct { - client *consul.Client - availble bool + client *consul.Client + runChecks bool serviceIdentifier string // serviceIdentifier is a token which identifies which task/alloc the service belongs to delegateChecks map[string]struct{} // delegateChecks are the checks that the Nomad client runs and reports to Consul @@ -348,12 +348,12 @@ func (c *Syncer) Run() { select { case <-sync.C: if err := c.performSync(); err != nil { - if c.availble { + if c.runChecks { c.logger.Printf("[DEBUG] consul: error in syncing services for %q: %v", c.serviceIdentifier, err) } - c.availble = false + c.runChecks = false } else { - c.availble = true + c.runChecks = true } case <-c.shutdownCh: sync.Stop() @@ -447,11 +447,11 @@ func (c *Syncer) runCheck(check Check) { output = res.Err.Error() } if err := c.client.Agent().UpdateTTL(check.ID(), output, state); err != nil { - if c.availble { + if c.runChecks { c.logger.Printf("[DEBUG] consul.sync: error updating ttl check for check %q: %v", check.ID(), err) - c.availble = false + c.runChecks = false } else { - c.availble = true + c.runChecks = true } } } From 8da18b60f3eb69532148a8ab658a0cb1b29ee8b9 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 21:36:23 -0700 Subject: [PATCH 021/166] Alpha sort config keys for Consul --- command/agent/config_parse.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index a5a7e713afa..ee291428f03 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -598,19 +598,19 @@ func parseConsulConfig(result **config.ConsulConfig, list *ast.ObjectList) error // Check for invalid keys valid := []string{ - "server_service_name", - "client_service_name", - "auto_register", "addr", - "token", "auth", - "ssl", - "verify_ssl", + "auto_register", "ca_file", "cert_file", - "key_file", "client_auto_join", + "client_service_name", + "key_file", "server_auto_join", + "server_service_name", + "ssl", + "token", + "verify_ssl", } if err := checkHCLKeys(listVal, valid); err != nil { From 89d48c8e827731d265e491ba851e60c5b851979c Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 21:43:09 -0700 Subject: [PATCH 022/166] Revise Agent.syncAgentServicesWithConsul()'s interface Reduce down to its lowest common helper function denominator. --- command/agent/agent.go | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 8381029f580..39476882681 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -38,8 +38,8 @@ type Agent struct { // consulSyncer registers the Nomad agent with the Consul Agent consulSyncer *consul.Syncer - serverHTTPAddr string - clientHTTPAddr string + serverHttpAddr string + clientHttpAddr string server *nomad.Server client *client.Client @@ -77,7 +77,7 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { return nil, fmt.Errorf("must have at least client or server mode enabled") } if a.config.Consul.AutoRegister { - if err := a.syncAgentServicesWithConsul(a.serverHTTPAddr, a.clientHTTPAddr); err != nil { + if err := a.syncAgentServicesWithConsul(); err != nil { a.logger.Printf("[ERR] agent: unable to sync agent services with consul: %v", err) } if a.consulSyncer != nil { @@ -169,13 +169,13 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { } if a.config.AdvertiseAddrs.HTTP != "" { - a.serverHTTPAddr = a.config.AdvertiseAddrs.HTTP + a.serverHttpAddr = a.config.AdvertiseAddrs.HTTP } else if a.config.Addresses.HTTP != "" { - a.serverHTTPAddr = fmt.Sprintf("%v:%v", a.config.Addresses.HTTP, a.config.Ports.HTTP) + a.serverHttpAddr = fmt.Sprintf("%v:%v", a.config.Addresses.HTTP, a.config.Ports.HTTP) } else if a.config.BindAddr != "" { - a.serverHTTPAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.HTTP) + a.serverHttpAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.HTTP) } else { - a.serverHTTPAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.HTTP) + a.serverHttpAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.HTTP) } if gcThreshold := a.config.Server.NodeGCThreshold; gcThreshold != "" { @@ -263,7 +263,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { httpAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) } conf.Node.HTTPAddr = httpAddr - a.clientHTTPAddr = httpAddr + a.clientHttpAddr = httpAddr // Reserve resources on the node. r := conf.Node.Reserved @@ -519,34 +519,26 @@ func (a *Agent) setupConsulSyncer(shutdownCh types.ShutdownChannel) (err error) // syncAgentServicesWithConsul syncs this Nomad Agent's services with Consul // when running in either Client or Server mode. -func (a *Agent) syncAgentServicesWithConsul(clientHttpAddr string, serverHttpAddr string) error { - cs, err := consul.NewSyncer(a.consulConfig, a.logger) - if err != nil { - return err - } - a.consulSyncer = cs +func (a *Agent) syncAgentServicesWithConsul() error { var services []*structs.Service if a.client != nil && a.config.Consul.ClientServiceName != "" { - if err != nil { - return err - } clientService := &structs.Service{ Name: a.config.Consul.ClientServiceName, - PortLabel: clientHttpAddr, + PortLabel: a.clientHttpAddr, } services = append(services, clientService) - cs.SetServiceIdentifier("agent-client") + a.consulSyncer.SetServiceIdentifier("agent-client") } if a.server != nil && a.config.Consul.ServerServiceName != "" { serverService := &structs.Service{ Name: a.config.Consul.ServerServiceName, - PortLabel: serverHttpAddr, + PortLabel: a.serverHttpAddr, } services = append(services, serverService) - cs.SetServiceIdentifier("agent-server") + a.consulSyncer.SetServiceIdentifier("agent-server") } - cs.SetAddrFinder(func(portLabel string) (string, int) { + a.consulSyncer.SetAddrFinder(func(portLabel string) (string, int) { host, port, err := net.SplitHostPort(portLabel) if err != nil { return "", 0 @@ -564,5 +556,5 @@ func (a *Agent) syncAgentServicesWithConsul(clientHttpAddr string, serverHttpAdd return host, p }) - return cs.SyncServices(services) + return a.consulSyncer.SyncServices(services) } From c159e77fe35eeaecdb224ec9caaf44ce07e1ce3b Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 21:45:13 -0700 Subject: [PATCH 023/166] An Agent's consulSyncer is always not nil now. --- command/agent/agent.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 39476882681..0eb9cd03bfd 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -80,10 +80,8 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { if err := a.syncAgentServicesWithConsul(); err != nil { a.logger.Printf("[ERR] agent: unable to sync agent services with consul: %v", err) } - if a.consulSyncer != nil { - go a.consulSyncer.Run() - } } + go a.consulSyncer.Run() return a, nil } From 17927c8b9cdecfa3332864a008a74acbc4d00f49 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 23:23:57 -0700 Subject: [PATCH 024/166] Only poll Consul for servers when Nomad heartbeats begin to fail When a deadline timer of 2x Server's last requested TTL expires, begin polling Consul for Nomad Servers. --- client/client.go | 154 +++++++++++++++++++++++++++++++----------- client/consul/sync.go | 108 ++++++++++++++++++++++++++--- 2 files changed, 212 insertions(+), 50 deletions(-) diff --git a/client/client.go b/client/client.go index f3bd5d2be43..9e268b8ead5 100644 --- a/client/client.go +++ b/client/client.go @@ -4,13 +4,16 @@ import ( "fmt" "io/ioutil" "log" + "net" "os" "path/filepath" "strconv" "sync" + "sync/atomic" "time" "github.com/armon/go-metrics" + consulapi "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/lib" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/client/allocdir" @@ -76,6 +79,13 @@ const ( // consulSyncInterval is the interval at which the client syncs with consul // to remove services and checks which are no longer valid consulSyncInterval = 15 * time.Second + + // consulSyncDelay specifies the initial sync delay when starting the + // Nomad Agent's consul.Syncer. + consulSyncDelay = 5 * time.Second + + // Add a little jitter to the agent's consul.Syncer task + consulSyncJitter = 8 ) // DefaultConfig returns the default configuration @@ -113,6 +123,12 @@ type Client struct { configCopy *config.Config configLock sync.RWMutex + // backupServerDeadline is the deadline at which this Nomad Agent + // will begin polling Consul for a list of Nomad Servers. When Nomad + // Clients are heartbeating successfully with Nomad Servers, Nomad + // Clients do not poll Consul for a backup server list. + backupServerDeadline time.Time + logger *log.Logger rpcProxy *rpc_proxy.RpcProxy @@ -132,6 +148,7 @@ type Client struct { // consulSyncer advertises this Nomad Agent with Consul consulSyncer *consul.Syncer + consulLock int64 // HostStatsCollector collects host resource usage stats hostStatsCollector *stats.HostStatsCollector @@ -205,9 +222,9 @@ func NewClient(cfg *config.Config) (*Client, error) { return nil, fmt.Errorf("failed to restore state: %v", err) } - // Setup the consul client - if err := c.setupConsulClient(); err != nil { - return nil, fmt.Errorf("failed to create consul client: %v") + // Setup the Consul syncer + if err := c.setupConsulSyncer(); err != nil { + return nil, fmt.Errorf("failed to create Consul syncer: %v") } // Register and then start heartbeating to the servers. @@ -228,8 +245,8 @@ func NewClient(cfg *config.Config) (*Client, error) { // Start maintenance task for servers go c.rpcProxy.Run() - // Start the consul sync - go c.syncConsul() + // Start the Consul sync + go c.runClientConsulSyncer() return c, nil } @@ -902,6 +919,7 @@ func (c *Client) updateNodeStatus() error { if err := c.rpcProxy.UpdateFromNodeUpdateResponse(&resp); err != nil { return err } + c.backupServerDeadline = time.Now().Add(2 * resp.HeartbeatTTL) return nil } @@ -1212,53 +1230,111 @@ func (c *Client) addAlloc(alloc *structs.Allocation) error { return nil } -// setupConsulClient creates a consul.Syncer -func (c *Client) setupConsulClient() error { - cs, err := consul.NewSyncer(c.config.ConsulAgentConfig, c.logger) +// setupConsulSyncer creates a consul.Syncer +func (c *Client) setupConsulSyncer() error { + cs, err := consul.NewSyncer(c.config.ConsulConfig, c.logger) + if err != nil { + return err + } + c.consulSyncer = cs - return err + + // Callback handler used to periodically poll Consul in the event + // there are no Nomad Servers available and the Nomad Agent is in a + // bootstrap situation. + fn := func() { + now := time.Now() + c.configLock.RLock() + if now.Before(c.backupServerDeadline) { + c.configLock.RUnlock() + return + } + c.configLock.RUnlock() + + nomadServerServiceName := c.config.ConsulConfig.ServerServiceName + services, _, err := c.consulSyncer.ConsulClient().Catalog().Service(nomadServerServiceName, "", &consulapi.QueryOptions{AllowStale: true}) + if err != nil { + c.logger.Printf("[WARN] client: unable to query service %q: %v", nomadServerServiceName, err) + return + } + serverAddrs := make([]string, 0, len(services)) + for _, s := range services { + port := strconv.FormatInt(int64(s.ServicePort), 10) + addr := s.ServiceAddress + if addr == "" { + addr = s.Address + } + serverAddrs = append(serverAddrs, net.JoinHostPort(addr, port)) + } + c.rpcProxy.SetBackupServers(serverAddrs) + } + + const handlerName = "Nomad Client Fallback Server Handler" + c.consulSyncer.AddPeriodicHandler(handlerName, fn) + return nil } -// syncConsul removes services of tasks which are no longer in running state -func (c *Client) syncConsul() { - sync := time.NewTicker(consulSyncInterval) +// runClientConsulSyncer runs the consul.Syncer task in the Nomad Agent's +// context. This is primarily responsible for removing tasks which are no +// longer in running state. +func (c *Client) runClientConsulSyncer() { + d := consulSyncDelay + lib.RandomStagger(consulSyncInterval-consulSyncDelay) + c.logger.Printf("[DEBUG] consul.sync: sleeping %v before first sync", d) + sync := time.NewTimer(d) for { select { case <-sync.C: - // Give up pruning services if we can't fingerprint Consul + fn := func() { + defer atomic.StoreInt64(&c.consulLock, 0) + + d = consulSyncInterval - lib.RandomStagger(consulSyncInterval/consulSyncJitter) + sync.Reset(d) + + // Run syncer handlers regardless of this + // Agent's client or server status. + c.consulSyncer.RunHandlers() + + // Give up pruning services if we can't + // fingerprint our Consul Agent. + c.configLock.RLock() + _, ok := c.configCopy.Node.Attributes["consul.version"] + c.configLock.RUnlock() + if !ok { + return + } - c.configLock.RLock() - _, ok := c.configCopy.Node.Attributes["consul.server"] - c.configLock.RUnlock() - if !ok { - continue - } - services := make(map[string]struct{}) - // Get the existing allocs - c.allocLock.RLock() - allocs := make([]*AllocRunner, 0, len(c.allocs)) - for _, ar := range c.allocs { - allocs = append(allocs, ar) - } - c.allocLock.RUnlock() - for _, ar := range allocs { - ar.taskStatusLock.RLock() - taskStates := copyTaskStates(ar.taskStates) - ar.taskStatusLock.RUnlock() - for taskName, taskState := range taskStates { - if taskState.State == structs.TaskStateRunning { - if tr, ok := ar.tasks[taskName]; ok { - for _, service := range tr.task.Services { - svcIdentifier := consul.GenerateServiceIdentifier(ar.alloc.ID, tr.task.Name) - services[service.ID(svcIdentifier)] = struct{}{} + services := make(map[string]struct{}) + // Get the existing allocs + c.allocLock.RLock() + allocs := make([]*AllocRunner, 0, len(c.allocs)) + for _, ar := range c.allocs { + allocs = append(allocs, ar) + } + c.allocLock.RUnlock() + + for _, ar := range allocs { + ar.taskStatusLock.RLock() + taskStates := copyTaskStates(ar.taskStates) + ar.taskStatusLock.RUnlock() + for taskName, taskState := range taskStates { + if taskState.State == structs.TaskStateRunning { + if tr, ok := ar.tasks[taskName]; ok { + for _, service := range tr.task.Services { + svcIdentifier := fmt.Sprintf("%s-%s", ar.alloc.ID, tr.task.Name) + services[service.ID(svcIdentifier)] = struct{}{} + } } } } } + + if err := c.consulSyncer.KeepServices(services); err != nil { + c.logger.Printf("[DEBUG] client: error removing services from non-running tasks: %v", err) + } } - if err := c.consulSyncer.KeepServices(services); err != nil { - c.logger.Printf("[DEBUG] client: error removing services from non-running tasks: %v", err) + if atomic.CompareAndSwapInt64(&c.consulLock, 0, 1) { + go fn() } case <-c.shutdownCh: sync.Stop() diff --git a/client/consul/sync.go b/client/consul/sync.go index 556de1b8522..f97b3444989 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -12,11 +12,17 @@ import ( "time" consul "github.com/hashicorp/consul/api" + "github.com/hashicorp/consul/lib" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/structs/config" + "github.com/hashicorp/nomad/nomad/types" ) +type notifyEvent struct{} +type notifyChannel chan notifyEvent + // Syncer allows syncing of services and checks with Consul type Syncer struct { client *consul.Client @@ -37,12 +43,28 @@ type Syncer struct { shutdown bool shutdownLock sync.Mutex + // periodicCallbacks is walked sequentially when the timer in Run + // fires. + periodicCallbacks map[string]types.PeriodicCallback + notifySyncCh notifyChannel + periodicLock sync.RWMutex } const ( + // initialSyncBuffer is the max time an initial sync will sleep + // before syncing. + initialSyncBuffer = 30 * time.Second + + // initialSyncDelay is the delay before an initial sync. + initialSyncDelay = 5 * time.Second + // The periodic time interval for syncing services and checks with Consul syncInterval = 5 * time.Second + // syncJitter provides a little variance in the frequency at which + // Syncer polls Consul. + syncJitter = 8 + // ttlCheckBuffer is the time interval that Nomad can take to report Consul // the check result ttlCheckBuffer = 31 * time.Second @@ -102,13 +124,13 @@ func NewSyncer(config *config.ConsulConfig, logger *log.Logger) (*Syncer, error) return nil, err } consulSyncer := Syncer{ - client: c, - logger: logger, - trackedServices: make(map[string]*consul.AgentService), - trackedChecks: make(map[string]*consul.AgentCheckRegistration), - checkRunners: make(map[string]*CheckRunner), - + client: c, + logger: logger, + trackedServices: make(map[string]*consul.AgentService), + trackedChecks: make(map[string]*consul.AgentCheckRegistration), + checkRunners: make(map[string]*CheckRunner), shutdownCh: make(types.ShutdownChannel), + periodicCallbacks: make(map[string]types.PeriodicCallback), } return &consulSyncer, nil } @@ -133,7 +155,16 @@ func (c *Syncer) SetServiceIdentifier(serviceIdentifier string) *Syncer { return c } -// SyncServices sync the services with consul +// SyncNow expires the current timer forcing the list of periodic callbacks +// to be synced immediately. +func (c *Syncer) SyncNow() { + select { + case c.notifySyncCh <- notifyEvent{}: + default: + } +} + +// SyncServices sync the services with the Consul Agent func (c *Syncer) SyncServices(services []*structs.Service) error { var mErr multierror.Error taskServices := make(map[string]*consul.AgentService) @@ -340,31 +371,54 @@ func (c *Syncer) deregisterCheck(ID string) error { return c.client.Agent().CheckDeregister(ID) } - sync := time.NewTicker(syncInterval) // Run triggers periodic syncing of services and checks with Consul. This is // a long lived go-routine which is stopped during shutdown. func (c *Syncer) Run() { + d := initialSyncDelay + lib.RandomStagger(initialSyncBuffer-initialSyncDelay) + sync := time.NewTimer(d) + c.logger.Printf("[DEBUG] consul.sync: sleeping %v before first sync", d) + for { select { case <-sync.C: + d = syncInterval - lib.RandomStagger(syncInterval/syncJitter) + sync.Reset(d) + if err := c.performSync(); err != nil { if c.runChecks { - c.logger.Printf("[DEBUG] consul: error in syncing services for %q: %v", c.serviceIdentifier, err) + c.logger.Printf("[DEBUG] consul.sync: disabling checks until Consul sync completes for %q: %v", c.serviceIdentifier, err) } c.runChecks = false } else { c.runChecks = true } + case <-c.notifySyncCh: + sync.Reset(syncInterval) case <-c.shutdownCh: sync.Stop() - c.logger.Printf("[INFO] consul: shutting down sync for %q", c.serviceIdentifier) + c.logger.Printf("[INFO] consul.sync: shutting down sync for %q", c.serviceIdentifier) return } } } +// RunHandlers executes each handler (randomly) +func (c *Syncer) RunHandlers() { + c.periodicLock.RLock() + handlers := make(map[string]types.PeriodicCallback, len(c.periodicCallbacks)) + for name, fn := range c.periodicCallbacks { + handlers[name] = fn + } + c.periodicLock.RUnlock() + for name, fn := range handlers { + fn() + } +} + // performSync sync the services and checks we are tracking with Consul. func (c *Syncer) performSync() error { + c.RunHandlers() + var mErr multierror.Error cServices, err := c.client.Agent().Services() if err != nil { @@ -448,7 +502,7 @@ func (c *Syncer) runCheck(check Check) { } if err := c.client.Agent().UpdateTTL(check.ID(), output, state); err != nil { if c.runChecks { - c.logger.Printf("[DEBUG] consul.sync: error updating ttl check for check %q: %v", check.ID(), err) + c.logger.Printf("[DEBUG] consul.sync: check %q failed, disabling Consul checks until until next successful sync: %v", check.ID(), err) c.runChecks = false } else { c.runChecks = true @@ -461,3 +515,35 @@ func (c *Syncer) runCheck(check Check) { func GenerateServiceIdentifier(allocID string, taskName string) string { return fmt.Sprintf("%s-%s", taskName, allocID) } + +// AddPeriodicHandler adds a uniquely named callback. Returns true if +// successful, false if a handler with the same name already exists. +func (c *Syncer) AddPeriodicHandler(name string, fn types.PeriodicCallback) bool { + c.periodicLock.Lock() + defer c.periodicLock.Unlock() + c.logger.Printf("[DEBUG] consul.sync: adding handler named %s", name) + if _, found := c.periodicCallbacks[name]; found { + c.logger.Printf("[ERROR] consul.sync: failed adding handler %q", name) + return false + } + c.periodicCallbacks[name] = fn + c.logger.Printf("[DEBUG] consul.sync: successfully added handler %q", name) + return true +} + +func (c *Syncer) NumHandlers() int { + c.periodicLock.RLock() + defer c.periodicLock.RUnlock() + return len(c.periodicCallbacks) +} + +// RemovePeriodicHandler removes a handler with a given name. +func (c *Syncer) RemovePeriodicHandler(name string) { + c.periodicLock.Lock() + defer c.periodicLock.Unlock() + delete(c.periodicCallbacks, name) +} + +func (c *Syncer) ConsulClient() *consul.Client { + return c.client +} From 4e4f0a1625dd63a29db6a83ba0c43eb18edcd2d8 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 23:28:22 -0700 Subject: [PATCH 025/166] Remove unused code --- client/rpc_proxy/rpc_proxy.go | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/client/rpc_proxy/rpc_proxy.go b/client/rpc_proxy/rpc_proxy.go index d98ef05057a..586a8923caa 100644 --- a/client/rpc_proxy/rpc_proxy.go +++ b/client/rpc_proxy/rpc_proxy.go @@ -132,10 +132,6 @@ type RpcProxy struct { // notifyFailedBarrier is acts as a barrier to prevent queuing behind // serverListLock and acts as a TryLock(). notifyFailedBarrier int32 - - // consulLock is the lock to prevent concurrent access to Consul from - // an RpcProxy instance. - consulLock int32 } // activateEndpoint adds an endpoint to the RpcProxy's active serverList. @@ -615,25 +611,6 @@ func (p *RpcProxy) Run() { p.RebalanceServers() p.refreshServerRebalanceTimer() - - // Perform Consul operations asynchronously, but in a - // singleton to prevent this task from stacking - // during the next heartbeat if Consul is slow or - // unavailable. - if atomic.CompareAndSwapInt32(&p.consulLock, 0, 1) { - go func() { - // TODO(sean@): Talk w/ Consul and - // append any servers it has to our - // server list. Need access to the - // Consul Config agent out of Client - // in order to poll (or create our - // own parallel client using the - // existing consul config). - p.logger.Printf("[DEBUG] Polling Consul for servers in the nomad-server list") - defer atomic.StoreInt32(&p.consulLock, 0) - }() - } - case <-p.shutdownCh: p.logger.Printf("[INFO] RPC Proxy: shutting down") return From fc5658521c8584ae598d1fbe8efbc468979d93b2 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 23:28:42 -0700 Subject: [PATCH 026/166] index on f-dyn-server-list: bd38cb4 Remove unused code --- client/consul/sync.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/consul/sync.go b/client/consul/sync.go index f97b3444989..d53c4a869d8 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -410,7 +410,7 @@ func (c *Syncer) RunHandlers() { handlers[name] = fn } c.periodicLock.RUnlock() - for name, fn := range handlers { + for _, fn := range handlers { fn() } } From 05bbbd2d55d4400fafff8a96f44c967e38d7073a Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 24 May 2016 02:17:20 -0700 Subject: [PATCH 027/166] Reconcile renamed structures during rebase --- command/agent/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/command/agent/config.go b/command/agent/config.go index e1033257b53..e9f2cce42d4 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -362,7 +362,7 @@ func DevConfig() *Config { conf.DevMode = true conf.EnableDebug = true conf.DisableAnonymousSignature = true - conf.ConsulConfig.AutoRegister = true + conf.Consul.AutoRegister = true if runtime.GOOS == "darwin" { conf.Client.NetworkInterface = "lo0" } else if runtime.GOOS == "linux" { From 90023c3639f001d234659c919fec1bec028cf285 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 24 May 2016 02:34:20 -0700 Subject: [PATCH 028/166] Rename RpcProxy's internal filename to match --- .../{manager_internal_test.go => rpc_proxy_internal_test.go} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename client/rpc_proxy/{manager_internal_test.go => rpc_proxy_internal_test.go} (100%) diff --git a/client/rpc_proxy/manager_internal_test.go b/client/rpc_proxy/rpc_proxy_internal_test.go similarity index 100% rename from client/rpc_proxy/manager_internal_test.go rename to client/rpc_proxy/rpc_proxy_internal_test.go From 681a7d69304e7fc1a43b52b5a40f370c33b604e5 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 24 May 2016 02:34:46 -0700 Subject: [PATCH 029/166] Remove testing for an API that doesn't exist any more. --- api/agent_test.go | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/api/agent_test.go b/api/agent_test.go index 09e8c0cadc4..0e764dbd2f4 100644 --- a/api/agent_test.go +++ b/api/agent_test.go @@ -124,39 +124,6 @@ func TestAgent_ForceLeave(t *testing.T) { // TODO: test force-leave on an existing node } -func TestAgent_SetServers(t *testing.T) { - c, s := makeClient(t, nil, func(c *testutil.TestServerConfig) { - c.Client.Enabled = true - c.Server.BootstrapExpect = 0 - }) - defer s.Stop() - a := c.Agent() - - // Attempting to set an empty list errors - err := a.SetServers([]string{}) - if err == nil { - t.Fatalf("expected error, got nothing") - } - - // Setting a valid list works - err = a.SetServers([]string{"foo", "bar"}) - if err != nil { - t.Fatalf("err: %s", err) - } - - // Returns the proper list of servers - out, err := a.Servers() - if err != nil { - t.Fatalf("err: %s", err) - } - if n := len(out); n != 2 { - t.Fatalf("expected 2 servers, got: %d", n) - } - if out[0] != "foo:4647" || out[1] != "bar:4647" { - t.Fatalf("bad server list: %v", out) - } -} - func (a *AgentMember) String() string { return "{Name: " + a.Name + " Region: " + a.Tags["region"] + " DC: " + a.Tags["dc"] + "}" } From 7125b857443d4d8d6610e8fa412cb0b7f403f39a Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 26 May 2016 09:17:32 -0700 Subject: [PATCH 030/166] Correct a mismerge Somewhere along the lines when rebasing I mis-merged a patch. --- client/rpc_proxy/rpc_proxy.go | 1 + 1 file changed, 1 insertion(+) diff --git a/client/rpc_proxy/rpc_proxy.go b/client/rpc_proxy/rpc_proxy.go index 586a8923caa..d5796caac8e 100644 --- a/client/rpc_proxy/rpc_proxy.go +++ b/client/rpc_proxy/rpc_proxy.go @@ -182,6 +182,7 @@ func (p *RpcProxy) SetBackupServers(addrs []string) error { p.logger.Printf("[WARN] RPC Proxy: unable to create backup server %q: %v", s, err) return fmt.Errorf("unable to create new backup server from %q: %v", s, err) } + l = append(l, s) } p.serverListLock.Lock() From 71dde1b5844589e65523f34636406ab11225fca0 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 26 May 2016 13:26:08 -0700 Subject: [PATCH 031/166] Reconcile consul's address configuration section. There were conflicting directives previously, both consul.addr and consul.address were required to achieve the desired behavior. The documentation said `consul.address` was the canonical name for the parameter, so consolidate configuration parameters to `consul.address`. --- command/agent/config_parse.go | 2 +- nomad/structs/config/consul.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index ee291428f03..cc3c1bcb707 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -598,7 +598,7 @@ func parseConsulConfig(result **config.ConsulConfig, list *ast.ObjectList) error // Check for invalid keys valid := []string{ - "addr", + "address", "auth", "auto_register", "ca_file", diff --git a/nomad/structs/config/consul.go b/nomad/structs/config/consul.go index 3b0989c73a6..7727a3c0bb2 100644 --- a/nomad/structs/config/consul.go +++ b/nomad/structs/config/consul.go @@ -23,7 +23,7 @@ type ConsulConfig struct { AutoRegister bool `mapstructure:"auto_register"` // Addr is the address of the local Consul agent - Addr string `mapstructure:"addr"` + Addr string `mapstructure:"address"` // Token is used to provide a per-request ACL token.This options overrides // the agent's default token From bc94ce8aa6f56d1ff8e74981bb6253900664bca5 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 26 May 2016 13:28:45 -0700 Subject: [PATCH 032/166] Rename `mergedNomadMap` to `mergedPrimaryMap`. Gratuitous, but more correct. --- client/rpc_proxy/rpc_proxy.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/client/rpc_proxy/rpc_proxy.go b/client/rpc_proxy/rpc_proxy.go index d5796caac8e..efcc881f124 100644 --- a/client/rpc_proxy/rpc_proxy.go +++ b/client/rpc_proxy/rpc_proxy.go @@ -651,10 +651,10 @@ func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse // 'n' == new state byte } - mergedNomadMap := make(map[EndpointKey]*targetServer, len(p.primaryServers.L)+len(resp.Servers)) + mergedPrimaryMap := make(map[EndpointKey]*targetServer, len(p.primaryServers.L)+len(resp.Servers)) numOldServers := 0 for _, s := range p.primaryServers.L { - mergedNomadMap[*s.Key()] = &targetServer{server: s, state: 'o'} + mergedPrimaryMap[*s.Key()] = &targetServer{server: s, state: 'o'} numOldServers++ } numBothServers := 0 @@ -685,12 +685,12 @@ func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse } k := server.Key() - _, found := mergedNomadMap[*k] + _, found := mergedPrimaryMap[*k] if found { - mergedNomadMap[*k].state = 'b' + mergedPrimaryMap[*k].state = 'b' numBothServers++ } else { - mergedNomadMap[*k] = &targetServer{server: server, state: 'n'} + mergedPrimaryMap[*k] = &targetServer{server: server, state: 'n'} newServers = true } } @@ -703,7 +703,7 @@ func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse p.listLock.Lock() defer p.listLock.Unlock() newServerCfg := p.getServerList() - for k, v := range mergedNomadMap { + for k, v := range mergedPrimaryMap { switch v.state { case 'b': // Do nothing, server exists in both From e6397da738e11dfad7f4abe13dfa941c6d5b9ae0 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 26 May 2016 13:31:32 -0700 Subject: [PATCH 033/166] Centralize the creation of a consul/api.Config struct. While documented, the consul.timeout parameter wasn't ever set except one-off in the Consul fingerprinter. --- client/fingerprint/consul.go | 9 ++------- command/agent/config.go | 1 + command/agent/config_parse.go | 1 + nomad/structs/config/consul.go | 30 ++++++++++++++++++++++++++++++ 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/client/fingerprint/consul.go b/client/fingerprint/consul.go index b15f7bac399..14932460149 100644 --- a/client/fingerprint/consul.go +++ b/client/fingerprint/consul.go @@ -38,16 +38,11 @@ func (f *ConsulFingerprint) Fingerprint(config *client.Config, node *structs.Nod // Only create the client once to avoid creating too many connections to // Consul. if f.client == nil { - address := config.ReadDefault("consul.address", "127.0.0.1:8500") - timeout, err := time.ParseDuration(config.ReadDefault("consul.timeout", "10ms")) + consulConfig, err := config.ConsulConfig.ApiConfig() if err != nil { - return false, fmt.Errorf("Unable to parse consul.timeout: %s", err) + return false, fmt.Errorf("Failed to initialize the Consul client config: %v", err) } - consulConfig := consul.DefaultConfig() - consulConfig.Address = address - consulConfig.HttpClient.Timeout = timeout - f.client, err = consul.NewClient(consulConfig) if err != nil { return false, fmt.Errorf("Failed to initialize consul client: %s", err) diff --git a/command/agent/config.go b/command/agent/config.go index e9f2cce42d4..8804d0b7ab2 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -394,6 +394,7 @@ func DefaultConfig() *Config { ServerServiceName: "nomad-server", ClientServiceName: "nomad-client", AutoRegister: true, + Timeout: 500 * time.Millisecond, }, Client: &ClientConfig{ Enabled: false, diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index cc3c1bcb707..8b4d1504618 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -609,6 +609,7 @@ func parseConsulConfig(result **config.ConsulConfig, list *ast.ObjectList) error "server_auto_join", "server_service_name", "ssl", + "timeout", "token", "verify_ssl", } diff --git a/nomad/structs/config/consul.go b/nomad/structs/config/consul.go index 7727a3c0bb2..57d5a6a509e 100644 --- a/nomad/structs/config/consul.go +++ b/nomad/structs/config/consul.go @@ -1,5 +1,11 @@ package config +import ( + "time" + + consul "github.com/hashicorp/consul/api" +) + // ConsulConfig contains the configuration information necessary to // communicate with a Consul Agent in order to: // @@ -25,6 +31,9 @@ type ConsulConfig struct { // Addr is the address of the local Consul agent Addr string `mapstructure:"address"` + // Timeout is used by Consul HTTP Client + Timeout time.Duration `mapstructure:"timeout"` + // Token is used to provide a per-request ACL token.This options overrides // the agent's default token Token string `mapstructure:"token"` @@ -73,6 +82,9 @@ func (a *ConsulConfig) Merge(b *ConsulConfig) *ConsulConfig { if b.Addr != "" { result.Addr = b.Addr } + if b.Timeout != 0 { + result.Timeout = b.Timeout + } if b.Token != "" { result.Token = b.Token } @@ -102,3 +114,21 @@ func (a *ConsulConfig) Merge(b *ConsulConfig) *ConsulConfig { } return &result } + +// ApiConfig() returns a usable Consul config that can be passed directly to +// hashicorp/consul/api. NOTE: datacenter is not set +func (c *ConsulConfig) ApiConfig() (*consul.Config, error) { + config := consul.DefaultConfig() + if c.Addr != "" { + config.Address = c.Addr + } + if c.Token != "" { + config.Token = c.Token + } + + if c.Timeout != 0 { + config.HttpClient.Timeout = c.Timeout + } + + return config, nil +} From 990a094bed5982af4ee3b25a6564e8a8bcbc28df Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 3 May 2016 00:18:11 -0700 Subject: [PATCH 034/166] Use `rand.Int*n()` where appropriate --- nomad/eval_broker.go | 2 +- nomad/rpc.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nomad/eval_broker.go b/nomad/eval_broker.go index d91ee824488..96060b6c327 100644 --- a/nomad/eval_broker.go +++ b/nomad/eval_broker.go @@ -348,7 +348,7 @@ func (b *EvalBroker) scanForSchedulers(schedulers []string) (*structs.Evaluation default: // Multiple tasks. We pick a random task so that we fairly // distribute work. - offset := rand.Intn(n) + offset := rand.Int63n(n) return b.dequeueForSched(eligibleSched[offset]) } } diff --git a/nomad/rpc.go b/nomad/rpc.go index 26b94489b12..3d233921594 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -232,7 +232,7 @@ func (s *Server) forwardRegion(region, method string, args interface{}, reply in } // Select a random addr - offset := rand.Intn(len(servers)) + offset := rand.Int31n(len(servers)) server := servers[offset] s.peerLock.RUnlock() From 07fa0c58d4df2cc4412820da05136619d6fc0735 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 3 May 2016 00:29:23 -0700 Subject: [PATCH 035/166] Use the correctly typed `rand.Int*` variant --- nomad/eval_broker.go | 2 +- nomad/rpc.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nomad/eval_broker.go b/nomad/eval_broker.go index 96060b6c327..d91ee824488 100644 --- a/nomad/eval_broker.go +++ b/nomad/eval_broker.go @@ -348,7 +348,7 @@ func (b *EvalBroker) scanForSchedulers(schedulers []string) (*structs.Evaluation default: // Multiple tasks. We pick a random task so that we fairly // distribute work. - offset := rand.Int63n(n) + offset := rand.Intn(n) return b.dequeueForSched(eligibleSched[offset]) } } diff --git a/nomad/rpc.go b/nomad/rpc.go index 3d233921594..26b94489b12 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -232,7 +232,7 @@ func (s *Server) forwardRegion(region, method string, args interface{}, reply in } // Select a random addr - offset := rand.Int31n(len(servers)) + offset := rand.Intn(len(servers)) server := servers[offset] s.peerLock.RUnlock() From 5e0d76f7ce2479db131a913825a02964c58ac02d Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 21 May 2016 16:37:21 -0500 Subject: [PATCH 036/166] Rename client/config/config's ConsulConfig to ConsulAgentConfig A follow up commit to the previous rename. More to come. --- command/agent/agent.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 0eb9cd03bfd..0efd1b7fa51 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -278,7 +278,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { conf.Version = fmt.Sprintf("%s%s", a.config.Version, a.config.VersionPrerelease) conf.Revision = a.config.Revision - conf.ConsulConfig = a.consulConfig + conf.ConsulAgentConfig = a.consulConfig conf.StatsDataPoints = a.config.Client.StatsConfig.DataPoints conf.StatsCollectionInterval = a.config.Client.StatsConfig.collectionInterval From 4d30c98aa6c734bdce1acf24fa6ce29ece1c8f49 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 21 May 2016 16:51:22 -0500 Subject: [PATCH 037/166] Rename consulConfig to consulAgentConfig --- command/agent/agent.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 0efd1b7fa51..903fd996265 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -278,7 +278,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { conf.Version = fmt.Sprintf("%s%s", a.config.Version, a.config.VersionPrerelease) conf.Revision = a.config.Revision - conf.ConsulAgentConfig = a.consulConfig + conf.ConsulAgentConfig = a.consulAgentConfig conf.StatsDataPoints = a.config.Client.StatsConfig.DataPoints conf.StatsCollectionInterval = a.config.Client.StatsConfig.collectionInterval From 6fae3db12e6f6f116573a652f0adb5529fb41eb8 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 21 May 2016 17:03:46 -0500 Subject: [PATCH 038/166] Rename ConsulConfig to ConsulAgentConfig --- client/driver/utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/driver/utils.go b/client/driver/utils.go index 5559e65e542..db43a2879e3 100644 --- a/client/driver/utils.go +++ b/client/driver/utils.go @@ -86,7 +86,7 @@ func consulContext(clientConfig *config.Config, containerID string) *executor.Co ClientServiceName: clientConfig.ReadDefault("consul.client_service_name", "nomad-client"), } return &executor.ConsulContext{ - ConsulConfig: &cfg, + ConsulAgentConfig: &cfg, ContainerID: containerID, DockerEndpoint: clientConfig.Read("docker.endpoint"), TLSCa: clientConfig.Read("docker.tls.ca"), From 16b4e5c9bed1f21766ff28fde8bd49279a7b1d9c Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 11:09:31 -0700 Subject: [PATCH 039/166] Rebalance Nomad client RPCs among different Nomad servers. Implement client/rpc_proxy.RpcProxy. --- client/driver/utils.go | 10 +- client/rpc_proxy/manager_internal_test.go | 353 ++++++++++++++++++++++ client/rpc_proxy/rpc_proxy.go | 23 ++ 3 files changed, 381 insertions(+), 5 deletions(-) create mode 100644 client/rpc_proxy/manager_internal_test.go diff --git a/client/driver/utils.go b/client/driver/utils.go index db43a2879e3..51789685dd6 100644 --- a/client/driver/utils.go +++ b/client/driver/utils.go @@ -87,11 +87,11 @@ func consulContext(clientConfig *config.Config, containerID string) *executor.Co } return &executor.ConsulContext{ ConsulAgentConfig: &cfg, - ContainerID: containerID, - DockerEndpoint: clientConfig.Read("docker.endpoint"), - TLSCa: clientConfig.Read("docker.tls.ca"), - TLSCert: clientConfig.Read("docker.tls.cert"), - TLSKey: clientConfig.Read("docker.tls.key"), + ContainerID: containerID, + DockerEndpoint: clientConfig.Read("docker.endpoint"), + TLSCa: clientConfig.Read("docker.tls.ca"), + TLSCert: clientConfig.Read("docker.tls.cert"), + TLSKey: clientConfig.Read("docker.tls.key"), } } diff --git a/client/rpc_proxy/manager_internal_test.go b/client/rpc_proxy/manager_internal_test.go new file mode 100644 index 00000000000..271d056a375 --- /dev/null +++ b/client/rpc_proxy/manager_internal_test.go @@ -0,0 +1,353 @@ +package rpc_proxy + +import ( + "bytes" + "fmt" + "log" + "math/rand" + "os" + "testing" + "time" +) + +var ( + localLogger *log.Logger + localLogBuffer *bytes.Buffer +) + +func init() { + localLogBuffer = new(bytes.Buffer) + localLogger = log.New(localLogBuffer, "", 0) +} + +func GetBufferedLogger() *log.Logger { + return localLogger +} + +type fauxConnPool struct { + // failPct between 0.0 and 1.0 == pct of time a Ping should fail + failPct float64 +} + +func (cp *fauxConnPool) PingNomadServer(region string, version int, s *ServerEndpoint) (bool, error) { + var success bool + successProb := rand.Float64() + if successProb > cp.failPct { + success = true + } + return success, nil +} + +type fauxSerf struct { + numNodes int +} + +func (s *fauxSerf) NumNodes() int { + return s.numNodes +} + +func (s *fauxSerf) Region() string { + return "global" +} + +func (s *fauxSerf) RPCVersion() int { + return 1 +} + +func testManager() (p *RpcProxy) { + logger := GetBufferedLogger() + shutdownCh := make(chan struct{}) + p = NewRpcProxy(logger, shutdownCh, &fauxSerf{numNodes: 16384}, &fauxConnPool{}) + return p +} + +func testManagerFailProb(failPct float64) (p *RpcProxy) { + logger := GetBufferedLogger() + logger = log.New(os.Stderr, "", log.LstdFlags) + shutdownCh := make(chan struct{}) + p = NewRpcProxy(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) + return p +} + +// func (l *serverList) cycleServer() (servers []*Server) { +func TestManagerInternal_cycleServer(t *testing.T) { + m := testManager() + l := m.getServerList() + + server0 := &ServerEndpoint{Name: "server1"} + server1 := &ServerEndpoint{Name: "server2"} + server2 := &ServerEndpoint{Name: "server3"} + l.L = append(l.L, server0, server1, server2) + m.saveServerList(l) + + l = m.getServerList() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server0 && + l.L[1] != server1 && + l.L[2] != server2 { + t.Fatalf("initial server ordering not correct") + } + + l.L = l.cycleServer() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server1 && + l.L[1] != server2 && + l.L[2] != server0 { + t.Fatalf("server ordering after one cycle not correct") + } + + l.L = l.cycleServer() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server2 && + l.L[1] != server0 && + l.L[2] != server1 { + t.Fatalf("server ordering after two cycles not correct") + } + + l.L = l.cycleServer() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server0 && + l.L[1] != server1 && + l.L[2] != server2 { + t.Fatalf("server ordering after three cycles not correct") + } +} + +// func (m *Manager) getServerList() serverList { +func TestManagerInternal_getServerList(t *testing.T) { + m := testManager() + l := m.getServerList() + if l.L == nil { + t.Fatalf("serverList.servers nil") + } + + if len(l.L) != 0 { + t.Fatalf("serverList.servers length not zero") + } +} + +func TestManagerInternal_NewManager(t *testing.T) { + m := testManager() + if m == nil { + t.Fatalf("Manager nil") + } + + if m.logger == nil { + t.Fatalf("Manager.logger nil") + } + + if m.shutdownCh == nil { + t.Fatalf("Manager.shutdownCh nil") + } +} + +// func (m *Manager) reconcileServerList(l *serverList) bool { +func TestManagerInternal_reconcileServerList(t *testing.T) { + tests := []int{0, 1, 2, 3, 4, 5, 10, 100} + for _, n := range tests { + ok, err := test_reconcileServerList(n) + if !ok { + t.Errorf("Expected %d to pass: %v", n, err) + } + } +} + +func test_reconcileServerList(maxServers int) (bool, error) { + // Build a server list, reconcile, verify the missing servers are + // missing, the added have been added, and the original server is + // present. + const failPct = 0.5 + m := testManagerFailProb(failPct) + + var failedServers, healthyServers []*ServerEndpoint + for i := 0; i < maxServers; i++ { + nodeName := fmt.Sprintf("s%02d", i) + + node := &ServerEndpoint{Name: nodeName} + // Add 66% of servers to Manager + if rand.Float64() > 0.33 { + m.activateEndpoint(node) + + // Of healthy servers, (ab)use connPoolPinger to + // failPct of the servers for the reconcile. This + // allows for the selected server to no longer be + // healthy for the reconcile below. + if ok, _ := m.connPoolPinger.PingNomadServer(m.configInfo.Region(), m.configInfo.RPCVersion(), node); ok { + // Will still be present + healthyServers = append(healthyServers, node) + } else { + // Will be missing + failedServers = append(failedServers, node) + } + } else { + // Will be added from the call to reconcile + healthyServers = append(healthyServers, node) + } + } + + // Randomize Manager's server list + m.RebalanceServers() + selectedServer := m.FindServer() + + var selectedServerFailed bool + for _, s := range failedServers { + if selectedServer.Key().Equal(s.Key()) { + selectedServerFailed = true + break + } + } + + // Update Manager's server list to be "healthy" based on Serf. + // Reconcile this with origServers, which is shuffled and has a live + // connection, but possibly out of date. + origServers := m.getServerList() + m.saveServerList(serverList{L: healthyServers}) + + // This should always succeed with non-zero server lists + if !selectedServerFailed && !m.reconcileServerList(&origServers) && + len(m.getServerList().L) != 0 && + len(origServers.L) != 0 { + // If the random gods are unfavorable and we end up with zero + // length lists, expect things to fail and retry the test. + return false, fmt.Errorf("Expected reconcile to succeed: %v %d %d", + selectedServerFailed, + len(m.getServerList().L), + len(origServers.L)) + } + + // If we have zero-length server lists, test succeeded in degenerate + // case. + if len(m.getServerList().L) == 0 && + len(origServers.L) == 0 { + // Failed as expected w/ zero length list + return true, nil + } + + resultingServerMap := make(map[EndpointKey]bool) + for _, s := range m.getServerList().L { + resultingServerMap[*s.Key()] = true + } + + // Test to make sure no failed servers are in the Manager's + // list. Error if there are any failedServers in l.servers + for _, s := range failedServers { + _, ok := resultingServerMap[*s.Key()] + if ok { + return false, fmt.Errorf("Found failed server %v in merged list %v", s, resultingServerMap) + } + } + + // Test to make sure all healthy servers are in the healthy list. + if len(healthyServers) != len(m.getServerList().L) { + return false, fmt.Errorf("Expected healthy map and servers to match: %d/%d", len(healthyServers), len(healthyServers)) + } + + // Test to make sure all healthy servers are in the resultingServerMap list. + for _, s := range healthyServers { + _, ok := resultingServerMap[*s.Key()] + if !ok { + return false, fmt.Errorf("Server %v missing from healthy map after merged lists", s) + } + } + return true, nil +} + +// func (l *serverList) refreshServerRebalanceTimer() { +func TestManagerInternal_refreshServerRebalanceTimer(t *testing.T) { + type clusterSizes struct { + numNodes int + numServers int + minRebalance time.Duration + } + clusters := []clusterSizes{ + {0, 3, 2 * time.Minute}, + {1, 0, 2 * time.Minute}, // partitioned cluster + {1, 3, 2 * time.Minute}, + {2, 3, 2 * time.Minute}, + {100, 0, 2 * time.Minute}, // partitioned + {100, 1, 2 * time.Minute}, // partitioned + {100, 3, 2 * time.Minute}, + {1024, 1, 2 * time.Minute}, // partitioned + {1024, 3, 2 * time.Minute}, // partitioned + {1024, 5, 2 * time.Minute}, + {16384, 1, 4 * time.Minute}, // partitioned + {16384, 2, 2 * time.Minute}, // partitioned + {16384, 3, 2 * time.Minute}, // partitioned + {16384, 5, 2 * time.Minute}, + {65535, 0, 2 * time.Minute}, // partitioned + {65535, 1, 8 * time.Minute}, // partitioned + {65535, 2, 3 * time.Minute}, // partitioned + {65535, 3, 5 * time.Minute}, // partitioned + {65535, 5, 3 * time.Minute}, // partitioned + {65535, 7, 2 * time.Minute}, + {1000000, 1, 4 * time.Hour}, // partitioned + {1000000, 2, 2 * time.Hour}, // partitioned + {1000000, 3, 80 * time.Minute}, // partitioned + {1000000, 5, 50 * time.Minute}, // partitioned + {1000000, 11, 20 * time.Minute}, // partitioned + {1000000, 19, 10 * time.Minute}, + } + + logger := log.New(os.Stderr, "", log.LstdFlags) + shutdownCh := make(chan struct{}) + + for _, s := range clusters { + m := NewRpcProxy(logger, shutdownCh, &fauxSerf{numNodes: s.numNodes}, &fauxConnPool{}) + for i := 0; i < s.numServers; i++ { + nodeName := fmt.Sprintf("s%02d", i) + m.activateEndpoint(&ServerEndpoint{Name: nodeName}) + } + + d := m.refreshServerRebalanceTimer() + if d < s.minRebalance { + t.Errorf("duration too short for cluster of size %d and %d servers (%s < %s)", s.numNodes, s.numServers, d, s.minRebalance) + } + } +} + +// func (m *Manager) saveServerList(l serverList) { +func TestManagerInternal_saveServerList(t *testing.T) { + m := testManager() + + // Initial condition + func() { + l := m.getServerList() + if len(l.L) != 0 { + t.Fatalf("Manager.saveServerList failed to load init config") + } + + newServer := new(ServerEndpoint) + l.L = append(l.L, newServer) + m.saveServerList(l) + }() + + // Test that save works + func() { + l1 := m.getServerList() + t1NumServers := len(l1.L) + if t1NumServers != 1 { + t.Fatalf("Manager.saveServerList failed to save mutated config") + } + }() + + // Verify mutation w/o a save doesn't alter the original + func() { + newServer := new(ServerEndpoint) + l := m.getServerList() + l.L = append(l.L, newServer) + + l_orig := m.getServerList() + origNumServers := len(l_orig.L) + if origNumServers >= len(l.L) { + t.Fatalf("Manager.saveServerList unsaved config overwrote original") + } + }() +} diff --git a/client/rpc_proxy/rpc_proxy.go b/client/rpc_proxy/rpc_proxy.go index efcc881f124..2bf0c171d3d 100644 --- a/client/rpc_proxy/rpc_proxy.go +++ b/client/rpc_proxy/rpc_proxy.go @@ -132,6 +132,10 @@ type RpcProxy struct { // notifyFailedBarrier is acts as a barrier to prevent queuing behind // serverListLock and acts as a TryLock(). notifyFailedBarrier int32 + + // consulLock is the lock to prevent concurrent access to Consul from + // an RpcProxy instance. + consulLock int32 } // activateEndpoint adds an endpoint to the RpcProxy's active serverList. @@ -612,6 +616,25 @@ func (p *RpcProxy) Run() { p.RebalanceServers() p.refreshServerRebalanceTimer() + + // Perform Consul operations asynchronously, but in a + // singleton to prevent this task from stacking + // during the next heartbeat if Consul is slow or + // unavailable. + if atomic.CompareAndSwapInt32(&p.consulLock, 0, 1) { + go func() { + // TODO(sean@): Talk w/ Consul and + // append any servers it has to our + // server list. Need access to the + // Consul Config agent out of Client + // in order to poll (or create our + // own parallel client using the + // existing consul config). + p.logger.Printf("[DEBUG] Polling Consul for servers in the nomad-server list") + defer atomic.StoreInt32(&p.consulLock, 0) + }() + } + case <-p.shutdownCh: p.logger.Printf("[INFO] RPC Proxy: shutting down") return From 1fa43a09ddf244fd2740a5b74512afa874b6f86a Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 21:28:12 -0700 Subject: [PATCH 040/166] Reduce all forms of ConsulConfig down to a single struct nomad/structs/config/consul.go's ConsulConfig is the canonical definition for all things Consul now. --- client/driver/utils.go | 12 ++++++------ command/agent/agent.go | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/client/driver/utils.go b/client/driver/utils.go index 51789685dd6..5559e65e542 100644 --- a/client/driver/utils.go +++ b/client/driver/utils.go @@ -86,12 +86,12 @@ func consulContext(clientConfig *config.Config, containerID string) *executor.Co ClientServiceName: clientConfig.ReadDefault("consul.client_service_name", "nomad-client"), } return &executor.ConsulContext{ - ConsulAgentConfig: &cfg, - ContainerID: containerID, - DockerEndpoint: clientConfig.Read("docker.endpoint"), - TLSCa: clientConfig.Read("docker.tls.ca"), - TLSCert: clientConfig.Read("docker.tls.cert"), - TLSKey: clientConfig.Read("docker.tls.key"), + ConsulConfig: &cfg, + ContainerID: containerID, + DockerEndpoint: clientConfig.Read("docker.endpoint"), + TLSCa: clientConfig.Read("docker.tls.ca"), + TLSCert: clientConfig.Read("docker.tls.cert"), + TLSKey: clientConfig.Read("docker.tls.key"), } } diff --git a/command/agent/agent.go b/command/agent/agent.go index 903fd996265..0eb9cd03bfd 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -278,7 +278,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { conf.Version = fmt.Sprintf("%s%s", a.config.Version, a.config.VersionPrerelease) conf.Revision = a.config.Revision - conf.ConsulAgentConfig = a.consulAgentConfig + conf.ConsulConfig = a.consulConfig conf.StatsDataPoints = a.config.Client.StatsConfig.DataPoints conf.StatsCollectionInterval = a.config.Client.StatsConfig.collectionInterval From 3728771060ce358b2455268bfc27e19d902cd7f7 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Mon, 23 May 2016 23:28:22 -0700 Subject: [PATCH 041/166] Remove unused code --- client/rpc_proxy/rpc_proxy.go | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/client/rpc_proxy/rpc_proxy.go b/client/rpc_proxy/rpc_proxy.go index 2bf0c171d3d..efcc881f124 100644 --- a/client/rpc_proxy/rpc_proxy.go +++ b/client/rpc_proxy/rpc_proxy.go @@ -132,10 +132,6 @@ type RpcProxy struct { // notifyFailedBarrier is acts as a barrier to prevent queuing behind // serverListLock and acts as a TryLock(). notifyFailedBarrier int32 - - // consulLock is the lock to prevent concurrent access to Consul from - // an RpcProxy instance. - consulLock int32 } // activateEndpoint adds an endpoint to the RpcProxy's active serverList. @@ -616,25 +612,6 @@ func (p *RpcProxy) Run() { p.RebalanceServers() p.refreshServerRebalanceTimer() - - // Perform Consul operations asynchronously, but in a - // singleton to prevent this task from stacking - // during the next heartbeat if Consul is slow or - // unavailable. - if atomic.CompareAndSwapInt32(&p.consulLock, 0, 1) { - go func() { - // TODO(sean@): Talk w/ Consul and - // append any servers it has to our - // server list. Need access to the - // Consul Config agent out of Client - // in order to poll (or create our - // own parallel client using the - // existing consul config). - p.logger.Printf("[DEBUG] Polling Consul for servers in the nomad-server list") - defer atomic.StoreInt32(&p.consulLock, 0) - }() - } - case <-p.shutdownCh: p.logger.Printf("[INFO] RPC Proxy: shutting down") return From a703c8a42f3fe7d89160087e34f6995f3d778904 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 24 May 2016 02:34:20 -0700 Subject: [PATCH 042/166] Rename RpcProxy's internal filename to match --- client/rpc_proxy/manager_internal_test.go | 353 ---------------------- 1 file changed, 353 deletions(-) delete mode 100644 client/rpc_proxy/manager_internal_test.go diff --git a/client/rpc_proxy/manager_internal_test.go b/client/rpc_proxy/manager_internal_test.go deleted file mode 100644 index 271d056a375..00000000000 --- a/client/rpc_proxy/manager_internal_test.go +++ /dev/null @@ -1,353 +0,0 @@ -package rpc_proxy - -import ( - "bytes" - "fmt" - "log" - "math/rand" - "os" - "testing" - "time" -) - -var ( - localLogger *log.Logger - localLogBuffer *bytes.Buffer -) - -func init() { - localLogBuffer = new(bytes.Buffer) - localLogger = log.New(localLogBuffer, "", 0) -} - -func GetBufferedLogger() *log.Logger { - return localLogger -} - -type fauxConnPool struct { - // failPct between 0.0 and 1.0 == pct of time a Ping should fail - failPct float64 -} - -func (cp *fauxConnPool) PingNomadServer(region string, version int, s *ServerEndpoint) (bool, error) { - var success bool - successProb := rand.Float64() - if successProb > cp.failPct { - success = true - } - return success, nil -} - -type fauxSerf struct { - numNodes int -} - -func (s *fauxSerf) NumNodes() int { - return s.numNodes -} - -func (s *fauxSerf) Region() string { - return "global" -} - -func (s *fauxSerf) RPCVersion() int { - return 1 -} - -func testManager() (p *RpcProxy) { - logger := GetBufferedLogger() - shutdownCh := make(chan struct{}) - p = NewRpcProxy(logger, shutdownCh, &fauxSerf{numNodes: 16384}, &fauxConnPool{}) - return p -} - -func testManagerFailProb(failPct float64) (p *RpcProxy) { - logger := GetBufferedLogger() - logger = log.New(os.Stderr, "", log.LstdFlags) - shutdownCh := make(chan struct{}) - p = NewRpcProxy(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) - return p -} - -// func (l *serverList) cycleServer() (servers []*Server) { -func TestManagerInternal_cycleServer(t *testing.T) { - m := testManager() - l := m.getServerList() - - server0 := &ServerEndpoint{Name: "server1"} - server1 := &ServerEndpoint{Name: "server2"} - server2 := &ServerEndpoint{Name: "server3"} - l.L = append(l.L, server0, server1, server2) - m.saveServerList(l) - - l = m.getServerList() - if len(l.L) != 3 { - t.Fatalf("server length incorrect: %d/3", len(l.L)) - } - if l.L[0] != server0 && - l.L[1] != server1 && - l.L[2] != server2 { - t.Fatalf("initial server ordering not correct") - } - - l.L = l.cycleServer() - if len(l.L) != 3 { - t.Fatalf("server length incorrect: %d/3", len(l.L)) - } - if l.L[0] != server1 && - l.L[1] != server2 && - l.L[2] != server0 { - t.Fatalf("server ordering after one cycle not correct") - } - - l.L = l.cycleServer() - if len(l.L) != 3 { - t.Fatalf("server length incorrect: %d/3", len(l.L)) - } - if l.L[0] != server2 && - l.L[1] != server0 && - l.L[2] != server1 { - t.Fatalf("server ordering after two cycles not correct") - } - - l.L = l.cycleServer() - if len(l.L) != 3 { - t.Fatalf("server length incorrect: %d/3", len(l.L)) - } - if l.L[0] != server0 && - l.L[1] != server1 && - l.L[2] != server2 { - t.Fatalf("server ordering after three cycles not correct") - } -} - -// func (m *Manager) getServerList() serverList { -func TestManagerInternal_getServerList(t *testing.T) { - m := testManager() - l := m.getServerList() - if l.L == nil { - t.Fatalf("serverList.servers nil") - } - - if len(l.L) != 0 { - t.Fatalf("serverList.servers length not zero") - } -} - -func TestManagerInternal_NewManager(t *testing.T) { - m := testManager() - if m == nil { - t.Fatalf("Manager nil") - } - - if m.logger == nil { - t.Fatalf("Manager.logger nil") - } - - if m.shutdownCh == nil { - t.Fatalf("Manager.shutdownCh nil") - } -} - -// func (m *Manager) reconcileServerList(l *serverList) bool { -func TestManagerInternal_reconcileServerList(t *testing.T) { - tests := []int{0, 1, 2, 3, 4, 5, 10, 100} - for _, n := range tests { - ok, err := test_reconcileServerList(n) - if !ok { - t.Errorf("Expected %d to pass: %v", n, err) - } - } -} - -func test_reconcileServerList(maxServers int) (bool, error) { - // Build a server list, reconcile, verify the missing servers are - // missing, the added have been added, and the original server is - // present. - const failPct = 0.5 - m := testManagerFailProb(failPct) - - var failedServers, healthyServers []*ServerEndpoint - for i := 0; i < maxServers; i++ { - nodeName := fmt.Sprintf("s%02d", i) - - node := &ServerEndpoint{Name: nodeName} - // Add 66% of servers to Manager - if rand.Float64() > 0.33 { - m.activateEndpoint(node) - - // Of healthy servers, (ab)use connPoolPinger to - // failPct of the servers for the reconcile. This - // allows for the selected server to no longer be - // healthy for the reconcile below. - if ok, _ := m.connPoolPinger.PingNomadServer(m.configInfo.Region(), m.configInfo.RPCVersion(), node); ok { - // Will still be present - healthyServers = append(healthyServers, node) - } else { - // Will be missing - failedServers = append(failedServers, node) - } - } else { - // Will be added from the call to reconcile - healthyServers = append(healthyServers, node) - } - } - - // Randomize Manager's server list - m.RebalanceServers() - selectedServer := m.FindServer() - - var selectedServerFailed bool - for _, s := range failedServers { - if selectedServer.Key().Equal(s.Key()) { - selectedServerFailed = true - break - } - } - - // Update Manager's server list to be "healthy" based on Serf. - // Reconcile this with origServers, which is shuffled and has a live - // connection, but possibly out of date. - origServers := m.getServerList() - m.saveServerList(serverList{L: healthyServers}) - - // This should always succeed with non-zero server lists - if !selectedServerFailed && !m.reconcileServerList(&origServers) && - len(m.getServerList().L) != 0 && - len(origServers.L) != 0 { - // If the random gods are unfavorable and we end up with zero - // length lists, expect things to fail and retry the test. - return false, fmt.Errorf("Expected reconcile to succeed: %v %d %d", - selectedServerFailed, - len(m.getServerList().L), - len(origServers.L)) - } - - // If we have zero-length server lists, test succeeded in degenerate - // case. - if len(m.getServerList().L) == 0 && - len(origServers.L) == 0 { - // Failed as expected w/ zero length list - return true, nil - } - - resultingServerMap := make(map[EndpointKey]bool) - for _, s := range m.getServerList().L { - resultingServerMap[*s.Key()] = true - } - - // Test to make sure no failed servers are in the Manager's - // list. Error if there are any failedServers in l.servers - for _, s := range failedServers { - _, ok := resultingServerMap[*s.Key()] - if ok { - return false, fmt.Errorf("Found failed server %v in merged list %v", s, resultingServerMap) - } - } - - // Test to make sure all healthy servers are in the healthy list. - if len(healthyServers) != len(m.getServerList().L) { - return false, fmt.Errorf("Expected healthy map and servers to match: %d/%d", len(healthyServers), len(healthyServers)) - } - - // Test to make sure all healthy servers are in the resultingServerMap list. - for _, s := range healthyServers { - _, ok := resultingServerMap[*s.Key()] - if !ok { - return false, fmt.Errorf("Server %v missing from healthy map after merged lists", s) - } - } - return true, nil -} - -// func (l *serverList) refreshServerRebalanceTimer() { -func TestManagerInternal_refreshServerRebalanceTimer(t *testing.T) { - type clusterSizes struct { - numNodes int - numServers int - minRebalance time.Duration - } - clusters := []clusterSizes{ - {0, 3, 2 * time.Minute}, - {1, 0, 2 * time.Minute}, // partitioned cluster - {1, 3, 2 * time.Minute}, - {2, 3, 2 * time.Minute}, - {100, 0, 2 * time.Minute}, // partitioned - {100, 1, 2 * time.Minute}, // partitioned - {100, 3, 2 * time.Minute}, - {1024, 1, 2 * time.Minute}, // partitioned - {1024, 3, 2 * time.Minute}, // partitioned - {1024, 5, 2 * time.Minute}, - {16384, 1, 4 * time.Minute}, // partitioned - {16384, 2, 2 * time.Minute}, // partitioned - {16384, 3, 2 * time.Minute}, // partitioned - {16384, 5, 2 * time.Minute}, - {65535, 0, 2 * time.Minute}, // partitioned - {65535, 1, 8 * time.Minute}, // partitioned - {65535, 2, 3 * time.Minute}, // partitioned - {65535, 3, 5 * time.Minute}, // partitioned - {65535, 5, 3 * time.Minute}, // partitioned - {65535, 7, 2 * time.Minute}, - {1000000, 1, 4 * time.Hour}, // partitioned - {1000000, 2, 2 * time.Hour}, // partitioned - {1000000, 3, 80 * time.Minute}, // partitioned - {1000000, 5, 50 * time.Minute}, // partitioned - {1000000, 11, 20 * time.Minute}, // partitioned - {1000000, 19, 10 * time.Minute}, - } - - logger := log.New(os.Stderr, "", log.LstdFlags) - shutdownCh := make(chan struct{}) - - for _, s := range clusters { - m := NewRpcProxy(logger, shutdownCh, &fauxSerf{numNodes: s.numNodes}, &fauxConnPool{}) - for i := 0; i < s.numServers; i++ { - nodeName := fmt.Sprintf("s%02d", i) - m.activateEndpoint(&ServerEndpoint{Name: nodeName}) - } - - d := m.refreshServerRebalanceTimer() - if d < s.minRebalance { - t.Errorf("duration too short for cluster of size %d and %d servers (%s < %s)", s.numNodes, s.numServers, d, s.minRebalance) - } - } -} - -// func (m *Manager) saveServerList(l serverList) { -func TestManagerInternal_saveServerList(t *testing.T) { - m := testManager() - - // Initial condition - func() { - l := m.getServerList() - if len(l.L) != 0 { - t.Fatalf("Manager.saveServerList failed to load init config") - } - - newServer := new(ServerEndpoint) - l.L = append(l.L, newServer) - m.saveServerList(l) - }() - - // Test that save works - func() { - l1 := m.getServerList() - t1NumServers := len(l1.L) - if t1NumServers != 1 { - t.Fatalf("Manager.saveServerList failed to save mutated config") - } - }() - - // Verify mutation w/o a save doesn't alter the original - func() { - newServer := new(ServerEndpoint) - l := m.getServerList() - l.L = append(l.L, newServer) - - l_orig := m.getServerList() - origNumServers := len(l_orig.L) - if origNumServers >= len(l.L) { - t.Fatalf("Manager.saveServerList unsaved config overwrote original") - } - }() -} From 22bd2b5ef2a96464975d72a4324a41e84e2b7db3 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 26 May 2016 15:15:54 -0700 Subject: [PATCH 043/166] Rename manager_test.go to rpc_proxy_test.go --- client/rpc_proxy/{manager_test.go => rpc_proxy_test.go} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename client/rpc_proxy/{manager_test.go => rpc_proxy_test.go} (100%) diff --git a/client/rpc_proxy/manager_test.go b/client/rpc_proxy/rpc_proxy_test.go similarity index 100% rename from client/rpc_proxy/manager_test.go rename to client/rpc_proxy/rpc_proxy_test.go From 7cdf0edcfefe8343d247c6b23fcd13f2d12605f1 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 26 May 2016 16:15:40 -0700 Subject: [PATCH 044/166] Rename the package from `client/rpc_proxy` to `client/rpcproxy` Also rename `NewRpcProxy()` to just `New()` to avoid package stutter. --- client/client.go | 8 ++++---- client/{rpc_proxy/rpc_proxy.go => rpcproxy/rpcproxy.go} | 6 +++--- .../rpcproxy_internal_test.go} | 8 ++++---- .../rpc_proxy_test.go => rpcproxy/rpcproxy_test.go} | 2 +- client/{rpc_proxy => rpcproxy}/server_endpoint.go | 2 +- command/agent/agent_test.go | 6 +++--- nomad/pool.go | 4 ++-- 7 files changed, 18 insertions(+), 18 deletions(-) rename client/{rpc_proxy/rpc_proxy.go => rpcproxy/rpcproxy.go} (99%) rename client/{rpc_proxy/rpc_proxy_internal_test.go => rpcproxy/rpcproxy_internal_test.go} (97%) rename client/{rpc_proxy/rpc_proxy_test.go => rpcproxy/rpcproxy_test.go} (99%) rename client/{rpc_proxy => rpcproxy}/server_endpoint.go (98%) diff --git a/client/client.go b/client/client.go index 9e268b8ead5..3a017297205 100644 --- a/client/client.go +++ b/client/client.go @@ -21,7 +21,7 @@ import ( "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/driver" "github.com/hashicorp/nomad/client/fingerprint" - "github.com/hashicorp/nomad/client/rpc_proxy" + "github.com/hashicorp/nomad/client/rpcproxy" "github.com/hashicorp/nomad/client/stats" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/structs" @@ -131,7 +131,7 @@ type Client struct { logger *log.Logger - rpcProxy *rpc_proxy.RpcProxy + rpcProxy *rpcproxy.RpcProxy connPool *nomad.ConnPool @@ -208,7 +208,7 @@ func NewClient(cfg *config.Config) (*Client, error) { // Create the RPC Proxy and bootstrap with the preconfigured list of // static servers - c.rpcProxy = rpc_proxy.NewRpcProxy(c.logger, c.shutdownCh, c, c.connPool) + c.rpcProxy = rpcproxy.New(c.logger, c.shutdownCh, c, c.connPool) for _, serverAddr := range c.config.Servers { c.rpcProxy.AddPrimaryServer(serverAddr) } @@ -1396,6 +1396,6 @@ func (c *Client) emitStats(hStats *stats.HostStats) { } } -func (c *Client) RpcProxy() *rpc_proxy.RpcProxy { +func (c *Client) RpcProxy() *rpcproxy.RpcProxy { return c.rpcProxy } diff --git a/client/rpc_proxy/rpc_proxy.go b/client/rpcproxy/rpcproxy.go similarity index 99% rename from client/rpc_proxy/rpc_proxy.go rename to client/rpcproxy/rpcproxy.go index efcc881f124..4c8d50f6f07 100644 --- a/client/rpc_proxy/rpc_proxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -6,7 +6,7 @@ // // The servers package does not provide any external API guarantees and // should be called only by `hashicorp/nomad`. -package rpc_proxy +package rpcproxy import ( "fmt" @@ -303,8 +303,8 @@ func (p *RpcProxy) LeaderAddr() string { return p.leaderAddr } -// NewRpcProxy is the only way to safely create a new RpcProxy. -func NewRpcProxy(logger *log.Logger, shutdownCh chan struct{}, configInfo NomadConfigInfo, connPoolPinger Pinger) (p *RpcProxy) { +// New is the only way to safely create a new RpcProxy. +func New(logger *log.Logger, shutdownCh chan struct{}, configInfo NomadConfigInfo, connPoolPinger Pinger) (p *RpcProxy) { p = new(RpcProxy) p.logger = logger p.configInfo = configInfo // can't pass *nomad.Client: import cycle diff --git a/client/rpc_proxy/rpc_proxy_internal_test.go b/client/rpcproxy/rpcproxy_internal_test.go similarity index 97% rename from client/rpc_proxy/rpc_proxy_internal_test.go rename to client/rpcproxy/rpcproxy_internal_test.go index 271d056a375..473d78e9639 100644 --- a/client/rpc_proxy/rpc_proxy_internal_test.go +++ b/client/rpcproxy/rpcproxy_internal_test.go @@ -1,4 +1,4 @@ -package rpc_proxy +package rpcproxy import ( "bytes" @@ -57,7 +57,7 @@ func (s *fauxSerf) RPCVersion() int { func testManager() (p *RpcProxy) { logger := GetBufferedLogger() shutdownCh := make(chan struct{}) - p = NewRpcProxy(logger, shutdownCh, &fauxSerf{numNodes: 16384}, &fauxConnPool{}) + p = New(logger, shutdownCh, &fauxSerf{numNodes: 16384}, &fauxConnPool{}) return p } @@ -65,7 +65,7 @@ func testManagerFailProb(failPct float64) (p *RpcProxy) { logger := GetBufferedLogger() logger = log.New(os.Stderr, "", log.LstdFlags) shutdownCh := make(chan struct{}) - p = NewRpcProxy(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) + p = New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) return p } @@ -300,7 +300,7 @@ func TestManagerInternal_refreshServerRebalanceTimer(t *testing.T) { shutdownCh := make(chan struct{}) for _, s := range clusters { - m := NewRpcProxy(logger, shutdownCh, &fauxSerf{numNodes: s.numNodes}, &fauxConnPool{}) + m := New(logger, shutdownCh, &fauxSerf{numNodes: s.numNodes}, &fauxConnPool{}) for i := 0; i < s.numServers; i++ { nodeName := fmt.Sprintf("s%02d", i) m.activateEndpoint(&ServerEndpoint{Name: nodeName}) diff --git a/client/rpc_proxy/rpc_proxy_test.go b/client/rpcproxy/rpcproxy_test.go similarity index 99% rename from client/rpc_proxy/rpc_proxy_test.go rename to client/rpcproxy/rpcproxy_test.go index dc8eed6d23b..b14e90eb870 100644 --- a/client/rpc_proxy/rpc_proxy_test.go +++ b/client/rpcproxy/rpcproxy_test.go @@ -1,4 +1,4 @@ -package rpc_proxy_test +package rpcproxy_test import ( "bytes" diff --git a/client/rpc_proxy/server_endpoint.go b/client/rpcproxy/server_endpoint.go similarity index 98% rename from client/rpc_proxy/server_endpoint.go rename to client/rpcproxy/server_endpoint.go index 34ae322fd64..f7c356c39ee 100644 --- a/client/rpc_proxy/server_endpoint.go +++ b/client/rpcproxy/server_endpoint.go @@ -1,4 +1,4 @@ -package rpc_proxy +package rpcproxy import ( "fmt" diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 6893c39c888..36a4aae9416 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -119,7 +119,7 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := out.RPCAdvertise; addr.IP.String() != "127.0.0.1" || addr.Port != 4001 { t.Fatalf("bad rpc advertise addr: %#v", addr) } - if addr := a.serverHTTPAddr; addr != "10.10.11.1:4005" { + if addr := a.serverHttpAddr; addr != "10.10.11.1:4005" { t.Fatalf("expect 10.11.11.1:4005, got: %v", addr) } @@ -155,7 +155,7 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := out.SerfConfig.MemberlistConfig.BindAddr; addr != "127.0.0.2" { t.Fatalf("expect 127.0.0.2, got: %s", addr) } - if addr := a.serverHTTPAddr; addr != "127.0.0.2:4646" { + if addr := a.serverHttpAddr; addr != "127.0.0.2:4646" { t.Fatalf("expect 127.0.0.3:4646, got: %s", addr) } @@ -195,7 +195,7 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := out.SerfConfig.MemberlistConfig.BindAddr; addr != "127.0.0.3" { t.Fatalf("expect 127.0.0.3, got: %s", addr) } - if addr := a.serverHTTPAddr; addr != "127.0.0.3:4646" { + if addr := a.serverHttpAddr; addr != "127.0.0.3:4646" { t.Fatalf("expect 127.0.0.3:4646, got: %s", addr) } diff --git a/nomad/pool.go b/nomad/pool.go index 50158b2a133..2881b6e6dda 100644 --- a/nomad/pool.go +++ b/nomad/pool.go @@ -12,7 +12,7 @@ import ( "github.com/hashicorp/consul/tlsutil" "github.com/hashicorp/net-rpc-msgpackrpc" - "github.com/hashicorp/nomad/client/rpc_proxy" + "github.com/hashicorp/nomad/client/rpcproxy" "github.com/hashicorp/yamux" ) @@ -376,7 +376,7 @@ func (p *ConnPool) RPC(region string, addr net.Addr, version int, method string, // PingNomadServer sends a Status.Ping message to the specified server and // returns true if healthy, false if an error occurred -func (p *ConnPool) PingNomadServer(region string, version int, s *rpc_proxy.ServerEndpoint) (bool, error) { +func (p *ConnPool) PingNomadServer(region string, version int, s *rpcproxy.ServerEndpoint) (bool, error) { // Get a usable client conn, sc, err := p.getClient(region, s.Addr, version) if err != nil { From 0a5fa55e8f652d726d9d4109b3709cde9acaaaff Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 26 May 2016 17:03:12 -0700 Subject: [PATCH 045/166] Fix package name in comments --- client/rpcproxy/rpcproxy.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 4c8d50f6f07..eeb901c5dbf 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -1,4 +1,4 @@ -// Package rpc_proxy provides a proxy interface for Nomad Servers. The +// Package rpcproxy provides a proxy interface for Nomad Servers. The // RpcProxy periodically shuffles which server a Nomad Client communicates // with in order to redistribute load across Nomad Servers. Nomad Servers // that fail an RPC request are automatically cycled to the end of the list From cbed88a808d5ccbdcad9526edff849d46916d0fc Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 26 May 2016 17:04:15 -0700 Subject: [PATCH 046/166] Move struct member to reduce diff context --- client/client.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/client/client.go b/client/client.go index 3a017297205..a532f4a84c7 100644 --- a/client/client.go +++ b/client/client.go @@ -123,14 +123,14 @@ type Client struct { configCopy *config.Config configLock sync.RWMutex + logger *log.Logger + // backupServerDeadline is the deadline at which this Nomad Agent // will begin polling Consul for a list of Nomad Servers. When Nomad // Clients are heartbeating successfully with Nomad Servers, Nomad // Clients do not poll Consul for a backup server list. backupServerDeadline time.Time - logger *log.Logger - rpcProxy *rpcproxy.RpcProxy connPool *nomad.ConnPool @@ -342,7 +342,7 @@ func (c *Client) RPC(method string, args interface{}, reply interface{}) error { } // Make the RPC request - if err := c.connPool.RPC(c.Region(), server.Addr, rpcVersion, method, args, reply); err != nil { + if err := c.connPool.RPC(c.Region(), server.Addr, c.RpcVersion(), method, args, reply); err != nil { c.rpcProxy.NotifyFailedServer(server) c.logger.Printf("[ERR] client: RPC failed to server %s: %v", server.Addr, err) return err From 1e3feae8f93dc4d44bdec400b367207baad7184b Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 26 May 2016 17:04:42 -0700 Subject: [PATCH 047/166] Bump shuffle interval per suggestion from Alex --- client/rpcproxy/rpcproxy.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index eeb901c5dbf..d3af5ada3ca 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -42,7 +42,7 @@ const ( // clientRPCMinReuseDuration controls the minimum amount of time RPC // queries are sent over an established connection to a single server - clientRPCMinReuseDuration = 120 * time.Second + clientRPCMinReuseDuration = 600 * time.Second // Limit the number of new connections a server receives per second // for connection rebalancing. This limit caps the load caused by From 1034e766a481b6f4b7fecd8d698756f247ba314e Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 26 May 2016 17:16:04 -0700 Subject: [PATCH 048/166] s/RpcVersion/RPCVersion/g --- client/client.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/client.go b/client/client.go index a532f4a84c7..313670cd760 100644 --- a/client/client.go +++ b/client/client.go @@ -342,7 +342,7 @@ func (c *Client) RPC(method string, args interface{}, reply interface{}) error { } // Make the RPC request - if err := c.connPool.RPC(c.Region(), server.Addr, c.RpcVersion(), method, args, reply); err != nil { + if err := c.connPool.RPC(c.Region(), server.Addr, c.RPCVersion(), method, args, reply); err != nil { c.rpcProxy.NotifyFailedServer(server) c.logger.Printf("[ERR] client: RPC failed to server %s: %v", server.Addr, err) return err From 916d93c9ab2c4d42eaefb1533279c8f0a0ed781b Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 26 May 2016 18:26:31 -0700 Subject: [PATCH 049/166] Bump the cluster test minimums to 10min. These ranges aren't too useful with the default 600s rebalance, but will be useful if that default ever changes in the future. --- client/rpcproxy/rpcproxy_internal_test.go | 54 +++++++++++------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/client/rpcproxy/rpcproxy_internal_test.go b/client/rpcproxy/rpcproxy_internal_test.go index 473d78e9639..494501cfc02 100644 --- a/client/rpcproxy/rpcproxy_internal_test.go +++ b/client/rpcproxy/rpcproxy_internal_test.go @@ -268,38 +268,38 @@ func TestManagerInternal_refreshServerRebalanceTimer(t *testing.T) { minRebalance time.Duration } clusters := []clusterSizes{ - {0, 3, 2 * time.Minute}, - {1, 0, 2 * time.Minute}, // partitioned cluster - {1, 3, 2 * time.Minute}, - {2, 3, 2 * time.Minute}, - {100, 0, 2 * time.Minute}, // partitioned - {100, 1, 2 * time.Minute}, // partitioned - {100, 3, 2 * time.Minute}, - {1024, 1, 2 * time.Minute}, // partitioned - {1024, 3, 2 * time.Minute}, // partitioned - {1024, 5, 2 * time.Minute}, - {16384, 1, 4 * time.Minute}, // partitioned - {16384, 2, 2 * time.Minute}, // partitioned - {16384, 3, 2 * time.Minute}, // partitioned - {16384, 5, 2 * time.Minute}, - {65535, 0, 2 * time.Minute}, // partitioned - {65535, 1, 8 * time.Minute}, // partitioned - {65535, 2, 3 * time.Minute}, // partitioned - {65535, 3, 5 * time.Minute}, // partitioned - {65535, 5, 3 * time.Minute}, // partitioned - {65535, 7, 2 * time.Minute}, - {1000000, 1, 4 * time.Hour}, // partitioned - {1000000, 2, 2 * time.Hour}, // partitioned - {1000000, 3, 80 * time.Minute}, // partitioned - {1000000, 5, 50 * time.Minute}, // partitioned - {1000000, 11, 20 * time.Minute}, // partitioned + {0, 3, 10 * time.Minute}, + {1, 0, 10 * time.Minute}, // partitioned cluster + {1, 3, 10 * time.Minute}, + {2, 3, 10 * time.Minute}, + {100, 0, 10 * time.Minute}, // partitioned + {100, 1, 10 * time.Minute}, // partitioned + {100, 3, 10 * time.Minute}, + {1024, 1, 10 * time.Minute}, // partitioned + {1024, 3, 10 * time.Minute}, // partitioned + {1024, 5, 10 * time.Minute}, + {16384, 1, 10 * time.Minute}, // partitioned + {16384, 2, 10 * time.Minute}, // partitioned + {16384, 3, 10 * time.Minute}, // partitioned + {16384, 5, 10 * time.Minute}, + {65535, 0, 10 * time.Minute}, // partitioned + {65535, 1, 10 * time.Minute}, // partitioned + {65535, 2, 10 * time.Minute}, // partitioned + {65535, 3, 10 * time.Minute}, // partitioned + {65535, 5, 10 * time.Minute}, // partitioned + {65535, 7, 10 * time.Minute}, + {1000000, 1, 10 * time.Minute}, // partitioned + {1000000, 2, 10 * time.Minute}, // partitioned + {1000000, 3, 10 * time.Minute}, // partitioned + {1000000, 5, 10 * time.Minute}, // partitioned + {1000000, 11, 10 * time.Minute}, // partitioned {1000000, 19, 10 * time.Minute}, } logger := log.New(os.Stderr, "", log.LstdFlags) shutdownCh := make(chan struct{}) - for _, s := range clusters { + for i, s := range clusters { m := New(logger, shutdownCh, &fauxSerf{numNodes: s.numNodes}, &fauxConnPool{}) for i := 0; i < s.numServers; i++ { nodeName := fmt.Sprintf("s%02d", i) @@ -308,7 +308,7 @@ func TestManagerInternal_refreshServerRebalanceTimer(t *testing.T) { d := m.refreshServerRebalanceTimer() if d < s.minRebalance { - t.Errorf("duration too short for cluster of size %d and %d servers (%s < %s)", s.numNodes, s.numServers, d, s.minRebalance) + t.Errorf("[%d] duration too short for cluster of size %d and %d servers (%s < %s)", i, s.numNodes, s.numServers, d, s.minRebalance) } } } From b293cf33c13b98501c1d93f851ab4c9800a1b488 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 26 May 2016 23:48:34 -0700 Subject: [PATCH 050/166] Fix typo in the comment to reflect the actual function name. --- nomad/structs/structs.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 23b747e0099..a076a80379e 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -303,7 +303,7 @@ type AllocSpecificRequest struct { QueryOptions } -// AllocsGetcRequest is used to query a set of allocations +// AllocsGetRequest is used to query a set of allocations type AllocsGetRequest struct { AllocIDs []string QueryOptions From ce8c03d70823fc94a431430dd709b02a72828329 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 00:42:31 -0700 Subject: [PATCH 051/166] Advertise the server's RPC endpoint, not its HTTP endpoint. Rename c.serverRpcAddr to serverRpcAddr. This will be broken out into in additional set of services in a subsequent commit. --- command/agent/agent.go | 21 +++++++++++++-------- command/agent/agent_test.go | 6 +++--- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 0eb9cd03bfd..9ac4036392c 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -38,7 +38,7 @@ type Agent struct { // consulSyncer registers the Nomad agent with the Consul Agent consulSyncer *consul.Syncer - serverHttpAddr string + serverRpcAddr string clientHttpAddr string server *nomad.Server @@ -166,15 +166,20 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { conf.SerfConfig.MemberlistConfig.BindPort = port } - if a.config.AdvertiseAddrs.HTTP != "" { - a.serverHttpAddr = a.config.AdvertiseAddrs.HTTP - } else if a.config.Addresses.HTTP != "" { - a.serverHttpAddr = fmt.Sprintf("%v:%v", a.config.Addresses.HTTP, a.config.Ports.HTTP) + if a.config.AdvertiseAddrs.RPC != "" { + a.serverRpcAddr = a.config.AdvertiseAddrs.RPC + } else if a.config.Addresses.RPC != "" { + a.serverRpcAddr = fmt.Sprintf("%v:%v", a.config.Addresses.RPC, a.config.Ports.RPC) } else if a.config.BindAddr != "" { - a.serverHttpAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.HTTP) + a.serverRpcAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.RPC) } else { - a.serverHttpAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.HTTP) + a.serverRpcAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.RPC) } + addr, err := net.ResolveTCPAddr("tcp", a.serverRpcAddr) + if err != nil { + return nil, fmt.Errorf("error resolving RPC addr %q: %v:", a.serverRpcAddr, err) + } + a.serverRpcAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) if gcThreshold := a.config.Server.NodeGCThreshold; gcThreshold != "" { dur, err := time.ParseDuration(gcThreshold) @@ -530,7 +535,7 @@ func (a *Agent) syncAgentServicesWithConsul() error { if a.server != nil && a.config.Consul.ServerServiceName != "" { serverService := &structs.Service{ Name: a.config.Consul.ServerServiceName, - PortLabel: a.serverHttpAddr, + PortLabel: a.serverRpcAddr, } services = append(services, serverService) a.consulSyncer.SetServiceIdentifier("agent-server") diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 36a4aae9416..ec3de72a6c7 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -119,7 +119,7 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := out.RPCAdvertise; addr.IP.String() != "127.0.0.1" || addr.Port != 4001 { t.Fatalf("bad rpc advertise addr: %#v", addr) } - if addr := a.serverHttpAddr; addr != "10.10.11.1:4005" { + if addr := a.serverRpcAddr; addr != "10.10.11.1:4005" { t.Fatalf("expect 10.11.11.1:4005, got: %v", addr) } @@ -155,7 +155,7 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := out.SerfConfig.MemberlistConfig.BindAddr; addr != "127.0.0.2" { t.Fatalf("expect 127.0.0.2, got: %s", addr) } - if addr := a.serverHttpAddr; addr != "127.0.0.2:4646" { + if addr := a.serverRpcAddr; addr != "127.0.0.2:4646" { t.Fatalf("expect 127.0.0.3:4646, got: %s", addr) } @@ -195,7 +195,7 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := out.SerfConfig.MemberlistConfig.BindAddr; addr != "127.0.0.3" { t.Fatalf("expect 127.0.0.3, got: %s", addr) } - if addr := a.serverHttpAddr; addr != "127.0.0.3:4646" { + if addr := a.serverRpcAddr; addr != "127.0.0.3:4646" { t.Fatalf("expect 127.0.0.3:4646, got: %s", addr) } From bc86e897ed606f44f2b9dcc5172797849560cf5c Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 01:35:10 -0700 Subject: [PATCH 052/166] Register two services each for clients and servers, http and rpc. In order to give clients a fighting chance to talk to the right port, differentiate RPC services from HTTP services by registering two services with different tags. This yields `rpc.nomad-server.service.consul` and `http.nomad-server.service.consul` which is immensely more useful to clients attempting to bootstrap their world. --- client/client.go | 2 +- client/consul/sync.go | 6 +++ command/agent/agent.go | 117 +++++++++++++++++++++++++++++++---------- 3 files changed, 95 insertions(+), 30 deletions(-) diff --git a/client/client.go b/client/client.go index 313670cd760..5e688d5dbef 100644 --- a/client/client.go +++ b/client/client.go @@ -1252,7 +1252,7 @@ func (c *Client) setupConsulSyncer() error { c.configLock.RUnlock() nomadServerServiceName := c.config.ConsulConfig.ServerServiceName - services, _, err := c.consulSyncer.ConsulClient().Catalog().Service(nomadServerServiceName, "", &consulapi.QueryOptions{AllowStale: true}) + services, _, err := c.consulSyncer.ConsulClient().Catalog().Service(nomadServerServiceName, consul.ServiceTagRpc, &consulapi.QueryOptions{AllowStale: true}) if err != nil { c.logger.Printf("[WARN] client: unable to query service %q: %v", nomadServerServiceName, err) return diff --git a/client/consul/sync.go b/client/consul/sync.go index d53c4a869d8..ec2a9725f87 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -68,6 +68,12 @@ const ( // ttlCheckBuffer is the time interval that Nomad can take to report Consul // the check result ttlCheckBuffer = 31 * time.Second + + // ServiceTagHttp is the tag assigned to HTTP services + ServiceTagHttp = "http" + + // ServiceTagRpc is the tag assigned to RPC services + ServiceTagRpc = "rpc" ) // NewSyncer returns a new consul.Syncer diff --git a/command/agent/agent.go b/command/agent/agent.go index 9ac4036392c..b07e862e3d5 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -38,11 +38,13 @@ type Agent struct { // consulSyncer registers the Nomad agent with the Consul Agent consulSyncer *consul.Syncer - serverRpcAddr string + client *client.Client clientHttpAddr string + clientRpcAddr string - server *nomad.Server - client *client.Client + server *nomad.Server + serverHttpAddr string + serverRpcAddr string shutdown bool shutdownCh types.ShutdownChannel @@ -166,6 +168,23 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { conf.SerfConfig.MemberlistConfig.BindPort = port } + // Resolve the Server's HTTP Address + if a.config.AdvertiseAddrs.HTTP != "" { + a.serverHttpAddr = a.config.AdvertiseAddrs.HTTP + } else if a.config.Addresses.HTTP != "" { + a.serverHttpAddr = fmt.Sprintf("%v:%v", a.config.Addresses.HTTP, a.config.Ports.HTTP) + } else if a.config.BindAddr != "" { + a.serverHttpAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.HTTP) + } else { + a.serverHttpAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.HTTP) + } + addr, err := net.ResolveTCPAddr("tcp", a.serverHttpAddr) + if err != nil { + return nil, fmt.Errorf("error resolving HTTP addr %q: %v:", a.serverHttpAddr, err) + } + a.serverHttpAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) + + // Resolve the Server's RPC Address if a.config.AdvertiseAddrs.RPC != "" { a.serverRpcAddr = a.config.AdvertiseAddrs.RPC } else if a.config.Addresses.RPC != "" { @@ -175,7 +194,7 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { } else { a.serverRpcAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.RPC) } - addr, err := net.ResolveTCPAddr("tcp", a.serverRpcAddr) + addr, err = net.ResolveTCPAddr("tcp", a.serverRpcAddr) if err != nil { return nil, fmt.Errorf("error resolving RPC addr %q: %v:", a.serverRpcAddr, err) } @@ -251,23 +270,40 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { conf.Node.Meta = a.config.Client.Meta conf.Node.NodeClass = a.config.Client.NodeClass - // Setting the proper HTTP Addr - httpAddr := fmt.Sprintf("%s:%d", a.config.BindAddr, a.config.Ports.HTTP) - if a.config.Addresses.HTTP != "" && a.config.AdvertiseAddrs.HTTP == "" { - httpAddr = fmt.Sprintf("%s:%d", a.config.Addresses.HTTP, a.config.Ports.HTTP) - if _, err := net.ResolveTCPAddr("tcp", httpAddr); err != nil { - return nil, fmt.Errorf("error resolving http addr: %v:", err) - } - } else if a.config.AdvertiseAddrs.HTTP != "" { - addr, err := net.ResolveTCPAddr("tcp", a.config.AdvertiseAddrs.HTTP) - if err != nil { - return nil, fmt.Errorf("error resolving advertise http addr: %v", err) - } - httpAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) + // Resolve the Client's HTTP address + if a.config.AdvertiseAddrs.HTTP != "" { + a.clientHttpAddr = a.config.AdvertiseAddrs.HTTP + } else if a.config.Addresses.HTTP != "" { + a.clientHttpAddr = fmt.Sprintf("%v:%v", a.config.Addresses.HTTP, a.config.Ports.HTTP) + } else if a.config.BindAddr != "" { + a.clientHttpAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.HTTP) + } else { + a.clientHttpAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.HTTP) } + addr, err := net.ResolveTCPAddr("tcp", a.clientHttpAddr) + if err != nil { + return nil, fmt.Errorf("error resolving HTTP addr %q: %v:", a.clientHttpAddr, err) + } + httpAddr := fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) conf.Node.HTTPAddr = httpAddr a.clientHttpAddr = httpAddr + // Resolve the Client's RPC address + if a.config.AdvertiseAddrs.RPC != "" { + a.clientRpcAddr = a.config.AdvertiseAddrs.RPC + } else if a.config.Addresses.RPC != "" { + a.clientRpcAddr = fmt.Sprintf("%v:%v", a.config.Addresses.RPC, a.config.Ports.RPC) + } else if a.config.BindAddr != "" { + a.clientRpcAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.RPC) + } else { + a.clientRpcAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.RPC) + } + addr, err = net.ResolveTCPAddr("tcp", a.clientRpcAddr) + if err != nil { + return nil, fmt.Errorf("error resolving RPC addr %q: %v:", a.clientRpcAddr, err) + } + a.clientRpcAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) + // Reserve resources on the node. r := conf.Node.Reserved if r == nil { @@ -524,21 +560,44 @@ func (a *Agent) setupConsulSyncer(shutdownCh types.ShutdownChannel) (err error) // when running in either Client or Server mode. func (a *Agent) syncAgentServicesWithConsul() error { var services []*structs.Service - if a.client != nil && a.config.Consul.ClientServiceName != "" { - clientService := &structs.Service{ - Name: a.config.Consul.ClientServiceName, - PortLabel: a.clientHttpAddr, + if a.client != nil { + if a.config.Consul.ClientServiceName != "" { + clientRpcService := &structs.Service{ + Name: a.config.Consul.ClientServiceName, + PortLabel: a.clientRpcAddr, + Tags: []string{consul.ServiceTagRpc}, + } + services = append(services, clientRpcService) + + clientHttpService := &structs.Service{ + Name: a.config.Consul.ClientServiceName, + PortLabel: a.clientHttpAddr, + Tags: []string{consul.ServiceTagHttp}, + } + services = append(services, clientHttpService) + + a.consulSyncer.SetServiceIdentifier("agent-client") } - services = append(services, clientService) - a.consulSyncer.SetServiceIdentifier("agent-client") } - if a.server != nil && a.config.Consul.ServerServiceName != "" { - serverService := &structs.Service{ - Name: a.config.Consul.ServerServiceName, - PortLabel: a.serverRpcAddr, + + if a.server != nil { + if a.config.Consul.ServerServiceName != "" { + serverRpcService := &structs.Service{ + Name: a.config.Consul.ServerServiceName, + Tags: []string{consul.ServiceTagRpc}, + PortLabel: a.serverRpcAddr, + } + services = append(services, serverRpcService) + + serverHttpService := &structs.Service{ + Name: a.config.Consul.ServerServiceName, + Tags: []string{consul.ServiceTagHttp}, + PortLabel: a.serverHttpAddr, + } + services = append(services, serverHttpService) + + a.consulSyncer.SetServiceIdentifier("agent-server") } - services = append(services, serverService) - a.consulSyncer.SetServiceIdentifier("agent-server") } a.consulSyncer.SetAddrFinder(func(portLabel string) (string, int) { From 4d47eedd5885dde621690a2604b38342580210c4 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 02:19:01 -0700 Subject: [PATCH 053/166] Teach Client to reuse an Agent's consulSyncer. "There can be only one." --- client/client.go | 22 +++++++--------------- client/client_test.go | 15 +++++++++++++-- client/consul/sync.go | 2 -- command/agent/agent.go | 2 +- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/client/client.go b/client/client.go index 5e688d5dbef..574bd2a8bb1 100644 --- a/client/client.go +++ b/client/client.go @@ -161,7 +161,7 @@ type Client struct { } // NewClient is used to create a new client from the given configuration -func NewClient(cfg *config.Config) (*Client, error) { +func NewClient(cfg *config.Config, consulSyncer *consul.Syncer) (*Client, error) { // Create a logger logger := log.New(cfg.LogOutput, "", log.LstdFlags) @@ -173,6 +173,7 @@ func NewClient(cfg *config.Config) (*Client, error) { // Create the client c := &Client{ config: cfg, + consulSyncer: consulSyncer, start: time.Now(), connPool: nomad.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, nil), logger: logger, @@ -245,9 +246,6 @@ func NewClient(cfg *config.Config) (*Client, error) { // Start maintenance task for servers go c.rpcProxy.Run() - // Start the Consul sync - go c.runClientConsulSyncer() - return c, nil } @@ -1232,17 +1230,10 @@ func (c *Client) addAlloc(alloc *structs.Allocation) error { // setupConsulSyncer creates a consul.Syncer func (c *Client) setupConsulSyncer() error { - cs, err := consul.NewSyncer(c.config.ConsulConfig, c.logger) - if err != nil { - return err - } - - c.consulSyncer = cs - - // Callback handler used to periodically poll Consul in the event - // there are no Nomad Servers available and the Nomad Agent is in a - // bootstrap situation. - fn := func() { + // Callback handler used to periodically poll Consul to look up the + // Nomad Servers in Consul in the event the heartbeat deadline has + // been exceeded and this Agent is in a bootstrap situation. + bootstrapFn := func() { now := time.Now() c.configLock.RLock() if now.Before(c.backupServerDeadline) { @@ -1268,6 +1259,7 @@ func (c *Client) setupConsulSyncer() error { } c.rpcProxy.SetBackupServers(serverAddrs) } + c.consulSyncer.AddPeriodicHandler("Nomad Client Fallback Server Handler", bootstrapFn) const handlerName = "Nomad Client Fallback Server Handler" c.consulSyncer.AddPeriodicHandler(handlerName, fn) diff --git a/client/client_test.go b/client/client_test.go index e8126a61193..57de4a60665 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -12,6 +12,7 @@ import ( "time" "github.com/hashicorp/nomad/client/config" + "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" @@ -76,7 +77,12 @@ func testClient(t *testing.T, cb func(c *config.Config)) *Client { cb(conf) } - client, err := NewClient(conf) + consulSyncer, err := consul.NewSyncer(conf, log.New(os.Stderr, "", log.LstdFlags)) + if err != nil { + t.Fatalf("err: %v", err) + } + + client, err := NewClient(conf, consulSyncer) if err != nil { t.Fatalf("err: %v", err) } @@ -463,7 +469,12 @@ func TestClient_SaveRestoreState(t *testing.T) { } // Create a new client - c2, err := NewClient(c1.config) + consulSyncer, err := consul.NewSyncer(c1.config, log.New(os.Stderr, "", log.LstdFlags)) + if err != nil { + t.Fatalf("err: %v", err) + } + + c2, err := NewClient(c1.config, consulSyncer) if err != nil { t.Fatalf("err: %v", err) } diff --git a/client/consul/sync.go b/client/consul/sync.go index ec2a9725f87..db3d64695cc 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -527,13 +527,11 @@ func GenerateServiceIdentifier(allocID string, taskName string) string { func (c *Syncer) AddPeriodicHandler(name string, fn types.PeriodicCallback) bool { c.periodicLock.Lock() defer c.periodicLock.Unlock() - c.logger.Printf("[DEBUG] consul.sync: adding handler named %s", name) if _, found := c.periodicCallbacks[name]; found { c.logger.Printf("[ERROR] consul.sync: failed adding handler %q", name) return false } c.periodicCallbacks[name] = fn - c.logger.Printf("[DEBUG] consul.sync: successfully added handler %q", name) return true } diff --git a/command/agent/agent.go b/command/agent/agent.go index b07e862e3d5..60361bfd046 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -369,7 +369,7 @@ func (a *Agent) setupClient() error { } // Create the client - client, err := client.NewClient(conf) + client, err := client.NewClient(conf, a.consulSyncer) if err != nil { return fmt.Errorf("client setup failed: %v", err) } From 9a93496475c613cc8a95911a9da10695376b8a33 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 02:21:38 -0700 Subject: [PATCH 054/166] Consolidate all consul sync periodic go routines to handlers. Only one pump and periodic loop now. --- client/client.go | 109 +++++++++++++++++------------------------------ 1 file changed, 39 insertions(+), 70 deletions(-) diff --git a/client/client.go b/client/client.go index 574bd2a8bb1..fcf3408b8f4 100644 --- a/client/client.go +++ b/client/client.go @@ -9,7 +9,6 @@ import ( "path/filepath" "strconv" "sync" - "sync/atomic" "time" "github.com/armon/go-metrics" @@ -1230,9 +1229,11 @@ func (c *Client) addAlloc(alloc *structs.Allocation) error { // setupConsulSyncer creates a consul.Syncer func (c *Client) setupConsulSyncer() error { - // Callback handler used to periodically poll Consul to look up the - // Nomad Servers in Consul in the event the heartbeat deadline has - // been exceeded and this Agent is in a bootstrap situation. + // The bootstrapFn callback handler is used to periodically poll + // Consul to look up the Nomad Servers in Consul. In the event the + // heartbeat deadline has been exceeded and this Agent is orphaned + // from its cluster, periodically poll Consul to reattach this Agent + // to its cluster and automatically recover from a detached state. bootstrapFn := func() { now := time.Now() c.configLock.RLock() @@ -1261,80 +1262,48 @@ func (c *Client) setupConsulSyncer() error { } c.consulSyncer.AddPeriodicHandler("Nomad Client Fallback Server Handler", bootstrapFn) - const handlerName = "Nomad Client Fallback Server Handler" - c.consulSyncer.AddPeriodicHandler(handlerName, fn) - return nil -} - -// runClientConsulSyncer runs the consul.Syncer task in the Nomad Agent's -// context. This is primarily responsible for removing tasks which are no -// longer in running state. -func (c *Client) runClientConsulSyncer() { - d := consulSyncDelay + lib.RandomStagger(consulSyncInterval-consulSyncDelay) - c.logger.Printf("[DEBUG] consul.sync: sleeping %v before first sync", d) - sync := time.NewTimer(d) - for { - select { - case <-sync.C: - fn := func() { - defer atomic.StoreInt64(&c.consulLock, 0) - - d = consulSyncInterval - lib.RandomStagger(consulSyncInterval/consulSyncJitter) - sync.Reset(d) - - // Run syncer handlers regardless of this - // Agent's client or server status. - c.consulSyncer.RunHandlers() - - // Give up pruning services if we can't - // fingerprint our Consul Agent. - c.configLock.RLock() - _, ok := c.configCopy.Node.Attributes["consul.version"] - c.configLock.RUnlock() - if !ok { - return - } + consulServicesSyncFn := func() { + // Give up pruning services if we can't fingerprint our + // Consul Agent. + c.configLock.RLock() + _, ok := c.configCopy.Node.Attributes["consul.version"] + c.configLock.RUnlock() + if !ok { + return + } - services := make(map[string]struct{}) - // Get the existing allocs - c.allocLock.RLock() - allocs := make([]*AllocRunner, 0, len(c.allocs)) - for _, ar := range c.allocs { - allocs = append(allocs, ar) - } - c.allocLock.RUnlock() - - for _, ar := range allocs { - ar.taskStatusLock.RLock() - taskStates := copyTaskStates(ar.taskStates) - ar.taskStatusLock.RUnlock() - for taskName, taskState := range taskStates { - if taskState.State == structs.TaskStateRunning { - if tr, ok := ar.tasks[taskName]; ok { - for _, service := range tr.task.Services { - svcIdentifier := fmt.Sprintf("%s-%s", ar.alloc.ID, tr.task.Name) - services[service.ID(svcIdentifier)] = struct{}{} - } - } + services := make(map[string]struct{}) + // Get the existing allocs + c.allocLock.RLock() + allocs := make([]*AllocRunner, 0, len(c.allocs)) + for _, ar := range c.allocs { + allocs = append(allocs, ar) + } + c.allocLock.RUnlock() + + for _, ar := range allocs { + ar.taskStatusLock.RLock() + taskStates := copyTaskStates(ar.taskStates) + ar.taskStatusLock.RUnlock() + for taskName, taskState := range taskStates { + if taskState.State == structs.TaskStateRunning { + if tr, ok := ar.tasks[taskName]; ok { + for _, service := range tr.task.Services { + svcIdentifier := fmt.Sprintf("%s-%s", ar.alloc.ID, tr.task.Name) + services[service.ID(svcIdentifier)] = struct{}{} } } } - - if err := c.consulSyncer.KeepServices(services); err != nil { - c.logger.Printf("[DEBUG] client: error removing services from non-running tasks: %v", err) - } - } - - if atomic.CompareAndSwapInt64(&c.consulLock, 0, 1) { - go fn() } - case <-c.shutdownCh: - sync.Stop() - c.logger.Printf("[INFO] client: shutting down consul sync") - return } + if err := c.consulSyncer.KeepServices(services); err != nil { + c.logger.Printf("[DEBUG] client: error removing services from non-running tasks: %v", err) + } } + c.consulSyncer.AddPeriodicHandler("Nomad Client Services Sync Handler", consulServicesSyncFn) + + return nil } // collectHostStats collects host resource usage stats periodically From bccf131f9a74d9b631446cbf5b43efa33b69b5f7 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 02:50:05 -0700 Subject: [PATCH 055/166] Invert check definition so the error is first --- nomad/server.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nomad/server.go b/nomad/server.go index 424d462aef4..18986bd6797 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -703,12 +703,11 @@ func (s *Server) RPC(method string, args interface{}, reply interface{}) error { // RaftPeers returns the current list of Raft peers func (s *Server) RaftPeers() ([]string, error) { - if peers, err := s.raftPeers.Peers(); err == nil { - return peers, nil - } else { + if peers, err := s.raftPeers.Peers(); err != nil { s.logger.Printf("[DEBUG] server: error getting raft peers: %v", err) return nil, err } + return peers, nil } // Stats is used to return statistics for debugging and insight From 2abd37c741c98fe46f7ae0e5c800c22310fea656 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 03:38:07 -0700 Subject: [PATCH 056/166] Silence unused variable warning --- api/nodes_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/nodes_test.go b/api/nodes_test.go index 0a57321763a..53355c3eb2b 100644 --- a/api/nodes_test.go +++ b/api/nodes_test.go @@ -50,7 +50,7 @@ func TestNodes_PrefixList(t *testing.T) { var err error // Get the node ID - var nodeID, dc string + var nodeID string testutil.WaitForResult(func() (bool, error) { out, _, err := nodes.List(nil) if err != nil { @@ -60,7 +60,7 @@ func TestNodes_PrefixList(t *testing.T) { return false, fmt.Errorf("expected 1 node, got: %d", n) } nodeID = out[0].ID - dc = out[0].Datacenter + _ = out[0].Datacenter return true, nil }, func(err error) { t.Fatalf("err: %s", err) From 7034c50d2fa5cca3942a3181327ccf50135c9acf Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 03:45:09 -0700 Subject: [PATCH 057/166] Pass the datacenter name in the heartbeat Servers that are part of a different datacenter are added as backup servers instead of primary servers. --- client/client.go | 5 +++++ client/rpcproxy/rpcproxy.go | 20 ++++++++++++++------ nomad/node_endpoint.go | 7 ++++--- nomad/structs/structs.go | 7 +++++-- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/client/client.go b/client/client.go index fcf3408b8f4..82a18d42253 100644 --- a/client/client.go +++ b/client/client.go @@ -291,6 +291,11 @@ func (c *Client) Leave() error { return nil } +// Datacenter returns the datacenter for the given client +func (c *Client) Datacenter() string { + return c.config.Node.Datacenter +} + // Region returns the region for the given client func (c *Client) Region() string { return c.config.Region diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index d3af5ada3ca..1811e211e2f 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -67,6 +67,7 @@ const ( // NomadConfigInfo is an interface wrapper around this Nomad Agent's // configuration to prevents a cyclic import dependency. type NomadConfigInfo interface { + Datacenter() string RPCVersion() int Region() string } @@ -666,21 +667,28 @@ func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse // TODO(sean@): Move the logging throttle logic into a // dedicated logging package so RpcProxy does not have to // perform this accounting. - if int32(p.configInfo.RPCVersion()) < s.RPCVersion { + if int32(p.configInfo.RPCVersion()) < s.RpcVersion { now := time.Now() - t, ok := p.rpcAPIMismatchThrottle[s.RPCAdvertiseAddr] + t, ok := p.rpcAPIMismatchThrottle[s.RpcAdvertiseAddr] if ok && t.After(now) { continue } - p.logger.Printf("[WARN] API mismatch between client (v%d) and server (v%d), ignoring server %q", apiMajorVersion, s.RPCVersion, s.RPCAdvertiseAddr) - p.rpcAPIMismatchThrottle[s.RPCAdvertiseAddr] = now.Add(rpcAPIMismatchLogRate) + p.logger.Printf("[WARN] API mismatch between client (v%d) and server (v%d), ignoring server %q", apiMajorVersion, s.RpcVersion, s.RpcAdvertiseAddr) + p.rpcAPIMismatchThrottle[s.RpcAdvertiseAddr] = now.Add(rpcAPIMismatchLogRate) continue } - server, err := newServer(s.RPCAdvertiseAddr) + server, err := newServer(s.RpcAdvertiseAddr) if err != nil { - p.logger.Printf("[WARN] Unable to create a server from %q: %v", s.RPCAdvertiseAddr, err) + p.logger.Printf("[WARN] Unable to create a server from %q: %v", s.RpcAdvertiseAddr, err) + continue + } + + // Nomad servers in different datacenters are automatically + // added to the backup server list. + if s.Datacenter != p.configInfo.Datacenter() { + p.backupServers.L = append(p.backupServers.L, server) continue } diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index 6a7cd04c346..e78fc87032e 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -213,11 +213,12 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct // Reply with config information required for future RPC requests reply.Servers = make([]*structs.NodeServerInfo, 0, len(n.srv.localPeers)) - for p := range n.srv.localPeers { + for k, v := range n.srv.localPeers { reply.Servers = append(reply.Servers, &structs.NodeServerInfo{ - RPCAdvertiseAddr: p, - RPCVersion: apiMajorVersion, + RpcAdvertiseAddr: k, + RpcVersion: int32(v.Version), + Datacenter: v.Datacenter, }) } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index a076a80379e..b1285a634f2 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -156,10 +156,13 @@ type NodeDeregisterRequest struct { type NodeServerInfo struct { // RPCAdvertiseAddr is the IP endpoint that a Nomad Server wishes to // be contacted at for RPCs. - RPCAdvertiseAddr string + RpcAdvertiseAddr string // RPCVersion is the version number the Nomad Server supports - RPCVersion int32 + RpcVersion int32 + + // Datacenter is the datacenter that a Nomad server belongs to + Datacenter string } // NodeUpdateStatusRequest is used for Node.UpdateStatus endpoint From 57bb37423643fddec255117e507c58044598b6c6 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 03:46:52 -0700 Subject: [PATCH 058/166] Fix unit tests --- client/client_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/client/client_test.go b/client/client_test.go index 57de4a60665..8653c5c3cb1 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -77,7 +77,7 @@ func testClient(t *testing.T, cb func(c *config.Config)) *Client { cb(conf) } - consulSyncer, err := consul.NewSyncer(conf, log.New(os.Stderr, "", log.LstdFlags)) + consulSyncer, err := consul.NewSyncer(conf.ConsulConfig, log.New(os.Stderr, "", log.LstdFlags)) if err != nil { t.Fatalf("err: %v", err) } @@ -469,7 +469,7 @@ func TestClient_SaveRestoreState(t *testing.T) { } // Create a new client - consulSyncer, err := consul.NewSyncer(c1.config, log.New(os.Stderr, "", log.LstdFlags)) + consulSyncer, err := consul.NewSyncer(c1.config.ConsulConfig, log.New(os.Stderr, "", log.LstdFlags)) if err != nil { t.Fatalf("err: %v", err) } From 3d22c22bf5d28f72c06863be8bc884adbd8bb508 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 03:47:49 -0700 Subject: [PATCH 059/166] Remove types.ShutdownChannel and replace with `chan struct{}` --- client/consul/sync.go | 4 ++-- command/agent/agent.go | 5 ++--- nomad/types/types.go | 1 - 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/client/consul/sync.go b/client/consul/sync.go index db3d64695cc..dd8d4b34b87 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -39,7 +39,7 @@ type Syncer struct { logger *log.Logger - shutdownCh types.ShutdownChannel + shutdownCh chan struct{} shutdown bool shutdownLock sync.Mutex @@ -135,7 +135,7 @@ func NewSyncer(config *config.ConsulConfig, logger *log.Logger) (*Syncer, error) trackedServices: make(map[string]*consul.AgentService), trackedChecks: make(map[string]*consul.AgentCheckRegistration), checkRunners: make(map[string]*CheckRunner), - shutdownCh: make(types.ShutdownChannel), + shutdownCh: make(chan struct{}), periodicCallbacks: make(map[string]types.PeriodicCallback), } return &consulSyncer, nil diff --git a/command/agent/agent.go b/command/agent/agent.go index 60361bfd046..f92ec4abdf3 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -18,7 +18,6 @@ import ( "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs/config" - "github.com/hashicorp/nomad/nomad/types" ) // Agent is a long running daemon that is used to run both @@ -47,7 +46,7 @@ type Agent struct { serverRpcAddr string shutdown bool - shutdownCh types.ShutdownChannel + shutdownCh chan struct{} shutdownLock sync.Mutex } @@ -58,7 +57,7 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { logOutput = os.Stderr } - shutdownCh := make(types.ShutdownChannel) + shutdownCh := make(chan struct{}) a := &Agent{ config: config, logger: log.New(logOutput, "", log.LstdFlags), diff --git a/nomad/types/types.go b/nomad/types/types.go index bb4ca552568..37196ac3946 100644 --- a/nomad/types/types.go +++ b/nomad/types/types.go @@ -1,4 +1,3 @@ package types type PeriodicCallback func() -type ShutdownChannel chan struct{} From 49266fa29ff8f31b64a8b4be9b4790cf76114bf2 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 03:48:19 -0700 Subject: [PATCH 060/166] Invert error handling logic --- nomad/server.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nomad/server.go b/nomad/server.go index 18986bd6797..c79de16678d 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -703,7 +703,8 @@ func (s *Server) RPC(method string, args interface{}, reply interface{}) error { // RaftPeers returns the current list of Raft peers func (s *Server) RaftPeers() ([]string, error) { - if peers, err := s.raftPeers.Peers(); err != nil { + peers, err := s.raftPeers.Peers() + if err != nil { s.logger.Printf("[DEBUG] server: error getting raft peers: %v", err) return nil, err } From 9bd28824e039c6a510de6cd77de8b6a3e4be51c0 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 03:48:32 -0700 Subject: [PATCH 061/166] Nuke trace-level logging in heartbeats --- nomad/heartbeat.go | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/nomad/heartbeat.go b/nomad/heartbeat.go index e4e52ed3042..9b2867ecaa3 100644 --- a/nomad/heartbeat.go +++ b/nomad/heartbeat.go @@ -101,18 +101,6 @@ func (s *Server) invalidateHeartbeat(id string) { if err := s.endpoints.Node.UpdateStatus(&req, &resp); err != nil { s.logger.Printf("[ERR] nomad.heartbeat: update status failed: %v", err) } - - if resp.LeaderRPCAddr == "" { - s.logger.Printf("[TRACE] nomad.heartbeat: no leader address returned during heartbeat") - } else { - s.logger.Printf("[TRACE] nomad.heartbeat: current leader address according to server %q is %v", s.rpcAdvertise.String(), resp.LeaderRPCAddr) - } - - if len(resp.Servers) == 0 { - s.logger.Printf("[TRACE] nomad.heartbeat: no servers returned during heartbeat") - } else { - s.logger.Printf("[TRACE] nomad.heartbeat: current servers according to server is %v", resp.Servers) - } } // clearHeartbeatTimer is used to clear the heartbeat time for From ca0f2d982d2cf140de100b6f614203e795538b58 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 03:49:29 -0700 Subject: [PATCH 062/166] Nuke a.consulConfig in favor of a.consul.Config --- command/agent/agent.go | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index f92ec4abdf3..26c4da392ce 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -17,7 +17,6 @@ import ( "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/structs" - "github.com/hashicorp/nomad/nomad/structs/config" ) // Agent is a long running daemon that is used to run both @@ -30,10 +29,6 @@ type Agent struct { logger *log.Logger logOutput io.Writer - // consulConfig is a limited subset of the information necessary to - // establish a connection with this Nomad Agent's Consul Agent. - consulConfig *config.ConsulConfig - // consulSyncer registers the Nomad agent with the Consul Agent consulSyncer *consul.Syncer @@ -318,7 +313,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { conf.Version = fmt.Sprintf("%s%s", a.config.Version, a.config.VersionPrerelease) conf.Revision = a.config.Revision - conf.ConsulConfig = a.consulConfig + conf.ConsulConfig = a.config.Consul conf.StatsDataPoints = a.config.Client.StatsConfig.DataPoints conf.StatsCollectionInterval = a.config.Client.StatsConfig.collectionInterval @@ -535,22 +530,8 @@ func (a *Agent) Stats() map[string]map[string]string { // setupConsulSyncer creates the Consul task used by this Nomad Agent when // running in either Client and Server mode. -func (a *Agent) setupConsulSyncer(shutdownCh types.ShutdownChannel) (err error) { - cfg := &config.ConsulConfig{ - Addr: a.config.Consul.Addr, - Token: a.config.Consul.Token, - Auth: a.config.Consul.Auth, - EnableSSL: a.config.Consul.EnableSSL, - VerifySSL: a.config.Consul.VerifySSL, - CAFile: a.config.Consul.CAFile, - CertFile: a.config.Consul.CertFile, - KeyFile: a.config.Consul.KeyFile, - ServerServiceName: a.config.Consul.ServerServiceName, - ClientServiceName: a.config.Consul.ClientServiceName, - } - a.consulConfig = cfg - - a.consulSyncer, err = consul.NewSyncer(cfg, a.logger) +func (a *Agent) setupConsulSyncer(shutdownCh chan struct{}) (err error) { + a.consulSyncer, err = consul.NewSyncer(a.config.Consul, a.logger) return nil } From 9998573298a2edd0e52ff2b59397cdabcddbfe32 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 03:49:51 -0700 Subject: [PATCH 063/166] Clear the backup server list when a Nomad heartbeat arives with servers If Nomad is heartbeating during a transition from using backup servers to Nomad servers, make Nomad the canonical source of servers and flush the list of servers populated from Consul. --- client/rpcproxy/rpcproxy.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 1811e211e2f..5dbbf53d219 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -642,6 +642,12 @@ func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse p.serverListLock.Lock() defer p.serverListLock.Unlock() + // Clear the backup server list when a heartbeat contains at least + // one server. + if len(resp.Servers) > 0 && len(p.backupServers.L) > 0 { + p.backupServers.L = make([]*ServerEndpoint, len(resp.Servers)) + } + // 1) Create a map to reconcile the difference between // m.primaryServers and resp.Servers. type targetServer struct { From bf4f0310b4558417ec4377793dc82feb82d8d73c Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 03:51:04 -0700 Subject: [PATCH 064/166] Remove unused function. --- client/consul/sync.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/client/consul/sync.go b/client/consul/sync.go index dd8d4b34b87..3f00e3245cc 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -480,12 +480,6 @@ func (c *Syncer) filterConsulChecks(chks map[string]*consul.AgentCheck) map[stri return nomadChecks } -// consulPresent indicates whether the consul agent is responding -func (c *Syncer) consulPresent() bool { - _, err := c.client.Agent().Self() - return err == nil -} - // runCheck runs a check and updates the corresponding ttl check in consul func (c *Syncer) runCheck(check Check) { res := check.Run() From 8a393d7517989a683fef9c7af73fa2c8bb5edff4 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 03:51:22 -0700 Subject: [PATCH 065/166] Improve language re: fingerprinting --- client/client.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/client.go b/client/client.go index 82a18d42253..1c5cdb46379 100644 --- a/client/client.go +++ b/client/client.go @@ -678,7 +678,7 @@ func (c *Client) fingerprint() error { // fingerprintPeriodic runs a fingerprinter at the specified duration. func (c *Client) fingerprintPeriodic(name string, f fingerprint.Fingerprint, d time.Duration) { - c.logger.Printf("[DEBUG] client: periodically fingerprinting %v at duration %v", name, d) + c.logger.Printf("[DEBUG] client: fingerprinting %v every %v", name, d) for { select { case <-time.After(d): From 90f7eb42fb613a811f96c78e6f27c7dcbe170022 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 11:44:18 -0700 Subject: [PATCH 066/166] When clearing the backup servers, set the length to zero. --- client/rpcproxy/rpcproxy.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 5dbbf53d219..34886e4e460 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -645,7 +645,7 @@ func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse // Clear the backup server list when a heartbeat contains at least // one server. if len(resp.Servers) > 0 && len(p.backupServers.L) > 0 { - p.backupServers.L = make([]*ServerEndpoint, len(resp.Servers)) + p.backupServers.L = make([]*ServerEndpoint, 0, len(resp.Servers)) } // 1) Create a map to reconcile the difference between From 6cdc0556de56d10ee3f9c2800089d2c4d10bf8ea Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 11:47:21 -0700 Subject: [PATCH 067/166] Change the constants used to match the struct definitions --- client/rpcproxy/rpcproxy.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 34886e4e460..167c446f91d 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -393,14 +393,14 @@ func (p *RpcProxy) RebalanceServers() { // Create a new merged serverList type targetServer struct { server *ServerEndpoint - // 'n' == Nomad Server - // 'c' == Consul Server + // 'p' == Primary Server + // 's' == Secondary/Backup Server // 'b' == Both state byte } mergedList := make(map[EndpointKey]*targetServer, len(p.primaryServers.L)+len(p.backupServers.L)) for _, s := range p.primaryServers.L { - mergedList[*s.Key()] = &targetServer{server: s, state: 'n'} + mergedList[*s.Key()] = &targetServer{server: s, state: 'p'} } for _, s := range p.backupServers.L { k := s.Key() @@ -408,7 +408,7 @@ func (p *RpcProxy) RebalanceServers() { if found { mergedList[*k].state = 'b' } else { - mergedList[*k] = &targetServer{server: s, state: 'c'} + mergedList[*k] = &targetServer{server: s, state: 's'} } } @@ -417,7 +417,7 @@ func (p *RpcProxy) RebalanceServers() { l.L = append(l.L, s) } for _, v := range mergedList { - if v.state != 'c' { + if v.state != 's' { continue } l.L = append(l.L, v.server) From d6769f59f7bf56ee895f5c5a4e7c2eb4bb5e644d Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 11:47:52 -0700 Subject: [PATCH 068/166] Create a weak decoder to parse time.Duration. Hat tip to Alex for pointing this out (vs patching mapstructure) --- command/agent/config_parse.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index 8b4d1504618..873752c0161 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -624,7 +624,15 @@ func parseConsulConfig(result **config.ConsulConfig, list *ast.ObjectList) error } var consulConfig config.ConsulConfig - if err := mapstructure.WeakDecode(m, &consulConfig); err != nil { + dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ + DecodeHook: mapstructure.StringToTimeDurationHookFunc(), + WeaklyTypedInput: true, + Result: &consulConfig, + }) + if err != nil { + return err + } + if err := dec.Decode(m); err != nil { return err } From 6b8a9b435c18d38891680ad5110c3f55f233de93 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 15:57:04 -0700 Subject: [PATCH 069/166] Update the `nomad_server_service` default from `nomad-server` to just `nomad`. --- client/driver/utils.go | 2 +- client/rpcproxy/rpcproxy.go | 2 +- command/agent/config-test-fixtures/basic.hcl | 2 +- command/agent/config.go | 2 +- command/agent/config_parse_test.go | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/client/driver/utils.go b/client/driver/utils.go index 5559e65e542..62b062a1591 100644 --- a/client/driver/utils.go +++ b/client/driver/utils.go @@ -82,7 +82,7 @@ func consulContext(clientConfig *config.Config, containerID string) *executor.Co CAFile: clientConfig.Read("consul.tls_ca_file"), CertFile: clientConfig.Read("consul.tls_cert_file"), KeyFile: clientConfig.Read("consul.tls_key_file"), - ServerServiceName: clientConfig.ReadDefault("consul.server_service_name", "nomad-server"), + ServerServiceName: clientConfig.ReadDefault("consul.server_service_name", "nomad"), ClientServiceName: clientConfig.ReadDefault("consul.client_service_name", "nomad-client"), } return &executor.ConsulContext{ diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 167c446f91d..9008e44f23b 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -452,7 +452,7 @@ func (p *RpcProxy) RebalanceServers() { // If no healthy servers were found, sleep and wait for the admin to // join this node to a server and begin receiving heartbeats with an // updated list of Nomad servers. Or Consul will begin advertising a - // new server in the nomad-servers service. + // new server in the nomad service (Nomad server service). if !foundHealthyServer { p.logger.Printf("[DEBUG] RPC Proxy: No healthy servers during rebalance, aborting") return diff --git a/command/agent/config-test-fixtures/basic.hcl b/command/agent/config-test-fixtures/basic.hcl index 38c22ec7814..8778487155a 100644 --- a/command/agent/config-test-fixtures/basic.hcl +++ b/command/agent/config-test-fixtures/basic.hcl @@ -86,7 +86,7 @@ http_api_response_headers { Access-Control-Allow-Origin = "*" } consul { - server_service_name = "nomad-server" + server_service_name = "nomad" client_service_name = "nomad-client" addr = "127.0.0.1:9500" token = "token1" diff --git a/command/agent/config.go b/command/agent/config.go index 8804d0b7ab2..16f4256214d 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -391,7 +391,7 @@ func DefaultConfig() *Config { AdvertiseAddrs: &AdvertiseAddrs{}, Atlas: &AtlasConfig{}, Consul: &config.ConsulConfig{ - ServerServiceName: "nomad-server", + ServerServiceName: "nomad", ClientServiceName: "nomad-client", AutoRegister: true, Timeout: 500 * time.Millisecond, diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 5f894ad103c..2e74e50eed9 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -103,7 +103,7 @@ func TestConfig_Parse(t *testing.T) { Endpoint: "127.0.0.1:1234", }, Consul: &config.ConsulConfig{ - ServerServiceName: "nomad-server", + ServerServiceName: "nomad", ClientServiceName: "nomad-client", Addr: "127.0.0.1:9500", Token: "token1", From b2357598ba023c7a519a564a1eae9e3993f67943 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 15:58:28 -0700 Subject: [PATCH 070/166] Register the serf service with the Nomad server service. This will be unused in this PR. --- client/consul/sync.go | 3 +++ command/agent/agent.go | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/client/consul/sync.go b/client/consul/sync.go index 3f00e3245cc..9fde90acaef 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -74,6 +74,9 @@ const ( // ServiceTagRpc is the tag assigned to RPC services ServiceTagRpc = "rpc" + + // ServiceTagSerf is the tag assigned to Serf services + ServiceTagSerf = "serf" ) // NewSyncer returns a new consul.Syncer diff --git a/command/agent/agent.go b/command/agent/agent.go index 26c4da392ce..55319e41873 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -39,6 +39,7 @@ type Agent struct { server *nomad.Server serverHttpAddr string serverRpcAddr string + serverSerfAddr string shutdown bool shutdownCh chan struct{} @@ -562,6 +563,13 @@ func (a *Agent) syncAgentServicesWithConsul() error { if a.server != nil { if a.config.Consul.ServerServiceName != "" { + serverHttpService := &structs.Service{ + Name: a.config.Consul.ServerServiceName, + Tags: []string{consul.ServiceTagHttp}, + PortLabel: a.serverHttpAddr, + } + services = append(services, serverHttpService) + serverRpcService := &structs.Service{ Name: a.config.Consul.ServerServiceName, Tags: []string{consul.ServiceTagRpc}, @@ -569,12 +577,12 @@ func (a *Agent) syncAgentServicesWithConsul() error { } services = append(services, serverRpcService) - serverHttpService := &structs.Service{ + serverSerfService := &structs.Service{ Name: a.config.Consul.ServerServiceName, - Tags: []string{consul.ServiceTagHttp}, - PortLabel: a.serverHttpAddr, + Tags: []string{consul.ServiceTagSerf}, + PortLabel: a.serverSerfAddr, } - services = append(services, serverHttpService) + services = append(services, serverSerfService) a.consulSyncer.SetServiceIdentifier("agent-server") } From ab99e89bd994faae2d5ea2f12847a8ac04e4f79b Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 18:14:34 -0700 Subject: [PATCH 071/166] Reconcile, clean up, and centralize API version numbers (major and minor). Reduce future confusion by introducing a minor version that is gossiped out via the `mvn` Serf tag (Minor Version Number, `vsn` is already being used for to communicate `Major Version Number`). Background: hashicorp/consul/issues/1346#issuecomment-151663152 --- client/client.go | 16 ++++--- client/fingerprint/consul_test.go | 4 +- client/rpcproxy/rpcproxy.go | 19 ++++---- client/rpcproxy/rpcproxy_internal_test.go | 16 +++++-- nomad/node_endpoint.go | 3 +- nomad/pool.go | 4 +- nomad/rpc.go | 4 +- nomad/server.go | 17 ++----- nomad/status_endpoint.go | 4 +- nomad/status_endpoint_test.go | 4 +- nomad/structs/structs.go | 30 ++++++++---- nomad/util.go | 47 ++++++++++++------- nomad/util_test.go | 2 +- .../source/docs/http/agent-members.html.md | 4 +- website/source/docs/http/agent-self.html.md | 4 +- 15 files changed, 98 insertions(+), 80 deletions(-) diff --git a/client/client.go b/client/client.go index 1c5cdb46379..b059e542c4a 100644 --- a/client/client.go +++ b/client/client.go @@ -47,9 +47,6 @@ const ( // devModeRetryIntv is the retry interval used for development devModeRetryIntv = time.Second - // rpcVersion specifies the RPC version - rpcVersion = 1 - // stateSnapshotIntv is how often the client snapshots state stateSnapshotIntv = 60 * time.Second @@ -301,9 +298,14 @@ func (c *Client) Region() string { return c.config.Region } -// Region returns the rpcVersion in use by the client -func (c *Client) RPCVersion() int { - return rpcVersion +// Region returns the structs.ApiMajorVersion in use by the client +func (c *Client) RpcMajorVersion() int { + return structs.ApiMajorVersion +} + +// Region returns the structs.ApiMinorVersion in use by the client +func (c *Client) RpcMinorVersion() int { + return structs.ApiMinorVersion } // Shutdown is used to tear down the client @@ -344,7 +346,7 @@ func (c *Client) RPC(method string, args interface{}, reply interface{}) error { } // Make the RPC request - if err := c.connPool.RPC(c.Region(), server.Addr, c.RPCVersion(), method, args, reply); err != nil { + if err := c.connPool.RPC(c.Region(), server.Addr, c.RpcMajorVersion(), method, args, reply); err != nil { c.rpcProxy.NotifyFailedServer(server) c.logger.Printf("[ERR] client: RPC failed to server %s: %v", server.Addr, err) return err diff --git a/client/fingerprint/consul_test.go b/client/fingerprint/consul_test.go index 2557d0f271e..29278a1d5c2 100644 --- a/client/fingerprint/consul_test.go +++ b/client/fingerprint/consul_test.go @@ -151,9 +151,7 @@ const mockConsulResponse = ` "expect": "3", "port": "8300", "role": "consul", - "vsn": "2", - "vsn_max": "2", - "vsn_min": "1" + "vsn": "2" }, "Status": 1, "ProtocolMin": 1, diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 9008e44f23b..02752ba4d02 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -21,12 +21,6 @@ import ( ) const ( - // apiMajorVersion is synchronized with `nomad/server.go` and - // represents the API version supported by this client. - // - // TODO(sean@): This symbol should be exported somewhere. - apiMajorVersion = 1 - // clientRPCJitterFraction determines the amount of jitter added to // clientRPCMinReuseDuration before a connection is expired and a new // connection is established in order to rebalance load across Nomad @@ -68,14 +62,15 @@ const ( // configuration to prevents a cyclic import dependency. type NomadConfigInfo interface { Datacenter() string - RPCVersion() int + RpcMajorVersion() int + RpcMinorVersion() int Region() string } // Pinger is an interface wrapping client.ConnPool to prevent a // cyclic import dependency type Pinger interface { - PingNomadServer(region string, version int, s *ServerEndpoint) (bool, error) + PingNomadServer(region string, apiMajorVersion int, s *ServerEndpoint) (bool, error) } // serverList is an array of Nomad Servers. The first server in the list is @@ -439,7 +434,7 @@ func (p *RpcProxy) RebalanceServers() { // detect the failed node. selectedServer := l.L[0] - ok, err := p.connPoolPinger.PingNomadServer(p.configInfo.Region(), p.configInfo.RPCVersion(), selectedServer) + ok, err := p.connPoolPinger.PingNomadServer(p.configInfo.Region(), p.configInfo.RpcMajorVersion(), selectedServer) if ok { foundHealthyServer = true break @@ -673,14 +668,16 @@ func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse // TODO(sean@): Move the logging throttle logic into a // dedicated logging package so RpcProxy does not have to // perform this accounting. - if int32(p.configInfo.RPCVersion()) < s.RpcVersion { + if int32(p.configInfo.RpcMajorVersion()) < s.RpcMajorVersion || + (int32(p.configInfo.RpcMajorVersion()) == s.RpcMajorVersion && + int32(p.configInfo.RpcMinorVersion()) < s.RpcMinorVersion) { now := time.Now() t, ok := p.rpcAPIMismatchThrottle[s.RpcAdvertiseAddr] if ok && t.After(now) { continue } - p.logger.Printf("[WARN] API mismatch between client (v%d) and server (v%d), ignoring server %q", apiMajorVersion, s.RpcVersion, s.RpcAdvertiseAddr) + p.logger.Printf("[WARN] API mismatch between client version (v%d.%d) and server version (v%d.%d), ignoring server %q", p.configInfo.RpcMajorVersion(), p.configInfo.RpcMinorVersion(), s.RpcMajorVersion, s.RpcMinorVersion, s.RpcAdvertiseAddr) p.rpcAPIMismatchThrottle[s.RpcAdvertiseAddr] = now.Add(rpcAPIMismatchLogRate) continue } diff --git a/client/rpcproxy/rpcproxy_internal_test.go b/client/rpcproxy/rpcproxy_internal_test.go index 494501cfc02..d5ec99ab341 100644 --- a/client/rpcproxy/rpcproxy_internal_test.go +++ b/client/rpcproxy/rpcproxy_internal_test.go @@ -8,6 +8,8 @@ import ( "os" "testing" "time" + + "github.com/hashicorp/nomad/nomad/structs" ) var ( @@ -50,8 +52,16 @@ func (s *fauxSerf) Region() string { return "global" } -func (s *fauxSerf) RPCVersion() int { - return 1 +func (s *fauxSerf) Datacenter() string { + return "dc1" +} + +func (s *fauxSerf) RpcMajorVersion() int { + return structs.ApiMajorVersion +} + +func (s *fauxSerf) RpcMinorVersion() int { + return structs.ApiMinorVersion } func testManager() (p *RpcProxy) { @@ -180,7 +190,7 @@ func test_reconcileServerList(maxServers int) (bool, error) { // failPct of the servers for the reconcile. This // allows for the selected server to no longer be // healthy for the reconcile below. - if ok, _ := m.connPoolPinger.PingNomadServer(m.configInfo.Region(), m.configInfo.RPCVersion(), node); ok { + if ok, _ := m.connPoolPinger.PingNomadServer(m.configInfo.Region(), m.configInfo.RpcMajorVersion(), node); ok { // Will still be present healthyServers = append(healthyServers, node) } else { diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index e78fc87032e..2fb51d74112 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -217,7 +217,8 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct reply.Servers = append(reply.Servers, &structs.NodeServerInfo{ RpcAdvertiseAddr: k, - RpcVersion: int32(v.Version), + RpcMajorVersion: int32(v.MajorVersion), + RpcMinorVersion: int32(v.MinorVersion), Datacenter: v.Datacenter, }) } diff --git a/nomad/pool.go b/nomad/pool.go index 2881b6e6dda..669e788989e 100644 --- a/nomad/pool.go +++ b/nomad/pool.go @@ -376,9 +376,9 @@ func (p *ConnPool) RPC(region string, addr net.Addr, version int, method string, // PingNomadServer sends a Status.Ping message to the specified server and // returns true if healthy, false if an error occurred -func (p *ConnPool) PingNomadServer(region string, version int, s *rpcproxy.ServerEndpoint) (bool, error) { +func (p *ConnPool) PingNomadServer(region string, apiMajorVersion int, s *rpcproxy.ServerEndpoint) (bool, error) { // Get a usable client - conn, sc, err := p.getClient(region, s.Addr, version) + conn, sc, err := p.getClient(region, s.Addr, apiMajorVersion) if err != nil { return false, err } diff --git a/nomad/rpc.go b/nomad/rpc.go index 26b94489b12..d65f65868e5 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -216,7 +216,7 @@ func (s *Server) forwardLeader(method string, args interface{}, reply interface{ if server == nil { return structs.ErrNoLeader } - return s.connPool.RPC(s.config.Region, server.Addr, server.Version, method, args, reply) + return s.connPool.RPC(s.config.Region, server.Addr, server.MajorVersion, method, args, reply) } // forwardRegion is used to forward an RPC call to a remote region, or fail if no servers @@ -238,7 +238,7 @@ func (s *Server) forwardRegion(region, method string, args interface{}, reply in // Forward to remote Nomad metrics.IncrCounter([]string{"nomad", "rpc", "cross-region", region}, 1) - return s.connPool.RPC(region, server.Addr, server.Version, method, args, reply) + return s.connPool.RPC(region, server.Addr, server.MajorVersion, method, args, reply) } // raftApplyFuture is used to encode a message, run it through raft, and return the Raft future. diff --git a/nomad/server.go b/nomad/server.go index c79de16678d..c560e731b5d 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -18,6 +18,7 @@ import ( "github.com/hashicorp/consul/tlsutil" "github.com/hashicorp/nomad/nomad/state" + "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/raft" "github.com/hashicorp/raft-boltdb" "github.com/hashicorp/serf/serf" @@ -41,17 +42,6 @@ const ( // raftRemoveGracePeriod is how long we wait to allow a RemovePeer // to replicate to gracefully leave the cluster. raftRemoveGracePeriod = 5 * time.Second - - // apiMajorVersion is returned as part of the Status.Version request. - // It should be incremented anytime the APIs are changed in a way that - // would break clients for sane client versioning. - apiMajorVersion = 1 - - // apiMinorVersion is returned as part of the Status.Version request. - // It should be incremented anytime the APIs are changed to allow - // for sane client versioning. Minor changes should be compatible - // within the major version. - apiMinorVersion = 1 ) // Server is Nomad server which manages the job queues, @@ -534,9 +524,8 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) ( conf.Tags["role"] = "nomad" conf.Tags["region"] = s.config.Region conf.Tags["dc"] = s.config.Datacenter - conf.Tags["vsn"] = fmt.Sprintf("%d", s.config.ProtocolVersion) - conf.Tags["vsn_min"] = fmt.Sprintf("%d", ProtocolVersionMin) - conf.Tags["vsn_max"] = fmt.Sprintf("%d", ProtocolVersionMax) + conf.Tags["vsn"] = fmt.Sprintf("%d", structs.ApiMajorVersion) + conf.Tags["mvn"] = fmt.Sprintf("%d", structs.ApiMinorVersion) conf.Tags["build"] = s.config.Build conf.Tags["port"] = fmt.Sprintf("%d", s.rpcAdvertise.(*net.TCPAddr).Port) if s.config.Bootstrap || (s.config.DevMode && !s.config.DevDisableBootstrap) { diff --git a/nomad/status_endpoint.go b/nomad/status_endpoint.go index 335bd6ed0e7..aaddb220137 100644 --- a/nomad/status_endpoint.go +++ b/nomad/status_endpoint.go @@ -18,8 +18,8 @@ func (s *Status) Version(args *structs.GenericRequest, reply *structs.VersionRes reply.Build = conf.Build reply.Versions = map[string]int{ structs.ProtocolVersion: int(conf.ProtocolVersion), - structs.APIMajorVersion: apiMajorVersion, - structs.APIMinorVersion: apiMinorVersion, + structs.APIMajorVersion: structs.ApiMajorVersion, + structs.APIMinorVersion: structs.ApiMinorVersion, } return nil } diff --git a/nomad/status_endpoint_test.go b/nomad/status_endpoint_test.go index ebbab2ead18..9d4407d2650 100644 --- a/nomad/status_endpoint_test.go +++ b/nomad/status_endpoint_test.go @@ -30,10 +30,10 @@ func TestStatusVersion(t *testing.T) { if out.Versions[structs.ProtocolVersion] != ProtocolVersionMax { t.Fatalf("bad: %#v", out) } - if out.Versions[structs.APIMajorVersion] != apiMajorVersion { + if out.Versions[structs.APIMajorVersion] != structs.ApiMajorVersion { t.Fatalf("bad: %#v", out) } - if out.Versions[structs.APIMinorVersion] != apiMinorVersion { + if out.Versions[structs.APIMinorVersion] != structs.ApiMinorVersion { t.Fatalf("bad: %#v", out) } } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index b1285a634f2..08185de42d0 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -54,6 +54,21 @@ const ( // that new commands can be added in a way that won't cause // old servers to crash when the FSM attempts to process them. IgnoreUnknownTypeFlag MessageType = 128 + + // ApiMajorVersion is returned as part of the Status.Version request. + // It should be incremented anytime the APIs are changed in a way + // that would break clients for sane client versioning. + ApiMajorVersion = 1 + + // ApiMinorVersion is returned as part of the Status.Version request. + // It should be incremented anytime the APIs are changed to allow + // for sane client versioning. Minor changes should be compatible + // within the major version. + ApiMinorVersion = 1 + + ProtocolVersion = "protocol" + APIMajorVersion = "api.major" + APIMinorVersion = "api.minor" ) // RPCInfo is used to describe common information about query @@ -158,8 +173,13 @@ type NodeServerInfo struct { // be contacted at for RPCs. RpcAdvertiseAddr string - // RPCVersion is the version number the Nomad Server supports - RpcVersion int32 + // RpcMajorVersion is the major version number the Nomad Server + // supports + RpcMajorVersion int32 + + // RpcMinorVersion is the minor version number the Nomad Server + // supports + RpcMinorVersion int32 // Datacenter is the datacenter that a Nomad server belongs to Datacenter string @@ -330,12 +350,6 @@ type GenericResponse struct { WriteMeta } -const ( - ProtocolVersion = "protocol" - APIMajorVersion = "api.major" - APIMinorVersion = "api.minor" -) - // VersionResponse is used for the Status.Version reseponse type VersionResponse struct { Build string diff --git a/nomad/util.go b/nomad/util.go index 7a74c954270..d2f50bb4861 100644 --- a/nomad/util.go +++ b/nomad/util.go @@ -34,14 +34,15 @@ func RuntimeStats() map[string]string { // serverParts is used to return the parts of a server role type serverParts struct { - Name string - Region string - Datacenter string - Port int - Bootstrap bool - Expect int - Version int - Addr net.Addr + Name string + Region string + Datacenter string + Port int + Bootstrap bool + Expect int + MajorVersion int + MinorVersion int + Addr net.Addr } func (s *serverParts) String() string { @@ -76,22 +77,32 @@ func isNomadServer(m serf.Member) (bool, *serverParts) { return false, nil } - vsn_str := m.Tags["vsn"] - vsn, err := strconv.Atoi(vsn_str) + // The "vsn" tag was Version, which is now the MajorVersion number. + majorVersionStr := m.Tags["vsn"] + majorVersion, err := strconv.Atoi(majorVersionStr) if err != nil { return false, nil } + // To keep some semblance of convention, "mvn" is now the "Minor + // Version Number." + minorVersionStr := m.Tags["mvn"] + minorVersion, err := strconv.Atoi(minorVersionStr) + if err != nil { + minorVersion = 0 + } + addr := &net.TCPAddr{IP: m.Addr, Port: port} parts := &serverParts{ - Name: m.Name, - Region: region, - Datacenter: datacenter, - Port: port, - Bootstrap: bootstrap, - Expect: expect, - Addr: addr, - Version: vsn, + Name: m.Name, + Region: region, + Datacenter: datacenter, + Port: port, + Bootstrap: bootstrap, + Expect: expect, + Addr: addr, + MajorVersion: majorVersion, + MinorVersion: minorVersion, } return true, parts } diff --git a/nomad/util_test.go b/nomad/util_test.go index e415bb4c9a0..60a6e6918ad 100644 --- a/nomad/util_test.go +++ b/nomad/util_test.go @@ -44,7 +44,7 @@ func TestIsNomadServer(t *testing.T) { if parts.Addr.String() != "127.0.0.1:10000" { t.Fatalf("bad addr: %v", parts.Addr) } - if parts.Version != 1 { + if parts.MajorVersion != 1 { t.Fatalf("bad: %v", parts) } diff --git a/website/source/docs/http/agent-members.html.md b/website/source/docs/http/agent-members.html.md index f56b5f9229c..e686b574cda 100644 --- a/website/source/docs/http/agent-members.html.md +++ b/website/source/docs/http/agent-members.html.md @@ -46,9 +46,7 @@ the gossip pool. This is only applicable to servers. "port": "4647", "region": "global", "role": "nomad", - "vsn": "1", - "vsn_max": "1", - "vsn_min": "1" + "vsn": "1" }, "Status": "alive", "ProtocolMin": 1, diff --git a/website/source/docs/http/agent-self.html.md b/website/source/docs/http/agent-self.html.md index a7a03debc24..91fb615deca 100644 --- a/website/source/docs/http/agent-self.html.md +++ b/website/source/docs/http/agent-self.html.md @@ -98,9 +98,7 @@ The `self` endpoint is used to query the state of the target agent. "port": "4647", "region": "global", "role": "nomad", - "vsn": "1", - "vsn_max": "1", - "vsn_min": "1" + "vsn": "1" }, "Status": "alive", "ProtocolMin": 1, From 6264a8eff6c9df42922a7bd179c9e44cb1e4a3dd Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 18:17:37 -0700 Subject: [PATCH 072/166] Unused code wasn't as unused as I thought. Restore. --- client/consul/sync.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/client/consul/sync.go b/client/consul/sync.go index 9fde90acaef..75312a36f7b 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -483,6 +483,12 @@ func (c *Syncer) filterConsulChecks(chks map[string]*consul.AgentCheck) map[stri return nomadChecks } +// consulPresent indicates whether the consul agent is responding +func (c *Syncer) consulPresent() bool { + _, err := c.client.Agent().Self() + return err == nil +} + // runCheck runs a check and updates the corresponding ttl check in consul func (c *Syncer) runCheck(check Check) { res := check.Run() From c8bf53b0a806a74aea08d9085f7f77b08a77ef40 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 27 May 2016 18:17:56 -0700 Subject: [PATCH 073/166] Fix a comment to be more correct --- client/rpcproxy/rpcproxy.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 02752ba4d02..a6a2d4dc152 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -706,7 +706,7 @@ func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse } } - // Short-circuit acquiring a lock if nothing changed + // Short-circuit acquiring listLock if nothing changed if !newServers && numOldServers == numBothServers { return nil } From b9f230b7b2d914f4553bc320b24db12f52961220 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 31 May 2016 11:58:02 -0700 Subject: [PATCH 074/166] Move client.DefaultConfig() to client/config.DefaultConfig() Resolves an import cycle in testing and is more appropriate because the default should reside next to its struct definition. --- client/client.go | 10 ---------- client/config/config.go | 11 +++++++++++ command/agent/agent.go | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/client/client.go b/client/client.go index b059e542c4a..dcfe19db5af 100644 --- a/client/client.go +++ b/client/client.go @@ -84,16 +84,6 @@ const ( consulSyncJitter = 8 ) -// DefaultConfig returns the default configuration -func DefaultConfig() *config.Config { - return &config.Config{ - LogOutput: os.Stderr, - Region: "global", - StatsDataPoints: 60, - StatsCollectionInterval: 1 * time.Second, - } -} - // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad // Client type ClientStatsReporter interface { diff --git a/client/config/config.go b/client/config/config.go index 52bb906c102..eff9a0442af 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -3,6 +3,7 @@ package config import ( "fmt" "io" + "os" "strconv" "strings" "time" @@ -131,6 +132,16 @@ func (c *Config) Copy() *Config { return nc } +// DefaultConfig returns the default configuration +func DefaultConfig() *Config { + return &Config{ + LogOutput: os.Stderr, + Region: "global", + StatsDataPoints: 60, + StatsCollectionInterval: 1 * time.Second, + } +} + // Read returns the specified configuration value or "". func (c *Config) Read(id string) string { return c.Options[id] diff --git a/command/agent/agent.go b/command/agent/agent.go index 55319e41873..cdfb549974e 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -220,7 +220,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { // Setup the configuration conf := a.config.ClientConfig if conf == nil { - conf = client.DefaultConfig() + conf = clientconfig.DefaultConfig() } if a.server != nil { conf.RPCHandler = a.server From 6785e7632c92b6f1e2231d5fe26b983e1a904ff3 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 31 May 2016 13:28:43 -0700 Subject: [PATCH 075/166] Provide a default ConsulConfig for client/config.DefaultConfig() Change the unit test to only test if the consul link exists, not the value of the link. The old test was hostname specific and therefore would always be different based on the environment running the tests. --- client/config/config.go | 6 ++++++ client/fingerprint/consul_test.go | 14 ++++---------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/client/config/config.go b/client/config/config.go index eff9a0442af..b0f993d0e6e 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -135,6 +135,12 @@ func (c *Config) Copy() *Config { // DefaultConfig returns the default configuration func DefaultConfig() *Config { return &Config{ + ConsulConfig: &config.ConsulConfig{ + ServerServiceName: "nomad", + ClientServiceName: "nomad-client", + AutoRegister: true, + Timeout: 500 * time.Millisecond, + }, LogOutput: os.Stderr, Region: "global", StatsDataPoints: 60, diff --git a/client/fingerprint/consul_test.go b/client/fingerprint/consul_test.go index 29278a1d5c2..6232869871c 100644 --- a/client/fingerprint/consul_test.go +++ b/client/fingerprint/consul_test.go @@ -22,14 +22,9 @@ func TestConsulFingerprint(t *testing.T) { })) defer ts.Close() - consulConfig := &config.Config{ - Options: map[string]string{ - // Split off "http://" - "consul.address": ts.URL[7:], - }, - } + config := config.DefaultConfig() - ok, err := fp.Fingerprint(consulConfig, node) + ok, err := fp.Fingerprint(config, node) if err != nil { t.Fatalf("Failed to fingerprint: %s", err) } @@ -43,9 +38,8 @@ func TestConsulFingerprint(t *testing.T) { assertNodeAttributeContains(t, node, "unique.consul.name") assertNodeAttributeContains(t, node, "consul.datacenter") - expectedLink := "vagrant.consul2" - if node.Links["consul"] != expectedLink { - t.Errorf("Expected consul link: %s\nFound links: %#v", expectedLink, node.Links) + if _, ok := node.Links["consul"]; !ok { + t.Errorf("Expected a link to consul, none found") } } From ed308761861c370073015b24433daa06ba4c35b7 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 00:34:46 -0700 Subject: [PATCH 076/166] Change the endpoint for `/v1/agent/servers` and fix tests. When an agent is running a server, the list of servers includes the Raft peers. When the agent is running a client (which is always the case?), include a list of the servers found in the Client's RpcProxy. Dedupe and provide a unique list back to the caller. --- client/client.go | 4 +-- client/rpcproxy/rpcproxy.go | 29 ++++++++-------- command/agent/agent_endpoint.go | 50 ++++++++++++++++++++++------ command/agent/agent_endpoint_test.go | 47 +++++++++++++++++++++----- 4 files changed, 95 insertions(+), 35 deletions(-) diff --git a/client/client.go b/client/client.go index dcfe19db5af..c843f17abd7 100644 --- a/client/client.go +++ b/client/client.go @@ -446,8 +446,8 @@ func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) { // AddPrimaryServerToRpcProxy adds serverAddr to the RPC Proxy's primary // server list. -func (c *Client) AddPrimaryServerToRpcProxy(serverAddr string) { - c.rpcProxy.AddPrimaryServer(serverAddr) +func (c *Client) AddPrimaryServerToRpcProxy(serverAddr string) *rpcproxy.ServerEndpoint { + return c.rpcProxy.AddPrimaryServer(serverAddr) } // restoreState is used to restore our state from the data dir diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index a6a2d4dc152..1f8a7f39d38 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -12,6 +12,7 @@ import ( "fmt" "log" "math/rand" + "strings" "sync" "sync/atomic" "time" @@ -99,7 +100,9 @@ type RpcProxy struct { // by serverListLock. backupServers serverList - // serverListLock covers both backupServers and primaryServers + // serverListLock covers both backupServers and primaryServers. If + // it is necessary to hold serverListLock and listLock, obtain an + // exclusive lock on serverListLock before listLock. serverListLock sync.RWMutex leaderAddr string @@ -204,7 +207,7 @@ func (p *RpcProxy) SetBackupServers(addrs []string) error { func (p *RpcProxy) AddPrimaryServer(rpcAddr string) *ServerEndpoint { s, err := newServer(rpcAddr) if err != nil { - p.logger.Printf("[WARN] RPC Proxy: unable to create new primary server from endpoint %q", rpcAddr) + p.logger.Printf("[WARN] RPC Proxy: unable to create new primary server from endpoint %q: %v", rpcAddr, err) return nil } @@ -540,24 +543,22 @@ func (p *RpcProxy) reconcileServerList(l *serverList) bool { } // RemoveServer takes out an internal write lock and removes a server from -// the server list. +// the activated server list. func (p *RpcProxy) RemoveServer(s *ServerEndpoint) { + // Lock hierarchy protocol dictates serverListLock is acquired first. + p.serverListLock.Lock() + defer p.serverListLock.Unlock() + p.listLock.Lock() defer p.listLock.Unlock() l := p.getServerList() - // Remove the server if known - for i, _ := range l.L { - if l.L[i].Name == s.Name { - newServers := make([]*ServerEndpoint, 0, len(l.L)-1) - newServers = append(newServers, l.L[:i]...) - newServers = append(newServers, l.L[i+1:]...) - l.L = newServers + k := s.Key() + l.removeServerByKey(k) + p.saveServerList(l) - p.saveServerList(l) - return - } - } + p.primaryServers.removeServerByKey(k) + p.backupServers.removeServerByKey(k) } // refreshServerRebalanceTimer is only called once m.rebalanceTimer expires. diff --git a/command/agent/agent_endpoint.go b/command/agent/agent_endpoint.go index 316d95b3d44..93782b112ce 100644 --- a/command/agent/agent_endpoint.go +++ b/command/agent/agent_endpoint.go @@ -119,6 +119,9 @@ func (s *HTTPServer) AgentForceLeaveRequest(resp http.ResponseWriter, req *http. return nil, err } +// AgentServersRequest is used to query the list of servers used by the Nomad +// Client for RPCs. This endpoint can also be used to update the list of +// servers for a given agent. func (s *HTTPServer) AgentServersRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) { switch req.Method { case "PUT", "POST": @@ -136,14 +139,38 @@ func (s *HTTPServer) listServers(resp http.ResponseWriter, req *http.Request) (i return nil, CodedError(501, ErrInvalidMethod) } - // Get the current list of servers according to Raft. - // - // NOTE(sean@); This could be s.agent.server.localPeers instead. - var err error - var peers []string - peers, err = s.agent.server.RaftPeers() - if err != nil { - return nil, err + // Preallocate for at least 5x servers + const initialServerListSize = 8 + peers := make([]string, 0, initialServerListSize) + uniquePeers := make(map[string]bool, initialServerListSize) + // When the agent has an active server, get the current list of + // servers according to Raft. + if s.agent.server != nil { + raftPeers, err := s.agent.server.RaftPeers() + if err != nil { + return nil, err + } + for _, peer := range raftPeers { + _, found := uniquePeers[peer] + if !found { + uniquePeers[peer] = true + peers = append(peers, peer) + } + } + } + + // When the agent has an active client, return the union of the list + // of servers according to RpcProxy, which is possibly populated by + // Consul. + if s.agent.client != nil { + clientPeers := s.agent.client.RpcProxy().ServerRPCAddrs() + for _, peer := range clientPeers { + _, found := uniquePeers[peer] + if !found { + uniquePeers[peer] = true + peers = append(peers, peer) + } + } } return peers, nil @@ -162,8 +189,11 @@ func (s *HTTPServer) updateServers(resp http.ResponseWriter, req *http.Request) } // Set the servers list into the client - for _, s := range servers { - client.AddPrimaryServerToRpcProxy(s) + for _, server := range servers { + se := client.AddPrimaryServerToRpcProxy(server) + if se == nil { + s.agent.logger.Printf("[ERR] Attempt to add server %q to client failed", server) + } } return nil, nil } diff --git a/command/agent/agent_endpoint_test.go b/command/agent/agent_endpoint_test.go index 8ab104b119f..03f7b2a7904 100644 --- a/command/agent/agent_endpoint_test.go +++ b/command/agent/agent_endpoint_test.go @@ -107,21 +107,35 @@ func TestHTTP_AgentForceLeave(t *testing.T) { func TestHTTP_AgentSetServers(t *testing.T) { httpTest(t, nil, func(s *TestServer) { + // Establish a baseline number of servers + req, err := http.NewRequest("GET", "/v1/agent/servers", nil) + if err != nil { + t.Fatalf("err: %s", err) + } + respW := httptest.NewRecorder() + + // Make the request and check the result + out, err := s.Server.AgentServersRequest(respW, req) + if err != nil { + t.Fatalf("err: %s", err) + } + numServers := len(out.([]string)) + // Create the request - req, err := http.NewRequest("PUT", "/v1/agent/servers", nil) + req, err = http.NewRequest("PUT", "/v1/agent/servers", nil) if err != nil { t.Fatalf("err: %s", err) } // Send the request - respW := httptest.NewRecorder() + respW = httptest.NewRecorder() _, err = s.Server.AgentServersRequest(respW, req) if err == nil || !strings.Contains(err.Error(), "missing server address") { t.Fatalf("expected missing servers error, got: %#v", err) } // Create a valid request - req, err = http.NewRequest("PUT", "/v1/agent/servers?address=foo&address=bar", nil) + req, err = http.NewRequest("PUT", "/v1/agent/servers?address=127.0.0.1%3A4647&address=127.0.0.2%3A4647", nil) if err != nil { t.Fatalf("err: %s", err) } @@ -141,16 +155,31 @@ func TestHTTP_AgentSetServers(t *testing.T) { respW = httptest.NewRecorder() // Make the request and check the result - out, err := s.Server.AgentServersRequest(respW, req) + expected := map[string]bool{ + "127.0.0.1:4647": true, + "127.0.0.2:4647": true, + } + out, err = s.Server.AgentServersRequest(respW, req) if err != nil { t.Fatalf("err: %s", err) } servers := out.([]string) - if n := len(servers); n != 2 { - t.Fatalf("expected 2 servers, got: %d", n) - } - if servers[0] != "foo:4647" || servers[1] != "bar:4647" { - t.Fatalf("bad servers result: %v", servers) + if n := len(servers); n != numServers+2 { + t.Fatalf("expected %d servers, got: %d: %v", numServers+2, n, servers) + } + received := make(map[string]bool, len(servers)) + for _, server := range servers { + received[server] = true + } + foundCount := 0 + for k, _ := range received { + _, found := expected[k] + if found { + foundCount++ + } + } + if foundCount != len(expected) { + t.Fatalf("bad servers result") } }) } From 5b0a969a3691d876d566feb0a837d101830b9409 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 00:41:08 -0700 Subject: [PATCH 077/166] Fix the client/rpcproxy unit tests. --- client/rpcproxy/rpcproxy.go | 14 + client/rpcproxy/rpcproxy_test.go | 437 +++++++++++++++++++++---------- 2 files changed, 307 insertions(+), 144 deletions(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 1f8a7f39d38..b0feed0282f 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -263,6 +263,20 @@ func (l *serverList) shuffleServers() { } } +// String returns a string representation of serverList +func (l *serverList) String() string { + if len(l.L) == 0 { + return fmt.Sprintf("[empty server list]") + } + + serverStrs := make([]string, 0, len(l.L)) + for _, server := range l.L { + serverStrs = append(serverStrs, server.String()) + } + + return fmt.Sprintf("[%s]", strings.Join(serverStrs, ", ")) +} + // FindServer takes out an internal "read lock" and searches through the list // of servers to find a "healthy" server. If the server is actually // unhealthy, we rely on heartbeats to detect this and remove the node from diff --git a/client/rpcproxy/rpcproxy_test.go b/client/rpcproxy/rpcproxy_test.go index b14e90eb870..fc35970032c 100644 --- a/client/rpcproxy/rpcproxy_test.go +++ b/client/rpcproxy/rpcproxy_test.go @@ -2,20 +2,33 @@ package rpcproxy_test import ( "bytes" - "fmt" + "encoding/binary" "log" "math/rand" + "net" "os" "strings" + "sync/atomic" "testing" - "github.com/hashicorp/consul/consul/agent" - "github.com/hashicorp/consul/consul/servers" + "github.com/hashicorp/nomad/client/rpcproxy" +) + +const ( + ipv4len = 4 + nodeNameFmt = "s%03d" + defaultNomadPort = "4647" + + // Poached from RFC2544 and RFC3330 + testingNetworkCidr = "198.18.0.0/15" + testingNetworkUint32 = 3323068416 ) var ( localLogger *log.Logger localLogBuffer *bytes.Buffer + serverCount uint32 + validIp uint32 ) func init() { @@ -23,16 +36,25 @@ func init() { localLogger = log.New(localLogBuffer, "", 0) } +func makeServerEndpointName() string { + serverNum := atomic.AddUint32(&serverCount, 1) + validIp := testingNetworkUint32 + serverNum + ipv4 := make(net.IP, ipv4len) + binary.BigEndian.PutUint32(ipv4, validIp) + return net.JoinHostPort(ipv4.String(), defaultNomadPort) +} + func GetBufferedLogger() *log.Logger { return localLogger } type fauxConnPool struct { // failPct between 0.0 and 1.0 == pct of time a Ping should fail - failPct float64 + failPct float64 + datacenter string } -func (cp *fauxConnPool) PingConsulServer(server *agent.Server) (bool, error) { +func (cp *fauxConnPool) PingNomadServer(region string, majorVersion int, server *rpcproxy.ServerEndpoint) (bool, error) { var success bool successProb := rand.Float64() if successProb > cp.failPct { @@ -42,210 +64,295 @@ func (cp *fauxConnPool) PingConsulServer(server *agent.Server) (bool, error) { } type fauxSerf struct { + datacenter string + numNodes int + region string + rpcMinorVersion int + rpcMajorVersion int +} + +func (s *fauxSerf) Datacenter() string { + return s.datacenter } func (s *fauxSerf) NumNodes() int { - return 16384 + return s.numNodes +} + +func (s *fauxSerf) Region() string { + return s.region +} + +func (s *fauxSerf) RpcMajorVersion() int { + return s.rpcMajorVersion +} + +func (s *fauxSerf) RpcMinorVersion() int { + return s.rpcMinorVersion } -func testManager() (m *servers.Manager) { +func testRpcProxy() (p *rpcproxy.RpcProxy) { logger := GetBufferedLogger() logger = log.New(os.Stderr, "", log.LstdFlags) shutdownCh := make(chan struct{}) - m = servers.New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{}) - return m + p = rpcproxy.New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{}) + return p } -func testManagerFailProb(failPct float64) (m *servers.Manager) { +func testRpcProxyFailProb(failPct float64) (p *rpcproxy.RpcProxy) { logger := GetBufferedLogger() logger = log.New(os.Stderr, "", log.LstdFlags) shutdownCh := make(chan struct{}) - m = servers.New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) - return m + p = rpcproxy.New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) + return p } -// func (m *Manager) AddServer(server *agent.Server) { -func TestServers_AddServer(t *testing.T) { - m := testManager() +// func (p *RpcProxy) AddPrimaryServer(server *rpcproxy.ServerEndpoint) { +func TestServers_AddPrimaryServer(t *testing.T) { + p := testRpcProxy() var num int - num = m.NumServers() + num = p.NumServers() if num != 0 { t.Fatalf("Expected zero servers to start") } - s1 := &agent.Server{Name: "s1"} - m.AddServer(s1) - num = m.NumServers() + s1Endpoint := makeServerEndpointName() + s1 := p.AddPrimaryServer(s1Endpoint) + num = p.NumServers() if num != 1 { t.Fatalf("Expected one server") } + if s1 == nil { + t.Fatalf("bad") + } + if s1.Name != s1Endpoint { + t.Fatalf("bad") + } - m.AddServer(s1) - num = m.NumServers() + s1 = p.AddPrimaryServer(s1Endpoint) + num = p.NumServers() if num != 1 { t.Fatalf("Expected one server (still)") } + if s1 == nil { + t.Fatalf("bad") + } + if s1.Name != s1Endpoint { + t.Fatalf("bad") + } - s2 := &agent.Server{Name: "s2"} - m.AddServer(s2) - num = m.NumServers() + s2Endpoint := makeServerEndpointName() + s2 := p.AddPrimaryServer(s2Endpoint) + num = p.NumServers() if num != 2 { t.Fatalf("Expected two servers") } + if s2 == nil { + t.Fatalf("bad") + } + if s2.Name != s2Endpoint { + t.Fatalf("bad") + } } -// func (m *Manager) FindServer() (server *agent.Server) { +// func (p *RpcProxy) FindServer() (server *rpcproxy.ServerEndpoint) { func TestServers_FindServer(t *testing.T) { - m := testManager() + p := testRpcProxy() - if m.FindServer() != nil { + if p.FindServer() != nil { t.Fatalf("Expected nil return") } - m.AddServer(&agent.Server{Name: "s1"}) - if m.NumServers() != 1 { + s1Endpoint := makeServerEndpointName() + p.AddPrimaryServer(s1Endpoint) + if p.NumServers() != 1 { t.Fatalf("Expected one server") } - s1 := m.FindServer() + s1 := p.FindServer() if s1 == nil { t.Fatalf("Expected non-nil server") } - if s1.Name != "s1" { + if s1.Name != s1Endpoint { t.Fatalf("Expected s1 server") } - s1 = m.FindServer() - if s1 == nil || s1.Name != "s1" { + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { t.Fatalf("Expected s1 server (still)") } - m.AddServer(&agent.Server{Name: "s2"}) - if m.NumServers() != 2 { + s2Endpoint := makeServerEndpointName() + p.AddPrimaryServer(s2Endpoint) + if p.NumServers() != 2 { t.Fatalf("Expected two servers") } - s1 = m.FindServer() - if s1 == nil || s1.Name != "s1" { + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { t.Fatalf("Expected s1 server (still)") } - m.NotifyFailedServer(s1) - s2 := m.FindServer() - if s2 == nil || s2.Name != "s2" { + p.NotifyFailedServer(s1) + s2 := p.FindServer() + if s2 == nil || s2.Name != s2Endpoint { t.Fatalf("Expected s2 server") } - m.NotifyFailedServer(s2) - s1 = m.FindServer() - if s1 == nil || s1.Name != "s1" { + p.NotifyFailedServer(s2) + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { t.Fatalf("Expected s1 server") } } -// func New(logger *log.Logger, shutdownCh chan struct{}) (m *Manager) { +// func New(logger *log.Logger, shutdownCh chan struct{}) (p *RpcProxy) { func TestServers_New(t *testing.T) { logger := GetBufferedLogger() logger = log.New(os.Stderr, "", log.LstdFlags) shutdownCh := make(chan struct{}) - m := servers.New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{}) - if m == nil { - t.Fatalf("Manager nil") + p := rpcproxy.New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{}) + if p == nil { + t.Fatalf("RpcProxy nil") } } -// func (m *Manager) NotifyFailedServer(server *agent.Server) { +// func (p *RpcProxy) NotifyFailedServer(server *rpcproxy.ServerEndpoint) { func TestServers_NotifyFailedServer(t *testing.T) { - m := testManager() + p := testRpcProxy() - if m.NumServers() != 0 { + if p.NumServers() != 0 { t.Fatalf("Expected zero servers to start") } - s1 := &agent.Server{Name: "s1"} - s2 := &agent.Server{Name: "s2"} - - // Try notifying for a server that is not managed by Manager - m.NotifyFailedServer(s1) - if m.NumServers() != 0 { - t.Fatalf("Expected zero servers to start") + // Try notifying for a server that is not managed by RpcProxy + s1Endpoint := makeServerEndpointName() + s1 := p.AddPrimaryServer(s1Endpoint) + if s1 == nil { + t.Fatalf("bad") + } + if p.NumServers() != 1 { + t.Fatalf("bad") + } + p.RemoveServer(s1) + if p.NumServers() != 0 { + t.Fatalf("bad") } - m.AddServer(s1) + p.NotifyFailedServer(s1) + s1 = p.AddPrimaryServer(s1Endpoint) // Test again w/ a server not in the list - m.NotifyFailedServer(s2) - if m.NumServers() != 1 { + s2Endpoint := makeServerEndpointName() + s2 := p.AddPrimaryServer(s2Endpoint) + if s2 == nil { + t.Fatalf("bad") + } + if p.NumServers() != 2 { + t.Fatalf("bad") + } + p.RemoveServer(s2) + if p.NumServers() != 1 { + t.Fatalf("bad") + } + p.NotifyFailedServer(s2) + if p.NumServers() != 1 { t.Fatalf("Expected one server") } - m.AddServer(s2) - if m.NumServers() != 2 { + // Re-add s2 so there are two servers in the RpcProxy server list + s2 = p.AddPrimaryServer(s2Endpoint) + if p.NumServers() != 2 { t.Fatalf("Expected two servers") } - s1 = m.FindServer() - if s1 == nil || s1.Name != "s1" { + // Find the first server, it should be s1 + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { t.Fatalf("Expected s1 server") } - m.NotifyFailedServer(s2) - s1 = m.FindServer() - if s1 == nil || s1.Name != "s1" { + // Notify s2 as failed, s1 should still be first + p.NotifyFailedServer(s2) + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { t.Fatalf("Expected s1 server (still)") } - m.NotifyFailedServer(s1) - s2 = m.FindServer() - if s2 == nil || s2.Name != "s2" { + // Fail s1, s2 should be first + p.NotifyFailedServer(s1) + s2 = p.FindServer() + if s2 == nil || s2.Name != s2Endpoint { t.Fatalf("Expected s2 server") } - m.NotifyFailedServer(s2) - s1 = m.FindServer() - if s1 == nil || s1.Name != "s1" { + // Fail s2, s1 should be first + p.NotifyFailedServer(s2) + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { t.Fatalf("Expected s1 server") } } -// func (m *Manager) NumServers() (numServers int) { +// func (p *RpcProxy) NumServers() (numServers int) { func TestServers_NumServers(t *testing.T) { - m := testManager() - var num int - num = m.NumServers() - if num != 0 { - t.Fatalf("Expected zero servers to start") + p := testRpcProxy() + const maxNumServers = 100 + serverList := make([]*rpcproxy.ServerEndpoint, 0, maxNumServers) + + // Add some servers + for i := 0; i < maxNumServers; i++ { + num := p.NumServers() + if num != i { + t.Fatalf("%d: Expected %d servers", i, num) + } + serverName := makeServerEndpointName() + s := p.AddPrimaryServer(serverName) + if s == nil { + t.Fatalf("Expected server from %q", serverName) + } + serverList = append(serverList, s) + + num = p.NumServers() + if num != i+1 { + t.Fatalf("%d: Expected %d servers", i, num+1) + } } - s := &agent.Server{} - m.AddServer(s) - num = m.NumServers() - if num != 1 { - t.Fatalf("Expected one server after AddServer") + // Remove some servers + for i := maxNumServers; i > 0; i-- { + num := p.NumServers() + if num != i { + t.Fatalf("%d: Expected %d servers", i, num) + } + p.RemoveServer(serverList[i-1]) + num = p.NumServers() + if num != i-1 { + t.Fatalf("%d: Expected %d servers", i, num-1) + } } } -// func (m *Manager) RebalanceServers() { +// func (p *RpcProxy) RebalanceServers() { func TestServers_RebalanceServers(t *testing.T) { const failPct = 0.5 - m := testManagerFailProb(failPct) + p := testRpcProxyFailProb(failPct) const maxServers = 100 const numShuffleTests = 100 const uniquePassRate = 0.5 // Make a huge list of nodes. for i := 0; i < maxServers; i++ { - nodeName := fmt.Sprintf("s%02d", i) - m.AddServer(&agent.Server{Name: nodeName}) + p.AddPrimaryServer(makeServerEndpointName()) } // Keep track of how many unique shuffles we get. uniques := make(map[string]struct{}, maxServers) for i := 0; i < numShuffleTests; i++ { - m.RebalanceServers() + p.RebalanceServers() var names []string for j := 0; j < maxServers; j++ { - server := m.FindServer() - m.NotifyFailedServer(server) + server := p.FindServer() + p.NotifyFailedServer(server) names = append(names, server.Name) } key := strings.Join(names, "|") @@ -260,48 +367,90 @@ func TestServers_RebalanceServers(t *testing.T) { } } -// func (m *Manager) RemoveServer(server *agent.Server) { -func TestManager_RemoveServer(t *testing.T) { - const nodeNameFmt = "s%02d" - m := testManager() - - if m.NumServers() != 0 { +// func (p *RpcProxy) RemoveServer(server *rpcproxy.ServerEndpoint) { +func TestRpcProxy_RemoveServer(t *testing.T) { + p := testRpcProxy() + if p.NumServers() != 0 { t.Fatalf("Expected zero servers to start") } // Test removing server before its added - nodeName := fmt.Sprintf(nodeNameFmt, 1) - s1 := &agent.Server{Name: nodeName} - m.RemoveServer(s1) - m.AddServer(s1) + s1Endpoint := makeServerEndpointName() + s1 := p.AddPrimaryServer(s1Endpoint) + if p.NumServers() != 1 { + t.Fatalf("bad") + } + if s1 == nil || s1.Name != s1Endpoint { + t.Fatalf("Expected s1 server: %q", s1.Name) + } + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { + t.Fatalf("Expected s1 server: %q", s1.Name) + } + p.RemoveServer(s1) + if p.NumServers() != 0 { + t.Fatalf("bad") + } + // Remove it a second time now that it doesn't exist + p.RemoveServer(s1) + if p.NumServers() != 0 { + t.Fatalf("bad") + } + p.AddPrimaryServer(s1Endpoint) + if p.NumServers() != 1 { + t.Fatalf("bad") + } - nodeName = fmt.Sprintf(nodeNameFmt, 2) - s2 := &agent.Server{Name: nodeName} - m.RemoveServer(s2) - m.AddServer(s2) + s2Endpoint := makeServerEndpointName() + s2 := p.AddPrimaryServer(s2Endpoint) + if p.NumServers() != 2 { + t.Fatalf("bad") + } + if s2 == nil || s2.Name != s2Endpoint { + t.Fatalf("Expected s2 server: %q", s2.Name) + } + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { + t.Fatalf("Expected s1 to be the front of the list: %q==%q", s1.Name, s1Endpoint) + } + // Move s1 to the back of the server list + p.NotifyFailedServer(s1) + s2 = p.FindServer() + if s2 == nil || s2.Name != s2Endpoint { + t.Fatalf("Expected s2 server: %q", s2Endpoint) + } + p.RemoveServer(s2) + if p.NumServers() != 1 { + t.Fatalf("bad") + } + p.RemoveServer(s2) + if p.NumServers() != 1 { + t.Fatalf("bad") + } + p.AddPrimaryServer(s2Endpoint) const maxServers = 19 - servers := make([]*agent.Server, maxServers) + servers := make([]*rpcproxy.ServerEndpoint, 0, maxServers) + servers = append(servers, s1) + servers = append(servers, s2) // Already added two servers above for i := maxServers; i > 2; i-- { - nodeName := fmt.Sprintf(nodeNameFmt, i) - server := &agent.Server{Name: nodeName} + server := p.AddPrimaryServer(makeServerEndpointName()) servers = append(servers, server) - m.AddServer(server) } - if m.NumServers() != maxServers { - t.Fatalf("Expected %d servers, received %d", maxServers, m.NumServers()) + if p.NumServers() != maxServers { + t.Fatalf("Expected %d servers, received %d", maxServers, p.NumServers()) } - m.RebalanceServers() + p.RebalanceServers() - if m.NumServers() != maxServers { - t.Fatalf("Expected %d servers, received %d", maxServers, m.NumServers()) + if p.NumServers() != maxServers { + t.Fatalf("Expected %d servers, received %d", maxServers, p.NumServers()) } - findServer := func(server *agent.Server) bool { - for i := m.NumServers(); i > 0; i-- { - s := m.FindServer() + findServer := func(server *rpcproxy.ServerEndpoint) bool { + for i := p.NumServers(); i > 0; i-- { + s := p.FindServer() if s == server { return true } @@ -310,18 +459,18 @@ func TestManager_RemoveServer(t *testing.T) { } expectedNumServers := maxServers - removedServers := make([]*agent.Server, 0, maxServers) + removedServers := make([]*rpcproxy.ServerEndpoint, 0, maxServers) // Remove servers from the front of the list for i := 3; i > 0; i-- { - server := m.FindServer() + server := p.FindServer() if server == nil { t.Fatalf("FindServer returned nil") } - m.RemoveServer(server) + p.RemoveServer(server) expectedNumServers-- - if m.NumServers() != expectedNumServers { - t.Fatalf("Expected %d servers (got %d)", expectedNumServers, m.NumServers()) + if p.NumServers() != expectedNumServers { + t.Fatalf("Expected %d servers (got %d)", expectedNumServers, p.NumServers()) } if findServer(server) == true { t.Fatalf("Did not expect to find server %s after removal from the front", server.Name) @@ -331,12 +480,12 @@ func TestManager_RemoveServer(t *testing.T) { // Remove server from the end of the list for i := 3; i > 0; i-- { - server := m.FindServer() - m.NotifyFailedServer(server) - m.RemoveServer(server) + server := p.FindServer() + p.NotifyFailedServer(server) + p.RemoveServer(server) expectedNumServers-- - if m.NumServers() != expectedNumServers { - t.Fatalf("Expected %d servers (got %d)", expectedNumServers, m.NumServers()) + if p.NumServers() != expectedNumServers { + t.Fatalf("Expected %d servers (got %d)", expectedNumServers, p.NumServers()) } if findServer(server) == true { t.Fatalf("Did not expect to find server %s", server.Name) @@ -346,15 +495,15 @@ func TestManager_RemoveServer(t *testing.T) { // Remove server from the middle of the list for i := 3; i > 0; i-- { - server := m.FindServer() - m.NotifyFailedServer(server) - server2 := m.FindServer() - m.NotifyFailedServer(server2) // server2 now at end of the list + server := p.FindServer() + p.NotifyFailedServer(server) + server2 := p.FindServer() + p.NotifyFailedServer(server2) // server2 now at end of the list - m.RemoveServer(server) + p.RemoveServer(server) expectedNumServers-- - if m.NumServers() != expectedNumServers { - t.Fatalf("Expected %d servers (got %d)", expectedNumServers, m.NumServers()) + if p.NumServers() != expectedNumServers { + t.Fatalf("Expected %d servers (got %d)", expectedNumServers, p.NumServers()) } if findServer(server) == true { t.Fatalf("Did not expect to find server %s", server.Name) @@ -362,21 +511,21 @@ func TestManager_RemoveServer(t *testing.T) { removedServers = append(removedServers, server) } - if m.NumServers()+len(removedServers) != maxServers { - t.Fatalf("Expected %d+%d=%d servers", m.NumServers(), len(removedServers), maxServers) + if p.NumServers()+len(removedServers) != maxServers { + t.Fatalf("Expected %d+%d=%d servers", p.NumServers(), len(removedServers), maxServers) } // Drain the remaining servers from the middle - for i := m.NumServers(); i > 0; i-- { - server := m.FindServer() - m.NotifyFailedServer(server) - server2 := m.FindServer() - m.NotifyFailedServer(server2) // server2 now at end of the list - m.RemoveServer(server) + for i := p.NumServers(); i > 0; i-- { + server := p.FindServer() + p.NotifyFailedServer(server) + server2 := p.FindServer() + p.NotifyFailedServer(server2) // server2 now at end of the list + p.RemoveServer(server) removedServers = append(removedServers, server) } - if m.NumServers() != 0 { + if p.NumServers() != 0 { t.Fatalf("Expected an empty server list") } if len(removedServers) != maxServers { @@ -384,4 +533,4 @@ func TestManager_RemoveServer(t *testing.T) { } } -// func (m *Manager) Start() { +// func (p *RpcProxy) Start() { From a55395678fd34a4dfc5c2cec5a04efa2637135de Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 01:08:15 -0700 Subject: [PATCH 078/166] Fix tests for client.TestAgent_ServerConfig Add similar logic in Agent `serverConfig()` to set up the `serverSerfAddr` the same as `serverHttpAddr` and `serverRpcAddr`. --- command/agent/agent.go | 16 ++++++++++++++++ command/agent/agent_test.go | 32 +++++++++++++++++++++++++++----- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index cdfb549974e..ece64d199e8 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -195,6 +195,22 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { } a.serverRpcAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) + // Resolve the Server's Serf Address + if a.config.AdvertiseAddrs.Serf != "" { + a.serverSerfAddr = a.config.AdvertiseAddrs.Serf + } else if a.config.Addresses.Serf != "" { + a.serverSerfAddr = fmt.Sprintf("%v:%v", a.config.Addresses.Serf, a.config.Ports.Serf) + } else if a.config.BindAddr != "" { + a.serverSerfAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.Serf) + } else { + a.serverSerfAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.Serf) + } + addr, err = net.ResolveTCPAddr("tcp", a.serverSerfAddr) + if err != nil { + return nil, fmt.Errorf("error resolving Serf addr %q: %v:", a.serverSerfAddr, err) + } + a.serverSerfAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) + if gcThreshold := a.config.Server.NodeGCThreshold; gcThreshold != "" { dur, err := time.ParseDuration(gcThreshold) if err != nil { diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index ec3de72a6c7..1d7eb3e1570 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -119,9 +119,12 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := out.RPCAdvertise; addr.IP.String() != "127.0.0.1" || addr.Port != 4001 { t.Fatalf("bad rpc advertise addr: %#v", addr) } - if addr := a.serverRpcAddr; addr != "10.10.11.1:4005" { + if addr := a.serverHttpAddr; addr != "10.10.11.1:4005" { t.Fatalf("expect 10.11.11.1:4005, got: %v", addr) } + if addr := a.serverRpcAddr; addr != "127.0.0.1:4001" { + t.Fatalf("expect 127.0.0.1:4001, got: %v", addr) + } // Sets up the ports properly conf.Ports.RPC = 4003 @@ -138,7 +141,7 @@ func TestAgent_ServerConfig(t *testing.T) { t.Fatalf("expect 4004, got: %d", port) } - // Prefers the most specific bind addrs + // Prefers advertise over bind addr conf.BindAddr = "127.0.0.3" conf.Addresses.RPC = "127.0.0.2" conf.Addresses.Serf = "127.0.0.2" @@ -155,8 +158,15 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := out.SerfConfig.MemberlistConfig.BindAddr; addr != "127.0.0.2" { t.Fatalf("expect 127.0.0.2, got: %s", addr) } - if addr := a.serverRpcAddr; addr != "127.0.0.2:4646" { - t.Fatalf("expect 127.0.0.3:4646, got: %s", addr) + if addr := a.serverHttpAddr; addr != "127.0.0.2:4646" { + t.Fatalf("expect 127.0.0.2:4646, got: %s", addr) + } + // NOTE: AdvertiseAddr > Addresses > BindAddr > Defaults + if addr := a.serverRpcAddr; addr != "127.0.0.1:4001" { + t.Fatalf("expect 127.0.0.1:4001, got: %s", addr) + } + if addr := a.serverSerfAddr; addr != "127.0.0.1:4000" { + t.Fatalf("expect 127.0.0.1:4000, got: %s", addr) } conf.Server.NodeGCThreshold = "42g" @@ -185,6 +195,12 @@ func TestAgent_ServerConfig(t *testing.T) { conf.Addresses.RPC = "" conf.Addresses.Serf = "" conf.Addresses.HTTP = "" + conf.AdvertiseAddrs.RPC = "" + conf.AdvertiseAddrs.HTTP = "" + conf.AdvertiseAddrs.Serf = "" + conf.Ports.HTTP = 4646 + conf.Ports.RPC = 4647 + conf.Ports.Serf = 4648 out, err = a.serverConfig() if err != nil { t.Fatalf("err: %s", err) @@ -195,9 +211,15 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := out.SerfConfig.MemberlistConfig.BindAddr; addr != "127.0.0.3" { t.Fatalf("expect 127.0.0.3, got: %s", addr) } - if addr := a.serverRpcAddr; addr != "127.0.0.3:4646" { + if addr := a.serverHttpAddr; addr != "127.0.0.3:4646" { t.Fatalf("expect 127.0.0.3:4646, got: %s", addr) } + if addr := a.serverRpcAddr; addr != "127.0.0.3:4647" { + t.Fatalf("expect 127.0.0.3:4647, got: %s", addr) + } + if addr := a.serverSerfAddr; addr != "127.0.0.3:4648" { + t.Fatalf("expect 127.0.0.3:4648, got: %s", addr) + } // Properly handles the bootstrap flags conf.Server.BootstrapExpect = 1 From 37e7038a27ef8e48b0e6efb9dc27fcb61af3f8f5 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 01:12:37 -0700 Subject: [PATCH 079/166] Fix config_parse_test to reflect that `consul.addr` does not exist. `consul.address` does, but not `consul.addr`. --- command/agent/config-test-fixtures/basic.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/command/agent/config-test-fixtures/basic.hcl b/command/agent/config-test-fixtures/basic.hcl index 8778487155a..f5f1380e0f0 100644 --- a/command/agent/config-test-fixtures/basic.hcl +++ b/command/agent/config-test-fixtures/basic.hcl @@ -88,7 +88,7 @@ http_api_response_headers { consul { server_service_name = "nomad" client_service_name = "nomad-client" - addr = "127.0.0.1:9500" + address = "127.0.0.1:9500" token = "token1" auth = "username:pass" ssl = true From f03765814e32e39d14ed19377002519451114530 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 01:15:31 -0700 Subject: [PATCH 080/166] Add some trace-level logging for /v1/agent/servers when writing This endpoint shouldn't be hit often, but this could be useful in logs down the road. --- command/agent/agent_endpoint.go | 1 + 1 file changed, 1 insertion(+) diff --git a/command/agent/agent_endpoint.go b/command/agent/agent_endpoint.go index 93782b112ce..99b618e737d 100644 --- a/command/agent/agent_endpoint.go +++ b/command/agent/agent_endpoint.go @@ -190,6 +190,7 @@ func (s *HTTPServer) updateServers(resp http.ResponseWriter, req *http.Request) // Set the servers list into the client for _, server := range servers { + s.agent.logger.Printf("[TRACE] Adding server %s to the client's primary server list", server) se := client.AddPrimaryServerToRpcProxy(server) if se == nil { s.agent.logger.Printf("[ERR] Attempt to add server %q to client failed", server) From ac174dbb6bd2fbd565c8fe1a3007a7c68f7c68b3 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 01:22:39 -0700 Subject: [PATCH 081/166] Fix building tests that used `DefaultConfig()` but didn't pickup the package move. --- client/alloc_runner_test.go | 3 ++- client/client_test.go | 2 +- client/task_runner_test.go | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go index 97b3dc34844..f978447999d 100644 --- a/client/alloc_runner_test.go +++ b/client/alloc_runner_test.go @@ -10,6 +10,7 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" + "github.com/hashicorp/nomad/client/config" ctestutil "github.com/hashicorp/nomad/client/testutil" ) @@ -25,7 +26,7 @@ func (m *MockAllocStateUpdater) Update(alloc *structs.Allocation) { func testAllocRunner(restarts bool) (*MockAllocStateUpdater, *AllocRunner) { logger := testLogger() - conf := DefaultConfig() + conf := config.DefaultConfig() conf.StateDir = os.TempDir() conf.AllocDir = os.TempDir() upd := &MockAllocStateUpdater{} diff --git a/client/client_test.go b/client/client_test.go index 8653c5c3cb1..fff1d790a33 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -70,7 +70,7 @@ func testServer(t *testing.T, cb func(*nomad.Config)) (*nomad.Server, string) { } func testClient(t *testing.T, cb func(c *config.Config)) *Client { - conf := DefaultConfig() + conf := nomad.DefaultConfig() conf.DevMode = true conf.ConsulConfig = &sconfig.ConsulConfig{} if cb != nil { diff --git a/client/task_runner_test.go b/client/task_runner_test.go index afcb31d0182..00472fb70be 100644 --- a/client/task_runner_test.go +++ b/client/task_runner_test.go @@ -11,6 +11,7 @@ import ( "time" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/driver" cstructs "github.com/hashicorp/nomad/client/driver/structs" "github.com/hashicorp/nomad/nomad/mock" @@ -46,7 +47,7 @@ func testTaskRunner(restarts bool) (*MockTaskStateUpdater, *TaskRunner) { // the passed allocation. func testTaskRunnerFromAlloc(restarts bool, alloc *structs.Allocation) (*MockTaskStateUpdater, *TaskRunner) { logger := testLogger() - conf := DefaultConfig() + conf := config.DefaultConfig() conf.StateDir = os.TempDir() conf.AllocDir = os.TempDir() upd := &MockTaskStateUpdater{} From 69088460860138f37e89127756aa93027de1871d Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 01:35:56 -0700 Subject: [PATCH 082/166] Add a quick set of client/rpcproxy.ServerEndpoint equality tests --- .../rpcproxy/server_endpoint_internal_test.go | 38 +++++++++ client/rpcproxy/server_endpoint_test.go | 79 +++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 client/rpcproxy/server_endpoint_internal_test.go create mode 100644 client/rpcproxy/server_endpoint_test.go diff --git a/client/rpcproxy/server_endpoint_internal_test.go b/client/rpcproxy/server_endpoint_internal_test.go new file mode 100644 index 00000000000..a4e1e9f16da --- /dev/null +++ b/client/rpcproxy/server_endpoint_internal_test.go @@ -0,0 +1,38 @@ +package rpcproxy + +import ( + "testing" +) + +// func (k *EndpointKey) Equal(x *EndpointKey) { +func TestServerEndpointKey_Equal(t *testing.T) { + tests := []struct { + name string + k1 *EndpointKey + k2 *EndpointKey + equal bool + }{ + { + name: "equal", + k1: &EndpointKey{name: "k1"}, + k2: &EndpointKey{name: "k1"}, + equal: true, + }, + { + name: "not equal", + k1: &EndpointKey{name: "k1"}, + k2: &EndpointKey{name: "k2"}, + equal: false, + }, + } + + for _, test := range tests { + if test.k1.Equal(test.k2) != test.equal { + t.Errorf("fixture %s failed forward comparison", test.name) + } + + if test.k2.Equal(test.k1) != test.equal { + t.Errorf("fixture %s failed reverse comparison", test.name) + } + } +} diff --git a/client/rpcproxy/server_endpoint_test.go b/client/rpcproxy/server_endpoint_test.go new file mode 100644 index 00000000000..8b964313ef0 --- /dev/null +++ b/client/rpcproxy/server_endpoint_test.go @@ -0,0 +1,79 @@ +package rpcproxy_test + +import ( + "fmt" + "net" + "testing" + + "github.com/hashicorp/nomad/client/rpcproxy" +) + +// func (k *rpcproxy.EndpointKey) Equal(x *rpcproxy.EndpointKey) { +func TestServerEndpointKey_Equal(t *testing.T) { + tests := []struct { + name string + s1 *rpcproxy.ServerEndpoint + s2 *rpcproxy.ServerEndpoint + equal bool + }{ + { + name: "equal", + s1: &rpcproxy.ServerEndpoint{Name: "k1"}, + s2: &rpcproxy.ServerEndpoint{Name: "k1"}, + equal: true, + }, + { + name: "not equal", + s1: &rpcproxy.ServerEndpoint{Name: "k1"}, + s2: &rpcproxy.ServerEndpoint{Name: "k2"}, + equal: false, + }, + } + + for _, test := range tests { + if test.s1.Key().Equal(test.s2.Key()) != test.equal { + t.Errorf("fixture %s failed forward comparison", test.name) + } + + if test.s2.Key().Equal(test.s1.Key()) != test.equal { + t.Errorf("fixture %s failed reverse comparison", test.name) + } + } +} + +// func (k *rpcproxy.ServerEndpoint) String() { +func TestServerEndpoint_String(t *testing.T) { + tests := []struct { + name string + s *rpcproxy.ServerEndpoint + str string + }{ + { + name: "name", + s: &rpcproxy.ServerEndpoint{Name: "s"}, + str: "s (:)", + }, + { + name: "name, host, port", + s: &rpcproxy.ServerEndpoint{ + Name: "s", + Host: "127.0.0.1", + Port: "4647", + }, + str: "s (tcp:127.0.0.1:4647)", + }, + } + + for _, test := range tests { + if test.s.Addr == nil && (test.s.Host != "" && test.s.Port != "") { + fmt.Printf("Setting addr\n") + addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(test.s.Host, test.s.Port)) + if err == nil { + test.s.Addr = addr + } + } + if test.s.String() != test.str { + t.Errorf("fixture %q failed: %q vs %q", test.name, test.s.String(), test.str) + } + } +} From 453a7556b66b72de9adbb00f6013fdcb7b33340d Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 02:08:23 -0700 Subject: [PATCH 083/166] Pick the right `DefaultConfig` from the right package. Overly zealous search && replace at work here. --- client/client_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/client_test.go b/client/client_test.go index fff1d790a33..24721b49b8b 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -70,7 +70,7 @@ func testServer(t *testing.T, cb func(*nomad.Config)) (*nomad.Server, string) { } func testClient(t *testing.T, cb func(c *config.Config)) *Client { - conf := nomad.DefaultConfig() + conf := config.DefaultConfig() conf.DevMode = true conf.ConsulConfig = &sconfig.ConsulConfig{} if cb != nil { From f9862d40439e847ca870103fea6ce94c990a8a4f Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 02:09:05 -0700 Subject: [PATCH 084/166] Update godoc for newServer to reflect DNS and IP-based inputs Requested by: alex --- client/rpcproxy/server_endpoint.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/client/rpcproxy/server_endpoint.go b/client/rpcproxy/server_endpoint.go index f7c356c39ee..f9fef0c0275 100644 --- a/client/rpcproxy/server_endpoint.go +++ b/client/rpcproxy/server_endpoint.go @@ -40,7 +40,11 @@ func (s *ServerEndpoint) Key() *EndpointKey { } } -// newServer creates a new Server instance with a resolvable endpoint +// newServer creates a new Server instance with a resolvable endpoint. +// `name` can be either an IP address or a DNS name. If `name` is a DNS +// name, it must be resolvable to an IP address (most inputs are IP +// addresses, not DNS names, but both work equally well when the name is +// resolvable). func newServer(name string) (s *ServerEndpoint, err error) { s = &ServerEndpoint{ Name: name, From 329057e22b9a3a75a7ffe77ac9ca4064fe0cacab Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 02:12:06 -0700 Subject: [PATCH 085/166] Only actively test Consul when env `CONSUL_HTTP_ADDR` is set --- client/fingerprint/consul_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/client/fingerprint/consul_test.go b/client/fingerprint/consul_test.go index 6232869871c..c1ab3acf483 100644 --- a/client/fingerprint/consul_test.go +++ b/client/fingerprint/consul_test.go @@ -4,6 +4,7 @@ import ( "fmt" "net/http" "net/http/httptest" + "os" "testing" "github.com/hashicorp/nomad/client/config" @@ -11,6 +12,11 @@ import ( ) func TestConsulFingerprint(t *testing.T) { + addr := os.Getenv("CONSUL_HTTP_ADDR") + if addr == "" { + t.Skipf("No consul process running, skipping test") + } + fp := NewConsulFingerprint(testLogger()) node := &structs.Node{ Attributes: make(map[string]string), From 503c6a996f8edc41a622039b9c63f6d0db35e261 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 02:14:59 -0700 Subject: [PATCH 086/166] Fix test TestClientConfigCommand_UpdateServers() Now that hostnames are validated on input, switch to IPs since they bypass DNS resolution. --- command/client_config_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/command/client_config_test.go b/command/client_config_test.go index ab3370db97e..e1e96e001a4 100644 --- a/command/client_config_test.go +++ b/command/client_config_test.go @@ -33,7 +33,7 @@ func TestClientConfigCommand_UpdateServers(t *testing.T) { ui.ErrorWriter.Reset() // Set the servers list - code = cmd.Run([]string{"-address=" + url, "-update-servers", "foo", "bar"}) + code = cmd.Run([]string{"-address=" + url, "-update-servers", "127.0.0.42", "198.18.5.5"}) if code != 0 { t.Fatalf("expected exit 0, got: %d", code) } @@ -44,11 +44,11 @@ func TestClientConfigCommand_UpdateServers(t *testing.T) { t.Fatalf("expect exit 0, got: %d", code) } out := ui.OutputWriter.String() - if !strings.Contains(out, "foo") { - t.Fatalf("missing foo") + if !strings.Contains(out, "127.0.0.42") { + t.Fatalf("missing 127.0.0.42") } - if !strings.Contains(out, "bar") { - t.Fatalf("missing bar") + if !strings.Contains(out, "198.18.5.5") { + t.Fatalf("missing 198.18.5.5") } } From 445386bb3f510c65b778765485aff11ba61952c7 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 02:31:24 -0700 Subject: [PATCH 087/166] Remove unused constants --- client/client.go | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/client/client.go b/client/client.go index c843f17abd7..640e6efbc14 100644 --- a/client/client.go +++ b/client/client.go @@ -71,17 +71,6 @@ const ( // allocSyncRetryIntv is the interval on which we retry updating // the status of the allocation allocSyncRetryIntv = 5 * time.Second - - // consulSyncInterval is the interval at which the client syncs with consul - // to remove services and checks which are no longer valid - consulSyncInterval = 15 * time.Second - - // consulSyncDelay specifies the initial sync delay when starting the - // Nomad Agent's consul.Syncer. - consulSyncDelay = 5 * time.Second - - // Add a little jitter to the agent's consul.Syncer task - consulSyncJitter = 8 ) // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad From 41904f262fd20cc8fb07d2a25b6996219399d98d Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 02:41:04 -0700 Subject: [PATCH 088/166] Clean up some docs and comments to be more accurate --- client/client.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/client/client.go b/client/client.go index 640e6efbc14..2977ff34854 100644 --- a/client/client.go +++ b/client/client.go @@ -200,7 +200,7 @@ func NewClient(cfg *config.Config, consulSyncer *consul.Syncer) (*Client, error) // Setup the Consul syncer if err := c.setupConsulSyncer(); err != nil { - return nil, fmt.Errorf("failed to create Consul syncer: %v") + return nil, fmt.Errorf("failed to create client Consul syncer: %v") } // Register and then start heartbeating to the servers. @@ -1213,12 +1213,13 @@ func (c *Client) addAlloc(alloc *structs.Allocation) error { return nil } -// setupConsulSyncer creates a consul.Syncer +// setupConsulSyncer creates Client-mode consul.Syncer callbacks that are +// executed periodically. func (c *Client) setupConsulSyncer() error { // The bootstrapFn callback handler is used to periodically poll // Consul to look up the Nomad Servers in Consul. In the event the - // heartbeat deadline has been exceeded and this Agent is orphaned - // from its cluster, periodically poll Consul to reattach this Agent + // heartbeat deadline has been exceeded and this Client is orphaned + // from its servers, periodically poll Consul to reattach this Client // to its cluster and automatically recover from a detached state. bootstrapFn := func() { now := time.Now() From 4300cb31d92a00d5fceb745c66ce989a83d65da3 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 03:08:11 -0700 Subject: [PATCH 089/166] Remove unused variable --- client/client.go | 1 - 1 file changed, 1 deletion(-) diff --git a/client/client.go b/client/client.go index 2977ff34854..7b3b481761c 100644 --- a/client/client.go +++ b/client/client.go @@ -123,7 +123,6 @@ type Client struct { // consulSyncer advertises this Nomad Agent with Consul consulSyncer *consul.Syncer - consulLock int64 // HostStatsCollector collects host resource usage stats hostStatsCollector *stats.HostStatsCollector From ab434b4079bb23abc9bf0a3c66c139d7d6ee5e33 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 03:08:39 -0700 Subject: [PATCH 090/166] Fix typo in comment --- command/agent/agent.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index ece64d199e8..b52587e8b97 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -388,7 +388,7 @@ func (a *Agent) setupClient() error { return nil } -// reservePortsForClient reservers a range of ports for the client to use when +// reservePortsForClient reserves a range of ports for the client to use when // it creates various plugins for log collection, executors, drivers, etc func (a *Agent) reservePortsForClient(conf *clientconfig.Config) error { // finding the device name for loopback From 66dc946852f69de985c692e7ac1d0f7bf4bfc0e4 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 03:09:53 -0700 Subject: [PATCH 091/166] Don't clobber the default consul config in tests --- client/client_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/client/client_test.go b/client/client_test.go index 24721b49b8b..5ffae2b3e33 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -16,7 +16,6 @@ import ( "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" - sconfig "github.com/hashicorp/nomad/nomad/structs/config" "github.com/hashicorp/nomad/testutil" "github.com/mitchellh/hashstructure" @@ -72,7 +71,6 @@ func testServer(t *testing.T, cb func(*nomad.Config)) (*nomad.Server, string) { func testClient(t *testing.T, cb func(c *config.Config)) *Client { conf := config.DefaultConfig() conf.DevMode = true - conf.ConsulConfig = &sconfig.ConsulConfig{} if cb != nil { cb(conf) } From 7538667837d14d143b052253854e20fc8d655a4a Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 03:47:19 -0700 Subject: [PATCH 092/166] Push down the server list even on node registration and evaluation Be mindful of the cost of taking a snapshot from the statestore and reuse the snapshot if one has already been taken. --- nomad/node_endpoint.go | 83 ++++++++++++++++++++++++++----------- nomad/node_endpoint_test.go | 28 +++++++++++-- 2 files changed, 83 insertions(+), 28 deletions(-) diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index 2fb51d74112..4ba53fa5d27 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -7,6 +7,7 @@ import ( "github.com/armon/go-metrics" "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/watch" ) @@ -101,6 +102,52 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp // Set the reply index reply.Index = index + + n.srv.peerLock.RLock() + defer n.srv.peerLock.RUnlock() + if err := n.updateNodeUpdateResponse(nil, reply); err != nil { + n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) + return err + } + + return nil +} + +// updateNodeUpdateResponse assumes the n.srv.peerLock is held for reading. +func (n *Node) updateNodeUpdateResponse(snap *state.StateSnapshot, reply *structs.NodeUpdateResponse) error { + reply.LeaderRPCAddr = n.srv.raft.Leader() + + // Reply with config information required for future RPC requests + reply.Servers = make([]*structs.NodeServerInfo, 0, len(n.srv.localPeers)) + for k, v := range n.srv.localPeers { + reply.Servers = append(reply.Servers, + &structs.NodeServerInfo{ + RpcAdvertiseAddr: k, + RpcMajorVersion: int32(v.MajorVersion), + RpcMinorVersion: int32(v.MinorVersion), + Datacenter: v.Datacenter, + }) + } + + // Capture all the nodes to obtain the node count + if snap == nil { + ss, err := n.srv.fsm.State().Snapshot() + if err != nil { + return err + } + snap = ss + } + iter, err := snap.Nodes() + if err == nil { + for { + raw := iter.Next() + if raw == nil { + break + } + reply.NumNodes++ + } + } + return nil } @@ -206,33 +253,12 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct } // Set the reply index and leader + reply.Index = index n.srv.peerLock.RLock() defer n.srv.peerLock.RUnlock() - reply.Index = index - reply.LeaderRPCAddr = n.srv.raft.Leader() - - // Reply with config information required for future RPC requests - reply.Servers = make([]*structs.NodeServerInfo, 0, len(n.srv.localPeers)) - for k, v := range n.srv.localPeers { - reply.Servers = append(reply.Servers, - &structs.NodeServerInfo{ - RpcAdvertiseAddr: k, - RpcMajorVersion: int32(v.MajorVersion), - RpcMinorVersion: int32(v.MinorVersion), - Datacenter: v.Datacenter, - }) - } - - // Capture all the nodes to obtain the node count - iter, err := snap.Nodes() - if err == nil { - for { - raw := iter.Next() - if raw == nil { - break - } - reply.NumNodes++ - } + if err := n.updateNodeUpdateResponse(snap, reply); err != nil { + n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) + return err } return nil @@ -326,6 +352,13 @@ func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUp // Set the reply index reply.Index = evalIndex + + n.srv.peerLock.RLock() + defer n.srv.peerLock.RUnlock() + if err := n.updateNodeUpdateResponse(snap, reply); err != nil { + n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) + return err + } return nil } diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go index 9d825a182d6..e796b194c4c 100644 --- a/nomad/node_endpoint_test.go +++ b/nomad/node_endpoint_test.go @@ -237,6 +237,28 @@ func TestClientEndpoint_UpdateStatus_GetEvals(t *testing.T) { func TestClientEndpoint_UpdateStatus_HeartbeatOnly(t *testing.T) { s1 := testServer(t, nil) defer s1.Shutdown() + + s2 := testServer(t, func(c *Config) { + c.DevDisableBootstrap = true + }) + defer s2.Shutdown() + + s3 := testServer(t, func(c *Config) { + c.DevDisableBootstrap = true + }) + defer s3.Shutdown() + servers := []*Server{s1, s2, s3} + testJoin(t, s1, s2, s3) + + for _, s := range servers { + testutil.WaitForResult(func() (bool, error) { + peers, _ := s.raftPeers.Peers() + return len(peers) == 3, nil + }, func(err error) { + t.Fatalf("should have 3 peers") + }) + } + codec := rpcClient(t, s1) testutil.WaitForLeader(t, s1.RPC) @@ -260,9 +282,9 @@ func TestClientEndpoint_UpdateStatus_HeartbeatOnly(t *testing.T) { } // Check for heartbeat servers - servers := resp.Servers - if len(servers) == 0 { - t.Fatalf("bad: %#v", servers) + serverAddrs := resp.Servers + if len(serverAddrs) == 0 { + t.Fatalf("bad: %#v", serverAddrs) } // Update the status, static state From f15d84e5e7dbcdc9b90138ef08bcbe3a0e5ba771 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 03:48:31 -0700 Subject: [PATCH 093/166] Use a monotonically incrementing number to create unique node names. Also remove the space from the "name" of the node --- nomad/server_test.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/nomad/server_test.go b/nomad/server_test.go index 1ee4e7a65ff..a44cb88da1f 100644 --- a/nomad/server_test.go +++ b/nomad/server_test.go @@ -11,7 +11,10 @@ import ( "github.com/hashicorp/nomad/testutil" ) -var nextPort uint32 = 15000 +var ( + nextPort uint32 = 15000 + nodeNumber uint32 = 0 +) func getPort() int { return int(atomic.AddUint32(&nextPort, 1)) @@ -34,7 +37,8 @@ func testServer(t *testing.T, cb func(*Config)) *Server { IP: []byte{127, 0, 0, 1}, Port: getPort(), } - config.NodeName = fmt.Sprintf("Node %d", config.RPCAddr.Port) + nodeNumber = atomic.AddUint32(&nodeNumber, 1) + config.NodeName = fmt.Sprintf("nomad-%03d", nodeNumber) // Tighten the Serf timing config.SerfConfig.MemberlistConfig.BindAddr = "127.0.0.1" From 73c15603a962c902c4bc826d918381485819f11d Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 13:17:46 -0700 Subject: [PATCH 094/166] Rename `backupServerDeadline` to `consulPullHeartbeatDeadline` Suggested by: @alex --- client/client.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/client/client.go b/client/client.go index 7b3b481761c..9a2cbc671e7 100644 --- a/client/client.go +++ b/client/client.go @@ -100,11 +100,12 @@ type Client struct { logger *log.Logger - // backupServerDeadline is the deadline at which this Nomad Agent - // will begin polling Consul for a list of Nomad Servers. When Nomad - // Clients are heartbeating successfully with Nomad Servers, Nomad - // Clients do not poll Consul for a backup server list. - backupServerDeadline time.Time + // consulPullHeartbeatDeadline is the deadline at which this Nomad + // Agent will begin polling Consul for a list of Nomad Servers. When + // Nomad Clients are heartbeating successfully with Nomad Servers, + // Nomad Clients do not poll Consul to populate their backup server + // list. + consulPullHeartbeatDeadline time.Time rpcProxy *rpcproxy.RpcProxy @@ -901,7 +902,7 @@ func (c *Client) updateNodeStatus() error { if err := c.rpcProxy.UpdateFromNodeUpdateResponse(&resp); err != nil { return err } - c.backupServerDeadline = time.Now().Add(2 * resp.HeartbeatTTL) + c.consulPullHeartbeatDeadline = time.Now().Add(2 * resp.HeartbeatTTL) return nil } @@ -1223,7 +1224,7 @@ func (c *Client) setupConsulSyncer() error { bootstrapFn := func() { now := time.Now() c.configLock.RLock() - if now.Before(c.backupServerDeadline) { + if now.Before(c.consulPullHeartbeatDeadline) { c.configLock.RUnlock() return } From 07799b636a826daed7582f08cabc78f47d2c26e0 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 14:13:51 -0700 Subject: [PATCH 095/166] Nuke the last of the explicit types in favor of using language idioms --- client/consul/sync.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/client/consul/sync.go b/client/consul/sync.go index 75312a36f7b..729d859717c 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -20,9 +20,6 @@ import ( "github.com/hashicorp/nomad/nomad/types" ) -type notifyEvent struct{} -type notifyChannel chan notifyEvent - // Syncer allows syncing of services and checks with Consul type Syncer struct { client *consul.Client @@ -46,7 +43,7 @@ type Syncer struct { // periodicCallbacks is walked sequentially when the timer in Run // fires. periodicCallbacks map[string]types.PeriodicCallback - notifySyncCh notifyChannel + notifySyncCh chan struct{} periodicLock sync.RWMutex } @@ -168,7 +165,7 @@ func (c *Syncer) SetServiceIdentifier(serviceIdentifier string) *Syncer { // to be synced immediately. func (c *Syncer) SyncNow() { select { - case c.notifySyncCh <- notifyEvent{}: + case c.notifySyncCh <- struct{}{}: default: } } From 9de963497b038fac3c8543feb54405aa6e544dd8 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 22:51:04 -0700 Subject: [PATCH 096/166] Clean up various comments --- client/rpcproxy/rpcproxy.go | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index b0feed0282f..18ee9053d65 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -1,10 +1,10 @@ -// Package rpcproxy provides a proxy interface for Nomad Servers. The +// Package rpcproxy provides a proxy interface to Nomad Servers. The // RpcProxy periodically shuffles which server a Nomad Client communicates // with in order to redistribute load across Nomad Servers. Nomad Servers // that fail an RPC request are automatically cycled to the end of the list // until the server list is reshuffled. // -// The servers package does not provide any external API guarantees and +// The rpcproxy package does not provide any external API guarantees and // should be called only by `hashicorp/nomad`. package rpcproxy @@ -32,7 +32,7 @@ const ( // // For example, in a 10K Nomad cluster with 5x servers, this default // averages out to ~13 new connections from rebalancing per server - // per second (each connection is reused for 120s to 180s). + // per second. clientRPCJitterFraction = 2 // clientRPCMinReuseDuration controls the minimum amount of time RPC @@ -85,7 +85,7 @@ type serverList struct { type RpcProxy struct { // activatedList manages the list of Nomad Servers that are eligible - // to be queried by the Agent + // to be queried by the Client agent. activatedList atomic.Value listLock sync.Mutex @@ -488,12 +488,13 @@ func (p *RpcProxy) RebalanceServers() { return } -// reconcileServerList returns true when the first server in serverList (l) -// exists in the receiver's serverList (m). If true, the merged serverList -// (l) is stored as the receiver's serverList (m). Returns false if the -// first server in m does not exist in the passed in list (l) (i.e. was -// removed by Nomad during a PingNomadServer() call. Newly added servers are -// appended to the list and other missing servers are removed from the list. +// reconcileServerList returns true when the first server in serverList +// (l) exists in the receiver's serverList (p). If true, the merged +// serverList (l) is stored as the receiver's serverList (p). Returns +// false if the first server in p does not exist in the passed in list (l) +// (i.e. was removed by Nomad during a PingNomadServer() call. Newly added +// servers are appended to the list and other missing servers are removed +// from the list. func (p *RpcProxy) reconcileServerList(l *serverList) bool { p.listLock.Lock() defer p.listLock.Unlock() @@ -575,7 +576,7 @@ func (p *RpcProxy) RemoveServer(s *ServerEndpoint) { p.backupServers.removeServerByKey(k) } -// refreshServerRebalanceTimer is only called once m.rebalanceTimer expires. +// refreshServerRebalanceTimer is only called once p.rebalanceTimer expires. func (p *RpcProxy) refreshServerRebalanceTimer() time.Duration { l := p.getServerList() numServers := len(l.L) @@ -659,7 +660,7 @@ func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse } // 1) Create a map to reconcile the difference between - // m.primaryServers and resp.Servers. + // p.primaryServers and resp.Servers. type targetServer struct { server *ServerEndpoint From c8b2f7ce33fa290c13ce35eea27ab7dd59ddd252 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 23:26:54 -0700 Subject: [PATCH 097/166] Flesh out the comment re: the client.rpcproxy.Run() task. Requested by: Alex --- client/client.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/client/client.go b/client/client.go index 9a2cbc671e7..67c8002b7b6 100644 --- a/client/client.go +++ b/client/client.go @@ -218,7 +218,14 @@ func NewClient(cfg *config.Config, consulSyncer *consul.Syncer) (*Client, error) // Start collecting stats go c.collectHostStats() - // Start maintenance task for servers + // Start the RpcProxy maintenance task. This task periodically + // shuffles the list of Nomad Server Endpoints this Client will use + // when communicating with Nomad Servers via RPC. This is done in + // order to prevent server fixation in stable Nomad clusters. This + // task actively populates the active list of Nomad Server Endpoints + // from information from the Nomad Client heartbeats. If a heartbeat + // times out and there are no Nomad servers available, this data is + // populated by periodically polling Consul, if available. go c.rpcProxy.Run() return c, nil From 8dd833f9a83470e1a66fb889f7245b489be52bf5 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 23:30:59 -0700 Subject: [PATCH 098/166] Use the client configCopy and lock appropriately. --- client/client.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/client/client.go b/client/client.go index 67c8002b7b6..7ee2840846b 100644 --- a/client/client.go +++ b/client/client.go @@ -276,7 +276,10 @@ func (c *Client) Leave() error { // Datacenter returns the datacenter for the given client func (c *Client) Datacenter() string { - return c.config.Node.Datacenter + c.configLock.RLock() + dc := c.configCopy.Node.Datacenter + c.configLock.RUnlock() + return dc } // Region returns the region for the given client From 8d478b96f7bc55b716b284fe60081c27fb029260 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 1 Jun 2016 23:36:28 -0700 Subject: [PATCH 099/166] Make the locking protocol more explicit in client.NewClient With an over abundance of caution, preevnt future copy/pasta by using the right locks when bootstrapping a Client. Strictly speaking this is not necessary, but it makes explicit the locking semantics and guards against future concurrent or parallel initialization. --- client/client.go | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/client/client.go b/client/client.go index 7ee2840846b..d4eb95537f4 100644 --- a/client/client.go +++ b/client/client.go @@ -182,16 +182,20 @@ func NewClient(cfg *config.Config, consulSyncer *consul.Syncer) (*Client, error) // Setup the reserved resources c.reservePorts() + // Store the config copy before restoring state but after it has been + // initialized. + c.configLock.Lock() + c.configCopy = c.config.Copy() + c.configLock.Unlock() + // Create the RPC Proxy and bootstrap with the preconfigured list of // static servers + c.configLock.RLock() c.rpcProxy = rpcproxy.New(c.logger, c.shutdownCh, c, c.connPool) - for _, serverAddr := range c.config.Servers { + for _, serverAddr := range c.configCopy.Servers { c.rpcProxy.AddPrimaryServer(serverAddr) } - - // Store the config copy before restoring state but after it has been - // initialized. - c.configCopy = c.config.Copy() + c.configLock.RUnlock() // Restore the state if err := c.restoreState(); err != nil { From f0c6b709111408a7ff9278ef73bdfffde492d4cb Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 2 Jun 2016 00:11:21 -0700 Subject: [PATCH 100/166] Fix up the comments Pointed out by: @dadgar --- client/client.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/client/client.go b/client/client.go index d4eb95537f4..a85dad3eea2 100644 --- a/client/client.go +++ b/client/client.go @@ -291,12 +291,14 @@ func (c *Client) Region() string { return c.config.Region } -// Region returns the structs.ApiMajorVersion in use by the client +// RpcMajorVersion returns the structs.ApiMajorVersion supported by the +// client. func (c *Client) RpcMajorVersion() int { return structs.ApiMajorVersion } -// Region returns the structs.ApiMinorVersion in use by the client +// RpcMinorVersion returns the structs.ApiMinorVersion supported by the +// client. func (c *Client) RpcMinorVersion() int { return structs.ApiMinorVersion } From 1ec537450e4f4d48ad12a32d8e136a74fadebd2a Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 2 Jun 2016 00:12:30 -0700 Subject: [PATCH 101/166] Rename rpcproxy.UpdateFromNodeUpdateResponse to RefreshServerLists While breaking the API within this PR, break out the individual arguments to RefreshServerLists. The servers parameter is reusing `structs.NodeServerInfo` for the time being, but this can be revisited if the needs of the strucutre diverge in the future. --- client/client.go | 2 +- client/rpcproxy/rpcproxy.go | 32 +++++++++++++++++--------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/client/client.go b/client/client.go index a85dad3eea2..3ac5bce954d 100644 --- a/client/client.go +++ b/client/client.go @@ -915,7 +915,7 @@ func (c *Client) updateNodeStatus() error { c.lastHeartbeat = time.Now() c.heartbeatTTL = resp.HeartbeatTTL - if err := c.rpcProxy.UpdateFromNodeUpdateResponse(&resp); err != nil { + if err := c.rpcProxy.RefreshServerLists(resp.Servers, resp.NumNodes, resp.LeaderRPCAddr); err != nil { return err } c.consulPullHeartbeatDeadline = time.Now().Add(2 * resp.HeartbeatTTL) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 18ee9053d65..899f0434b93 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -631,14 +631,16 @@ func (p *RpcProxy) Run() { } } -// UpdateFromNodeUpdateResponse handles heartbeat responses from Nomad -// Servers. Heartbeats contain a list of Nomad Servers that the client -// should talk with for RPC requests. UpdateFromNodeUpdateResponse does not -// rebalance its serverList, that is handled elsewhere. New servers learned -// via the heartbeat are appended to the RpcProxy's serverList. Removed -// servers are removed immediately. Servers speaking a newer RPC version are -// filtered from the serverList. -func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse) error { +// RefreshServerLists is called when the Client receives an update from a +// Nomad Server. The response from Nomad Client Heartbeats contain a list of +// Nomad Servers that the Nomad Client should use for RPC requests. +// RefreshServerLists does not rebalance its serverLists (that is handled +// elsewhere via a periodic timer). New Nomad Servers learned via the +// heartbeat are appended to the RpcProxy's activated serverList. Servers +// that are no longer present in the Heartbeat are removed immediately from +// all server lists. Nomad Servers speaking a newer major or minor API +// version are filtered from the serverList. +func (p *RpcProxy) RefreshServerLists(servers []*structs.NodeServerInfo, numNodes int32, leaderRpcAddr string) error { // Merge all servers found in the response. Servers in the response // with newer API versions are filtered from the list. If the list // is missing an address found in the RpcProxy's server list, remove @@ -655,12 +657,12 @@ func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse // Clear the backup server list when a heartbeat contains at least // one server. - if len(resp.Servers) > 0 && len(p.backupServers.L) > 0 { - p.backupServers.L = make([]*ServerEndpoint, 0, len(resp.Servers)) + if len(servers) > 0 && len(p.backupServers.L) > 0 { + p.backupServers.L = make([]*ServerEndpoint, 0, len(servers)) } // 1) Create a map to reconcile the difference between - // p.primaryServers and resp.Servers. + // p.primaryServers and servers. type targetServer struct { server *ServerEndpoint @@ -669,7 +671,7 @@ func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse // 'n' == new state byte } - mergedPrimaryMap := make(map[EndpointKey]*targetServer, len(p.primaryServers.L)+len(resp.Servers)) + mergedPrimaryMap := make(map[EndpointKey]*targetServer, len(p.primaryServers.L)+len(servers)) numOldServers := 0 for _, s := range p.primaryServers.L { mergedPrimaryMap[*s.Key()] = &targetServer{server: s, state: 'o'} @@ -677,7 +679,7 @@ func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse } numBothServers := 0 var newServers bool - for _, s := range resp.Servers { + for _, s := range servers { // Filter out servers using a newer API version. Prevent // spamming the logs every heartbeat. // @@ -758,8 +760,8 @@ func (p *RpcProxy) UpdateFromNodeUpdateResponse(resp *structs.NodeUpdateResponse } } - p.numNodes = int(resp.NumNodes) - p.leaderAddr = resp.LeaderRPCAddr + p.numNodes = int(numNodes) + p.leaderAddr = leaderRpcAddr p.saveServerList(newServerCfg) return nil From cf052e5f294980fbf0926393288cf75a38510fd2 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 2 Jun 2016 00:20:28 -0700 Subject: [PATCH 102/166] Line wrap long line. --- client/client.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/client/client.go b/client/client.go index 3ac5bce954d..1cf568312bc 100644 --- a/client/client.go +++ b/client/client.go @@ -1247,7 +1247,9 @@ func (c *Client) setupConsulSyncer() error { c.configLock.RUnlock() nomadServerServiceName := c.config.ConsulConfig.ServerServiceName - services, _, err := c.consulSyncer.ConsulClient().Catalog().Service(nomadServerServiceName, consul.ServiceTagRpc, &consulapi.QueryOptions{AllowStale: true}) + services, _, err := c.consulSyncer.ConsulClient().Catalog(). + Service(nomadServerServiceName, consul.ServiceTagRpc, + &consulapi.QueryOptions{AllowStale: true}) if err != nil { c.logger.Printf("[WARN] client: unable to query service %q: %v", nomadServerServiceName, err) return From 8bbd76374028261fade1c044350cc5f2fa20fd0b Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 2 Jun 2016 00:42:05 -0700 Subject: [PATCH 103/166] Use client.getAllocRunners() where appropriate. --- client/client.go | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/client/client.go b/client/client.go index 1cf568312bc..52f55b89347 100644 --- a/client/client.go +++ b/client/client.go @@ -389,8 +389,7 @@ func (c *Client) StatsReporter() ClientStatsReporter { // Nomad client func (c *Client) AllocStats() map[string]AllocStatsReporter { res := make(map[string]AllocStatsReporter) - allocRunners := c.getAllocRunners() - for alloc, ar := range allocRunners { + for alloc, ar := range c.getAllocRunners() { res[alloc] = ar } return res @@ -1278,15 +1277,7 @@ func (c *Client) setupConsulSyncer() error { } services := make(map[string]struct{}) - // Get the existing allocs - c.allocLock.RLock() - allocs := make([]*AllocRunner, 0, len(c.allocs)) - for _, ar := range c.allocs { - allocs = append(allocs, ar) - } - c.allocLock.RUnlock() - - for _, ar := range allocs { + for allocId, ar := range c.getAllocRunners() { ar.taskStatusLock.RLock() taskStates := copyTaskStates(ar.taskStates) ar.taskStatusLock.RUnlock() @@ -1294,7 +1285,7 @@ func (c *Client) setupConsulSyncer() error { if taskState.State == structs.TaskStateRunning { if tr, ok := ar.tasks[taskName]; ok { for _, service := range tr.task.Services { - svcIdentifier := fmt.Sprintf("%s-%s", ar.alloc.ID, tr.task.Name) + svcIdentifier := fmt.Sprintf("%s-%s", allocId, tr.task.Name) services[service.ID(svcIdentifier)] = struct{}{} } } From 998f285be0018fadab210e42798f297707b8be80 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 2 Jun 2016 00:52:49 -0700 Subject: [PATCH 104/166] Ensure that all accesses to Client.alloc are wrapped by allocLock. --- client/client.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/client/client.go b/client/client.go index 52f55b89347..59211d9fac1 100644 --- a/client/client.go +++ b/client/client.go @@ -315,10 +315,12 @@ func (c *Client) Shutdown() error { // Destroy all the running allocations. if c.config.DevMode { + c.allocLock.Lock() for _, ar := range c.allocs { ar.Destroy() <-ar.WaitCh() } + c.allocLock.Unlock() } c.shutdown = true @@ -441,6 +443,9 @@ func (c *Client) HostStatsTS(since int64) []*stats.HostStats { // GetAllocFS returns the AllocFS interface for the alloc dir of an allocation func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) { + c.allocLock.RLock() + defer c.allocLock.RUnlock() + ar, ok := c.allocs[allocID] if !ok { return nil, fmt.Errorf("alloc not found") @@ -476,7 +481,9 @@ func (c *Client) restoreState() error { c.configLock.RLock() ar := NewAllocRunner(c.logger, c.configCopy, c.updateAllocStatus, alloc) c.configLock.RUnlock() + c.allocLock.Lock() c.allocs[id] = ar + c.allocLock.Unlock() if err := ar.RestoreState(); err != nil { c.logger.Printf("[ERR] client: failed to restore state for alloc %s: %v", id, err) mErr.Errors = append(mErr.Errors, err) From 86b5d318f8982f9cb1c0b1dbe55b584c141bccf3 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 2 Jun 2016 00:56:55 -0700 Subject: [PATCH 105/166] Move `const` block to the top of the file. Requested by: @dadgar --- client/consul/sync.go | 54 +++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/client/consul/sync.go b/client/consul/sync.go index 729d859717c..2888f23e868 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -20,33 +20,6 @@ import ( "github.com/hashicorp/nomad/nomad/types" ) -// Syncer allows syncing of services and checks with Consul -type Syncer struct { - client *consul.Client - runChecks bool - - serviceIdentifier string // serviceIdentifier is a token which identifies which task/alloc the service belongs to - delegateChecks map[string]struct{} // delegateChecks are the checks that the Nomad client runs and reports to Consul - createCheck func(*structs.ServiceCheck, string) (Check, error) - addrFinder func(portLabel string) (string, int) - - trackedServices map[string]*consul.AgentService - trackedChecks map[string]*consul.AgentCheckRegistration - checkRunners map[string]*CheckRunner - - logger *log.Logger - - shutdownCh chan struct{} - shutdown bool - shutdownLock sync.Mutex - - // periodicCallbacks is walked sequentially when the timer in Run - // fires. - periodicCallbacks map[string]types.PeriodicCallback - notifySyncCh chan struct{} - periodicLock sync.RWMutex -} - const ( // initialSyncBuffer is the max time an initial sync will sleep // before syncing. @@ -76,6 +49,33 @@ const ( ServiceTagSerf = "serf" ) +// Syncer allows syncing of services and checks with Consul +type Syncer struct { + client *consul.Client + runChecks bool + + serviceIdentifier string // serviceIdentifier is a token which identifies which task/alloc the service belongs to + delegateChecks map[string]struct{} // delegateChecks are the checks that the Nomad client runs and reports to Consul + createCheck func(*structs.ServiceCheck, string) (Check, error) + addrFinder func(portLabel string) (string, int) + + trackedServices map[string]*consul.AgentService + trackedChecks map[string]*consul.AgentCheckRegistration + checkRunners map[string]*CheckRunner + + logger *log.Logger + + shutdownCh chan struct{} + shutdown bool + shutdownLock sync.Mutex + + // periodicCallbacks is walked sequentially when the timer in Run + // fires. + periodicCallbacks map[string]types.PeriodicCallback + notifySyncCh chan struct{} + periodicLock sync.RWMutex +} + // NewSyncer returns a new consul.Syncer func NewSyncer(config *config.ConsulConfig, logger *log.Logger) (*Syncer, error) { var err error From bbf7348abc8273be4d2956f568508aae8bde2a30 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 2 Jun 2016 00:57:22 -0700 Subject: [PATCH 106/166] Bump the default Consul client timeout from 500ms to 5s. Requsted by: @dadgar --- client/config/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/config/config.go b/client/config/config.go index b0f993d0e6e..d2ae39e4d9a 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -139,7 +139,7 @@ func DefaultConfig() *Config { ServerServiceName: "nomad", ClientServiceName: "nomad-client", AutoRegister: true, - Timeout: 500 * time.Millisecond, + Timeout: 5 * time.Second, }, LogOutput: os.Stderr, Region: "global", From 273e8cf3c9b50e19f7d9d18fc4bb3a7e06c9ac37 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 2 Jun 2016 01:13:24 -0700 Subject: [PATCH 107/166] Collapse rpcproxy_internal_test.go into rpcproxy_test.go Requested by: @dadgar --- client/rpcproxy/rpcproxy_internal_test.go | 363 ---------------------- client/rpcproxy/rpcproxy_test.go | 340 ++++++++++++++++++-- 2 files changed, 311 insertions(+), 392 deletions(-) delete mode 100644 client/rpcproxy/rpcproxy_internal_test.go diff --git a/client/rpcproxy/rpcproxy_internal_test.go b/client/rpcproxy/rpcproxy_internal_test.go deleted file mode 100644 index d5ec99ab341..00000000000 --- a/client/rpcproxy/rpcproxy_internal_test.go +++ /dev/null @@ -1,363 +0,0 @@ -package rpcproxy - -import ( - "bytes" - "fmt" - "log" - "math/rand" - "os" - "testing" - "time" - - "github.com/hashicorp/nomad/nomad/structs" -) - -var ( - localLogger *log.Logger - localLogBuffer *bytes.Buffer -) - -func init() { - localLogBuffer = new(bytes.Buffer) - localLogger = log.New(localLogBuffer, "", 0) -} - -func GetBufferedLogger() *log.Logger { - return localLogger -} - -type fauxConnPool struct { - // failPct between 0.0 and 1.0 == pct of time a Ping should fail - failPct float64 -} - -func (cp *fauxConnPool) PingNomadServer(region string, version int, s *ServerEndpoint) (bool, error) { - var success bool - successProb := rand.Float64() - if successProb > cp.failPct { - success = true - } - return success, nil -} - -type fauxSerf struct { - numNodes int -} - -func (s *fauxSerf) NumNodes() int { - return s.numNodes -} - -func (s *fauxSerf) Region() string { - return "global" -} - -func (s *fauxSerf) Datacenter() string { - return "dc1" -} - -func (s *fauxSerf) RpcMajorVersion() int { - return structs.ApiMajorVersion -} - -func (s *fauxSerf) RpcMinorVersion() int { - return structs.ApiMinorVersion -} - -func testManager() (p *RpcProxy) { - logger := GetBufferedLogger() - shutdownCh := make(chan struct{}) - p = New(logger, shutdownCh, &fauxSerf{numNodes: 16384}, &fauxConnPool{}) - return p -} - -func testManagerFailProb(failPct float64) (p *RpcProxy) { - logger := GetBufferedLogger() - logger = log.New(os.Stderr, "", log.LstdFlags) - shutdownCh := make(chan struct{}) - p = New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) - return p -} - -// func (l *serverList) cycleServer() (servers []*Server) { -func TestManagerInternal_cycleServer(t *testing.T) { - m := testManager() - l := m.getServerList() - - server0 := &ServerEndpoint{Name: "server1"} - server1 := &ServerEndpoint{Name: "server2"} - server2 := &ServerEndpoint{Name: "server3"} - l.L = append(l.L, server0, server1, server2) - m.saveServerList(l) - - l = m.getServerList() - if len(l.L) != 3 { - t.Fatalf("server length incorrect: %d/3", len(l.L)) - } - if l.L[0] != server0 && - l.L[1] != server1 && - l.L[2] != server2 { - t.Fatalf("initial server ordering not correct") - } - - l.L = l.cycleServer() - if len(l.L) != 3 { - t.Fatalf("server length incorrect: %d/3", len(l.L)) - } - if l.L[0] != server1 && - l.L[1] != server2 && - l.L[2] != server0 { - t.Fatalf("server ordering after one cycle not correct") - } - - l.L = l.cycleServer() - if len(l.L) != 3 { - t.Fatalf("server length incorrect: %d/3", len(l.L)) - } - if l.L[0] != server2 && - l.L[1] != server0 && - l.L[2] != server1 { - t.Fatalf("server ordering after two cycles not correct") - } - - l.L = l.cycleServer() - if len(l.L) != 3 { - t.Fatalf("server length incorrect: %d/3", len(l.L)) - } - if l.L[0] != server0 && - l.L[1] != server1 && - l.L[2] != server2 { - t.Fatalf("server ordering after three cycles not correct") - } -} - -// func (m *Manager) getServerList() serverList { -func TestManagerInternal_getServerList(t *testing.T) { - m := testManager() - l := m.getServerList() - if l.L == nil { - t.Fatalf("serverList.servers nil") - } - - if len(l.L) != 0 { - t.Fatalf("serverList.servers length not zero") - } -} - -func TestManagerInternal_NewManager(t *testing.T) { - m := testManager() - if m == nil { - t.Fatalf("Manager nil") - } - - if m.logger == nil { - t.Fatalf("Manager.logger nil") - } - - if m.shutdownCh == nil { - t.Fatalf("Manager.shutdownCh nil") - } -} - -// func (m *Manager) reconcileServerList(l *serverList) bool { -func TestManagerInternal_reconcileServerList(t *testing.T) { - tests := []int{0, 1, 2, 3, 4, 5, 10, 100} - for _, n := range tests { - ok, err := test_reconcileServerList(n) - if !ok { - t.Errorf("Expected %d to pass: %v", n, err) - } - } -} - -func test_reconcileServerList(maxServers int) (bool, error) { - // Build a server list, reconcile, verify the missing servers are - // missing, the added have been added, and the original server is - // present. - const failPct = 0.5 - m := testManagerFailProb(failPct) - - var failedServers, healthyServers []*ServerEndpoint - for i := 0; i < maxServers; i++ { - nodeName := fmt.Sprintf("s%02d", i) - - node := &ServerEndpoint{Name: nodeName} - // Add 66% of servers to Manager - if rand.Float64() > 0.33 { - m.activateEndpoint(node) - - // Of healthy servers, (ab)use connPoolPinger to - // failPct of the servers for the reconcile. This - // allows for the selected server to no longer be - // healthy for the reconcile below. - if ok, _ := m.connPoolPinger.PingNomadServer(m.configInfo.Region(), m.configInfo.RpcMajorVersion(), node); ok { - // Will still be present - healthyServers = append(healthyServers, node) - } else { - // Will be missing - failedServers = append(failedServers, node) - } - } else { - // Will be added from the call to reconcile - healthyServers = append(healthyServers, node) - } - } - - // Randomize Manager's server list - m.RebalanceServers() - selectedServer := m.FindServer() - - var selectedServerFailed bool - for _, s := range failedServers { - if selectedServer.Key().Equal(s.Key()) { - selectedServerFailed = true - break - } - } - - // Update Manager's server list to be "healthy" based on Serf. - // Reconcile this with origServers, which is shuffled and has a live - // connection, but possibly out of date. - origServers := m.getServerList() - m.saveServerList(serverList{L: healthyServers}) - - // This should always succeed with non-zero server lists - if !selectedServerFailed && !m.reconcileServerList(&origServers) && - len(m.getServerList().L) != 0 && - len(origServers.L) != 0 { - // If the random gods are unfavorable and we end up with zero - // length lists, expect things to fail and retry the test. - return false, fmt.Errorf("Expected reconcile to succeed: %v %d %d", - selectedServerFailed, - len(m.getServerList().L), - len(origServers.L)) - } - - // If we have zero-length server lists, test succeeded in degenerate - // case. - if len(m.getServerList().L) == 0 && - len(origServers.L) == 0 { - // Failed as expected w/ zero length list - return true, nil - } - - resultingServerMap := make(map[EndpointKey]bool) - for _, s := range m.getServerList().L { - resultingServerMap[*s.Key()] = true - } - - // Test to make sure no failed servers are in the Manager's - // list. Error if there are any failedServers in l.servers - for _, s := range failedServers { - _, ok := resultingServerMap[*s.Key()] - if ok { - return false, fmt.Errorf("Found failed server %v in merged list %v", s, resultingServerMap) - } - } - - // Test to make sure all healthy servers are in the healthy list. - if len(healthyServers) != len(m.getServerList().L) { - return false, fmt.Errorf("Expected healthy map and servers to match: %d/%d", len(healthyServers), len(healthyServers)) - } - - // Test to make sure all healthy servers are in the resultingServerMap list. - for _, s := range healthyServers { - _, ok := resultingServerMap[*s.Key()] - if !ok { - return false, fmt.Errorf("Server %v missing from healthy map after merged lists", s) - } - } - return true, nil -} - -// func (l *serverList) refreshServerRebalanceTimer() { -func TestManagerInternal_refreshServerRebalanceTimer(t *testing.T) { - type clusterSizes struct { - numNodes int - numServers int - minRebalance time.Duration - } - clusters := []clusterSizes{ - {0, 3, 10 * time.Minute}, - {1, 0, 10 * time.Minute}, // partitioned cluster - {1, 3, 10 * time.Minute}, - {2, 3, 10 * time.Minute}, - {100, 0, 10 * time.Minute}, // partitioned - {100, 1, 10 * time.Minute}, // partitioned - {100, 3, 10 * time.Minute}, - {1024, 1, 10 * time.Minute}, // partitioned - {1024, 3, 10 * time.Minute}, // partitioned - {1024, 5, 10 * time.Minute}, - {16384, 1, 10 * time.Minute}, // partitioned - {16384, 2, 10 * time.Minute}, // partitioned - {16384, 3, 10 * time.Minute}, // partitioned - {16384, 5, 10 * time.Minute}, - {65535, 0, 10 * time.Minute}, // partitioned - {65535, 1, 10 * time.Minute}, // partitioned - {65535, 2, 10 * time.Minute}, // partitioned - {65535, 3, 10 * time.Minute}, // partitioned - {65535, 5, 10 * time.Minute}, // partitioned - {65535, 7, 10 * time.Minute}, - {1000000, 1, 10 * time.Minute}, // partitioned - {1000000, 2, 10 * time.Minute}, // partitioned - {1000000, 3, 10 * time.Minute}, // partitioned - {1000000, 5, 10 * time.Minute}, // partitioned - {1000000, 11, 10 * time.Minute}, // partitioned - {1000000, 19, 10 * time.Minute}, - } - - logger := log.New(os.Stderr, "", log.LstdFlags) - shutdownCh := make(chan struct{}) - - for i, s := range clusters { - m := New(logger, shutdownCh, &fauxSerf{numNodes: s.numNodes}, &fauxConnPool{}) - for i := 0; i < s.numServers; i++ { - nodeName := fmt.Sprintf("s%02d", i) - m.activateEndpoint(&ServerEndpoint{Name: nodeName}) - } - - d := m.refreshServerRebalanceTimer() - if d < s.minRebalance { - t.Errorf("[%d] duration too short for cluster of size %d and %d servers (%s < %s)", i, s.numNodes, s.numServers, d, s.minRebalance) - } - } -} - -// func (m *Manager) saveServerList(l serverList) { -func TestManagerInternal_saveServerList(t *testing.T) { - m := testManager() - - // Initial condition - func() { - l := m.getServerList() - if len(l.L) != 0 { - t.Fatalf("Manager.saveServerList failed to load init config") - } - - newServer := new(ServerEndpoint) - l.L = append(l.L, newServer) - m.saveServerList(l) - }() - - // Test that save works - func() { - l1 := m.getServerList() - t1NumServers := len(l1.L) - if t1NumServers != 1 { - t.Fatalf("Manager.saveServerList failed to save mutated config") - } - }() - - // Verify mutation w/o a save doesn't alter the original - func() { - newServer := new(ServerEndpoint) - l := m.getServerList() - l.L = append(l.L, newServer) - - l_orig := m.getServerList() - origNumServers := len(l_orig.L) - if origNumServers >= len(l.L) { - t.Fatalf("Manager.saveServerList unsaved config overwrote original") - } - }() -} diff --git a/client/rpcproxy/rpcproxy_test.go b/client/rpcproxy/rpcproxy_test.go index fc35970032c..fe900e701b3 100644 --- a/client/rpcproxy/rpcproxy_test.go +++ b/client/rpcproxy/rpcproxy_test.go @@ -1,8 +1,9 @@ -package rpcproxy_test +package rpcproxy import ( "bytes" "encoding/binary" + "fmt" "log" "math/rand" "net" @@ -10,8 +11,7 @@ import ( "strings" "sync/atomic" "testing" - - "github.com/hashicorp/nomad/client/rpcproxy" + "time" ) const ( @@ -50,11 +50,10 @@ func GetBufferedLogger() *log.Logger { type fauxConnPool struct { // failPct between 0.0 and 1.0 == pct of time a Ping should fail - failPct float64 - datacenter string + failPct float64 } -func (cp *fauxConnPool) PingNomadServer(region string, majorVersion int, server *rpcproxy.ServerEndpoint) (bool, error) { +func (cp *fauxConnPool) PingNomadServer(region string, majorVersion int, s *ServerEndpoint) (bool, error) { var success bool successProb := rand.Float64() if successProb > cp.failPct { @@ -71,10 +70,6 @@ type fauxSerf struct { rpcMajorVersion int } -func (s *fauxSerf) Datacenter() string { - return s.datacenter -} - func (s *fauxSerf) NumNodes() int { return s.numNodes } @@ -83,6 +78,10 @@ func (s *fauxSerf) Region() string { return s.region } +func (s *fauxSerf) Datacenter() string { + return s.datacenter +} + func (s *fauxSerf) RpcMajorVersion() int { return s.rpcMajorVersion } @@ -91,24 +90,24 @@ func (s *fauxSerf) RpcMinorVersion() int { return s.rpcMinorVersion } -func testRpcProxy() (p *rpcproxy.RpcProxy) { +func testRpcProxy() (p *RpcProxy) { logger := GetBufferedLogger() logger = log.New(os.Stderr, "", log.LstdFlags) shutdownCh := make(chan struct{}) - p = rpcproxy.New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{}) + p = New(logger, shutdownCh, &fauxSerf{numNodes: 16384}, &fauxConnPool{}) return p } -func testRpcProxyFailProb(failPct float64) (p *rpcproxy.RpcProxy) { +func testRpcProxyFailProb(failPct float64) (p *RpcProxy) { logger := GetBufferedLogger() logger = log.New(os.Stderr, "", log.LstdFlags) shutdownCh := make(chan struct{}) - p = rpcproxy.New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) + p = New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) return p } -// func (p *RpcProxy) AddPrimaryServer(server *rpcproxy.ServerEndpoint) { -func TestServers_AddPrimaryServer(t *testing.T) { +// func (p *RpcProxy) AddPrimaryServer(server *ServerEndpoint) { +func TestRpcProxy_AddPrimaryServer(t *testing.T) { p := testRpcProxy() var num int num = p.NumServers() @@ -155,8 +154,8 @@ func TestServers_AddPrimaryServer(t *testing.T) { } } -// func (p *RpcProxy) FindServer() (server *rpcproxy.ServerEndpoint) { -func TestServers_FindServer(t *testing.T) { +// func (p *RpcProxy) FindServer() (server *ServerEndpoint) { +func TestRpcProxy_FindServer(t *testing.T) { p := testRpcProxy() if p.FindServer() != nil { @@ -206,18 +205,18 @@ func TestServers_FindServer(t *testing.T) { } // func New(logger *log.Logger, shutdownCh chan struct{}) (p *RpcProxy) { -func TestServers_New(t *testing.T) { +func TestRpcProxy_New(t *testing.T) { logger := GetBufferedLogger() logger = log.New(os.Stderr, "", log.LstdFlags) shutdownCh := make(chan struct{}) - p := rpcproxy.New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{}) + p := New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{}) if p == nil { t.Fatalf("RpcProxy nil") } } -// func (p *RpcProxy) NotifyFailedServer(server *rpcproxy.ServerEndpoint) { -func TestServers_NotifyFailedServer(t *testing.T) { +// func (p *RpcProxy) NotifyFailedServer(server *ServerEndpoint) { +func TestRpcProxy_NotifyFailedServer(t *testing.T) { p := testRpcProxy() if p.NumServers() != 0 { @@ -293,10 +292,10 @@ func TestServers_NotifyFailedServer(t *testing.T) { } // func (p *RpcProxy) NumServers() (numServers int) { -func TestServers_NumServers(t *testing.T) { +func TestRpcProxy_NumServers(t *testing.T) { p := testRpcProxy() const maxNumServers = 100 - serverList := make([]*rpcproxy.ServerEndpoint, 0, maxNumServers) + serverList := make([]*ServerEndpoint, 0, maxNumServers) // Add some servers for i := 0; i < maxNumServers; i++ { @@ -332,7 +331,7 @@ func TestServers_NumServers(t *testing.T) { } // func (p *RpcProxy) RebalanceServers() { -func TestServers_RebalanceServers(t *testing.T) { +func TestRpcProxy_RebalanceServers(t *testing.T) { const failPct = 0.5 p := testRpcProxyFailProb(failPct) const maxServers = 100 @@ -367,7 +366,7 @@ func TestServers_RebalanceServers(t *testing.T) { } } -// func (p *RpcProxy) RemoveServer(server *rpcproxy.ServerEndpoint) { +// func (p *RpcProxy) RemoveServer(server *ServerEndpoint) { func TestRpcProxy_RemoveServer(t *testing.T) { p := testRpcProxy() if p.NumServers() != 0 { @@ -430,7 +429,7 @@ func TestRpcProxy_RemoveServer(t *testing.T) { p.AddPrimaryServer(s2Endpoint) const maxServers = 19 - servers := make([]*rpcproxy.ServerEndpoint, 0, maxServers) + servers := make([]*ServerEndpoint, 0, maxServers) servers = append(servers, s1) servers = append(servers, s2) // Already added two servers above @@ -448,7 +447,7 @@ func TestRpcProxy_RemoveServer(t *testing.T) { t.Fatalf("Expected %d servers, received %d", maxServers, p.NumServers()) } - findServer := func(server *rpcproxy.ServerEndpoint) bool { + findServer := func(server *ServerEndpoint) bool { for i := p.NumServers(); i > 0; i-- { s := p.FindServer() if s == server { @@ -459,7 +458,7 @@ func TestRpcProxy_RemoveServer(t *testing.T) { } expectedNumServers := maxServers - removedServers := make([]*rpcproxy.ServerEndpoint, 0, maxServers) + removedServers := make([]*ServerEndpoint, 0, maxServers) // Remove servers from the front of the list for i := 3; i > 0; i-- { @@ -534,3 +533,286 @@ func TestRpcProxy_RemoveServer(t *testing.T) { } // func (p *RpcProxy) Start() { + +// func (l *serverList) cycleServer() (servers []*Server) { +func TestRpcProxyInternal_cycleServer(t *testing.T) { + p := testRpcProxy() + l := p.getServerList() + + server0 := &ServerEndpoint{Name: "server1"} + server1 := &ServerEndpoint{Name: "server2"} + server2 := &ServerEndpoint{Name: "server3"} + l.L = append(l.L, server0, server1, server2) + p.saveServerList(l) + + l = p.getServerList() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server0 && + l.L[1] != server1 && + l.L[2] != server2 { + t.Fatalf("initial server ordering not correct") + } + + l.L = l.cycleServer() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server1 && + l.L[1] != server2 && + l.L[2] != server0 { + t.Fatalf("server ordering after one cycle not correct") + } + + l.L = l.cycleServer() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server2 && + l.L[1] != server0 && + l.L[2] != server1 { + t.Fatalf("server ordering after two cycles not correct") + } + + l.L = l.cycleServer() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server0 && + l.L[1] != server1 && + l.L[2] != server2 { + t.Fatalf("server ordering after three cycles not correct") + } +} + +// func (p *RpcProxy) getServerList() serverList { +func TestRpcProxyInternal_getServerList(t *testing.T) { + p := testRpcProxy() + l := p.getServerList() + if l.L == nil { + t.Fatalf("serverList.servers nil") + } + + if len(l.L) != 0 { + t.Fatalf("serverList.servers length not zero") + } +} + +func TestRpcProxyInternal_New(t *testing.T) { + p := testRpcProxy() + if p == nil { + t.Fatalf("bad") + } + + if p.logger == nil { + t.Fatalf("bad") + } + + if p.shutdownCh == nil { + t.Fatalf("bad") + } +} + +// func (p *RpcProxy) reconcileServerList(l *serverList) bool { +func TestRpcProxyInternal_reconcileServerList(t *testing.T) { + tests := []int{0, 1, 2, 3, 4, 5, 10, 100} + for _, n := range tests { + ok, err := test_reconcileServerList(n) + if !ok { + t.Errorf("Expected %d to pass: %v", n, err) + } + } +} + +func test_reconcileServerList(maxServers int) (bool, error) { + // Build a server list, reconcile, verify the missing servers are + // missing, the added have been added, and the original server is + // present. + const failPct = 0.5 + p := testRpcProxyFailProb(failPct) + + var failedServers, healthyServers []*ServerEndpoint + for i := 0; i < maxServers; i++ { + nodeName := fmt.Sprintf("s%02d", i) + + node := &ServerEndpoint{Name: nodeName} + // Add 66% of servers to RpcProxy + if rand.Float64() > 0.33 { + p.activateEndpoint(node) + + // Of healthy servers, (ab)use connPoolPinger to + // failPct of the servers for the reconcile. This + // allows for the selected server to no longer be + // healthy for the reconcile below. + if ok, _ := p.connPoolPinger.PingNomadServer(p.configInfo.Region(), p.configInfo.RpcMajorVersion(), node); ok { + // Will still be present + healthyServers = append(healthyServers, node) + } else { + // Will be missing + failedServers = append(failedServers, node) + } + } else { + // Will be added from the call to reconcile + healthyServers = append(healthyServers, node) + } + } + + // Randomize RpcProxy's server list + p.RebalanceServers() + selectedServer := p.FindServer() + + var selectedServerFailed bool + for _, s := range failedServers { + if selectedServer.Key().Equal(s.Key()) { + selectedServerFailed = true + break + } + } + + // Update RpcProxy's server list to be "healthy" based on Serf. + // Reconcile this with origServers, which is shuffled and has a live + // connection, but possibly out of date. + origServers := p.getServerList() + p.saveServerList(serverList{L: healthyServers}) + + // This should always succeed with non-zero server lists + if !selectedServerFailed && !p.reconcileServerList(&origServers) && + len(p.getServerList().L) != 0 && + len(origServers.L) != 0 { + // If the random gods are unfavorable and we end up with zero + // length lists, expect things to fail and retry the test. + return false, fmt.Errorf("Expected reconcile to succeed: %v %d %d", + selectedServerFailed, + len(p.getServerList().L), + len(origServers.L)) + } + + // If we have zero-length server lists, test succeeded in degenerate + // case. + if len(p.getServerList().L) == 0 && + len(origServers.L) == 0 { + // Failed as expected w/ zero length list + return true, nil + } + + resultingServerMap := make(map[EndpointKey]bool) + for _, s := range p.getServerList().L { + resultingServerMap[*s.Key()] = true + } + + // Test to make sure no failed servers are in the RpcProxy's + // list. Error if there are any failedServers in l.servers + for _, s := range failedServers { + _, ok := resultingServerMap[*s.Key()] + if ok { + return false, fmt.Errorf("Found failed server %v in merged list %v", s, resultingServerMap) + } + } + + // Test to make sure all healthy servers are in the healthy list. + if len(healthyServers) != len(p.getServerList().L) { + return false, fmt.Errorf("Expected healthy map and servers to match: %d/%d", len(healthyServers), len(healthyServers)) + } + + // Test to make sure all healthy servers are in the resultingServerMap list. + for _, s := range healthyServers { + _, ok := resultingServerMap[*s.Key()] + if !ok { + return false, fmt.Errorf("Server %v missing from healthy map after merged lists", s) + } + } + return true, nil +} + +// func (l *serverList) refreshServerRebalanceTimer() { +func TestRpcProxyInternal_refreshServerRebalanceTimer(t *testing.T) { + type clusterSizes struct { + numNodes int + numServers int + minRebalance time.Duration + } + clusters := []clusterSizes{ + {0, 3, 10 * time.Minute}, + {1, 0, 10 * time.Minute}, // partitioned cluster + {1, 3, 10 * time.Minute}, + {2, 3, 10 * time.Minute}, + {100, 0, 10 * time.Minute}, // partitioned + {100, 1, 10 * time.Minute}, // partitioned + {100, 3, 10 * time.Minute}, + {1024, 1, 10 * time.Minute}, // partitioned + {1024, 3, 10 * time.Minute}, // partitioned + {1024, 5, 10 * time.Minute}, + {16384, 1, 10 * time.Minute}, // partitioned + {16384, 2, 10 * time.Minute}, // partitioned + {16384, 3, 10 * time.Minute}, // partitioned + {16384, 5, 10 * time.Minute}, + {65535, 0, 10 * time.Minute}, // partitioned + {65535, 1, 10 * time.Minute}, // partitioned + {65535, 2, 10 * time.Minute}, // partitioned + {65535, 3, 10 * time.Minute}, // partitioned + {65535, 5, 10 * time.Minute}, // partitioned + {65535, 7, 10 * time.Minute}, + {1000000, 1, 10 * time.Minute}, // partitioned + {1000000, 2, 10 * time.Minute}, // partitioned + {1000000, 3, 10 * time.Minute}, // partitioned + {1000000, 5, 10 * time.Minute}, // partitioned + {1000000, 11, 10 * time.Minute}, // partitioned + {1000000, 19, 10 * time.Minute}, + } + + logger := log.New(os.Stderr, "", log.LstdFlags) + shutdownCh := make(chan struct{}) + + for i, s := range clusters { + p := New(logger, shutdownCh, &fauxSerf{numNodes: s.numNodes}, &fauxConnPool{}) + for i := 0; i < s.numServers; i++ { + nodeName := fmt.Sprintf("s%02d", i) + p.activateEndpoint(&ServerEndpoint{Name: nodeName}) + } + + d := p.refreshServerRebalanceTimer() + if d < s.minRebalance { + t.Errorf("[%d] duration too short for cluster of size %d and %d servers (%s < %s)", i, s.numNodes, s.numServers, d, s.minRebalance) + } + } +} + +// func (p *RpcProxy) saveServerList(l serverList) { +func TestRpcProxyInternal_saveServerList(t *testing.T) { + p := testRpcProxy() + + // Initial condition + func() { + l := p.getServerList() + if len(l.L) != 0 { + t.Fatalf("RpcProxy.saveServerList failed to load init config") + } + + newServer := new(ServerEndpoint) + l.L = append(l.L, newServer) + p.saveServerList(l) + }() + + // Test that save works + func() { + l1 := p.getServerList() + t1NumServers := len(l1.L) + if t1NumServers != 1 { + t.Fatalf("RpcProxy.saveServerList failed to save mutated config") + } + }() + + // Verify mutation w/o a save doesn't alter the original + func() { + newServer := new(ServerEndpoint) + l := p.getServerList() + l.L = append(l.L, newServer) + + l_orig := p.getServerList() + origNumServers := len(l_orig.L) + if origNumServers >= len(l.L) { + t.Fatalf("RpcProxy.saveServerList unsaved config overwrote original") + } + }() +} From 68f7afcd23e5295398662d4daec086d5e1c0e267 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 2 Jun 2016 01:16:56 -0700 Subject: [PATCH 108/166] Collapse server_endpoint_internal_test.go into server_endpoint_test.go Requested by: @dadgar --- .../rpcproxy/server_endpoint_internal_test.go | 38 ------------------- client/rpcproxy/server_endpoint_test.go | 26 ++++++------- 2 files changed, 12 insertions(+), 52 deletions(-) delete mode 100644 client/rpcproxy/server_endpoint_internal_test.go diff --git a/client/rpcproxy/server_endpoint_internal_test.go b/client/rpcproxy/server_endpoint_internal_test.go deleted file mode 100644 index a4e1e9f16da..00000000000 --- a/client/rpcproxy/server_endpoint_internal_test.go +++ /dev/null @@ -1,38 +0,0 @@ -package rpcproxy - -import ( - "testing" -) - -// func (k *EndpointKey) Equal(x *EndpointKey) { -func TestServerEndpointKey_Equal(t *testing.T) { - tests := []struct { - name string - k1 *EndpointKey - k2 *EndpointKey - equal bool - }{ - { - name: "equal", - k1: &EndpointKey{name: "k1"}, - k2: &EndpointKey{name: "k1"}, - equal: true, - }, - { - name: "not equal", - k1: &EndpointKey{name: "k1"}, - k2: &EndpointKey{name: "k2"}, - equal: false, - }, - } - - for _, test := range tests { - if test.k1.Equal(test.k2) != test.equal { - t.Errorf("fixture %s failed forward comparison", test.name) - } - - if test.k2.Equal(test.k1) != test.equal { - t.Errorf("fixture %s failed reverse comparison", test.name) - } - } -} diff --git a/client/rpcproxy/server_endpoint_test.go b/client/rpcproxy/server_endpoint_test.go index 8b964313ef0..f04494859d3 100644 --- a/client/rpcproxy/server_endpoint_test.go +++ b/client/rpcproxy/server_endpoint_test.go @@ -1,31 +1,29 @@ -package rpcproxy_test +package rpcproxy import ( "fmt" "net" "testing" - - "github.com/hashicorp/nomad/client/rpcproxy" ) -// func (k *rpcproxy.EndpointKey) Equal(x *rpcproxy.EndpointKey) { +// func (k *EndpointKey) Equal(x *EndpointKey) { func TestServerEndpointKey_Equal(t *testing.T) { tests := []struct { name string - s1 *rpcproxy.ServerEndpoint - s2 *rpcproxy.ServerEndpoint + s1 *ServerEndpoint + s2 *ServerEndpoint equal bool }{ { name: "equal", - s1: &rpcproxy.ServerEndpoint{Name: "k1"}, - s2: &rpcproxy.ServerEndpoint{Name: "k1"}, + s1: &ServerEndpoint{Name: "k1"}, + s2: &ServerEndpoint{Name: "k1"}, equal: true, }, { name: "not equal", - s1: &rpcproxy.ServerEndpoint{Name: "k1"}, - s2: &rpcproxy.ServerEndpoint{Name: "k2"}, + s1: &ServerEndpoint{Name: "k1"}, + s2: &ServerEndpoint{Name: "k2"}, equal: false, }, } @@ -41,21 +39,21 @@ func TestServerEndpointKey_Equal(t *testing.T) { } } -// func (k *rpcproxy.ServerEndpoint) String() { +// func (k *ServerEndpoint) String() { func TestServerEndpoint_String(t *testing.T) { tests := []struct { name string - s *rpcproxy.ServerEndpoint + s *ServerEndpoint str string }{ { name: "name", - s: &rpcproxy.ServerEndpoint{Name: "s"}, + s: &ServerEndpoint{Name: "s"}, str: "s (:)", }, { name: "name, host, port", - s: &rpcproxy.ServerEndpoint{ + s: &ServerEndpoint{ Name: "s", Host: "127.0.0.1", Port: "4647", From 4e03dc5fbcf373865614c53690ddc821f21bdc11 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 2 Jun 2016 01:19:20 -0700 Subject: [PATCH 109/166] Remove named return parameters --- client/rpcproxy/server_endpoint.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/client/rpcproxy/server_endpoint.go b/client/rpcproxy/server_endpoint.go index f9fef0c0275..07cc3adf342 100644 --- a/client/rpcproxy/server_endpoint.go +++ b/client/rpcproxy/server_endpoint.go @@ -45,13 +45,14 @@ func (s *ServerEndpoint) Key() *EndpointKey { // name, it must be resolvable to an IP address (most inputs are IP // addresses, not DNS names, but both work equally well when the name is // resolvable). -func newServer(name string) (s *ServerEndpoint, err error) { - s = &ServerEndpoint{ +func newServer(name string) (*ServerEndpoint, error) { + s := &ServerEndpoint{ Name: name, } var ( host, port string + err error ) host, port, err = net.SplitHostPort(name) if err == nil { From 1352f7f0e67dd76a190b07864fa23de097152070 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 2 Jun 2016 09:15:30 -0700 Subject: [PATCH 110/166] Change client/consul.NewSyncer() to accept a shutdown channel In addition to the API changing, consul.Syncer can now be signaled to shutdown via the Shutdown() method, which will call the Run()'ing sync task to exit gracefully. --- client/client.go | 7 +++++-- client/client_test.go | 6 ++++-- client/consul/sync.go | 22 +++++++++++++++++++--- client/consul/sync_test.go | 6 ++++-- client/driver/executor/executor.go | 7 ++++++- command/agent/agent.go | 2 +- 6 files changed, 39 insertions(+), 11 deletions(-) diff --git a/client/client.go b/client/client.go index 59211d9fac1..2b183b9f40f 100644 --- a/client/client.go +++ b/client/client.go @@ -1235,8 +1235,11 @@ func (c *Client) addAlloc(alloc *structs.Allocation) error { return nil } -// setupConsulSyncer creates Client-mode consul.Syncer callbacks that are -// executed periodically. +// setupConsulSyncer creates Client-mode consul.Syncer which periodically +// executes callbacks on a fixed interval. +// +// TODO(sean@): this could eventually be moved to a priority queue and give +// each task an interval, but that is not necessary at this time. func (c *Client) setupConsulSyncer() error { // The bootstrapFn callback handler is used to periodically poll // Consul to look up the Nomad Servers in Consul. In the event the diff --git a/client/client_test.go b/client/client_test.go index 5ffae2b3e33..207b7e58507 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -75,7 +75,8 @@ func testClient(t *testing.T, cb func(c *config.Config)) *Client { cb(conf) } - consulSyncer, err := consul.NewSyncer(conf.ConsulConfig, log.New(os.Stderr, "", log.LstdFlags)) + shutdownCh := make(chan struct{}) + consulSyncer, err := consul.NewSyncer(conf.ConsulConfig, shutdownCh, log.New(os.Stderr, "", log.LstdFlags)) if err != nil { t.Fatalf("err: %v", err) } @@ -467,7 +468,8 @@ func TestClient_SaveRestoreState(t *testing.T) { } // Create a new client - consulSyncer, err := consul.NewSyncer(c1.config.ConsulConfig, log.New(os.Stderr, "", log.LstdFlags)) + shutdownCh := make(chan struct{}) + consulSyncer, err := consul.NewSyncer(c1.config.ConsulConfig, shutdownCh, log.New(os.Stderr, "", log.LstdFlags)) if err != nil { t.Fatalf("err: %v", err) } diff --git a/client/consul/sync.go b/client/consul/sync.go index 2888f23e868..2d89317c6c6 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -69,6 +69,12 @@ type Syncer struct { shutdown bool shutdownLock sync.Mutex + // notifyShutdownCh is used to notify a Syncer it needs to shutdown. + // This can happen because there was an explicit call to the Syncer's + // Shutdown() method, or because the calling task signaled the + // program is going to exit by closing its shutdownCh. + notifyShutdownCh chan struct{} + // periodicCallbacks is walked sequentially when the timer in Run // fires. periodicCallbacks map[string]types.PeriodicCallback @@ -77,7 +83,7 @@ type Syncer struct { } // NewSyncer returns a new consul.Syncer -func NewSyncer(config *config.ConsulConfig, logger *log.Logger) (*Syncer, error) { +func NewSyncer(config *config.ConsulConfig, shutdownCh chan struct{}, logger *log.Logger) (*Syncer, error) { var err error var c *consul.Client cfg := consul.DefaultConfig() @@ -132,10 +138,10 @@ func NewSyncer(config *config.ConsulConfig, logger *log.Logger) (*Syncer, error) consulSyncer := Syncer{ client: c, logger: logger, + shutdownCh: shutdownCh, trackedServices: make(map[string]*consul.AgentService), trackedChecks: make(map[string]*consul.AgentCheckRegistration), checkRunners: make(map[string]*CheckRunner), - shutdownCh: make(chan struct{}), periodicCallbacks: make(map[string]types.PeriodicCallback), } return &consulSyncer, nil @@ -242,17 +248,25 @@ func (c *Syncer) SyncServices(services []*structs.Service) error { return mErr.ErrorOrNil() } +func (c *Syncer) signalShutdown() { + select { + case c.notifyShutdownCh <- struct{}{}: + default: + } +} + // Shutdown de-registers the services and checks and shuts down periodic syncing func (c *Syncer) Shutdown() error { var mErr multierror.Error c.shutdownLock.Lock() if !c.shutdown { - close(c.shutdownCh) c.shutdown = true } c.shutdownLock.Unlock() + c.signalShutdown() + // Stop all the checks that nomad is running for _, cr := range c.checkRunners { cr.Stop() @@ -401,6 +415,8 @@ func (c *Syncer) Run() { case <-c.notifySyncCh: sync.Reset(syncInterval) case <-c.shutdownCh: + c.Shutdown() + case <-c.notifyShutdownCh: sync.Stop() c.logger.Printf("[INFO] consul.sync: shutting down sync for %q", c.serviceIdentifier) return diff --git a/client/consul/sync_test.go b/client/consul/sync_test.go index 211b0181826..5eec54bcde6 100644 --- a/client/consul/sync_test.go +++ b/client/consul/sync_test.go @@ -42,7 +42,8 @@ var ( ) func TestConsulServiceRegisterServices(t *testing.T) { - cs, err := NewSyncer(&config.ConsulConfig{}, logger) + shutdownCh := make(chan struct{}) + cs, err := NewSyncer(&config.ConsulConfig{}, shutdownCh, logger) if err != nil { t.Fatalf("Err: %v", err) } @@ -69,7 +70,8 @@ func TestConsulServiceRegisterServices(t *testing.T) { } func TestConsulServiceUpdateService(t *testing.T) { - cs, err := NewSyncer(&config.ConsulConfig{}, logger) + shutdownCh := make(chan struct{}) + cs, err := NewSyncer(&config.ConsulConfig{}, shutdownCh, logger) if err != nil { t.Fatalf("Err: %v", err) } diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index c6018f484e4..369dfce0b58 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -183,6 +183,8 @@ type UniversalExecutor struct { lro *logging.FileRotator rotatorLock sync.Mutex + shutdownCh chan struct{} + syslogServer *logging.SyslogServer syslogChan chan *logging.SyslogMessage @@ -203,6 +205,7 @@ func NewExecutor(logger *log.Logger) Executor { exec := &UniversalExecutor{ logger: logger, processExited: make(chan interface{}), + shutdownCh: make(chan struct{}), totalCpuStats: stats.NewCpuStats(), userCpuStats: stats.NewCpuStats(), systemCpuStats: stats.NewCpuStats(), @@ -412,6 +415,8 @@ func (e *UniversalExecutor) Exit() error { e.lre.Close() e.lro.Close() + e.consulSyncer.Shutdown() + // If the executor did not launch a process, return. if e.command == nil { return nil @@ -472,7 +477,7 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { e.logger.Printf("[INFO] executor: registering services") e.consulCtx = ctx if e.consulSyncer == nil { - cs, err := consul.NewSyncer(ctx.ConsulConfig, e.logger) + cs, err := consul.NewSyncer(ctx.ConsulConfig, e.shutdownCh, e.logger) if err != nil { return err } diff --git a/command/agent/agent.go b/command/agent/agent.go index b52587e8b97..142aefeba2c 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -548,7 +548,7 @@ func (a *Agent) Stats() map[string]map[string]string { // setupConsulSyncer creates the Consul task used by this Nomad Agent when // running in either Client and Server mode. func (a *Agent) setupConsulSyncer(shutdownCh chan struct{}) (err error) { - a.consulSyncer, err = consul.NewSyncer(a.config.Consul, a.logger) + a.consulSyncer, err = consul.NewSyncer(a.config.Consul, shutdownCh, a.logger) return nil } From 107fc1bb81dc4a8d8d50cd4a01f04698955d1920 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 7 Jun 2016 07:01:13 -0700 Subject: [PATCH 111/166] Rename createCheck() to createDelegatedCheck() for clarity --- client/consul/sync.go | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/client/consul/sync.go b/client/consul/sync.go index 2d89317c6c6..3a8ffa1c145 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -55,13 +55,13 @@ type Syncer struct { runChecks bool serviceIdentifier string // serviceIdentifier is a token which identifies which task/alloc the service belongs to - delegateChecks map[string]struct{} // delegateChecks are the checks that the Nomad client runs and reports to Consul - createCheck func(*structs.ServiceCheck, string) (Check, error) addrFinder func(portLabel string) (string, int) trackedServices map[string]*consul.AgentService trackedChecks map[string]*consul.AgentCheckRegistration checkRunners map[string]*CheckRunner + delegateChecks map[string]struct{} // delegateChecks are the checks that the Nomad client runs and reports to Consul + createDelegatedCheck func(*structs.ServiceCheck, string) (Check, error) logger *log.Logger @@ -149,9 +149,9 @@ func NewSyncer(config *config.ConsulConfig, shutdownCh chan struct{}, logger *lo // SetDelegatedChecks sets the checks that nomad is going to run and report the // result back to consul -func (c *Syncer) SetDelegatedChecks(delegateChecks map[string]struct{}, createCheck func(*structs.ServiceCheck, string) (Check, error)) *Syncer { +func (c *Syncer) SetDelegatedChecks(delegateChecks map[string]struct{}, createDelegatedCheckFn func(*structs.ServiceCheck, string) (Check, error)) *Syncer { c.delegateChecks = delegateChecks - c.createCheck = createCheck + c.createDelegatedCheck = createDelegatedCheckFn return c } @@ -200,14 +200,14 @@ func (c *Syncer) SyncServices(services []*structs.Service) error { for _, chk := range service.Checks { // Create a consul check registration - chkReg, err := c.createCheckReg(chk, srv) + chkReg, err := c.createDelegatedCheckReg(chk, srv) if err != nil { mErr.Errors = append(mErr.Errors, err) continue } // creating a nomad check if we have to handle this particular check type if _, ok := c.delegateChecks[chk.Type]; ok { - nc, err := c.createCheck(chk, chkReg.ID) + nc, err := c.createDelegatedCheck(chk, chkReg.ID) if err != nil { mErr.Errors = append(mErr.Errors, err) continue @@ -312,9 +312,10 @@ func (c *Syncer) registerCheck(chkReg *consul.AgentCheckRegistration) error { return c.client.Agent().CheckRegister(chkReg) } -// createCheckReg creates a Check that can be registered with Nomad. It also -// creates a Nomad check for the check types that it can handle. -func (c *Syncer) createCheckReg(check *structs.ServiceCheck, service *consul.AgentService) (*consul.AgentCheckRegistration, error) { +// createDelegatedCheckReg creates a Check that can be registered with +// Nomad. It also creates a Nomad check for the check types that it can +// handle. +func (c *Syncer) createDelegatedCheckReg(check *structs.ServiceCheck, service *consul.AgentService) (*consul.AgentCheckRegistration, error) { chkReg := consul.AgentCheckRegistration{ ID: check.Hash(service.ID), Name: check.Name, From acb3d58b828c0efc7b6ebc74c2a9c37c078910f9 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 7 Jun 2016 09:55:35 -0500 Subject: [PATCH 112/166] Refine Nomad's Consul `port` handling. Previously this would immediately default to '127.0.0.1' if the config was set to `:some-port-number`. Now it uses the BindAddr if available. Also, if the `port` option is set to just a port` number (e.g. '1234'), attempt to parse the port number by itself to allow statically configured ports to work, even when no host is specified. --- command/agent/agent.go | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 142aefeba2c..8582452aabc 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -607,17 +607,31 @@ func (a *Agent) syncAgentServicesWithConsul() error { a.consulSyncer.SetAddrFinder(func(portLabel string) (string, int) { host, port, err := net.SplitHostPort(portLabel) if err != nil { - return "", 0 + p, err := strconv.Atoi(port) + if err != nil { + return "", 0 + } + return "", p } - // if the addr for the service is ":port", then we default to - // registering the service with ip as the loopback addr + // If the addr for the service is ":port", then we fall back + // to Nomad's default address resolution protocol. + // + // TODO(sean@): This should poll Consul to figure out what + // its advertise address is and use that in order to handle + // the case where there is something funky like NAT on this + // host. For now we just use the BindAddr if set, otherwise + // we fall back to a loopback addr. if host == "" { - host = "127.0.0.1" + if a.config.BindAddr != "" { + host = a.configBindAddr + } else { + host = "127.0.0.1" + } } p, err := strconv.Atoi(port) if err != nil { - return "", 0 + return host, 0 } return host, p }) From a2081159b454e5e1d58da6a885923d5893f86e33 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 7 Jun 2016 10:54:03 -0500 Subject: [PATCH 113/166] Rename structs.Services to structs.ConsulServices --- client/consul/sync.go | 2 +- client/consul/sync_test.go | 6 +++--- jobspec/parse.go | 6 +++--- jobspec/parse_test.go | 2 +- nomad/mock/mock.go | 2 +- nomad/structs/diff.go | 12 ++++++------ nomad/structs/diff_test.go | 16 ++++++++-------- nomad/structs/structs.go | 21 +++++++++++---------- nomad/structs/structs_test.go | 24 ++++++++++++------------ scheduler/util_test.go | 2 +- 10 files changed, 47 insertions(+), 46 deletions(-) diff --git a/client/consul/sync.go b/client/consul/sync.go index 3a8ffa1c145..933e994a759 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -345,11 +345,11 @@ func (c *Syncer) createDelegatedCheckReg(check *structs.ServiceCheck, service *c } // createService creates a Consul AgentService from a Nomad Service -func (c *Syncer) createService(service *structs.Service) (*consul.AgentService, error) { srv := consul.AgentService{ ID: service.ID(c.serviceIdentifier), Service: service.Name, Tags: service.Tags, +func (c *Syncer) createService(service *structs.ConsulService) (*consul.AgentServiceRegistration, error) { } host, port := c.addrFinder(service.PortLabel) if host != "" { diff --git a/client/consul/sync_test.go b/client/consul/sync_test.go index 5eec54bcde6..64aaf3fd0fb 100644 --- a/client/consul/sync_test.go +++ b/client/consul/sync_test.go @@ -25,7 +25,7 @@ var ( Interval: 30 * time.Second, Timeout: 5 * time.Second, } - service1 = structs.Service{ + service1 = structs.ConsulService{ Name: "foo-1", Tags: []string{"tag1", "tag2"}, PortLabel: "port1", @@ -34,7 +34,7 @@ var ( }, } - service2 = structs.Service{ + service2 = structs.ConsulService{ Name: "foo-2", Tags: []string{"tag1", "tag2"}, PortLabel: "port2", @@ -148,7 +148,7 @@ func checksPresent(t *testing.T, checkIDs []string, syncer *Syncer) error { func mockTask() *structs.Task { task := structs.Task{ Name: "foo", - Services: []*structs.Service{&service1, &service2}, + Services: []*structs.ConsulService{&service1, &service2}, Resources: &structs.Resources{ Networks: []*structs.NetworkResource{ &structs.NetworkResource{ diff --git a/jobspec/parse.go b/jobspec/parse.go index 028f4baae2b..c951f270fbb 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -701,7 +701,7 @@ func parseArtifactOption(result map[string]string, list *ast.ObjectList) error { } func parseServices(jobName string, taskGroupName string, task *structs.Task, serviceObjs *ast.ObjectList) error { - task.Services = make([]*structs.Service, len(serviceObjs.Items)) + task.Services = make([]*structs.ConsulService, len(serviceObjs.Items)) var defaultServiceName bool for idx, o := range serviceObjs.Items { // Check for invalid keys @@ -715,7 +715,7 @@ func parseServices(jobName string, taskGroupName string, task *structs.Task, ser return multierror.Prefix(err, fmt.Sprintf("service (%d) ->", idx)) } - var service structs.Service + var service structs.ConsulService var m map[string]interface{} if err := hcl.DecodeObject(&m, o.Val); err != nil { return err @@ -756,7 +756,7 @@ func parseServices(jobName string, taskGroupName string, task *structs.Task, ser return nil } -func parseChecks(service *structs.Service, checkObjs *ast.ObjectList) error { +func parseChecks(service *structs.ConsulService, checkObjs *ast.ObjectList) error { service.Checks = make([]*structs.ServiceCheck, len(checkObjs.Items)) for idx, co := range checkObjs.Items { // Check for invalid keys diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index 10374146eef..35518dc9e0a 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -97,7 +97,7 @@ func TestParse(t *testing.T) { }, }, }, - Services: []*structs.Service{ + Services: []*structs.ConsulService{ { Name: "binstore-storagelocker-binsl-binstore", Tags: []string{"foo", "bar"}, diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 9e920864150..7e5a5fedcbf 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -94,7 +94,7 @@ func Job() *structs.Job { Env: map[string]string{ "FOO": "bar", }, - Services: []*structs.Service{ + Services: []*structs.ConsulService{ { Name: "${TASK}-frontend", PortLabel: "http", diff --git a/nomad/structs/diff.go b/nomad/structs/diff.go index eabc3ec4b52..812b41adf78 100644 --- a/nomad/structs/diff.go +++ b/nomad/structs/diff.go @@ -458,18 +458,18 @@ func (t TaskDiffs) Less(i, j int) bool { return t[i].Name < t[j].Name } // serviceDiff returns the diff of two service objects. If contextual diff is // enabled, all fields will be returned, even if no diff occurred. -func serviceDiff(old, new *Service, contextual bool) *ObjectDiff { +func serviceDiff(old, new *ConsulService, contextual bool) *ObjectDiff { diff := &ObjectDiff{Type: DiffTypeNone, Name: "Service"} var oldPrimitiveFlat, newPrimitiveFlat map[string]string if reflect.DeepEqual(old, new) { return nil } else if old == nil { - old = &Service{} + old = &ConsulService{} diff.Type = DiffTypeAdded newPrimitiveFlat = flatmap.Flatten(new, nil, true) } else if new == nil { - new = &Service{} + new = &ConsulService{} diff.Type = DiffTypeDeleted oldPrimitiveFlat = flatmap.Flatten(old, nil, true) } else { @@ -491,9 +491,9 @@ func serviceDiff(old, new *Service, contextual bool) *ObjectDiff { // serviceDiffs diffs a set of services. If contextual diff is enabled, unchanged // fields within objects nested in the tasks will be returned. -func serviceDiffs(old, new []*Service, contextual bool) []*ObjectDiff { - oldMap := make(map[string]*Service, len(old)) - newMap := make(map[string]*Service, len(new)) +func serviceDiffs(old, new []*ConsulService, contextual bool) []*ObjectDiff { + oldMap := make(map[string]*ConsulService, len(old)) + newMap := make(map[string]*ConsulService, len(new)) for _, o := range old { oldMap[o.Name] = o } diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go index a851d891b37..5808cebc9b8 100644 --- a/nomad/structs/diff_test.go +++ b/nomad/structs/diff_test.go @@ -2363,7 +2363,7 @@ func TestTaskDiff(t *testing.T) { { // Services edited (no checks) Old: &Task{ - Services: []*Service{ + Services: []*ConsulService{ { Name: "foo", PortLabel: "foo", @@ -2379,7 +2379,7 @@ func TestTaskDiff(t *testing.T) { }, }, New: &Task{ - Services: []*Service{ + Services: []*ConsulService{ { Name: "bar", PortLabel: "bar", @@ -2452,7 +2452,7 @@ func TestTaskDiff(t *testing.T) { // Services edited (no checks) with context Contextual: true, Old: &Task{ - Services: []*Service{ + Services: []*ConsulService{ { Name: "foo", PortLabel: "foo", @@ -2460,7 +2460,7 @@ func TestTaskDiff(t *testing.T) { }, }, New: &Task{ - Services: []*Service{ + Services: []*ConsulService{ { Name: "foo", PortLabel: "bar", @@ -2494,7 +2494,7 @@ func TestTaskDiff(t *testing.T) { { // Service Checks edited Old: &Task{ - Services: []*Service{ + Services: []*ConsulService{ { Name: "foo", Checks: []*ServiceCheck{ @@ -2533,7 +2533,7 @@ func TestTaskDiff(t *testing.T) { }, }, New: &Task{ - Services: []*Service{ + Services: []*ConsulService{ { Name: "foo", Checks: []*ServiceCheck{ @@ -2695,7 +2695,7 @@ func TestTaskDiff(t *testing.T) { // Service Checks edited with context Contextual: true, Old: &Task{ - Services: []*Service{ + Services: []*ConsulService{ { Name: "foo", Checks: []*ServiceCheck{ @@ -2714,7 +2714,7 @@ func TestTaskDiff(t *testing.T) { }, }, New: &Task{ - Services: []*Service{ + Services: []*ConsulService{ { Name: "foo", Checks: []*ServiceCheck{ diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 08185de42d0..aa74d06ac09 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -1552,19 +1552,20 @@ var ( AgentServicePrefix = fmt.Sprintf("%s-%s", NomadConsulPrefix, "agent") ) -// The Service model represents a Consul service definition -type Service struct { Name string // Name of the service, defaults to id Tags []string // List of tags for the service +// The ConsulService model represents a Consul service definition in Nomad +// Agent's Config. +type ConsulService struct { PortLabel string `mapstructure:"port"` // port for the service Checks []*ServiceCheck // List of checks associated with the service } -func (s *Service) Copy() *Service { +func (s *ConsulService) Copy() *ConsulService { if s == nil { return nil } - ns := new(Service) + ns := new(ConsulService) *ns = *s ns.Tags = CopySliceString(ns.Tags) @@ -1581,7 +1582,7 @@ func (s *Service) Copy() *Service { // InitFields interpolates values of Job, Task Group and Task in the Service // Name. This also generates check names, service id and check ids. -func (s *Service) InitFields(job string, taskGroup string, task string) { +func (s *ConsulService) InitFields(job string, taskGroup string, task string) { s.Name = args.ReplaceEnv(s.Name, map[string]string{ "JOB": job, "TASKGROUP": taskGroup, @@ -1597,12 +1598,12 @@ func (s *Service) InitFields(job string, taskGroup string, task string) { } } -func (s *Service) ID(identifier string) string { +func (s *ConsulService) ID(identifier string) string { return fmt.Sprintf("%s-%s-%s", NomadConsulPrefix, identifier, s.Hash()) } // Validate checks if the Check definition is valid -func (s *Service) Validate() error { +func (s *ConsulService) Validate() error { var mErr multierror.Error // Ensure the service name is valid per RFC-952 §1 @@ -1628,7 +1629,7 @@ func (s *Service) Validate() error { // Hash calculates the hash of the check based on it's content and the service // which owns it -func (s *Service) Hash() string { +func (s *ConsulService) Hash() string { h := sha1.New() io.WriteString(h, s.Name) io.WriteString(h, strings.Join(s.Tags, "")) @@ -1687,7 +1688,7 @@ type Task struct { Env map[string]string // List of service definitions exposed by the Task - Services []*Service + Services []*ConsulService // Constraints can be specified at a task level and apply only to // the particular task. @@ -1721,7 +1722,7 @@ func (t *Task) Copy() *Task { nt.Env = CopyMapStringString(nt.Env) if t.Services != nil { - services := make([]*Service, len(nt.Services)) + services := make([]*ConsulService, len(nt.Services)) for i, s := range nt.Services { services[i] = s.Copy() } diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index e60ceadcfa7..f3b50554488 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -136,7 +136,7 @@ func testJob() *Job { GetterSource: "http://foo.com", }, }, - Services: []*Service{ + Services: []*ConsulService{ { Name: "${TASK}-frontend", PortLabel: "http", @@ -283,7 +283,7 @@ func TestTask_Validate(t *testing.T) { } func TestTask_Validate_Services(t *testing.T) { - s1 := &Service{ + s1 := &ConsulService{ Name: "service-name", PortLabel: "bar", Checks: []*ServiceCheck{ @@ -298,7 +298,7 @@ func TestTask_Validate_Services(t *testing.T) { }, } - s2 := &Service{ + s2 := &ConsulService{ Name: "service-name", } @@ -311,7 +311,7 @@ func TestTask_Validate_Services(t *testing.T) { MemoryMB: 100, IOPS: 10, }, - Services: []*Service{s1, s2}, + Services: []*ConsulService{s1, s2}, } err := task.Validate() if err == nil { @@ -568,7 +568,7 @@ func BenchmarkEncodeDecode(b *testing.B) { } func TestInvalidServiceCheck(t *testing.T) { - s := Service{ + s := ConsulService{ Name: "service-name", PortLabel: "bar", Checks: []*ServiceCheck{ @@ -582,7 +582,7 @@ func TestInvalidServiceCheck(t *testing.T) { t.Fatalf("Service should be invalid (invalid type)") } - s = Service{ + s = ConsulService{ Name: "service.name", PortLabel: "bar", } @@ -590,7 +590,7 @@ func TestInvalidServiceCheck(t *testing.T) { t.Fatalf("Service should be invalid (contains a dot): %v", err) } - s = Service{ + s = ConsulService{ Name: "-my-service", PortLabel: "bar", } @@ -598,7 +598,7 @@ func TestInvalidServiceCheck(t *testing.T) { t.Fatalf("Service should be invalid (begins with a hyphen): %v", err) } - s = Service{ + s = ConsulService{ Name: "abcdef0123456789-abcdef0123456789-abcdef0123456789-abcdef0123456", PortLabel: "bar", } @@ -606,7 +606,7 @@ func TestInvalidServiceCheck(t *testing.T) { t.Fatalf("Service should be invalid (too long): %v", err) } - s = Service{ + s = ConsulService{ Name: "service-name", Checks: []*ServiceCheck{ { @@ -628,7 +628,7 @@ func TestInvalidServiceCheck(t *testing.T) { t.Fatalf("service should be invalid (tcp/http checks with no port): %v", err) } - s = Service{ + s = ConsulService{ Name: "service-name", Checks: []*ServiceCheck{ { @@ -684,7 +684,7 @@ func TestService_InitFields(t *testing.T) { taskGroup := "cache" task := "redis" - s := Service{ + s := ConsulService{ Name: "${TASK}-db", } @@ -722,7 +722,7 @@ func TestJob_ExpandServiceNames(t *testing.T) { Tasks: []*Task{ { Name: "frontend", - Services: []*Service{ + Services: []*ConsulService{ { Name: "${BASE}-default", }, diff --git a/scheduler/util_test.go b/scheduler/util_test.go index fc2a1f13324..da9e8706de0 100644 --- a/scheduler/util_test.go +++ b/scheduler/util_test.go @@ -679,7 +679,7 @@ func TestInplaceUpdate_Success(t *testing.T) { *tg = *job.TaskGroups[0] resource := &structs.Resources{CPU: 737} tg.Tasks[0].Resources = resource - newServices := []*structs.Service{ + newServices := []*structs.ConsulService{ { Name: "dummy-service", PortLabel: "http", From cf8beb7ba9381fea7890aaff15c0eadd46f07e2b Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 7 Jun 2016 10:59:17 -0500 Subject: [PATCH 114/166] Change the signature of the PeriodicCallback to return an error I *KNEW* I should have done this when I wrote it, but didn't want to go back and audit the handlers to include the appropriate return handling, but now that the code is taking shape, make this change. --- client/client.go | 19 +++++++++++++------ client/consul/sync.go | 39 +++++++++++++-------------------------- nomad/types/types.go | 2 +- 3 files changed, 27 insertions(+), 33 deletions(-) diff --git a/client/client.go b/client/client.go index 2b183b9f40f..a99faebe7a0 100644 --- a/client/client.go +++ b/client/client.go @@ -1246,12 +1246,12 @@ func (c *Client) setupConsulSyncer() error { // heartbeat deadline has been exceeded and this Client is orphaned // from its servers, periodically poll Consul to reattach this Client // to its cluster and automatically recover from a detached state. - bootstrapFn := func() { + bootstrapFn := func() error { now := time.Now() c.configLock.RLock() if now.Before(c.consulPullHeartbeatDeadline) { c.configLock.RUnlock() - return + return nil } c.configLock.RUnlock() @@ -1261,7 +1261,7 @@ func (c *Client) setupConsulSyncer() error { &consulapi.QueryOptions{AllowStale: true}) if err != nil { c.logger.Printf("[WARN] client: unable to query service %q: %v", nomadServerServiceName, err) - return + return err } serverAddrs := make([]string, 0, len(services)) for _, s := range services { @@ -1272,18 +1272,23 @@ func (c *Client) setupConsulSyncer() error { } serverAddrs = append(serverAddrs, net.JoinHostPort(addr, port)) } - c.rpcProxy.SetBackupServers(serverAddrs) + + if err := c.rpcProxy.SetBackupServers(serverAddrs); err != nil { + return err + } + + return nil } c.consulSyncer.AddPeriodicHandler("Nomad Client Fallback Server Handler", bootstrapFn) - consulServicesSyncFn := func() { + consulServicesSyncFn := func() error { // Give up pruning services if we can't fingerprint our // Consul Agent. c.configLock.RLock() _, ok := c.configCopy.Node.Attributes["consul.version"] c.configLock.RUnlock() if !ok { - return + return fmt.Errorf("Consul not running") } services := make(map[string]struct{}) @@ -1305,7 +1310,9 @@ func (c *Client) setupConsulSyncer() error { if err := c.consulSyncer.KeepServices(services); err != nil { c.logger.Printf("[DEBUG] client: error removing services from non-running tasks: %v", err) + return err } + return nil } c.consulSyncer.AddPeriodicHandler("Nomad Client Services Sync Handler", consulServicesSyncFn) diff --git a/client/consul/sync.go b/client/consul/sync.go index 933e994a759..a58ade10097 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -426,47 +426,34 @@ func (c *Syncer) Run() { } // RunHandlers executes each handler (randomly) -func (c *Syncer) RunHandlers() { +func (c *Syncer) RunHandlers() error { c.periodicLock.RLock() handlers := make(map[string]types.PeriodicCallback, len(c.periodicCallbacks)) for name, fn := range c.periodicCallbacks { handlers[name] = fn } c.periodicLock.RUnlock() + + var mErr multierror.Error for _, fn := range handlers { - fn() + if err := fn(); err != nil { + mErr.Errors = append(mErr.Errors, err) + } } + return mErr.ErrorOrNil() } // performSync sync the services and checks we are tracking with Consul. func (c *Syncer) performSync() error { - c.RunHandlers() - var mErr multierror.Error - cServices, err := c.client.Agent().Services() - if err != nil { - return err - } - - cChecks, err := c.client.Agent().Checks() - if err != nil { - return err + if err := c.RunHandlers(); err != nil { + mErr.Errors = append(mErr.Errors, err) } - - // Add services and checks that consul doesn't have but we do - for serviceID, service := range c.trackedServices { - if _, ok := cServices[serviceID]; !ok { - if err := c.registerService(service); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } + if err := c.syncServices(); err != nil { + mErr.Errors = append(mErr.Errors, err) } - for checkID, check := range c.trackedChecks { - if _, ok := cChecks[checkID]; !ok { - if err := c.registerCheck(check); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } + if err := c.syncChecks(); err != nil { + mErr.Errors = append(mErr.Errors, err) } return mErr.ErrorOrNil() diff --git a/nomad/types/types.go b/nomad/types/types.go index 37196ac3946..2a05ddbb3ff 100644 --- a/nomad/types/types.go +++ b/nomad/types/types.go @@ -1,3 +1,3 @@ package types -type PeriodicCallback func() +type PeriodicCallback func() error From 74e691cab1f056ebada380f1ab1d87e3b3d3c0a7 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 7 Jun 2016 11:03:43 -0500 Subject: [PATCH 115/166] Change the API signature of Syncer.SyncServices(). SyncServices() immediately attempts to sync whatever information the process has with Consul. Previously this method would take an argument of the exclusive list of services that should exist, however this is not condusive to having a Nomad Client and Nomad Server share the same consul.Syncer. --- client/consul/sync.go | 6 ++++-- client/consul/sync_test.go | 6 +++--- client/driver/executor/executor.go | 15 +++++++++------ command/agent/agent.go | 4 +++- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/client/consul/sync.go b/client/consul/sync.go index a58ade10097..10b8c93dde4 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -177,9 +177,11 @@ func (c *Syncer) SyncNow() { } // SyncServices sync the services with the Consul Agent -func (c *Syncer) SyncServices(services []*structs.Service) error { +func (c *Syncer) SyncServices() error { + services := c.flattenedServices() + var mErr multierror.Error - taskServices := make(map[string]*consul.AgentService) + taskServices := make(map[string]*consul.AgentServiceRegistration) taskChecks := make(map[string]*consul.AgentCheckRegistration) // Register Services and Checks that we don't know about or has changed diff --git a/client/consul/sync_test.go b/client/consul/sync_test.go index 64aaf3fd0fb..976f6ea52a0 100644 --- a/client/consul/sync_test.go +++ b/client/consul/sync_test.go @@ -54,7 +54,7 @@ func TestConsulServiceRegisterServices(t *testing.T) { task := mockTask() cs.SetServiceIdentifier(GenerateServiceIdentifier(allocID, task.Name)) cs.SetAddrFinder(task.FindHostAndPortFor) - if err := cs.SyncServices(task.Services); err != nil { + if err := cs.SyncServices(); err != nil { t.Fatalf("err: %v", err) } defer cs.Shutdown() @@ -83,7 +83,7 @@ func TestConsulServiceUpdateService(t *testing.T) { task := mockTask() cs.SetServiceIdentifier(GenerateServiceIdentifier(allocID, task.Name)) cs.SetAddrFinder(task.FindHostAndPortFor) - if err := cs.SyncServices(task.Services); err != nil { + if err := cs.SyncServices(); err != nil { t.Fatalf("err: %v", err) } defer cs.Shutdown() @@ -91,7 +91,7 @@ func TestConsulServiceUpdateService(t *testing.T) { //Update Service defn 1 newTags := []string{"tag3"} task.Services[0].Tags = newTags - if err := cs.SyncServices(task.Services); err != nil { + if err := cs.SyncServices(); err != nil { t.Fatalf("err: %v", err) } // Make sure all the services and checks are still present diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index 369dfce0b58..2f95df84f31 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -357,11 +357,9 @@ func (e *UniversalExecutor) UpdateTask(task *structs.Task) error { e.lre.MaxFiles = task.LogConfig.MaxFiles e.lre.FileSize = fileSize - // Re-syncing task with consul service + // Re-syncing task with Consul agent if e.consulSyncer != nil { - if err := e.consulSyncer.SyncServices(task.Services); err != nil { - return err - } + e.consulSyncer.SetServices(servicesGroupName, task.Services) } return nil } @@ -487,10 +485,15 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { e.consulSyncer = cs } if e.ctx != nil { - e.interpolateServices(e.ctx.Task) + syncerFn := func() error { + e.interpolateServices(e.ctx.Task) + e.consulSyncer.SetServices(e.ctx.AllocID, e.ctx.Task.Services) + return nil + } + e.consulSyncer.AddPeriodicHandler(e.ctx.AllocID, syncerFn) } - err := e.consulSyncer.SyncServices(e.ctx.Task.Services) go e.consulSyncer.Run() + err := e.consulSyncer.SyncServices() // Attempt to register immediately return err } diff --git a/command/agent/agent.go b/command/agent/agent.go index 8582452aabc..0f7c76d302f 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -636,5 +636,7 @@ func (a *Agent) syncAgentServicesWithConsul() error { return host, p }) - return a.consulSyncer.SyncServices(services) + a.consulSyncer.SetServices("agent", agentServiceGroup) + + return a.consulSyncer.SyncServices() } From e858928d68ca93c7ebadd8016ab7e669b3dee3b8 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Tue, 7 Jun 2016 11:37:39 -0500 Subject: [PATCH 116/166] Rename Syncer.SetServiceIdentifier to SetServiceRegPrefix() This attribute isn't actually an identifier because it can represent a collection of services. Rename `serviceIdentifier` to `serviceRegPrefix which more accurately conveys the intention of this Syncer attribute. While here, also rename `SetServiceIdentifier()` to `SetServiceRegPrefix()` and `GenerateServiceIdentifier()` to `GenerateServicePrefix()`. --- client/consul/sync.go | 65 +++++++++++++++++++----------- client/consul/sync_test.go | 12 +++--- client/driver/executor/executor.go | 2 +- 3 files changed, 49 insertions(+), 30 deletions(-) diff --git a/client/consul/sync.go b/client/consul/sync.go index 10b8c93dde4..a14bf90beed 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -54,14 +54,26 @@ type Syncer struct { client *consul.Client runChecks bool - serviceIdentifier string // serviceIdentifier is a token which identifies which task/alloc the service belongs to - addrFinder func(portLabel string) (string, int) + // The "Consul Registry" is a collection of Consul Services and + // Checks all guarded by the registryLock. + registryLock sync.RWMutex - trackedServices map[string]*consul.AgentService - trackedChecks map[string]*consul.AgentCheckRegistration checkRunners map[string]*CheckRunner delegateChecks map[string]struct{} // delegateChecks are the checks that the Nomad client runs and reports to Consul + + // serviceRegPrefix is used to namespace the domain of registered + // Consul Services and Checks belonging to a single Syncer. A given + // Nomad Agent may spawn multiple Syncer tasks between the Agent + // Agent and its Executors, all syncing to a single Consul Agent. + // The serviceRegPrefix allows multiple Syncers to coexist without + // each Syncer clobbering each others Services. The Syncer namespace + // protocol is fmt.Sprintf("nomad-%s-%s", serviceRegPrefix, miscID). + // serviceRegPrefix is guarded by the registryLock. + serviceRegPrefix string + + addrFinder func(portLabel string) (string, int) createDelegatedCheck func(*structs.ServiceCheck, string) (Check, error) + // End registryLock guarded attributes. logger *log.Logger @@ -161,9 +173,12 @@ func (c *Syncer) SetAddrFinder(addrFinder func(string) (string, int)) *Syncer { return c } -// SetServiceIdentifier sets the identifier of the services we are syncing with Consul -func (c *Syncer) SetServiceIdentifier(serviceIdentifier string) *Syncer { - c.serviceIdentifier = serviceIdentifier +// SetServiceRegPrefix sets the registration prefix used by the Syncer when +// registering Services with Consul. +func (c *Syncer) SetServiceRegPrefix(servicePrefix string) *Syncer { + c.registryLock.Lock() + defer c.registryLock.Unlock() + c.serviceRegPrefix = servicePrefix return c } @@ -347,11 +362,14 @@ func (c *Syncer) createDelegatedCheckReg(check *structs.ServiceCheck, service *c } // createService creates a Consul AgentService from a Nomad Service - srv := consul.AgentService{ - ID: service.ID(c.serviceIdentifier), - Service: service.Name, - Tags: service.Tags, func (c *Syncer) createService(service *structs.ConsulService) (*consul.AgentServiceRegistration, error) { + c.registryLock.RLock() + defer c.registryLock.RUnlock() + + srv := consul.AgentServiceRegistration{ + ID: service.ID(c.serviceRegPrefix), + Name: service.Name, + Tags: service.Tags, } host, port := c.addrFinder(service.PortLabel) if host != "" { @@ -409,7 +427,7 @@ func (c *Syncer) Run() { if err := c.performSync(); err != nil { if c.runChecks { - c.logger.Printf("[DEBUG] consul.sync: disabling checks until Consul sync completes for %q: %v", c.serviceIdentifier, err) + c.logger.Printf("[DEBUG] consul.sync: disabling checks until Consul sync completes for %q: %v", c.serviceRegPrefix, err) } c.runChecks = false } else { @@ -421,7 +439,7 @@ func (c *Syncer) Run() { c.Shutdown() case <-c.notifyShutdownCh: sync.Stop() - c.logger.Printf("[INFO] consul.sync: shutting down sync for %q", c.serviceIdentifier) + c.logger.Printf("[INFO] consul.sync: shutting down sync for %q", c.serviceRegPrefix) return } } @@ -463,15 +481,16 @@ func (c *Syncer) performSync() error { // filterConsulServices prunes out all the service whose ids are not prefixed // with nomad- -func (c *Syncer) filterConsulServices(srvcs map[string]*consul.AgentService) map[string]*consul.AgentService { - nomadServices := make(map[string]*consul.AgentService) - for _, srv := range srvcs { - if strings.HasPrefix(srv.ID, structs.NomadConsulPrefix) && - !strings.HasPrefix(srv.ID, structs.AgentServicePrefix) { - nomadServices[srv.ID] = srv +func (c *Syncer) filterConsulServices(consulServices map[string]*consul.AgentService) map[string]*consul.AgentService { + localServices := make(map[string]*consul.AgentService, len(consulServices)) + c.registryLock.RLock() + defer c.registryLock.RUnlock() + for serviceID, service := range consulServices { + if strings.HasPrefix(service.ID, c.serviceRegPrefix) { + localServices[serviceID] = service } } - return nomadServices + return localServices } // filterConsulChecks prunes out all the consul checks which do not have @@ -522,9 +541,9 @@ func (c *Syncer) runCheck(check Check) { } } -// GenerateServiceIdentifier returns a service identifier based on an allocation -// id and task name -func GenerateServiceIdentifier(allocID string, taskName string) string { +// GenerateServicePrefix returns a service prefix based on an allocation id +// and task name. +func GenerateServicePrefix(allocID string, taskName string) string { return fmt.Sprintf("%s-%s", taskName, allocID) } diff --git a/client/consul/sync_test.go b/client/consul/sync_test.go index 976f6ea52a0..e06d5713a35 100644 --- a/client/consul/sync_test.go +++ b/client/consul/sync_test.go @@ -52,15 +52,15 @@ func TestConsulServiceRegisterServices(t *testing.T) { return } task := mockTask() - cs.SetServiceIdentifier(GenerateServiceIdentifier(allocID, task.Name)) + cs.SetServiceRegPrefix(GenerateServicePrefix(allocID, task.Name)) cs.SetAddrFinder(task.FindHostAndPortFor) if err := cs.SyncServices(); err != nil { t.Fatalf("err: %v", err) } defer cs.Shutdown() - service1ID := service1.ID(GenerateServiceIdentifier(allocID, task.Name)) - service2ID := service2.ID(GenerateServiceIdentifier(allocID, task.Name)) + service1ID := service1.ID(GenerateServicePrefix(allocID, task.Name)) + service2ID := service2.ID(GenerateServicePrefix(allocID, task.Name)) if err := servicesPresent(t, []string{service1ID, service2ID}, cs); err != nil { t.Fatalf("err : %v", err) } @@ -81,7 +81,7 @@ func TestConsulServiceUpdateService(t *testing.T) { } task := mockTask() - cs.SetServiceIdentifier(GenerateServiceIdentifier(allocID, task.Name)) + cs.SetServiceRegPrefix(GenerateServicePrefix(allocID, task.Name)) cs.SetAddrFinder(task.FindHostAndPortFor) if err := cs.SyncServices(); err != nil { t.Fatalf("err: %v", err) @@ -95,8 +95,8 @@ func TestConsulServiceUpdateService(t *testing.T) { t.Fatalf("err: %v", err) } // Make sure all the services and checks are still present - service1ID := service1.ID(GenerateServiceIdentifier(allocID, task.Name)) - service2ID := service2.ID(GenerateServiceIdentifier(allocID, task.Name)) + service1ID := service1.ID(GenerateServicePrefix(allocID, task.Name)) + service2ID := service2.ID(GenerateServicePrefix(allocID, task.Name)) if err := servicesPresent(t, []string{service1ID, service2ID}, cs); err != nil { t.Fatalf("err : %v", err) } diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index 2f95df84f31..b6b0fcdca7c 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -480,7 +480,7 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { return err } cs.SetDelegatedChecks(e.createCheckMap(), e.createCheck) - cs.SetServiceIdentifier(consul.GenerateServiceIdentifier(e.ctx.AllocID, e.ctx.Task.Name)) + cs.SetServiceRegPrefix(consul.GenerateServicePrefix(e.ctx.AllocID, e.ctx.Task.Name)) cs.SetAddrFinder(e.ctx.Task.FindHostAndPortFor) e.consulSyncer = cs } From 57c2c819e8c5136c0980565cee8d1307fe142432 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 8 Jun 2016 02:02:37 -0400 Subject: [PATCH 117/166] Move package client/consul/sync to command/agent/consul. This has been done to allow the Server and Client to reuse the same Syncer because the Agent may be running Client, Server, or both simultaneously and we only want one Syncer object alive in the agent. --- client/client.go | 2 +- client/client_test.go | 1 + client/driver/executor/executor.go | 2 +- command/agent/agent.go | 2 +- {client => command/agent}/consul/check.go | 0 {client => command/agent}/consul/sync.go | 0 {client => command/agent}/consul/sync_test.go | 0 7 files changed, 4 insertions(+), 3 deletions(-) rename {client => command/agent}/consul/check.go (100%) rename {client => command/agent}/consul/sync.go (100%) rename {client => command/agent}/consul/sync_test.go (100%) diff --git a/client/client.go b/client/client.go index a99faebe7a0..3788fdfd1ad 100644 --- a/client/client.go +++ b/client/client.go @@ -17,11 +17,11 @@ import ( "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/config" - "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/driver" "github.com/hashicorp/nomad/client/fingerprint" "github.com/hashicorp/nomad/client/rpcproxy" "github.com/hashicorp/nomad/client/stats" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/structs" "github.com/mitchellh/hashstructure" diff --git a/client/client_test.go b/client/client_test.go index 207b7e58507..758d3cb7b90 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -13,6 +13,7 @@ import ( "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/consul" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index b6b0fcdca7c..d6901ac9517 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -21,11 +21,11 @@ import ( "github.com/shirou/gopsutil/process" "github.com/hashicorp/nomad/client/allocdir" - "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/driver/env" "github.com/hashicorp/nomad/client/driver/logging" cstructs "github.com/hashicorp/nomad/client/driver/structs" "github.com/hashicorp/nomad/client/stats" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs/config" ) diff --git a/command/agent/agent.go b/command/agent/agent.go index 0f7c76d302f..1ccf77422cd 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -14,7 +14,7 @@ import ( "github.com/hashicorp/nomad/client" clientconfig "github.com/hashicorp/nomad/client/config" - "github.com/hashicorp/nomad/client/consul" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/structs" ) diff --git a/client/consul/check.go b/command/agent/consul/check.go similarity index 100% rename from client/consul/check.go rename to command/agent/consul/check.go diff --git a/client/consul/sync.go b/command/agent/consul/sync.go similarity index 100% rename from client/consul/sync.go rename to command/agent/consul/sync.go diff --git a/client/consul/sync_test.go b/command/agent/consul/sync_test.go similarity index 100% rename from client/consul/sync_test.go rename to command/agent/consul/sync_test.go From b6a2ec2db8c9871c3c7fa2c40eea2fc9ba8147fd Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 8 Jun 2016 02:08:50 -0400 Subject: [PATCH 118/166] Remove Syncer.registerService() This call is obsolete by a future commit that changes the canonical source of truth to be consul.AgentServiceRegistration structs, which means it is not necessary to construct AgentServiceRegistration objects every time a registration is made, we just reuse the existing object. --- command/agent/consul/sync.go | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/command/agent/consul/sync.go b/command/agent/consul/sync.go index a14bf90beed..6c528ae31cd 100644 --- a/command/agent/consul/sync.go +++ b/command/agent/consul/sync.go @@ -208,7 +208,7 @@ func (c *Syncer) SyncServices() error { } trackedService, ok := c.trackedServices[srv.ID] if (ok && !reflect.DeepEqual(trackedService, srv)) || !ok { - if err := c.registerService(srv); err != nil { + if err := c.client.Agent().ServiceRegister(srv); err != nil { mErr.Errors = append(mErr.Errors, err) } } @@ -383,18 +383,6 @@ func (c *Syncer) createService(service *structs.ConsulService) (*consul.AgentSer return &srv, nil } -// registerService registers a service with Consul -func (c *Syncer) registerService(service *consul.AgentService) error { - srvReg := consul.AgentServiceRegistration{ - ID: service.ID, - Name: service.Service, - Tags: service.Tags, - Port: service.Port, - Address: service.Address, - } - return c.client.Agent().ServiceRegister(&srvReg) -} - // deregisterService de-registers a service with the given ID from consul func (c *Syncer) deregisterService(ID string) error { return c.client.Agent().ServiceDeregister(ID) From 54838b9eba5df8a71e840e72a4711f9b49d2902b Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 8 Jun 2016 02:20:10 -0400 Subject: [PATCH 119/166] Rename structs.Task's `Service` attribute to `ConsulService` --- client/client.go | 2 +- client/client_test.go | 1 - client/driver/executor/executor.go | 6 +++--- client/driver/executor/executor_test.go | 12 ++++++------ command/agent/consul/sync_test.go | 6 +++--- jobspec/parse.go | 4 ++-- jobspec/parse_test.go | 2 +- nomad/job_endpoint_test.go | 10 +++++----- nomad/mock/mock.go | 2 +- nomad/structs/diff.go | 2 +- nomad/structs/diff_test.go | 16 ++++++++-------- nomad/structs/structs.go | 16 ++++++++-------- nomad/structs/structs_test.go | 10 +++++----- scheduler/util_test.go | 4 ++-- 14 files changed, 46 insertions(+), 47 deletions(-) diff --git a/client/client.go b/client/client.go index 3788fdfd1ad..3af148c3d7a 100644 --- a/client/client.go +++ b/client/client.go @@ -1299,7 +1299,7 @@ func (c *Client) setupConsulSyncer() error { for taskName, taskState := range taskStates { if taskState.State == structs.TaskStateRunning { if tr, ok := ar.tasks[taskName]; ok { - for _, service := range tr.task.Services { + for _, service := range tr.task.ConsulServices { svcIdentifier := fmt.Sprintf("%s-%s", allocId, tr.task.Name) services[service.ID(svcIdentifier)] = struct{}{} } diff --git a/client/client_test.go b/client/client_test.go index 758d3cb7b90..36e3ae16928 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -12,7 +12,6 @@ import ( "time" "github.com/hashicorp/nomad/client/config" - "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/mock" diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index d6901ac9517..0196650a834 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -359,7 +359,7 @@ func (e *UniversalExecutor) UpdateTask(task *structs.Task) error { // Re-syncing task with Consul agent if e.consulSyncer != nil { - e.consulSyncer.SetServices(servicesGroupName, task.Services) + e.consulSyncer.SetServices(servicesGroupName, task.ConsulServices) } return nil } @@ -487,7 +487,7 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { if e.ctx != nil { syncerFn := func() error { e.interpolateServices(e.ctx.Task) - e.consulSyncer.SetServices(e.ctx.AllocID, e.ctx.Task.Services) + e.consulSyncer.SetServices(e.ctx.AllocID, e.ctx.Task.ConsulServices) return nil } e.consulSyncer.AddPeriodicHandler(e.ctx.AllocID, syncerFn) @@ -690,7 +690,7 @@ func (e *UniversalExecutor) createCheck(check *structs.ServiceCheck, checkID str // task's environment. func (e *UniversalExecutor) interpolateServices(task *structs.Task) { e.ctx.TaskEnv.Build() - for _, service := range task.Services { + for _, service := range task.ConsulServices { for _, check := range service.Checks { if check.Type == structs.ServiceCheckScript { check.Name = e.ctx.TaskEnv.ReplaceEnv(check.Name) diff --git a/client/driver/executor/executor_test.go b/client/driver/executor/executor_test.go index de479970e5d..ad96a4c5066 100644 --- a/client/driver/executor/executor_test.go +++ b/client/driver/executor/executor_test.go @@ -338,18 +338,18 @@ func TestExecutorInterpolateServices(t *testing.T) { executor.(*UniversalExecutor).ctx = ctx executor.(*UniversalExecutor).interpolateServices(task) expectedTags := []string{"pci:true", "datacenter:dc1"} - if !reflect.DeepEqual(task.Services[0].Tags, expectedTags) { - t.Fatalf("expected: %v, actual: %v", expectedTags, task.Services[0].Tags) + if !reflect.DeepEqual(task.ConsulServices[0].Tags, expectedTags) { + t.Fatalf("expected: %v, actual: %v", expectedTags, task.ConsulServices[0].Tags) } expectedCheckCmd := "/usr/local/check-table-mysql" expectedCheckArgs := []string{"5.6"} - if !reflect.DeepEqual(task.Services[0].Checks[0].Command, expectedCheckCmd) { - t.Fatalf("expected: %v, actual: %v", expectedCheckCmd, task.Services[0].Checks[0].Command) + if !reflect.DeepEqual(task.ConsulServices[0].Checks[0].Command, expectedCheckCmd) { + t.Fatalf("expected: %v, actual: %v", expectedCheckCmd, task.ConsulServices[0].Checks[0].Command) } - if !reflect.DeepEqual(task.Services[0].Checks[0].Args, expectedCheckArgs) { - t.Fatalf("expected: %v, actual: %v", expectedCheckArgs, task.Services[0].Checks[0].Args) + if !reflect.DeepEqual(task.ConsulServices[0].Checks[0].Args, expectedCheckArgs) { + t.Fatalf("expected: %v, actual: %v", expectedCheckArgs, task.ConsulServices[0].Checks[0].Args) } } diff --git a/command/agent/consul/sync_test.go b/command/agent/consul/sync_test.go index e06d5713a35..3773dd00d53 100644 --- a/command/agent/consul/sync_test.go +++ b/command/agent/consul/sync_test.go @@ -90,7 +90,7 @@ func TestConsulServiceUpdateService(t *testing.T) { //Update Service defn 1 newTags := []string{"tag3"} - task.Services[0].Tags = newTags + task.ConsulServices[0].Tags = newTags if err := cs.SyncServices(); err != nil { t.Fatalf("err: %v", err) } @@ -147,8 +147,8 @@ func checksPresent(t *testing.T, checkIDs []string, syncer *Syncer) error { func mockTask() *structs.Task { task := structs.Task{ - Name: "foo", - Services: []*structs.ConsulService{&service1, &service2}, + Name: "foo", + ConsulServices: []*structs.ConsulService{&service1, &service2}, Resources: &structs.Resources{ Networks: []*structs.NetworkResource{ &structs.NetworkResource{ diff --git a/jobspec/parse.go b/jobspec/parse.go index c951f270fbb..99dbd6676ef 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -701,7 +701,7 @@ func parseArtifactOption(result map[string]string, list *ast.ObjectList) error { } func parseServices(jobName string, taskGroupName string, task *structs.Task, serviceObjs *ast.ObjectList) error { - task.Services = make([]*structs.ConsulService, len(serviceObjs.Items)) + task.ConsulServices = make([]*structs.ConsulService, len(serviceObjs.Items)) var defaultServiceName bool for idx, o := range serviceObjs.Items { // Check for invalid keys @@ -750,7 +750,7 @@ func parseServices(jobName string, taskGroupName string, task *structs.Task, ser } } - task.Services[idx] = &service + task.ConsulServices[idx] = &service } return nil diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index 35518dc9e0a..e2119668e0a 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -97,7 +97,7 @@ func TestParse(t *testing.T) { }, }, }, - Services: []*structs.ConsulService{ + ConsulServices: []*structs.ConsulService{ { Name: "binstore-storagelocker-binsl-binstore", Tags: []string{"foo", "bar"}, diff --git a/nomad/job_endpoint_test.go b/nomad/job_endpoint_test.go index 339b13439b4..5aab5ee89b3 100644 --- a/nomad/job_endpoint_test.go +++ b/nomad/job_endpoint_test.go @@ -48,7 +48,7 @@ func TestJobEndpoint_Register(t *testing.T) { if out.CreateIndex != resp.JobModifyIndex { t.Fatalf("index mis-match") } - serviceName := out.TaskGroups[0].Tasks[0].Services[0].Name + serviceName := out.TaskGroups[0].Tasks[0].ConsulServices[0].Name expectedServiceName := "web-frontend" if serviceName != expectedServiceName { t.Fatalf("Expected Service Name: %s, Actual: %s", expectedServiceName, serviceName) @@ -237,7 +237,7 @@ func TestJobEndpoint_Register_Periodic(t *testing.T) { if out.CreateIndex != resp.JobModifyIndex { t.Fatalf("index mis-match") } - serviceName := out.TaskGroups[0].Tasks[0].Services[0].Name + serviceName := out.TaskGroups[0].Tasks[0].ConsulServices[0].Name expectedServiceName := "web-frontend" if serviceName != expectedServiceName { t.Fatalf("Expected Service Name: %s, Actual: %s", expectedServiceName, serviceName) @@ -573,12 +573,12 @@ func TestJobEndpoint_GetJob(t *testing.T) { // Make a copy of the origin job and change the service name so that we can // do a deep equal with the response from the GET JOB Api j := job - j.TaskGroups[0].Tasks[0].Services[0].Name = "web-frontend" + j.TaskGroups[0].Tasks[0].ConsulServices[0].Name = "web-frontend" for tgix, tg := range j.TaskGroups { for tidx, t := range tg.Tasks { - for sidx, service := range t.Services { + for sidx, service := range t.ConsulServices { for cidx, check := range service.Checks { - check.Name = resp2.Job.TaskGroups[tgix].Tasks[tidx].Services[sidx].Checks[cidx].Name + check.Name = resp2.Job.TaskGroups[tgix].Tasks[tidx].ConsulServices[sidx].Checks[cidx].Name } } } diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 7e5a5fedcbf..1f7b64dee44 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -94,7 +94,7 @@ func Job() *structs.Job { Env: map[string]string{ "FOO": "bar", }, - Services: []*structs.ConsulService{ + ConsulServices: []*structs.ConsulService{ { Name: "${TASK}-frontend", PortLabel: "http", diff --git a/nomad/structs/diff.go b/nomad/structs/diff.go index 812b41adf78..4d59d2b5ac0 100644 --- a/nomad/structs/diff.go +++ b/nomad/structs/diff.go @@ -386,7 +386,7 @@ func (t *Task) Diff(other *Task, contextual bool) (*TaskDiff, error) { } // Services diff - if sDiffs := serviceDiffs(t.Services, other.Services, contextual); sDiffs != nil { + if sDiffs := serviceDiffs(t.ConsulServices, other.ConsulServices, contextual); sDiffs != nil { diff.Objects = append(diff.Objects, sDiffs...) } diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go index 5808cebc9b8..d56971f72b3 100644 --- a/nomad/structs/diff_test.go +++ b/nomad/structs/diff_test.go @@ -2363,7 +2363,7 @@ func TestTaskDiff(t *testing.T) { { // Services edited (no checks) Old: &Task{ - Services: []*ConsulService{ + ConsulServices: []*ConsulService{ { Name: "foo", PortLabel: "foo", @@ -2379,7 +2379,7 @@ func TestTaskDiff(t *testing.T) { }, }, New: &Task{ - Services: []*ConsulService{ + ConsulServices: []*ConsulService{ { Name: "bar", PortLabel: "bar", @@ -2452,7 +2452,7 @@ func TestTaskDiff(t *testing.T) { // Services edited (no checks) with context Contextual: true, Old: &Task{ - Services: []*ConsulService{ + ConsulServices: []*ConsulService{ { Name: "foo", PortLabel: "foo", @@ -2460,7 +2460,7 @@ func TestTaskDiff(t *testing.T) { }, }, New: &Task{ - Services: []*ConsulService{ + ConsulServices: []*ConsulService{ { Name: "foo", PortLabel: "bar", @@ -2494,7 +2494,7 @@ func TestTaskDiff(t *testing.T) { { // Service Checks edited Old: &Task{ - Services: []*ConsulService{ + ConsulServices: []*ConsulService{ { Name: "foo", Checks: []*ServiceCheck{ @@ -2533,7 +2533,7 @@ func TestTaskDiff(t *testing.T) { }, }, New: &Task{ - Services: []*ConsulService{ + ConsulServices: []*ConsulService{ { Name: "foo", Checks: []*ServiceCheck{ @@ -2695,7 +2695,7 @@ func TestTaskDiff(t *testing.T) { // Service Checks edited with context Contextual: true, Old: &Task{ - Services: []*ConsulService{ + ConsulServices: []*ConsulService{ { Name: "foo", Checks: []*ServiceCheck{ @@ -2714,7 +2714,7 @@ func TestTaskDiff(t *testing.T) { }, }, New: &Task{ - Services: []*ConsulService{ + ConsulServices: []*ConsulService{ { Name: "foo", Checks: []*ServiceCheck{ diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index aa74d06ac09..a6d4df8eeec 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -1688,7 +1688,7 @@ type Task struct { Env map[string]string // List of service definitions exposed by the Task - Services []*ConsulService + ConsulServices []*ConsulService // Constraints can be specified at a task level and apply only to // the particular task. @@ -1721,12 +1721,12 @@ func (t *Task) Copy() *Task { *nt = *t nt.Env = CopyMapStringString(nt.Env) - if t.Services != nil { - services := make([]*ConsulService, len(nt.Services)) - for i, s := range nt.Services { + if t.ConsulServices != nil { + services := make([]*ConsulService, len(nt.ConsulServices)) + for i, s := range nt.ConsulServices { services[i] = s.Copy() } - nt.Services = services + nt.ConsulServices = services } nt.Constraints = CopySliceConstraints(nt.Constraints) @@ -1763,7 +1763,7 @@ func (t *Task) InitFields(job *Job, tg *TaskGroup) { // and Tasks in all the service Names of a Task. This also generates the service // id, check id and check names. func (t *Task) InitServiceFields(job string, taskGroup string) { - for _, service := range t.Services { + for _, service := range t.ConsulServices { service.InitFields(job, taskGroup, t.Name) } } @@ -1860,7 +1860,7 @@ func validateServices(t *Task) error { // unique. servicePorts := make(map[string][]string) knownServices := make(map[string]struct{}) - for i, service := range t.Services { + for i, service := range t.ConsulServices { if err := service.Validate(); err != nil { outer := fmt.Errorf("service %d validation failed: %s", i, err) mErr.Errors = append(mErr.Errors, outer) @@ -2448,7 +2448,7 @@ func (a *Allocation) PopulateServiceIDs(tg *TaskGroup) { a.Services = make(map[string]string) for _, task := range tg.Tasks { - for _, service := range task.Services { + for _, service := range task.ConsulServices { // Retain the service if an ID is already generated if id, ok := previous[service.Name]; ok { a.Services[service.Name] = id diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index f3b50554488..5c56e9ff10d 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -136,7 +136,7 @@ func testJob() *Job { GetterSource: "http://foo.com", }, }, - Services: []*ConsulService{ + ConsulServices: []*ConsulService{ { Name: "${TASK}-frontend", PortLabel: "http", @@ -311,7 +311,7 @@ func TestTask_Validate_Services(t *testing.T) { MemoryMB: 100, IOPS: 10, }, - Services: []*ConsulService{s1, s2}, + ConsulServices: []*ConsulService{s1, s2}, } err := task.Validate() if err == nil { @@ -722,7 +722,7 @@ func TestJob_ExpandServiceNames(t *testing.T) { Tasks: []*Task{ { Name: "frontend", - Services: []*ConsulService{ + ConsulServices: []*ConsulService{ { Name: "${BASE}-default", }, @@ -746,12 +746,12 @@ func TestJob_ExpandServiceNames(t *testing.T) { j.InitFields() - service1Name := j.TaskGroups[0].Tasks[0].Services[0].Name + service1Name := j.TaskGroups[0].Tasks[0].ConsulServices[0].Name if service1Name != "my-job-web-frontend-default" { t.Fatalf("Expected Service Name: %s, Actual: %s", "my-job-web-frontend-default", service1Name) } - service2Name := j.TaskGroups[0].Tasks[0].Services[1].Name + service2Name := j.TaskGroups[0].Tasks[0].ConsulServices[1].Name if service2Name != "jmx" { t.Fatalf("Expected Service Name: %s, Actual: %s", "jmx", service2Name) } diff --git a/scheduler/util_test.go b/scheduler/util_test.go index da9e8706de0..130edce7641 100644 --- a/scheduler/util_test.go +++ b/scheduler/util_test.go @@ -691,10 +691,10 @@ func TestInplaceUpdate_Success(t *testing.T) { } // Delete service 2 - tg.Tasks[0].Services = tg.Tasks[0].Services[:1] + tg.Tasks[0].ConsulServices = tg.Tasks[0].ConsulServices[:1] // Add the new services - tg.Tasks[0].Services = append(tg.Tasks[0].Services, newServices...) + tg.Tasks[0].ConsulServices = append(tg.Tasks[0].ConsulServices, newServices...) updates := []allocTuple{{Alloc: alloc, TaskGroup: tg}} stack := NewGenericStack(false, ctx) From 3052e7477adeb0ee967d192c347ad4def0a9275d Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 8 Jun 2016 02:25:32 -0400 Subject: [PATCH 120/166] Move the start of the UniversalExecutor's consulSyncer to initialize once This should be handled via a sync.Once primative, but I don't want to unpack that atm. --- client/driver/executor/executor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index 0196650a834..8e22f0fb77e 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -483,6 +483,7 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { cs.SetServiceRegPrefix(consul.GenerateServicePrefix(e.ctx.AllocID, e.ctx.Task.Name)) cs.SetAddrFinder(e.ctx.Task.FindHostAndPortFor) e.consulSyncer = cs + go e.consulSyncer.Run() } if e.ctx != nil { syncerFn := func() error { @@ -492,7 +493,6 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { } e.consulSyncer.AddPeriodicHandler(e.ctx.AllocID, syncerFn) } - go e.consulSyncer.Run() err := e.consulSyncer.SyncServices() // Attempt to register immediately return err } From 7ad5cd571ce6f4c7b5fa0a2ae470e1230dc41429 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 8 Jun 2016 02:31:19 -0400 Subject: [PATCH 121/166] Begin leveraging the Agent-level consul.Syncer --- command/agent/agent.go | 116 +++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 56 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 1ccf77422cd..fe223d64aaa 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -62,7 +62,7 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { } if err := a.setupConsulSyncer(shutdownCh); err != nil { - return nil, err + return nil, fmt.Errorf("Failed to initialize Consul syncer task: %v", err) } if err := a.setupServer(); err != nil { return nil, err @@ -73,11 +73,17 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { if a.client == nil && a.server == nil { return nil, fmt.Errorf("must have at least client or server mode enabled") } - if a.config.Consul.AutoRegister { - if err := a.syncAgentServicesWithConsul(); err != nil { - a.logger.Printf("[ERR] agent: unable to sync agent services with consul: %v", err) - } - } + + // The Nomad Agent runs the consul.Syncer regardless of whether or + // not the Agent is running in Client or Server mode (or both), and + // regardless of the consul.auto_register parameter. The Client and + // Server both reuse the same consul.Syncer instance. This Syncer + // task periodically executes callbacks that update Consul. The + // reason the Syncer is always running is because one of the + // callbacks is attempts to self-bootstrap Nomad using information + // found in Consul. The Syncer's handlers automatically deactivate + // when the Consul Fingerprinter has detected the local Consul Agent + // is missing. go a.consulSyncer.Run() return a, nil @@ -545,63 +551,61 @@ func (a *Agent) Stats() map[string]map[string]string { return stats } -// setupConsulSyncer creates the Consul task used by this Nomad Agent when -// running in either Client and Server mode. -func (a *Agent) setupConsulSyncer(shutdownCh chan struct{}) (err error) { +// setupAgentConsulSyncer creates the Consul tasks used by this Nomad Agent +// (either Client or Server mode). +func (a *Agent) setupConsulSyncer(shutdownCh chan struct{}) error { + var err error a.consulSyncer, err = consul.NewSyncer(a.config.Consul, shutdownCh, a.logger) + if err != nil { + return err + } + + // Create the agent's group of services that this Nomad Agent should + // sync with Consul. The list of services that this agent should + // sync depends on whether or not a node is in Client or Server mode, + // but doesn't change after initialization. + var agentServiceGroup []*structs.ConsulService + if a.client != nil && a.config.Consul.ClientServiceName != "" { + clientRpcService := &structs.ConsulService{ + Name: a.config.Consul.ClientServiceName, + PortLabel: a.clientRpcAddr, + Tags: []string{consul.ServiceTagRpc}, + } + agentServiceGroup = append(agentServiceGroup, clientRpcService) - return nil -} - -// syncAgentServicesWithConsul syncs this Nomad Agent's services with Consul -// when running in either Client or Server mode. -func (a *Agent) syncAgentServicesWithConsul() error { - var services []*structs.Service - if a.client != nil { - if a.config.Consul.ClientServiceName != "" { - clientRpcService := &structs.Service{ - Name: a.config.Consul.ClientServiceName, - PortLabel: a.clientRpcAddr, - Tags: []string{consul.ServiceTagRpc}, - } - services = append(services, clientRpcService) - - clientHttpService := &structs.Service{ - Name: a.config.Consul.ClientServiceName, - PortLabel: a.clientHttpAddr, - Tags: []string{consul.ServiceTagHttp}, - } - services = append(services, clientHttpService) - - a.consulSyncer.SetServiceIdentifier("agent-client") + clientHttpService := &structs.ConsulService{ + Name: a.config.Consul.ClientServiceName, + PortLabel: a.clientHttpAddr, + Tags: []string{consul.ServiceTagHttp}, } - } + agentServiceGroup = append(agentServiceGroup, clientHttpService) - if a.server != nil { - if a.config.Consul.ServerServiceName != "" { - serverHttpService := &structs.Service{ - Name: a.config.Consul.ServerServiceName, - Tags: []string{consul.ServiceTagHttp}, - PortLabel: a.serverHttpAddr, - } - services = append(services, serverHttpService) + a.consulSyncer.SetServiceRegPrefix("agent-client") + } - serverRpcService := &structs.Service{ - Name: a.config.Consul.ServerServiceName, - Tags: []string{consul.ServiceTagRpc}, - PortLabel: a.serverRpcAddr, - } - services = append(services, serverRpcService) + if a.server != nil && a.config.Consul.ServerServiceName != "" { + serverHttpService := &structs.ConsulService{ + Name: a.config.Consul.ServerServiceName, + PortLabel: a.serverHttpAddr, + Tags: []string{consul.ServiceTagHttp}, + } + agentServiceGroup = append(agentServiceGroup, serverHttpService) - serverSerfService := &structs.Service{ - Name: a.config.Consul.ServerServiceName, - Tags: []string{consul.ServiceTagSerf}, - PortLabel: a.serverSerfAddr, - } - services = append(services, serverSerfService) + serverRpcService := &structs.ConsulService{ + Name: a.config.Consul.ServerServiceName, + PortLabel: a.serverRpcAddr, + Tags: []string{consul.ServiceTagRpc}, + } + agentServiceGroup = append(agentServiceGroup, serverRpcService) - a.consulSyncer.SetServiceIdentifier("agent-server") + serverSerfService := &structs.ConsulService{ + Name: a.config.Consul.ServerServiceName, + PortLabel: a.serverSerfAddr, + Tags: []string{consul.ServiceTagSerf}, } + agentServiceGroup = append(agentServiceGroup, serverSerfService) + + a.consulSyncer.SetServiceRegPrefix("agent-server") } a.consulSyncer.SetAddrFinder(func(portLabel string) (string, int) { @@ -624,7 +628,7 @@ func (a *Agent) syncAgentServicesWithConsul() error { // we fall back to a loopback addr. if host == "" { if a.config.BindAddr != "" { - host = a.configBindAddr + host = a.config.BindAddr } else { host = "127.0.0.1" } From 802a8c459cd8466254b3a63ee3bfe4f671f9eddf Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Wed, 8 Jun 2016 13:38:00 -0400 Subject: [PATCH 122/166] Rename command/agent/consul/sync.go to syncer.go --- command/agent/consul/{sync.go => syncer.go} | 8 ++++---- command/agent/consul/{sync_test.go => syncer_test.go} | 0 2 files changed, 4 insertions(+), 4 deletions(-) rename command/agent/consul/{sync.go => syncer.go} (97%) rename command/agent/consul/{sync_test.go => syncer_test.go} (100%) diff --git a/command/agent/consul/sync.go b/command/agent/consul/syncer.go similarity index 97% rename from command/agent/consul/sync.go rename to command/agent/consul/syncer.go index 6c528ae31cd..ed556a31f4c 100644 --- a/command/agent/consul/sync.go +++ b/command/agent/consul/syncer.go @@ -427,7 +427,7 @@ func (c *Syncer) Run() { c.Shutdown() case <-c.notifyShutdownCh: sync.Stop() - c.logger.Printf("[INFO] consul.sync: shutting down sync for %q", c.serviceRegPrefix) + c.logger.Printf("[INFO] consul.syncer: shutting down sync for %q", c.serviceRegPrefix) return } } @@ -503,7 +503,7 @@ func (c *Syncer) consulPresent() bool { func (c *Syncer) runCheck(check Check) { res := check.Run() if res.Duration >= check.Timeout() { - c.logger.Printf("[DEBUG] consul.sync: check took time: %v, timeout: %v", res.Duration, check.Timeout()) + c.logger.Printf("[DEBUG] consul.syncer: check took time: %v, timeout: %v", res.Duration, check.Timeout()) } state := consul.HealthCritical output := res.Output @@ -521,7 +521,7 @@ func (c *Syncer) runCheck(check Check) { } if err := c.client.Agent().UpdateTTL(check.ID(), output, state); err != nil { if c.runChecks { - c.logger.Printf("[DEBUG] consul.sync: check %q failed, disabling Consul checks until until next successful sync: %v", check.ID(), err) + c.logger.Printf("[DEBUG] consul.syncer: check %q failed, disabling Consul checks until until next successful sync: %v", check.ID(), err) c.runChecks = false } else { c.runChecks = true @@ -541,7 +541,7 @@ func (c *Syncer) AddPeriodicHandler(name string, fn types.PeriodicCallback) bool c.periodicLock.Lock() defer c.periodicLock.Unlock() if _, found := c.periodicCallbacks[name]; found { - c.logger.Printf("[ERROR] consul.sync: failed adding handler %q", name) + c.logger.Printf("[ERROR] consul.syncer: failed adding handler %q", name) return false } c.periodicCallbacks[name] = fn diff --git a/command/agent/consul/sync_test.go b/command/agent/consul/syncer_test.go similarity index 100% rename from command/agent/consul/sync_test.go rename to command/agent/consul/syncer_test.go From 3e95ca61ef4b07972bee6942de40ce518596ab85 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 02:51:12 -0400 Subject: [PATCH 123/166] Per-comment, remove structs.Allocation's Services attribute. Nuke PopulateServiceIDs() now that it's also no longer needed. --- nomad/structs/structs.go | 36 ------------------------------- scheduler/generic_sched.go | 4 ---- scheduler/system_sched.go | 4 ---- scheduler/util.go | 1 - scheduler/util_test.go | 43 ++++++++++++++++++++++---------------- 5 files changed, 25 insertions(+), 63 deletions(-) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index a6d4df8eeec..c9382fdc341 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -2311,9 +2311,6 @@ type Allocation struct { // task. These should sum to the total Resources. TaskResources map[string]*Resources - // Services is a map of service names to service ids - Services map[string]string - // Metrics associated with this allocation Metrics *AllocMetric @@ -2363,14 +2360,6 @@ func (a *Allocation) Copy() *Allocation { na.TaskResources = tr } - if a.Services != nil { - s := make(map[string]string, len(na.Services)) - for service, id := range na.Services { - s[service] = id - } - na.Services = s - } - na.Metrics = na.Metrics.Copy() if a.TaskStates != nil { @@ -2439,31 +2428,6 @@ func (a *Allocation) Stub() *AllocListStub { } } -// PopulateServiceIDs generates the service IDs for all the service definitions -// in that Allocation -func (a *Allocation) PopulateServiceIDs(tg *TaskGroup) { - // Retain the old services, and re-initialize. We may be removing - // services, so we cannot update the existing map. - previous := a.Services - a.Services = make(map[string]string) - - for _, task := range tg.Tasks { - for _, service := range task.ConsulServices { - // Retain the service if an ID is already generated - if id, ok := previous[service.Name]; ok { - a.Services[service.Name] = id - continue - } - - // If the service hasn't been generated an ID, we generate one. - // We add a prefix to the Service ID so that we can know that this service - // is managed by Nomad since Consul can also have service which are not - // managed by Nomad - a.Services[service.Name] = fmt.Sprintf("%s-%s", NomadConsulPrefix, GenerateUUID()) - } - } -} - var ( // AllocationIndexRegex is a regular expression to find the allocation index. AllocationIndexRegex = regexp.MustCompile(".+\\[(\\d+)\\]$") diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 0a942cd1398..584b8a0176d 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -424,10 +424,6 @@ func (s *GenericScheduler) computePlacements(place []allocTuple) error { ClientStatus: structs.AllocClientStatusPending, } - // Generate service IDs tasks in this allocation - // COMPAT - This is no longer required and would be removed in v0.4 - alloc.PopulateServiceIDs(missing.TaskGroup) - s.plan.AppendAlloc(alloc) } else { // Lazy initialize the failed map diff --git a/scheduler/system_sched.go b/scheduler/system_sched.go index 3b8a6f4389c..42f509b395f 100644 --- a/scheduler/system_sched.go +++ b/scheduler/system_sched.go @@ -264,10 +264,6 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { ClientStatus: structs.AllocClientStatusPending, } - // Generate service IDs tasks in this allocation - // COMPAT - This is no longer required and would be removed in v0.4 - alloc.PopulateServiceIDs(missing.TaskGroup) - s.plan.AppendAlloc(alloc) } else { // Lazy initialize the failed map diff --git a/scheduler/util.go b/scheduler/util.go index 1c1c93275f0..269a4b4f04b 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -455,7 +455,6 @@ func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, newAlloc.Metrics = ctx.Metrics() newAlloc.DesiredStatus = structs.AllocDesiredStatusRun newAlloc.ClientStatus = structs.AllocClientStatusPending - newAlloc.PopulateServiceIDs(update.TaskGroup) ctx.Plan().AppendAlloc(newAlloc) // Remove this allocation from the slice diff --git a/scheduler/util_test.go b/scheduler/util_test.go index 130edce7641..0e8a5d5d8db 100644 --- a/scheduler/util_test.go +++ b/scheduler/util_test.go @@ -664,16 +664,8 @@ func TestInplaceUpdate_Success(t *testing.T) { DesiredStatus: structs.AllocDesiredStatusRun, } alloc.TaskResources = map[string]*structs.Resources{"web": alloc.Resources} - alloc.PopulateServiceIDs(job.TaskGroups[0]) noErr(t, state.UpsertAllocs(1001, []*structs.Allocation{alloc})) - webFeSrvID := alloc.Services["web-frontend"] - adminSrvID := alloc.Services["web-admin"] - - if webFeSrvID == "" || adminSrvID == "" { - t.Fatal("Service ID needs to be generated for service") - } - // Create a new task group that updates the resources. tg := &structs.TaskGroup{} *tg = *job.TaskGroups[0] @@ -716,20 +708,35 @@ func TestInplaceUpdate_Success(t *testing.T) { } // Get the alloc we inserted. - a := ctx.plan.NodeAllocation[alloc.NodeID][0] - if len(a.Services) != 3 { - t.Fatalf("Expected number of services: %v, Actual: %v", 3, len(a.Services)) + a := inplace[0].Alloc // TODO(sean@): Verify this is correct vs: ctx.plan.NodeAllocation[alloc.NodeID][0] + if a.Job == nil { + t.Fatalf("bad") } - // Test that the service id for the old service is still the same - if a.Services["web-frontend"] != webFeSrvID { - t.Fatalf("Expected service ID: %v, Actual: %v", webFeSrvID, a.Services["web-frontend"]) + if len(a.Job.TaskGroups) != 1 { + t.Fatalf("bad") + } + + if len(a.Job.TaskGroups[0].Tasks) != 1 { + t.Fatalf("bad") } - // Test that the map doesn't contain the service ID of the admin Service - // anymore - if _, ok := a.Services["web-admin"]; ok { - t.Fatal("Service shouldn't be present") + if len(a.Job.TaskGroups[0].Tasks[0].ConsulServices) != 3 { + t.Fatalf("Expected number of services: %v, Actual: %v", 3, len(a.Job.TaskGroups[0].Tasks[0].ConsulServices)) + } + + serviceNames := make(map[string]struct{}, 3) + for _, consulService := range a.Job.TaskGroups[0].Tasks[0].ConsulServices { + serviceNames[consulService.Name] = struct{}{} + } + if len(serviceNames) != 3 { + t.Fatalf("bad") + } + + for _, name := range []string{"dummy-service", "dummy-service2", "web-frontend"} { + if _, found := serviceNames[name]; !found { + t.Errorf("Expected consul service name missing: %v", name) + } } } From a4f605a7894c5c2cf660066aa01ef3410013d4f6 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 21:12:02 -0400 Subject: [PATCH 124/166] Initialize Consul for the Nomad Agent in a more uniform way. Decompose Client and Server registration into `setupClient()` and `setupServer()`, respectively. --- command/agent/agent.go | 100 +++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 53 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index fe223d64aaa..119afe08818 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -84,6 +84,9 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { // found in Consul. The Syncer's handlers automatically deactivate // when the Consul Fingerprinter has detected the local Consul Agent // is missing. + if err := a.consulSyncer.SyncServices(); err != nil { + a.logger.Printf("[WARN] agent.consul: Initial sync of Consul failed: %v", err) + } go a.consulSyncer.Run() return a, nil @@ -361,8 +364,30 @@ func (a *Agent) setupServer() error { if err != nil { return fmt.Errorf("server setup failed: %v", err) } - a.server = server + + // Create the Nomad Server services for Consul + if a.config.Consul.ServerServiceName != "" { + const serviceGroupName = "server" + a.consulSyncer.SetServices(serviceGroupName, []*structs.ConsulService{ + &structs.ConsulService{ + Name: a.config.Consul.ServerServiceName, + PortLabel: a.serverHttpAddr, + Tags: []string{consul.ServiceTagHttp}, + }, + &structs.ConsulService{ + Name: a.config.Consul.ServerServiceName, + PortLabel: a.serverRpcAddr, + Tags: []string{consul.ServiceTagRpc}, + }, + &structs.ConsulService{ + PortLabel: a.serverSerfAddr, + Name: a.config.Consul.ServerServiceName, + Tags: []string{consul.ServiceTagSerf}, + }, + }) + } + return nil } @@ -391,6 +416,24 @@ func (a *Agent) setupClient() error { return fmt.Errorf("client setup failed: %v", err) } a.client = client + + // Create the Nomad Server services for Consul + if a.config.Consul.ClientServiceName != "" { + const serviceGroupName = "client" + a.consulSyncer.SetServices(serviceGroupName, []*structs.ConsulService{ + &structs.ConsulService{ + Name: a.config.Consul.ClientServiceName, + PortLabel: a.clientHttpAddr, + Tags: []string{consul.ServiceTagHttp}, + }, + &structs.ConsulService{ + Name: a.config.Consul.ClientServiceName, + PortLabel: a.clientRpcAddr, + Tags: []string{consul.ServiceTagRpc}, + }, + }) + } + return nil } @@ -551,7 +594,7 @@ func (a *Agent) Stats() map[string]map[string]string { return stats } -// setupAgentConsulSyncer creates the Consul tasks used by this Nomad Agent +// setupConsulSyncer creates the Consul tasks used by this Nomad Agent // (either Client or Server mode). func (a *Agent) setupConsulSyncer(shutdownCh chan struct{}) error { var err error @@ -559,54 +602,7 @@ func (a *Agent) setupConsulSyncer(shutdownCh chan struct{}) error { if err != nil { return err } - - // Create the agent's group of services that this Nomad Agent should - // sync with Consul. The list of services that this agent should - // sync depends on whether or not a node is in Client or Server mode, - // but doesn't change after initialization. - var agentServiceGroup []*structs.ConsulService - if a.client != nil && a.config.Consul.ClientServiceName != "" { - clientRpcService := &structs.ConsulService{ - Name: a.config.Consul.ClientServiceName, - PortLabel: a.clientRpcAddr, - Tags: []string{consul.ServiceTagRpc}, - } - agentServiceGroup = append(agentServiceGroup, clientRpcService) - - clientHttpService := &structs.ConsulService{ - Name: a.config.Consul.ClientServiceName, - PortLabel: a.clientHttpAddr, - Tags: []string{consul.ServiceTagHttp}, - } - agentServiceGroup = append(agentServiceGroup, clientHttpService) - - a.consulSyncer.SetServiceRegPrefix("agent-client") - } - - if a.server != nil && a.config.Consul.ServerServiceName != "" { - serverHttpService := &structs.ConsulService{ - Name: a.config.Consul.ServerServiceName, - PortLabel: a.serverHttpAddr, - Tags: []string{consul.ServiceTagHttp}, - } - agentServiceGroup = append(agentServiceGroup, serverHttpService) - - serverRpcService := &structs.ConsulService{ - Name: a.config.Consul.ServerServiceName, - PortLabel: a.serverRpcAddr, - Tags: []string{consul.ServiceTagRpc}, - } - agentServiceGroup = append(agentServiceGroup, serverRpcService) - - serverSerfService := &structs.ConsulService{ - Name: a.config.Consul.ServerServiceName, - PortLabel: a.serverSerfAddr, - Tags: []string{consul.ServiceTagSerf}, - } - agentServiceGroup = append(agentServiceGroup, serverSerfService) - - a.consulSyncer.SetServiceRegPrefix("agent-server") - } + a.consulSyncer.SetServiceRegPrefix("agent") a.consulSyncer.SetAddrFinder(func(portLabel string) (string, int) { host, port, err := net.SplitHostPort(portLabel) @@ -640,7 +636,5 @@ func (a *Agent) setupConsulSyncer(shutdownCh chan struct{}) error { return host, p }) - a.consulSyncer.SetServices("agent", agentServiceGroup) - - return a.consulSyncer.SyncServices() + return nil } From 8be79cf28cc4b27b27ee2c64e5ce826b8792bb86 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 21:21:22 -0400 Subject: [PATCH 125/166] Add "Service Groups" to the Syncer. Now the right way to register services with the Syncer is to call `SetServices(groupName, []*services)`. This was required to allow the Syncer to sync either the Client, Server, or Both using a single Syncer. --- command/agent/consul/syncer.go | 99 +++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index ed556a31f4c..efd7463ee7f 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -54,6 +54,14 @@ type Syncer struct { client *consul.Client runChecks bool + // servicesGroups is a named group of services that will be flattened + // and reconciled with Consul when SyncServices() is called. The key + // to the servicesGroups map is unique per handler and is used to + // allow the Agent's services to be maintained independently of the + // Client or Server's services. + servicesGroups map[string][]*consul.AgentServiceRegistration + servicesGroupsLock sync.RWMutex + // The "Consul Registry" is a collection of Consul Services and // Checks all guarded by the registryLock. registryLock sync.RWMutex @@ -152,6 +160,7 @@ func NewSyncer(config *config.ConsulConfig, shutdownCh chan struct{}, logger *lo logger: logger, shutdownCh: shutdownCh, trackedServices: make(map[string]*consul.AgentService), + servicesGroups: make(map[string][]*consul.AgentServiceRegistration), trackedChecks: make(map[string]*consul.AgentCheckRegistration), checkRunners: make(map[string]*CheckRunner), periodicCallbacks: make(map[string]types.PeriodicCallback), @@ -191,39 +200,36 @@ func (c *Syncer) SyncNow() { } } -// SyncServices sync the services with the Consul Agent -func (c *Syncer) SyncServices() error { - services := c.flattenedServices() - +// SetServices assigns the slice of Nomad Services to the provided services +// group name. +func (c *Syncer) SetServices(groupName string, services []*structs.ConsulService) error { var mErr multierror.Error - taskServices := make(map[string]*consul.AgentServiceRegistration) - taskChecks := make(map[string]*consul.AgentCheckRegistration) - - // Register Services and Checks that we don't know about or has changed + registeredServices := make([]*consul.AgentServiceRegistration, 0, len(services)) for _, service := range services { - srv, err := c.createService(service) - if err != nil { + if service.ServiceID == "" { + service.ServiceID = c.GenerateServiceID(groupName, service) + } + var serviceReg *consul.AgentServiceRegistration + var err error + if serviceReg, err = c.createService(service); err != nil { mErr.Errors = append(mErr.Errors, err) continue } - trackedService, ok := c.trackedServices[srv.ID] - if (ok && !reflect.DeepEqual(trackedService, srv)) || !ok { - if err := c.client.Agent().ServiceRegister(srv); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - c.trackedServices[srv.ID] = srv - taskServices[srv.ID] = srv + registeredServices = append(registeredServices, serviceReg) + // Register the check(s) for this service for _, chk := range service.Checks { - // Create a consul check registration - chkReg, err := c.createDelegatedCheckReg(chk, srv) + // Create a Consul check registration + chkReg, err := c.createDelegatedCheckReg(chk, serviceReg) if err != nil { mErr.Errors = append(mErr.Errors, err) continue } // creating a nomad check if we have to handle this particular check type if _, ok := c.delegateChecks[chk.Type]; ok { + if _, ok := c.checkRunners[chkReg.ID]; ok { + continue + } nc, err := c.createDelegatedCheck(chk, chkReg.ID) if err != nil { mErr.Errors = append(mErr.Errors, err) @@ -232,37 +238,42 @@ func (c *Syncer) SyncServices() error { cr := NewCheckRunner(nc, c.runCheck, c.logger) c.checkRunners[nc.ID()] = cr } - - if _, ok := c.trackedChecks[chkReg.ID]; !ok { - if err := c.registerCheck(chkReg); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - c.trackedChecks[chkReg.ID] = chkReg - taskChecks[chkReg.ID] = chkReg } } - // Remove services that are not present anymore - for _, service := range c.trackedServices { - if _, ok := taskServices[service.ID]; !ok { - if err := c.deregisterService(service.ID); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - delete(c.trackedServices, service.ID) - } + if len(mErr.Errors) > 0 { + return mErr.ErrorOrNil() } - // Remove the checks that are not present anymore - for checkID, _ := range c.trackedChecks { - if _, ok := taskChecks[checkID]; !ok { - if err := c.deregisterCheck(checkID); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - delete(c.trackedChecks, checkID) + c.servicesGroupsLock.Lock() + c.servicesGroups[groupName] = registeredServices + c.servicesGroupsLock.Unlock() + + return nil +} + +// SyncNow expires the current timer forcing the list of periodic callbacks +// to be synced immediately. +func (c *Syncer) SyncNow() { + select { + case c.notifySyncCh <- struct{}{}: + default: + } +} + +// flattenedServices returns a flattened list of services +func (c *Syncer) flattenedServices() []*consul.AgentServiceRegistration { + const initialNumServices = 8 + services := make([]*consul.AgentServiceRegistration, 0, initialNumServices) + c.servicesGroupsLock.RLock() + for _, servicesGroup := range c.servicesGroups { + for _, service := range servicesGroup { + services = append(services, service) } } - return mErr.ErrorOrNil() + c.servicesGroupsLock.RUnlock() + + return services } func (c *Syncer) signalShutdown() { From d87c697c87a9c6e669cee539a968e97f5ce68aef Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 21:29:10 -0400 Subject: [PATCH 126/166] Update Syncer.Run() to call SyncServices(). --- command/agent/consul/syncer.go | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index efd7463ee7f..4ade66ccae9 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -416,7 +416,7 @@ func (c *Syncer) deregisterCheck(ID string) error { func (c *Syncer) Run() { d := initialSyncDelay + lib.RandomStagger(initialSyncBuffer-initialSyncDelay) sync := time.NewTimer(d) - c.logger.Printf("[DEBUG] consul.sync: sleeping %v before first sync", d) + c.logger.Printf("[DEBUG] consul.syncer: sleeping %v before first sync", d) for { select { @@ -424,13 +424,17 @@ func (c *Syncer) Run() { d = syncInterval - lib.RandomStagger(syncInterval/syncJitter) sync.Reset(d) - if err := c.performSync(); err != nil { - if c.runChecks { - c.logger.Printf("[DEBUG] consul.sync: disabling checks until Consul sync completes for %q: %v", c.serviceRegPrefix, err) + if err := c.SyncServices(); err != nil { + if c.consulAvailable { + c.logger.Printf("[DEBUG] consul.syncer: disabling checks until successful sync for %q: %v", c.serviceRegPrefix, err) + } else { + c.consulAvailable = false } - c.runChecks = false } else { - c.runChecks = true + if !c.consulAvailable { + c.logger.Printf("[DEBUG] consul.syncer: re-enabling checks for for %q", c.serviceRegPrefix) + } + c.consulAvailable = true } case <-c.notifySyncCh: sync.Reset(syncInterval) @@ -462,20 +466,19 @@ func (c *Syncer) RunHandlers() error { return mErr.ErrorOrNil() } -// performSync sync the services and checks we are tracking with Consul. -func (c *Syncer) performSync() error { - var mErr multierror.Error +// SyncServices sync the services with the Consul Agent +func (c *Syncer) SyncServices() error { if err := c.RunHandlers(); err != nil { - mErr.Errors = append(mErr.Errors, err) + return err } if err := c.syncServices(); err != nil { - mErr.Errors = append(mErr.Errors, err) + return err } if err := c.syncChecks(); err != nil { - mErr.Errors = append(mErr.Errors, err) + return err } - return mErr.ErrorOrNil() + return nil } // filterConsulServices prunes out all the service whose ids are not prefixed From 5913acfd6944e335852c4463a6dfe1f15d3f8bd7 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 21:32:04 -0400 Subject: [PATCH 127/166] Rename runChecks to consulAvailable Apologies in advance for the variable thrash, the fingerprinter is no longer used to gate whether or not Consul is available any more. --- command/agent/consul/syncer.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index 4ade66ccae9..a39ed4427b2 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -51,8 +51,8 @@ const ( // Syncer allows syncing of services and checks with Consul type Syncer struct { - client *consul.Client - runChecks bool + client *consul.Client + consulAvailable bool // servicesGroups is a named group of services that will be flattened // and reconciled with Consul when SyncServices() is called. The key @@ -158,6 +158,7 @@ func NewSyncer(config *config.ConsulConfig, shutdownCh chan struct{}, logger *lo consulSyncer := Syncer{ client: c, logger: logger, + consulAvailable: true, shutdownCh: shutdownCh, trackedServices: make(map[string]*consul.AgentService), servicesGroups: make(map[string][]*consul.AgentServiceRegistration), @@ -534,11 +535,11 @@ func (c *Syncer) runCheck(check Check) { output = res.Err.Error() } if err := c.client.Agent().UpdateTTL(check.ID(), output, state); err != nil { - if c.runChecks { + if c.consulAvailable { c.logger.Printf("[DEBUG] consul.syncer: check %q failed, disabling Consul checks until until next successful sync: %v", check.ID(), err) - c.runChecks = false + c.consulAvailable = false } else { - c.runChecks = true + c.consulAvailable = true } } } From d1ab21f2f777110416bd91d6b6223cd43a9cdab9 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 21:45:51 -0400 Subject: [PATCH 128/166] Generate and sync Consul ServiceIDs consistently --- client/client.go | 27 +++++++++------------ client/driver/executor/executor.go | 8 ++++-- command/agent/consul/syncer.go | 39 ++++++++++++++++++------------ nomad/structs/structs.go | 12 --------- 4 files changed, 40 insertions(+), 46 deletions(-) diff --git a/client/client.go b/client/client.go index 3af148c3d7a..c1ff41ece9f 100644 --- a/client/client.go +++ b/client/client.go @@ -1282,16 +1282,9 @@ func (c *Client) setupConsulSyncer() error { c.consulSyncer.AddPeriodicHandler("Nomad Client Fallback Server Handler", bootstrapFn) consulServicesSyncFn := func() error { - // Give up pruning services if we can't fingerprint our - // Consul Agent. - c.configLock.RLock() - _, ok := c.configCopy.Node.Attributes["consul.version"] - c.configLock.RUnlock() - if !ok { - return fmt.Errorf("Consul not running") - } - - services := make(map[string]struct{}) + const estInitialConsulServices = 8 + const serviceGroupName = "executor" + services := make([]*structs.ConsulService, 0, estInitialConsulServices) for allocId, ar := range c.getAllocRunners() { ar.taskStatusLock.RLock() taskStates := copyTaskStates(ar.taskStates) @@ -1300,18 +1293,20 @@ func (c *Client) setupConsulSyncer() error { if taskState.State == structs.TaskStateRunning { if tr, ok := ar.tasks[taskName]; ok { for _, service := range tr.task.ConsulServices { - svcIdentifier := fmt.Sprintf("%s-%s", allocId, tr.task.Name) - services[service.ID(svcIdentifier)] = struct{}{} + if service.Name == "" { + service.Name = fmt.Sprintf("%s-%s", tr.task.Name, allocId) + } + if service.ServiceID == "" { + service.ServiceID = fmt.Sprintf("%s-%s:%s/%s", c.consulSyncer.GenerateServiceID(serviceGroupName, service), tr.task.Name, allocId) + } + services = append(services, service) } } } } } - if err := c.consulSyncer.KeepServices(services); err != nil { - c.logger.Printf("[DEBUG] client: error removing services from non-running tasks: %v", err) - return err - } + c.consulSyncer.SetServices(serviceGroupName, services) return nil } c.consulSyncer.AddPeriodicHandler("Nomad Client Services Sync Handler", consulServicesSyncFn) diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index 8e22f0fb77e..b238aa2bc89 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -35,6 +35,9 @@ const ( // tree for finding out the pids that the executor and it's child processes // have forked pidScanInterval = 5 * time.Second + + // serviceRegPrefix is the prefix the entire Executor should use + serviceRegPrefix = "executor" ) var ( @@ -359,7 +362,8 @@ func (e *UniversalExecutor) UpdateTask(task *structs.Task) error { // Re-syncing task with Consul agent if e.consulSyncer != nil { - e.consulSyncer.SetServices(servicesGroupName, task.ConsulServices) + e.interpolateServices(e.ctx.Task) + e.consulSyncer.SetServices(e.ctx.AllocID, task.ConsulServices) } return nil } @@ -480,7 +484,7 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { return err } cs.SetDelegatedChecks(e.createCheckMap(), e.createCheck) - cs.SetServiceRegPrefix(consul.GenerateServicePrefix(e.ctx.AllocID, e.ctx.Task.Name)) + cs.SetServiceRegPrefix(serviceRegPrefix) cs.SetAddrFinder(e.ctx.Task.FindHostAndPortFor) e.consulSyncer = cs go e.consulSyncer.Run() diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index a39ed4427b2..6bb458fbb31 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -28,6 +28,10 @@ const ( // initialSyncDelay is the delay before an initial sync. initialSyncDelay = 5 * time.Second + // nomadServicePrefix is the prefix used when registering a service + // with consul + nomadServicePrefix = "nomad" + // The periodic time interval for syncing services and checks with Consul syncInterval = 5 * time.Second @@ -192,12 +196,25 @@ func (c *Syncer) SetServiceRegPrefix(servicePrefix string) *Syncer { return c } -// SyncNow expires the current timer forcing the list of periodic callbacks -// to be synced immediately. -func (c *Syncer) SyncNow() { - select { - case c.notifySyncCh <- struct{}{}: +// filterPrefix generates a unique prefix that a Syncer can later filter on. +func (c *Syncer) filterPrefix() string { + c.registryLock.RLock() + defer c.registryLock.RUnlock() + return fmt.Sprintf("%s-%s", nomadServicePrefix, c.serviceRegPrefix) +} + +// GenerateServiceID creates a unique Consul ServiceID for a given +// ConsulService. +func (c *Syncer) GenerateServiceID(groupName string, service *structs.ConsulService) string { + numTags := len(service.Tags) + switch numTags { + case 0: + return fmt.Sprintf("%s-%s:%s", c.filterPrefix(), groupName, service.Name) + case 1: + return fmt.Sprintf("%s-%s:%s@%s", c.filterPrefix(), groupName, service.Tags[0], service.Name) default: + tags := strings.Join(service.Tags, "|") + return fmt.Sprintf("%s-%s:(%s)@%s", c.filterPrefix(), groupName, tags, service.Name) } } @@ -310,10 +327,6 @@ func (c *Syncer) Shutdown() error { return mErr.ErrorOrNil() } -// KeepServices removes services from consul which are not present in the list -// of tasks passed to it -func (c *Syncer) KeepServices(services map[string]struct{}) error { - var mErr multierror.Error // Get the services from Consul cServices, err := c.client.Agent().Services() @@ -379,7 +392,7 @@ func (c *Syncer) createService(service *structs.ConsulService) (*consul.AgentSer defer c.registryLock.RUnlock() srv := consul.AgentServiceRegistration{ - ID: service.ID(c.serviceRegPrefix), + ID: service.ServiceID, Name: service.Name, Tags: service.Tags, } @@ -544,12 +557,6 @@ func (c *Syncer) runCheck(check Check) { } } -// GenerateServicePrefix returns a service prefix based on an allocation id -// and task name. -func GenerateServicePrefix(allocID string, taskName string) string { - return fmt.Sprintf("%s-%s", taskName, allocID) -} - // AddPeriodicHandler adds a uniquely named callback. Returns true if // successful, false if a handler with the same name already exists. func (c *Syncer) AddPeriodicHandler(name string, fn types.PeriodicCallback) bool { diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index c9382fdc341..0314326b732 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -1544,14 +1544,6 @@ func (sc *ServiceCheck) Hash(serviceID string) string { return fmt.Sprintf("%x", h.Sum(nil)) } -const ( - NomadConsulPrefix = "nomad-registered-service" -) - -var ( - AgentServicePrefix = fmt.Sprintf("%s-%s", NomadConsulPrefix, "agent") -) - Name string // Name of the service, defaults to id Tags []string // List of tags for the service // The ConsulService model represents a Consul service definition in Nomad @@ -1598,10 +1590,6 @@ func (s *ConsulService) InitFields(job string, taskGroup string, task string) { } } -func (s *ConsulService) ID(identifier string) string { - return fmt.Sprintf("%s-%s-%s", NomadConsulPrefix, identifier, s.Hash()) -} - // Validate checks if the Check definition is valid func (s *ConsulService) Validate() error { var mErr multierror.Error From d2dcf27b352ea04a0e2b16e0d285677356d267e3 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 22:27:02 -0400 Subject: [PATCH 129/166] Populate the RPC Proxy's server list if heartbeat did not include a leader. It's possible that a Nomad Client is heartbeating with a Nomad server that has become issolated from the quorum of Nomad Servers. When 3x the heartbeatTTL has been exceeded, append the Consul server list to the primary primary server list. When the next RPCProxy rebalance occurs, there is a chance one of the servers discovered from Consul will be in the majority. When client reattaches to a Nomad Server in the majority, it will include a heartbeat and will reset the TTLs *AND* will clear the primary server list to include only values from the heartbeat. --- client/client.go | 59 +++++++++++++++++++++++++++++++++---- client/rpcproxy/rpcproxy.go | 6 ---- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/client/client.go b/client/client.go index c1ff41ece9f..340edba2154 100644 --- a/client/client.go +++ b/client/client.go @@ -9,6 +9,7 @@ import ( "path/filepath" "strconv" "sync" + "sync/atomic" "time" "github.com/armon/go-metrics" @@ -111,6 +112,13 @@ type Client struct { connPool *nomad.ConnPool + // lastHeartbeatFromQuorum is an atomic int32 acting as a bool. When + // true, the last heartbeat message had a leader. When false (0), + // the last heartbeat did not include the RPC address of the leader, + // indicating that the server is in the minority or middle of an + // election. + lastHeartbeatFromQuorum int32 + lastHeartbeat time.Time heartbeatTTL time.Duration heartbeatLock sync.Mutex @@ -361,6 +369,8 @@ func (c *Client) Stats() map[string]map[string]string { numAllocs := len(c.allocs) c.allocLock.RUnlock() + c.heartbeatLock.Lock() + defer c.heartbeatLock.Unlock() stats := map[string]map[string]string{ "client": map[string]string{ "node_id": c.Node().ID, @@ -924,8 +934,20 @@ func (c *Client) updateNodeStatus() error { if err := c.rpcProxy.RefreshServerLists(resp.Servers, resp.NumNodes, resp.LeaderRPCAddr); err != nil { return err } - c.consulPullHeartbeatDeadline = time.Now().Add(2 * resp.HeartbeatTTL) + // Begin polling Consul if there is no Nomad leader. We could be + // heartbeating to a Nomad server that is in the minority of a + // partition of the Nomad server quorum, but this Nomad Agent still + // has connectivity to the existing majority of Nomad Servers, but + // only if it queries Consul. + if resp.LeaderRPCAddr == "" { + atomic.CompareAndSwapInt32(&c.lastHeartbeatFromQuorum, 1, 0) + return nil + } + + const heartbeatFallbackFactor = 3 + atomic.CompareAndSwapInt32(&c.lastHeartbeatFromQuorum, 0, 1) + c.consulPullHeartbeatDeadline = time.Now().Add(heartbeatFallbackFactor * resp.HeartbeatTTL) return nil } @@ -1249,20 +1271,29 @@ func (c *Client) setupConsulSyncer() error { bootstrapFn := func() error { now := time.Now() c.configLock.RLock() - if now.Before(c.consulPullHeartbeatDeadline) { + + // If the last heartbeat didn't contain a leader, give the + // Nomad server this Agent is talking to one more attempt at + // providing a heartbeat that does contain a leader. + if atomic.LoadInt32(&c.lastHeartbeatFromQuorum) == 1 && now.Before(c.consulPullHeartbeatDeadline) { c.configLock.RUnlock() return nil } c.configLock.RUnlock() + c.logger.Printf("[TRACE] client.consul: lost heartbeat with Nomad quorum, falling back to Consul for server list") nomadServerServiceName := c.config.ConsulConfig.ServerServiceName services, _, err := c.consulSyncer.ConsulClient().Catalog(). Service(nomadServerServiceName, consul.ServiceTagRpc, &consulapi.QueryOptions{AllowStale: true}) if err != nil { - c.logger.Printf("[WARN] client: unable to query service %q: %v", nomadServerServiceName, err) - return err + return fmt.Errorf("client.consul: unable to query service %q: %v", nomadServerServiceName, err) } + + if len(services) == 0 { + return fmt.Errorf("client.consul: no Nomad servers advertising service %q", nomadServerServiceName) + } + serverAddrs := make([]string, 0, len(services)) for _, s := range services { port := strconv.FormatInt(int64(s.ServicePort), 10) @@ -1273,8 +1304,24 @@ func (c *Client) setupConsulSyncer() error { serverAddrs = append(serverAddrs, net.JoinHostPort(addr, port)) } - if err := c.rpcProxy.SetBackupServers(serverAddrs); err != nil { - return err + if atomic.LoadInt32(&c.lastHeartbeatFromQuorum) == 1 && now.Before(c.consulPullHeartbeatDeadline) { + // Common, healthy path + if err := c.rpcProxy.SetBackupServers(serverAddrs); err != nil { + return fmt.Errorf("client.consul: unable to set backup servers: %v", err) + } + } else { + // If this Client is talking with a Server that + // doesn't have a leader, and we have exceeded the + // consulPullHeartbeatDeadline, change the call from + // SetBackupServers() to calling AddPrimaryServer() + // in order to allow the Clients to randomly begin + // considering all known Nomad servers and + // eventually, hopefully, find their way to a Nomad + // Server that has quorum (assuming Consul has a + // server list that is in the majority). + for _, s := range serverAddrs { + c.rpcProxy.AddPrimaryServer(s) + } } return nil diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 899f0434b93..2541df0102b 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -645,12 +645,6 @@ func (p *RpcProxy) RefreshServerLists(servers []*structs.NodeServerInfo, numNode // with newer API versions are filtered from the list. If the list // is missing an address found in the RpcProxy's server list, remove // it from the RpcProxy. - // - // FIXME(sean@): This is not true. We rely on an outside pump to set - // these values. In order to catch the orphaned clients where all - // Nomad servers were rolled between the heartbeat interval, the - // rebalance task queries Consul and adds the servers found in Consul - // to the server list in order to reattach an orphan to a server. p.serverListLock.Lock() defer p.serverListLock.Unlock() From 19f765eb056ec11535967a8fcbbe8f8e2ef5f6a9 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 22:55:01 -0400 Subject: [PATCH 130/166] Sync services with Consul by comparing the AgentServiceReg w/ ConsulService The source of truth is the local Nomad Agent. Any services not local that have a matching prefix are removed. Changed services are re-registered and missing services are re-added. --- command/agent/consul/syncer.go | 169 +++++++++++++++++++++++++++++++-- nomad/structs/config/consul.go | 2 +- 2 files changed, 162 insertions(+), 9 deletions(-) diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index 6bb458fbb31..f8ddf2ac4c7 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -6,7 +6,6 @@ import ( "log" "net/http" "net/url" - "reflect" "strings" "sync" "time" @@ -72,6 +71,7 @@ type Syncer struct { checkRunners map[string]*CheckRunner delegateChecks map[string]struct{} // delegateChecks are the checks that the Nomad client runs and reports to Consul + trackedServices map[string]*consul.AgentServiceRegistration // serviceRegPrefix is used to namespace the domain of registered // Consul Services and Checks belonging to a single Syncer. A given @@ -164,8 +164,8 @@ func NewSyncer(config *config.ConsulConfig, shutdownCh chan struct{}, logger *lo logger: logger, consulAvailable: true, shutdownCh: shutdownCh, - trackedServices: make(map[string]*consul.AgentService), servicesGroups: make(map[string][]*consul.AgentServiceRegistration), + trackedServices: make(map[string]*consul.AgentServiceRegistration), trackedChecks: make(map[string]*consul.AgentCheckRegistration), checkRunners: make(map[string]*CheckRunner), periodicCallbacks: make(map[string]types.PeriodicCallback), @@ -328,8 +328,20 @@ func (c *Syncer) Shutdown() error { } - // Get the services from Consul - cServices, err := c.client.Agent().Services() +// queryAgentServices queries the Consul Agent for a list of Consul services that +// have been registered with this Consul Syncer. +func (c *Syncer) queryAgentServices() (map[string]*consul.AgentService, error) { + services, err := c.client.Agent().Services() + if err != nil { + return nil, err + } + return c.filterConsulServices(services), nil +} + +// syncChecks synchronizes this Syncer's Consul Checks with the Consul Agent. +func (c *Syncer) syncChecks() error { + var mErr multierror.Error + consulChecks, err := c.queryChecks() if err != nil { return err } @@ -341,7 +353,147 @@ func (c *Syncer) Shutdown() error { if err := c.deregisterService(service.ID); err != nil { mErr.Errors = append(mErr.Errors, err) } + +// compareConsulService takes a consul.AgentServiceRegistration instance and +// compares it with a consul.AgentService. Returns true if they are equal +// according to consul.AgentService, otherwise false. +func compareConsulService(localService *consul.AgentServiceRegistration, consulService *consul.AgentService) bool { + if consulService.ID != localService.ID || + consulService.Service != localService.Name || + consulService.Port != localService.Port || + consulService.Address != localService.Address || + consulService.Address != localService.Address || + consulService.EnableTagOverride != localService.EnableTagOverride { + return false + } + + serviceTags := make(map[string]byte, len(localService.Tags)) + for _, tag := range localService.Tags { + serviceTags[tag] = 'l' + } + for _, tag := range consulService.Tags { + if _, found := serviceTags[tag]; !found { + return false + } + serviceTags[tag] = 'b' + } + for _, state := range serviceTags { + if state == 'l' { + return false + } + } + + return true +} + +// calcServicesDiff takes the argument (consulServices) and calculates the +// delta between the consul.Syncer's list of known services +// (c.trackedServices). Three arrays are returned: +// +// 1) a slice of services that exist only locally in the Syncer and are +// missing from the Consul Agent (consulServices) and therefore need to be +// registered. +// +// 2) a slice of services that exist in both the local consul.Syncer's +// tracked list and Consul Agent (consulServices) *AND* are identical. +// +// 3) a slice of services that exist in both the local consul.Syncer's +// tracked list and Consul Agent (consulServices) but have diverged state. +// +// 4) a slice of services that exist only in the Consul Agent +// (consulServices) and should be removed because the Consul Agent has +// drifted from the Syncer. +func (c *Syncer) calcServicesDiff(consulServices map[string]*consul.AgentService) (missingServices []*consul.AgentServiceRegistration, equalServices []*consul.AgentServiceRegistration, changedServices []*consul.AgentServiceRegistration, staleServices []*consul.AgentServiceRegistration) { + type mergedService struct { + service *consul.AgentServiceRegistration + // 'l' == Nomad local only + // 'e' == equal + // 'c' == changed + // 'a' == Consul agent only + state byte + } + var ( + localServicesCount = 0 + equalServicesCount = 0 + changedServicesCount = 0 + agentServices = 0 + ) + localServices := make(map[string]*mergedService, len(c.trackedServices)+len(consulServices)) + for _, localService := range c.flattenedServices() { + localServicesCount++ + localServices[localService.ID] = &mergedService{localService, 'l'} + } + for _, consulService := range consulServices { + if localService, found := localServices[consulService.ID]; found { + localServicesCount-- + if compareConsulService(localService.service, consulService) { + equalServicesCount++ + localServices[consulService.ID].state = 'e' + } else { + changedServicesCount++ + localServices[consulService.ID].state = 'c' + } + } else { + agentServices++ + agentServiceReg := &consul.AgentServiceRegistration{ + ID: consulService.ID, + Name: consulService.Service, + Tags: consulService.Tags, + Port: consulService.Port, + Address: consulService.Address, + } + localServices[consulService.ID] = &mergedService{agentServiceReg, 'a'} + } + } + + missingServices = make([]*consul.AgentServiceRegistration, 0, localServicesCount) + equalServices = make([]*consul.AgentServiceRegistration, 0, equalServicesCount) + changedServices = make([]*consul.AgentServiceRegistration, 0, changedServicesCount) + staleServices = make([]*consul.AgentServiceRegistration, 0, agentServices) + for _, service := range localServices { + switch service.state { + case 'l': + missingServices = append(missingServices, service.service) + case 'e': + equalServices = append(equalServices, service.service) + case 'c': + changedServices = append(changedServices, service.service) + case 'a': + staleServices = append(staleServices, service.service) + } + } + + return missingServices, equalServices, changedServices, staleServices +} + +// syncServices synchronizes this Syncer's Consul Services with the Consul +// Agent. +func (c *Syncer) syncServices() error { + consulServices, err := c.queryAgentServices() + if err != nil { + return err + } + + // Synchronize services with Consul + var mErr multierror.Error + missingServices, _, changedServices, removedServices := c.calcServicesDiff(consulServices) + for _, service := range missingServices { + if err := c.client.Agent().ServiceRegister(service); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + c.trackedServices[service.ID] = service + } + for _, service := range changedServices { + // Re-register the local service + if err := c.client.Agent().ServiceRegister(service); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + } + for _, service := range removedServices { + if err := c.deregisterService(service.ID); err != nil { + mErr.Errors = append(mErr.Errors, err) } + delete(c.trackedServices, service.ID) } return mErr.ErrorOrNil() } @@ -386,7 +538,7 @@ func (c *Syncer) createDelegatedCheckReg(check *structs.ServiceCheck, service *c return &chkReg, nil } -// createService creates a Consul AgentService from a Nomad Service +// createService creates a Consul AgentService from a Nomad ConsulService. func (c *Syncer) createService(service *structs.ConsulService) (*consul.AgentServiceRegistration, error) { c.registryLock.RLock() defer c.registryLock.RUnlock() @@ -409,8 +561,8 @@ func (c *Syncer) createService(service *structs.ConsulService) (*consul.AgentSer } // deregisterService de-registers a service with the given ID from consul -func (c *Syncer) deregisterService(ID string) error { - return c.client.Agent().ServiceDeregister(ID) +func (c *Syncer) deregisterService(serviceID string) error { + return c.client.Agent().ServiceDeregister(serviceID) } // deregisterCheck de-registers a check with a given ID from Consul. @@ -501,8 +653,9 @@ func (c *Syncer) filterConsulServices(consulServices map[string]*consul.AgentSer localServices := make(map[string]*consul.AgentService, len(consulServices)) c.registryLock.RLock() defer c.registryLock.RUnlock() + filterPrefix := c.filterPrefix() for serviceID, service := range consulServices { - if strings.HasPrefix(service.ID, c.serviceRegPrefix) { + if strings.HasPrefix(service.ID, filterPrefix) { localServices[serviceID] = service } } diff --git a/nomad/structs/config/consul.go b/nomad/structs/config/consul.go index 57d5a6a509e..cccb91903ee 100644 --- a/nomad/structs/config/consul.go +++ b/nomad/structs/config/consul.go @@ -9,7 +9,7 @@ import ( // ConsulConfig contains the configuration information necessary to // communicate with a Consul Agent in order to: // -// - Register services and checks with Consul +// - Register services and their checks with Consul // // - Bootstrap this Nomad Client with the list of Nomad Servers registered // with Consul From 7d060c20d794d0a49a4d41e0859705a36a402269 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 22:58:33 -0400 Subject: [PATCH 131/166] Sync checks with Consul by comparing the AgentCheckReg w/ ConsulService The source of truth is the local Nomad Agent. Any checks are not local that have a matching prefix are removed. Changed checks are re-registered and missing checks are re-added. --- command/agent/consul/syncer.go | 175 +++++++++++++++++++++++++++++---- 1 file changed, 154 insertions(+), 21 deletions(-) diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index f8ddf2ac4c7..59b2dfe9547 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -71,6 +71,7 @@ type Syncer struct { checkRunners map[string]*CheckRunner delegateChecks map[string]struct{} // delegateChecks are the checks that the Nomad client runs and reports to Consul + trackedChecks map[string]*consul.AgentCheckRegistration trackedServices map[string]*consul.AgentServiceRegistration // serviceRegPrefix is used to namespace the domain of registered @@ -327,6 +328,15 @@ func (c *Syncer) Shutdown() error { return mErr.ErrorOrNil() } +// queryChecks queries the Consul Agent for a list of Consul checks that +// have been registered with this Consul Syncer. +func (c *Syncer) queryChecks() (map[string]*consul.AgentCheck, error) { + checks, err := c.client.Agent().Checks() + if err != nil { + return nil, err + } + return c.filterConsulChecks(checks), nil +} // queryAgentServices queries the Consul Agent for a list of Consul services that // have been registered with this Consul Syncer. @@ -345,14 +355,127 @@ func (c *Syncer) syncChecks() error { if err != nil { return err } - cServices = c.filterConsulServices(cServices) - // Remove the services from consul which are not in any of the tasks - for _, service := range cServices { - if _, validService := services[service.ID]; !validService { - if err := c.deregisterService(service.ID); err != nil { - mErr.Errors = append(mErr.Errors, err) + // Synchronize checks with Consul + missingChecks, _, changedChecks, staleChecks := c.calcChecksDiff(consulChecks) + for _, check := range missingChecks { + if err := c.registerCheck(check); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + c.trackedChecks[check.ID] = check + } + for _, check := range changedChecks { + // NOTE(sean@): Do we need to deregister the check before + // re-registering it? Not deregistering to avoid missing the + // TTL but doesn't correct reconcile any possible drift with + // the check. + // + // if err := c.deregisterCheck(check.ID); err != nil { + // mErr.Errors = append(mErr.Errors, err) + // } + if err := c.registerCheck(check); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + } + for _, check := range staleChecks { + if err := c.deregisterCheck(check.ID); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + delete(c.trackedChecks, check.ID) + } + return mErr.ErrorOrNil() +} + +// compareConsulCheck takes a consul.AgentCheckRegistration instance and +// compares it with a consul.AgentCheck. Returns true if they are equal +// according to consul.AgentCheck, otherwise false. +func compareConsulCheck(localCheck *consul.AgentCheckRegistration, consulCheck *consul.AgentCheck) bool { + if consulCheck.CheckID != localCheck.ID || + consulCheck.Name != localCheck.Name || + consulCheck.Notes != localCheck.Notes || + consulCheck.ServiceID != localCheck.ServiceID { + return false + } + return true +} + +// calcChecksDiff takes the argument (consulChecks) and calculates the delta +// between the consul.Syncer's list of known checks (c.trackedChecks). Three +// arrays are returned: +// +// 1) a slice of checks that exist only locally in the Syncer and are missing +// from the Consul Agent (consulChecks) and therefore need to be registered. +// +// 2) a slice of checks that exist in both the local consul.Syncer's +// tracked list and Consul Agent (consulChecks). +// +// 3) a slice of checks that exist in both the local consul.Syncer's +// tracked list and Consul Agent (consulServices) but have diverged state. +// +// 4) a slice of checks that exist only in the Consul Agent (consulChecks) +// and should be removed because the Consul Agent has drifted from the +// Syncer. +func (c *Syncer) calcChecksDiff(consulChecks map[string]*consul.AgentCheck) (missingChecks []*consul.AgentCheckRegistration, equalChecks []*consul.AgentCheckRegistration, changedChecks []*consul.AgentCheckRegistration, staleChecks []*consul.AgentCheckRegistration) { + type mergedCheck struct { + check *consul.AgentCheckRegistration + // 'l' == Nomad local only + // 'e' == equal + // 'c' == changed + // 'a' == Consul agent only + state byte + } + var ( + localChecksCount = 0 + equalChecksCount = 0 + changedChecksCount = 0 + agentChecks = 0 + ) + localChecks := make(map[string]*mergedCheck, len(c.trackedChecks)+len(consulChecks)) + for _, localCheck := range c.trackedChecks { + localChecksCount++ + localChecks[localCheck.ID] = &mergedCheck{localCheck, 'l'} + } + for _, consulCheck := range consulChecks { + if localCheck, found := localChecks[consulCheck.CheckID]; found { + localChecksCount-- + if compareConsulCheck(localCheck.check, consulCheck) { + equalChecksCount++ + localChecks[consulCheck.CheckID].state = 'e' + } else { + changedChecksCount++ + localChecks[consulCheck.CheckID].state = 'c' } + } else { + agentChecks++ + agentCheckReg := &consul.AgentCheckRegistration{ + ID: consulCheck.CheckID, + Name: consulCheck.Name, + Notes: consulCheck.Notes, + ServiceID: consulCheck.ServiceID, + } + localChecks[consulCheck.CheckID] = &mergedCheck{agentCheckReg, 'a'} + } + } + + missingChecks = make([]*consul.AgentCheckRegistration, 0, localChecksCount) + equalChecks = make([]*consul.AgentCheckRegistration, 0, equalChecksCount) + changedChecks = make([]*consul.AgentCheckRegistration, 0, changedChecksCount) + staleChecks = make([]*consul.AgentCheckRegistration, 0, agentChecks) + for _, check := range localChecks { + switch check.state { + case 'l': + missingChecks = append(missingChecks, check.check) + case 'e': + equalChecks = append(equalChecks, check.check) + case 'c': + changedChecks = append(changedChecks, check.check) + case 'a': + staleChecks = append(staleChecks, check.check) + } + } + + return missingChecks, equalChecks, changedChecks, staleChecks +} // compareConsulService takes a consul.AgentServiceRegistration instance and // compares it with a consul.AgentService. Returns true if they are equal @@ -509,7 +632,7 @@ func (c *Syncer) registerCheck(chkReg *consul.AgentCheckRegistration) error { // createDelegatedCheckReg creates a Check that can be registered with // Nomad. It also creates a Nomad check for the check types that it can // handle. -func (c *Syncer) createDelegatedCheckReg(check *structs.ServiceCheck, service *consul.AgentService) (*consul.AgentCheckRegistration, error) { +func (c *Syncer) createDelegatedCheckReg(check *structs.ServiceCheck, service *consul.AgentServiceRegistration) (*consul.AgentCheckRegistration, error) { chkReg := consul.AgentCheckRegistration{ ID: check.Hash(service.ID), Name: check.Name, @@ -565,16 +688,25 @@ func (c *Syncer) deregisterService(serviceID string) error { return c.client.Agent().ServiceDeregister(serviceID) } -// deregisterCheck de-registers a check with a given ID from Consul. -func (c *Syncer) deregisterCheck(ID string) error { - // Deleting the nomad check - if cr, ok := c.checkRunners[ID]; ok { +// deregisterCheck de-registers a check from Consul +func (c *Syncer) deregisterCheck(checkID string) error { + c.registryLock.Lock() + defer c.registryLock.Unlock() + + // Deleting from Consul Agent + if err := c.client.Agent().CheckDeregister(checkID); err != nil { + // CheckDeregister() will be reattempted again in a future + // sync. + return err + } + + // Remove the check from the local registry + if cr, ok := c.checkRunners[checkID]; ok { cr.Stop() - delete(c.checkRunners, ID) + delete(c.checkRunners, checkID) } - // Deleting from consul - return c.client.Agent().CheckDeregister(ID) + return nil } // Run triggers periodic syncing of services and checks with Consul. This is @@ -663,15 +795,16 @@ func (c *Syncer) filterConsulServices(consulServices map[string]*consul.AgentSer } // filterConsulChecks prunes out all the consul checks which do not have -// services with id prefixed with noamd- -func (c *Syncer) filterConsulChecks(chks map[string]*consul.AgentCheck) map[string]*consul.AgentCheck { - nomadChecks := make(map[string]*consul.AgentCheck) - for _, chk := range chks { - if strings.HasPrefix(chk.ServiceID, structs.NomadConsulPrefix) { - nomadChecks[chk.CheckID] = chk +// services with Syncer's idPrefix. +func (c *Syncer) filterConsulChecks(consulChecks map[string]*consul.AgentCheck) map[string]*consul.AgentCheck { + localChecks := make(map[string]*consul.AgentCheck, len(consulChecks)) + filterPrefix := c.filterPrefix() + for checkID, check := range consulChecks { + if strings.HasPrefix(check.ServiceID, filterPrefix) { + localChecks[checkID] = check } } - return nomadChecks + return localChecks } // consulPresent indicates whether the consul agent is responding From 9b984d04f20de70747be03f2356283c837035087 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 23:00:32 -0400 Subject: [PATCH 132/166] Update the structure of ConsulService to match reality. ConsulService is the configuration for a Consul Service --- nomad/structs/structs.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 0314326b732..1f6c80bb49d 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -1544,12 +1544,23 @@ func (sc *ServiceCheck) Hash(serviceID string) string { return fmt.Sprintf("%x", h.Sum(nil)) } - Name string // Name of the service, defaults to id - Tags []string // List of tags for the service // The ConsulService model represents a Consul service definition in Nomad // Agent's Config. type ConsulService struct { - PortLabel string `mapstructure:"port"` // port for the service + // ServiceID is the calculated Consul ServiceID used for a service. + // This value is not available to be set via configuration. + ServiceID string `mapstructure:"-"` + + // Name of the service registered with Consul. Consul defaults the + // Name to ServiceID if not specified. The Name if specified is used + // as one of the seed values when generating a Consul ServiceID. + Name string + + // PortLabel is either the numeric port number or the `host:port`. + // To specify the port number using the host's Consul Advertise + // address, specify an empty host in the PortLabel (e.g. `:port`). + PortLabel string `mapstructure:"port"` + Tags []string // List of tags for the service Checks []*ServiceCheck // List of checks associated with the service } From 27f6ffd83d15365a726b7e0266d4fff77c3ad57a Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 23:01:55 -0400 Subject: [PATCH 133/166] On Syncer Shutdown, remove all services that match a Syncer's prefix. --- command/agent/consul/syncer.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index 59b2dfe9547..a03288f95a2 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -319,9 +319,14 @@ func (c *Syncer) Shutdown() error { cr.Stop() } - // De-register all the services from consul - for _, service := range c.trackedServices { + // De-register all the services from Consul + services, err := c.queryAgentServices() + if err != nil { + mErr.Errors = append(mErr.Errors, err) + } + for _, service := range services { if err := c.client.Agent().ServiceDeregister(service.ID); err != nil { + c.logger.Printf("[WARN] consul.syncer: failed to deregister service ID %q: %v", service.ID, err) mErr.Errors = append(mErr.Errors, err) } } @@ -807,7 +812,7 @@ func (c *Syncer) filterConsulChecks(consulChecks map[string]*consul.AgentCheck) return localChecks } -// consulPresent indicates whether the consul agent is responding +// consulPresent indicates whether the Consul Agent is responding func (c *Syncer) consulPresent() bool { _, err := c.client.Agent().Self() return err == nil From d810590db9dc8d26a1032ae723617d200d92a68f Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 23:02:44 -0400 Subject: [PATCH 134/166] Create a consulContext using a client's consul config. This is wrong and should be the Agent's Consul Config. This is a step in the right direction, so committing to mark the necessary future change. --- client/driver/utils.go | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/client/driver/utils.go b/client/driver/utils.go index 62b062a1591..562e3165e5f 100644 --- a/client/driver/utils.go +++ b/client/driver/utils.go @@ -16,7 +16,6 @@ import ( "github.com/hashicorp/nomad/client/driver/logging" cstructs "github.com/hashicorp/nomad/client/driver/structs" "github.com/hashicorp/nomad/nomad/structs" - sconfig "github.com/hashicorp/nomad/nomad/structs/config" ) // createExecutor launches an executor plugin and returns an instance of the @@ -73,20 +72,8 @@ func createLogCollector(config *plugin.ClientConfig, w io.Writer, } func consulContext(clientConfig *config.Config, containerID string) *executor.ConsulContext { - cfg := sconfig.ConsulConfig{ - Addr: clientConfig.ReadDefault("consul.address", "127.0.0.1:8500"), - Token: clientConfig.Read("consul.token"), - Auth: clientConfig.Read("consul.auth"), - EnableSSL: clientConfig.ReadBoolDefault("consul.ssl", false), - VerifySSL: clientConfig.ReadBoolDefault("consul.verifyssl", true), - CAFile: clientConfig.Read("consul.tls_ca_file"), - CertFile: clientConfig.Read("consul.tls_cert_file"), - KeyFile: clientConfig.Read("consul.tls_key_file"), - ServerServiceName: clientConfig.ReadDefault("consul.server_service_name", "nomad"), - ClientServiceName: clientConfig.ReadDefault("consul.client_service_name", "nomad-client"), - } return &executor.ConsulContext{ - ConsulConfig: &cfg, + ConsulConfig: clientConfig.ConsulConfig, ContainerID: containerID, DockerEndpoint: clientConfig.Read("docker.endpoint"), TLSCa: clientConfig.Read("docker.tls.ca"), From f7b2949f11d74907570b3e0b046ac29cea70e1fd Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 23:31:14 -0400 Subject: [PATCH 135/166] Properly cover Syncer attributes with the registryLock. trackedServices, delegateChecks, trackedChecks, and checkRunners should all be covered. This lock needs to be reasonably narrow and can't use defer due to possible recursive locking concerns further downstream from the call sites. --- command/agent/consul/syncer.go | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index a03288f95a2..3d866d60943 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -245,8 +245,11 @@ func (c *Syncer) SetServices(groupName string, services []*structs.ConsulService continue } // creating a nomad check if we have to handle this particular check type + c.registryLock.RLock() if _, ok := c.delegateChecks[chk.Type]; ok { - if _, ok := c.checkRunners[chkReg.ID]; ok { + _, ok := c.checkRunners[chkReg.ID] + c.registryLock.RUnlock() + if ok { continue } nc, err := c.createDelegatedCheck(chk, chkReg.ID) @@ -255,7 +258,11 @@ func (c *Syncer) SetServices(groupName string, services []*structs.ConsulService continue } cr := NewCheckRunner(nc, c.runCheck, c.logger) + c.registryLock.Lock() c.checkRunners[nc.ID()] = cr + c.registryLock.Unlock() + } else { + c.registryLock.RUnlock() } } } @@ -315,9 +322,11 @@ func (c *Syncer) Shutdown() error { c.signalShutdown() // Stop all the checks that nomad is running + c.registryLock.RLock() for _, cr := range c.checkRunners { cr.Stop() } + c.registryLock.RUnlock() // De-register all the services from Consul services, err := c.queryAgentServices() @@ -367,7 +376,9 @@ func (c *Syncer) syncChecks() error { if err := c.registerCheck(check); err != nil { mErr.Errors = append(mErr.Errors, err) } + c.registryLock.Lock() c.trackedChecks[check.ID] = check + c.registryLock.Unlock() } for _, check := range changedChecks { // NOTE(sean@): Do we need to deregister the check before @@ -386,7 +397,9 @@ func (c *Syncer) syncChecks() error { if err := c.deregisterCheck(check.ID); err != nil { mErr.Errors = append(mErr.Errors, err) } + c.registryLock.Lock() delete(c.trackedChecks, check.ID) + c.registryLock.Unlock() } return mErr.ErrorOrNil() } @@ -435,11 +448,13 @@ func (c *Syncer) calcChecksDiff(consulChecks map[string]*consul.AgentCheck) (mis changedChecksCount = 0 agentChecks = 0 ) + c.registryLock.RLock() localChecks := make(map[string]*mergedCheck, len(c.trackedChecks)+len(consulChecks)) for _, localCheck := range c.trackedChecks { localChecksCount++ localChecks[localCheck.ID] = &mergedCheck{localCheck, 'l'} } + c.registryLock.RUnlock() for _, consulCheck := range consulChecks { if localCheck, found := localChecks[consulCheck.CheckID]; found { localChecksCount-- @@ -546,7 +561,9 @@ func (c *Syncer) calcServicesDiff(consulServices map[string]*consul.AgentService changedServicesCount = 0 agentServices = 0 ) + c.registryLock.RLock() localServices := make(map[string]*mergedService, len(c.trackedServices)+len(consulServices)) + c.registryLock.RUnlock() for _, localService := range c.flattenedServices() { localServicesCount++ localServices[localService.ID] = &mergedService{localService, 'l'} @@ -609,7 +626,9 @@ func (c *Syncer) syncServices() error { if err := c.client.Agent().ServiceRegister(service); err != nil { mErr.Errors = append(mErr.Errors, err) } + c.registryLock.Lock() c.trackedServices[service.ID] = service + c.registryLock.Unlock() } for _, service := range changedServices { // Re-register the local service @@ -621,16 +640,20 @@ func (c *Syncer) syncServices() error { if err := c.deregisterService(service.ID); err != nil { mErr.Errors = append(mErr.Errors, err) } + c.registryLock.Lock() delete(c.trackedServices, service.ID) + c.registryLock.Unlock() } return mErr.ErrorOrNil() } // registerCheck registers a check definition with Consul func (c *Syncer) registerCheck(chkReg *consul.AgentCheckRegistration) error { + c.registryLock.RLock() if cr, ok := c.checkRunners[chkReg.ID]; ok { cr.Start() } + c.registryLock.RUnlock() return c.client.Agent().CheckRegister(chkReg) } From 05e713cfca31294d6f510f4006b85e944c102dbc Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 23:35:07 -0400 Subject: [PATCH 136/166] Skip nil check for agent's consulSyncer is always not nil --- command/agent/agent.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 119afe08818..e3548cc1df8 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -545,10 +545,8 @@ func (a *Agent) Shutdown() error { } } - if a.consulSyncer != nil { - if err := a.consulSyncer.Shutdown(); err != nil { - a.logger.Printf("[ERR] agent: shutting down consul service failed: %v", err) - } + if err := a.consulSyncer.Shutdown(); err != nil { + a.logger.Printf("[ERR] agent: shutting down consul service failed: %v", err) } a.logger.Println("[INFO] agent: shutdown complete") From 69aa8e81957061afef42c8fec8c75c86179614d4 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Thu, 9 Jun 2016 23:42:54 -0400 Subject: [PATCH 137/166] Don't spam the consul if Consul is not available. Log once when Consul goes away, and log when Consul comes back. --- command/agent/consul/syncer.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index 3d866d60943..0c06091d042 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -753,9 +753,8 @@ func (c *Syncer) Run() { if err := c.SyncServices(); err != nil { if c.consulAvailable { c.logger.Printf("[DEBUG] consul.syncer: disabling checks until successful sync for %q: %v", c.serviceRegPrefix, err) - } else { - c.consulAvailable = false } + c.consulAvailable = false } else { if !c.consulAvailable { c.logger.Printf("[DEBUG] consul.syncer: re-enabling checks for for %q", c.serviceRegPrefix) From c426b8501c5a18001200c42d1a0d6754855f71f1 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 00:19:06 -0400 Subject: [PATCH 138/166] Hand wave over the syncer tests atm, these will be fixed shortly. --- command/agent/consul/syncer_test.go | 54 ++++++++++++++++++----------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/command/agent/consul/syncer_test.go b/command/agent/consul/syncer_test.go index 3773dd00d53..b9827e3a174 100644 --- a/command/agent/consul/syncer_test.go +++ b/command/agent/consul/syncer_test.go @@ -14,7 +14,9 @@ import ( ) const ( - allocID = "12" + allocID = "12" + serviceRegPrefix = "test" + serviceGroupName = "executor" ) var ( @@ -52,21 +54,27 @@ func TestConsulServiceRegisterServices(t *testing.T) { return } task := mockTask() - cs.SetServiceRegPrefix(GenerateServicePrefix(allocID, task.Name)) + cs.SetServiceRegPrefix(serviceRegPrefix) cs.SetAddrFinder(task.FindHostAndPortFor) if err := cs.SyncServices(); err != nil { t.Fatalf("err: %v", err) } defer cs.Shutdown() - service1ID := service1.ID(GenerateServicePrefix(allocID, task.Name)) - service2ID := service2.ID(GenerateServicePrefix(allocID, task.Name)) - if err := servicesPresent(t, []string{service1ID, service2ID}, cs); err != nil { - t.Fatalf("err : %v", err) - } - if err := checksPresent(t, []string{check1.Hash(service1ID)}, cs); err != nil { + service1 := &structs.ConsulService{Name: task.Name} + service2 := &structs.ConsulService{Name: task.Name} + services := []*structs.ConsulService{service1, service2} + service1.ServiceID = fmt.Sprintf("%s-%s:%s/%s", cs.GenerateServiceID(serviceGroupName, service1), task.Name, allocID) + service2.ServiceID = fmt.Sprintf("%s-%s:%s/%s", cs.GenerateServiceID(serviceGroupName, service2), task.Name, allocID) + + cs.SetServices(serviceGroupName, services) + if err := servicesPresent(t, services, cs); err != nil { t.Fatalf("err : %v", err) } + // FIXME(sean@) + // if err := checksPresent(t, []string{check1.Hash(service1ID)}, cs); err != nil { + // t.Fatalf("err : %v", err) + // } } func TestConsulServiceUpdateService(t *testing.T) { @@ -81,7 +89,7 @@ func TestConsulServiceUpdateService(t *testing.T) { } task := mockTask() - cs.SetServiceRegPrefix(GenerateServicePrefix(allocID, task.Name)) + cs.SetServiceRegPrefix(serviceRegPrefix) cs.SetAddrFinder(task.FindHostAndPortFor) if err := cs.SyncServices(); err != nil { t.Fatalf("err: %v", err) @@ -95,36 +103,40 @@ func TestConsulServiceUpdateService(t *testing.T) { t.Fatalf("err: %v", err) } // Make sure all the services and checks are still present - service1ID := service1.ID(GenerateServicePrefix(allocID, task.Name)) - service2ID := service2.ID(GenerateServicePrefix(allocID, task.Name)) - if err := servicesPresent(t, []string{service1ID, service2ID}, cs); err != nil { - t.Fatalf("err : %v", err) - } - if err := checksPresent(t, []string{check1.Hash(service1ID)}, cs); err != nil { + service1 := &structs.ConsulService{Name: task.Name} + service2 := &structs.ConsulService{Name: task.Name} + services := []*structs.ConsulService{service1, service2} + service1.ServiceID = fmt.Sprintf("%s-%s:%s/%s", cs.GenerateServiceID(serviceGroupName, service1), task.Name, allocID) + service2.ServiceID = fmt.Sprintf("%s-%s:%s/%s", cs.GenerateServiceID(serviceGroupName, service2), task.Name, allocID) + if err := servicesPresent(t, services, cs); err != nil { t.Fatalf("err : %v", err) } + // FIXME(sean@) + // if err := checksPresent(t, []string{check1.Hash(service1ID)}, cs); err != nil { + // t.Fatalf("err : %v", err) + // } // check if service defn 1 has been updated - services, err := cs.client.Agent().Services() + consulServices, err := cs.client.Agent().Services() if err != nil { t.Fatalf("errL: %v", err) } - srv, _ := services[service1ID] + srv, _ := consulServices[service1.ServiceID] if !reflect.DeepEqual(srv.Tags, newTags) { t.Fatalf("expected tags: %v, actual: %v", newTags, srv.Tags) } } -func servicesPresent(t *testing.T, serviceIDs []string, syncer *Syncer) error { +func servicesPresent(t *testing.T, configuredServices []*structs.ConsulService, syncer *Syncer) error { var mErr multierror.Error services, err := syncer.client.Agent().Services() if err != nil { t.Fatalf("err: %v", err) } - for _, serviceID := range serviceIDs { - if _, ok := services[serviceID]; !ok { - mErr.Errors = append(mErr.Errors, fmt.Errorf("service ID %q not synced", serviceID)) + for _, configuredService := range configuredServices { + if _, ok := services[configuredService.ServiceID]; !ok { + mErr.Errors = append(mErr.Errors, fmt.Errorf("service ID %q not synced", configuredService.ServiceID)) } } return mErr.ErrorOrNil() From 8f83c2e8258a25a020ecdf6e5004fdef6e1bb49b Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 00:22:04 -0400 Subject: [PATCH 139/166] Move RPCProxy.New() adjacent to its struct definition --- client/rpcproxy/rpcproxy.go | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 2541df0102b..bc28e7e524e 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -133,6 +133,21 @@ type RpcProxy struct { notifyFailedBarrier int32 } +// New is the only way to safely create a new RpcProxy. +func New(logger *log.Logger, shutdownCh chan struct{}, configInfo NomadConfigInfo, connPoolPinger Pinger) (p *RpcProxy) { + p = new(RpcProxy) + p.logger = logger + p.configInfo = configInfo // can't pass *nomad.Client: import cycle + p.connPoolPinger = connPoolPinger // can't pass *nomad.ConnPool: import cycle + p.rebalanceTimer = time.NewTimer(clientRPCMinReuseDuration) + p.shutdownCh = shutdownCh + + l := serverList{} + l.L = make([]*ServerEndpoint, 0) + p.saveServerList(l) + return p +} + // activateEndpoint adds an endpoint to the RpcProxy's active serverList. // Returns true if the server was added, returns false if the server already // existed in the RpcProxy's serverList. @@ -316,21 +331,6 @@ func (p *RpcProxy) LeaderAddr() string { return p.leaderAddr } -// New is the only way to safely create a new RpcProxy. -func New(logger *log.Logger, shutdownCh chan struct{}, configInfo NomadConfigInfo, connPoolPinger Pinger) (p *RpcProxy) { - p = new(RpcProxy) - p.logger = logger - p.configInfo = configInfo // can't pass *nomad.Client: import cycle - p.connPoolPinger = connPoolPinger // can't pass *nomad.ConnPool: import cycle - p.rebalanceTimer = time.NewTimer(clientRPCMinReuseDuration) - p.shutdownCh = shutdownCh - - l := serverList{} - l.L = make([]*ServerEndpoint, 0) - p.saveServerList(l) - return p -} - // NotifyFailedServer marks the passed in server as "failed" by rotating it // to the end of the server list. func (p *RpcProxy) NotifyFailedServer(s *ServerEndpoint) { From 4b5310ea6862207707f575cf4771356187faff44 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 00:31:55 -0400 Subject: [PATCH 140/166] Properly guard consulPullHeartbeatDeadline behind heartbeatLock --- client/client.go | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/client/client.go b/client/client.go index 340edba2154..c8d0044f208 100644 --- a/client/client.go +++ b/client/client.go @@ -101,13 +101,6 @@ type Client struct { logger *log.Logger - // consulPullHeartbeatDeadline is the deadline at which this Nomad - // Agent will begin polling Consul for a list of Nomad Servers. When - // Nomad Clients are heartbeating successfully with Nomad Servers, - // Nomad Clients do not poll Consul to populate their backup server - // list. - consulPullHeartbeatDeadline time.Time - rpcProxy *rpcproxy.RpcProxy connPool *nomad.ConnPool @@ -119,9 +112,15 @@ type Client struct { // election. lastHeartbeatFromQuorum int32 - lastHeartbeat time.Time - heartbeatTTL time.Duration - heartbeatLock sync.Mutex + // consulPullHeartbeatDeadline is the deadline at which this Nomad + // Agent will begin polling Consul for a list of Nomad Servers. When + // Nomad Clients are heartbeating successfully with Nomad Servers, + // Nomad Clients do not poll Consul to populate their backup server + // list. + consulPullHeartbeatDeadline time.Time + lastHeartbeat time.Time + heartbeatTTL time.Duration + heartbeatLock sync.Mutex // allocs is the current set of allocations allocs map[string]*AllocRunner @@ -1270,16 +1269,17 @@ func (c *Client) setupConsulSyncer() error { // to its cluster and automatically recover from a detached state. bootstrapFn := func() error { now := time.Now() - c.configLock.RLock() + c.heartbeatLock.Lock() // If the last heartbeat didn't contain a leader, give the // Nomad server this Agent is talking to one more attempt at // providing a heartbeat that does contain a leader. if atomic.LoadInt32(&c.lastHeartbeatFromQuorum) == 1 && now.Before(c.consulPullHeartbeatDeadline) { - c.configLock.RUnlock() + c.heartbeatLock.Unlock() + // c.logger.Printf("[TRACE] client.consul: heartbeat received, sleeping until %v", c.consulPullHeartbeatDeadline) return nil } - c.configLock.RUnlock() + c.heartbeatLock.Unlock() c.logger.Printf("[TRACE] client.consul: lost heartbeat with Nomad quorum, falling back to Consul for server list") nomadServerServiceName := c.config.ConsulConfig.ServerServiceName @@ -1304,12 +1304,15 @@ func (c *Client) setupConsulSyncer() error { serverAddrs = append(serverAddrs, net.JoinHostPort(addr, port)) } + c.heartbeatLock.Lock() if atomic.LoadInt32(&c.lastHeartbeatFromQuorum) == 1 && now.Before(c.consulPullHeartbeatDeadline) { + c.heartbeatLock.Unlock() // Common, healthy path if err := c.rpcProxy.SetBackupServers(serverAddrs); err != nil { return fmt.Errorf("client.consul: unable to set backup servers: %v", err) } } else { + c.heartbeatLock.Unlock() // If this Client is talking with a Server that // doesn't have a leader, and we have exceeded the // consulPullHeartbeatDeadline, change the call from From 5757e50243c0220cebaaa5449db2a435593fd006 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 01:03:05 -0400 Subject: [PATCH 141/166] Only return the Client's server addresses, never mix-in server peers --- command/agent/agent_endpoint.go | 35 +-------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/command/agent/agent_endpoint.go b/command/agent/agent_endpoint.go index 99b618e737d..6dc0872a52a 100644 --- a/command/agent/agent_endpoint.go +++ b/command/agent/agent_endpoint.go @@ -139,40 +139,7 @@ func (s *HTTPServer) listServers(resp http.ResponseWriter, req *http.Request) (i return nil, CodedError(501, ErrInvalidMethod) } - // Preallocate for at least 5x servers - const initialServerListSize = 8 - peers := make([]string, 0, initialServerListSize) - uniquePeers := make(map[string]bool, initialServerListSize) - // When the agent has an active server, get the current list of - // servers according to Raft. - if s.agent.server != nil { - raftPeers, err := s.agent.server.RaftPeers() - if err != nil { - return nil, err - } - for _, peer := range raftPeers { - _, found := uniquePeers[peer] - if !found { - uniquePeers[peer] = true - peers = append(peers, peer) - } - } - } - - // When the agent has an active client, return the union of the list - // of servers according to RpcProxy, which is possibly populated by - // Consul. - if s.agent.client != nil { - clientPeers := s.agent.client.RpcProxy().ServerRPCAddrs() - for _, peer := range clientPeers { - _, found := uniquePeers[peer] - if !found { - uniquePeers[peer] = true - peers = append(peers, peer) - } - } - } - + peers := s.agent.client.RpcProxy().ServerRPCAddrs() return peers, nil } From a423f07d63ff5d7263bd4e9ce70d494cddebbc0a Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 01:05:55 -0400 Subject: [PATCH 142/166] Stash client and server registration behind consul.auto_register --- command/agent/agent.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index e3548cc1df8..bf50d93c070 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -367,7 +367,7 @@ func (a *Agent) setupServer() error { a.server = server // Create the Nomad Server services for Consul - if a.config.Consul.ServerServiceName != "" { + if a.config.Consul.AutoRegister && a.config.Consul.ServerServiceName != "" { const serviceGroupName = "server" a.consulSyncer.SetServices(serviceGroupName, []*structs.ConsulService{ &structs.ConsulService{ @@ -418,7 +418,7 @@ func (a *Agent) setupClient() error { a.client = client // Create the Nomad Server services for Consul - if a.config.Consul.ClientServiceName != "" { + if a.config.Consul.AutoRegister && a.config.Consul.ClientServiceName != "" { const serviceGroupName = "client" a.consulSyncer.SetServices(serviceGroupName, []*structs.ConsulService{ &structs.ConsulService{ From a55d3f10d7bfc87260fd6713a3e6c3b45699d003 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 01:07:21 -0400 Subject: [PATCH 143/166] Rename `updateNodeUpdateResponse` to `constructNodeServerInfoResponse` --- nomad/node_endpoint.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index 4ba53fa5d27..5b61961e4b9 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -105,7 +105,7 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp n.srv.peerLock.RLock() defer n.srv.peerLock.RUnlock() - if err := n.updateNodeUpdateResponse(nil, reply); err != nil { + if err := n.constructNodeServerInfoResponse(nil, reply); err != nil { n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) return err } @@ -114,7 +114,7 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp } // updateNodeUpdateResponse assumes the n.srv.peerLock is held for reading. -func (n *Node) updateNodeUpdateResponse(snap *state.StateSnapshot, reply *structs.NodeUpdateResponse) error { +func (n *Node) constructNodeServerInfoResponse(snap *state.StateSnapshot, reply *structs.NodeUpdateResponse) error { reply.LeaderRPCAddr = n.srv.raft.Leader() // Reply with config information required for future RPC requests @@ -256,7 +256,7 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct reply.Index = index n.srv.peerLock.RLock() defer n.srv.peerLock.RUnlock() - if err := n.updateNodeUpdateResponse(snap, reply); err != nil { + if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) return err } @@ -355,7 +355,7 @@ func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUp n.srv.peerLock.RLock() defer n.srv.peerLock.RUnlock() - if err := n.updateNodeUpdateResponse(snap, reply); err != nil { + if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) return err } From 8c8f33da110c9ac50eeb8d69ded2663546b90f19 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 01:16:02 -0400 Subject: [PATCH 144/166] Always pass in a snapshot before calling constructNodeServerInfoResponse() --- nomad/node_endpoint.go | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index 5b61961e4b9..4784a6fe353 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -102,10 +102,14 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp // Set the reply index reply.Index = index + snap, err := n.srv.fsm.State().Snapshot() + if err != nil { + return err + } n.srv.peerLock.RLock() defer n.srv.peerLock.RUnlock() - if err := n.constructNodeServerInfoResponse(nil, reply); err != nil { + if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) return err } @@ -129,14 +133,11 @@ func (n *Node) constructNodeServerInfoResponse(snap *state.StateSnapshot, reply }) } - // Capture all the nodes to obtain the node count - if snap == nil { - ss, err := n.srv.fsm.State().Snapshot() - if err != nil { - return err - } - snap = ss - } + // TODO(sean@): Use an indexed node count instead + // + // Snapshot is used only to iterate over all nodes to create a node + // count to send back to Nomad Clients in their heartbeat so Clients + // can estimate the size of the cluster. iter, err := snap.Nodes() if err == nil { for { From b57f7c97c29a25c516167dc364422b92373c91de Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 01:17:06 -0400 Subject: [PATCH 145/166] Style nit: remove `var` block --- client/rpcproxy/server_endpoint.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/client/rpcproxy/server_endpoint.go b/client/rpcproxy/server_endpoint.go index 07cc3adf342..5ebca184dfa 100644 --- a/client/rpcproxy/server_endpoint.go +++ b/client/rpcproxy/server_endpoint.go @@ -50,10 +50,8 @@ func newServer(name string) (*ServerEndpoint, error) { Name: name, } - var ( - host, port string - err error - ) + var host, port string + var err error host, port, err = net.SplitHostPort(name) if err == nil { s.Host = host From 709b6bd7872730042a2d23692e79e7819cd814da Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 01:20:18 -0400 Subject: [PATCH 146/166] Fold RaftPeers() into its only call site now --- nomad/server.go | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/nomad/server.go b/nomad/server.go index c560e731b5d..1932d18b197 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -690,16 +690,6 @@ func (s *Server) RPC(method string, args interface{}, reply interface{}) error { return codec.err } -// RaftPeers returns the current list of Raft peers -func (s *Server) RaftPeers() ([]string, error) { - peers, err := s.raftPeers.Peers() - if err != nil { - s.logger.Printf("[DEBUG] server: error getting raft peers: %v", err) - return nil, err - } - return peers, nil -} - // Stats is used to return statistics for debugging and insight // for various sub-systems func (s *Server) Stats() map[string]map[string]string { @@ -718,7 +708,7 @@ func (s *Server) Stats() map[string]map[string]string { "serf": s.serf.Stats(), "runtime": RuntimeStats(), } - if peers, err := s.RaftPeers(); err == nil { + if peers, err := s.raftPeers.Peers(); err == nil { stats["raft"]["raft_peers"] = strings.Join(peers, ",") } else { s.logger.Printf("[DEBUG] server: error getting raft peers: %v", err) From a7047bb28f3bea0b939a0ecc7f9c2f24a8ba9498 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 01:23:54 -0400 Subject: [PATCH 147/166] Prefix all log entries in client/rpcproxy with client.rpcproxy --- client/rpcproxy/rpcproxy.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index bc28e7e524e..9bb9214076c 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -193,7 +193,7 @@ func (p *RpcProxy) SetBackupServers(addrs []string) error { for _, s := range addrs { s, err := newServer(s) if err != nil { - p.logger.Printf("[WARN] RPC Proxy: unable to create backup server %q: %v", s, err) + p.logger.Printf("[WARN] client.rpcproxy: unable to create backup server %q: %v", s, err) return fmt.Errorf("unable to create new backup server from %q: %v", s, err) } l = append(l, s) @@ -222,7 +222,7 @@ func (p *RpcProxy) SetBackupServers(addrs []string) error { func (p *RpcProxy) AddPrimaryServer(rpcAddr string) *ServerEndpoint { s, err := newServer(rpcAddr) if err != nil { - p.logger.Printf("[WARN] RPC Proxy: unable to create new primary server from endpoint %q: %v", rpcAddr, err) + p.logger.Printf("[WARN] client.rpcproxy: unable to create new primary server from endpoint %q: %v", rpcAddr, err) return nil } @@ -302,7 +302,7 @@ func (p *RpcProxy) FindServer() *ServerEndpoint { l := p.getServerList() numServers := len(l.L) if numServers == 0 { - p.logger.Printf("[WARN] RPC Proxy: No servers available") + p.logger.Printf("[WARN] client.rpcproxy: No servers available") return nil } else { // Return whatever is at the front of the list because it is @@ -456,7 +456,7 @@ func (p *RpcProxy) RebalanceServers() { foundHealthyServer = true break } - p.logger.Printf(`[DEBUG] RPC Proxy: pinging server "%s" failed: %s`, selectedServer.String(), err) + p.logger.Printf(`[DEBUG] client.rpcproxy: pinging server "%s" failed: %s`, selectedServer.String(), err) l.cycleServer() } @@ -466,14 +466,14 @@ func (p *RpcProxy) RebalanceServers() { // updated list of Nomad servers. Or Consul will begin advertising a // new server in the nomad service (Nomad server service). if !foundHealthyServer { - p.logger.Printf("[DEBUG] RPC Proxy: No healthy servers during rebalance, aborting") + p.logger.Printf("[DEBUG] client.rpcproxy: No healthy servers during rebalance, aborting") return } // Verify that all servers are present. Reconcile will save the // final serverList. if p.reconcileServerList(l) { - p.logger.Printf("[DEBUG] RPC Proxy: Rebalanced %d servers, next active server is %s", len(l.L), l.L[0].String()) + p.logger.Printf("[DEBUG] client.rpcproxy: Rebalanced %d servers, next active server is %s/%v", len(l.L), l.L[0].String(), l) } else { // reconcileServerList failed because Nomad removed the // server that was at the front of the list that had @@ -625,7 +625,7 @@ func (p *RpcProxy) Run() { p.refreshServerRebalanceTimer() case <-p.shutdownCh: - p.logger.Printf("[INFO] RPC Proxy: shutting down") + p.logger.Printf("[INFO] client.rpcproxy: shutting down") return } } @@ -689,14 +689,14 @@ func (p *RpcProxy) RefreshServerLists(servers []*structs.NodeServerInfo, numNode continue } - p.logger.Printf("[WARN] API mismatch between client version (v%d.%d) and server version (v%d.%d), ignoring server %q", p.configInfo.RpcMajorVersion(), p.configInfo.RpcMinorVersion(), s.RpcMajorVersion, s.RpcMinorVersion, s.RpcAdvertiseAddr) + p.logger.Printf("[WARN] client.rpcproxy: API mismatch between client version (v%d.%d) and server version (v%d.%d), ignoring server %q", p.configInfo.RpcMajorVersion(), p.configInfo.RpcMinorVersion(), s.RpcMajorVersion, s.RpcMinorVersion, s.RpcAdvertiseAddr) p.rpcAPIMismatchThrottle[s.RpcAdvertiseAddr] = now.Add(rpcAPIMismatchLogRate) continue } server, err := newServer(s.RpcAdvertiseAddr) if err != nil { - p.logger.Printf("[WARN] Unable to create a server from %q: %v", s.RpcAdvertiseAddr, err) + p.logger.Printf("[WARN] client.rpcproxy: Unable to create a server from %q: %v", s.RpcAdvertiseAddr, err) continue } From f07c91013133c7f70abe6363b7405fff67ff0777 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 01:25:47 -0400 Subject: [PATCH 148/166] Formatting nit: remove brackets --- client/rpcproxy/rpcproxy.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 9bb9214076c..3198b2fb2c1 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -281,7 +281,7 @@ func (l *serverList) shuffleServers() { // String returns a string representation of serverList func (l *serverList) String() string { if len(l.L) == 0 { - return fmt.Sprintf("[empty server list]") + return fmt.Sprintf("empty server list") } serverStrs := make([]string, 0, len(l.L)) From dbdebcad808e96e73fc3f0fcebfff3dc6bee8233 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 01:30:05 -0400 Subject: [PATCH 149/166] golint(1) police --- client/client.go | 12 ++-- client/rpcproxy/rpcproxy.go | 94 +++++++++++++++------------- client/rpcproxy/rpcproxy_test.go | 102 +++++++++++++++---------------- command/agent/agent_endpoint.go | 4 +- 4 files changed, 110 insertions(+), 102 deletions(-) diff --git a/client/client.go b/client/client.go index c8d0044f208..a0503d0fefa 100644 --- a/client/client.go +++ b/client/client.go @@ -101,7 +101,7 @@ type Client struct { logger *log.Logger - rpcProxy *rpcproxy.RpcProxy + rpcProxy *rpcproxy.RPCProxy connPool *nomad.ConnPool @@ -198,7 +198,7 @@ func NewClient(cfg *config.Config, consulSyncer *consul.Syncer) (*Client, error) // Create the RPC Proxy and bootstrap with the preconfigured list of // static servers c.configLock.RLock() - c.rpcProxy = rpcproxy.New(c.logger, c.shutdownCh, c, c.connPool) + c.rpcProxy = rpcproxy.NewRPCProxy(c.logger, c.shutdownCh, c, c.connPool) for _, serverAddr := range c.configCopy.Servers { c.rpcProxy.AddPrimaryServer(serverAddr) } @@ -229,7 +229,7 @@ func NewClient(cfg *config.Config, consulSyncer *consul.Syncer) (*Client, error) // Start collecting stats go c.collectHostStats() - // Start the RpcProxy maintenance task. This task periodically + // Start the RPCProxy maintenance task. This task periodically // shuffles the list of Nomad Server Endpoints this Client will use // when communicating with Nomad Servers via RPC. This is done in // order to prevent server fixation in stable Nomad clusters. This @@ -462,9 +462,9 @@ func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) { return ar.ctx.AllocDir, nil } -// AddPrimaryServerToRpcProxy adds serverAddr to the RPC Proxy's primary +// AddPrimaryServerToRPCProxy adds serverAddr to the RPC Proxy's primary // server list. -func (c *Client) AddPrimaryServerToRpcProxy(serverAddr string) *rpcproxy.ServerEndpoint { +func (c *Client) AddPrimaryServerToRPCProxy(serverAddr string) *rpcproxy.ServerEndpoint { return c.rpcProxy.AddPrimaryServer(serverAddr) } @@ -1415,6 +1415,6 @@ func (c *Client) emitStats(hStats *stats.HostStats) { } } -func (c *Client) RpcProxy() *rpcproxy.RpcProxy { +func (c *Client) RPCProxy() *rpcproxy.RPCProxy { return c.rpcProxy } diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 3198b2fb2c1..240835e6197 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -1,5 +1,5 @@ // Package rpcproxy provides a proxy interface to Nomad Servers. The -// RpcProxy periodically shuffles which server a Nomad Client communicates +// RPCProxy periodically shuffles which server a Nomad Client communicates // with in order to redistribute load across Nomad Servers. Nomad Servers // that fail an RPC request are automatically cycled to the end of the list // until the server list is reshuffled. @@ -83,7 +83,9 @@ type serverList struct { L []*ServerEndpoint } -type RpcProxy struct { +// RPCProxy is the manager type responsible for returning and managing Nomad +// addresses. +type RPCProxy struct { // activatedList manages the list of Nomad Servers that are eligible // to be queried by the Client agent. activatedList atomic.Value @@ -95,7 +97,7 @@ type RpcProxy struct { primaryServers serverList // backupServers is a list of fallback servers. These servers are - // appended to the RpcProxy's serverList, but are never shuffled with + // appended to the RPCProxy's serverList, but are never shuffled with // the list of servers discovered via the Nomad heartbeat. Covered // by serverListLock. backupServers serverList @@ -133,14 +135,15 @@ type RpcProxy struct { notifyFailedBarrier int32 } -// New is the only way to safely create a new RpcProxy. -func New(logger *log.Logger, shutdownCh chan struct{}, configInfo NomadConfigInfo, connPoolPinger Pinger) (p *RpcProxy) { - p = new(RpcProxy) - p.logger = logger - p.configInfo = configInfo // can't pass *nomad.Client: import cycle - p.connPoolPinger = connPoolPinger // can't pass *nomad.ConnPool: import cycle - p.rebalanceTimer = time.NewTimer(clientRPCMinReuseDuration) - p.shutdownCh = shutdownCh +// NewRPCProxy is the only way to safely create a new RPCProxy. +func NewRPCProxy(logger *log.Logger, shutdownCh chan struct{}, configInfo NomadConfigInfo, connPoolPinger Pinger) *RPCProxy { + p := &RPCProxy{ + logger: logger, + configInfo: configInfo, // can't pass *nomad.Client: import cycle + connPoolPinger: connPoolPinger, // can't pass *nomad.ConnPool: import cycle + rebalanceTimer: time.NewTimer(clientRPCMinReuseDuration), + shutdownCh: shutdownCh, + } l := serverList{} l.L = make([]*ServerEndpoint, 0) @@ -148,10 +151,10 @@ func New(logger *log.Logger, shutdownCh chan struct{}, configInfo NomadConfigInf return p } -// activateEndpoint adds an endpoint to the RpcProxy's active serverList. +// activateEndpoint adds an endpoint to the RPCProxy's active serverList. // Returns true if the server was added, returns false if the server already -// existed in the RpcProxy's serverList. -func (p *RpcProxy) activateEndpoint(s *ServerEndpoint) bool { +// existed in the RPCProxy's serverList. +func (p *RPCProxy) activateEndpoint(s *ServerEndpoint) bool { l := p.getServerList() // Check if this server is known @@ -188,7 +191,7 @@ func (p *RpcProxy) activateEndpoint(s *ServerEndpoint) bool { // the Nomad Agent lost contact with the list of Nomad Servers provided via // the Nomad Agent's heartbeat. If available, the backup servers are // populated via Consul. -func (p *RpcProxy) SetBackupServers(addrs []string) error { +func (p *RPCProxy) SetBackupServers(addrs []string) error { l := make([]*ServerEndpoint, 0, len(addrs)) for _, s := range addrs { s, err := newServer(s) @@ -215,11 +218,11 @@ func (p *RpcProxy) SetBackupServers(addrs []string) error { // AddPrimaryServer takes the RPC address of a Nomad server, creates a new // endpoint, and adds it to both the primaryServers list and the active // serverList used in the RPC Proxy. If the endpoint is not known by the -// RpcProxy, appends the endpoint to the list. The new endpoint will begin +// RPCProxy, appends the endpoint to the list. The new endpoint will begin // seeing use after the rebalance timer fires (or enough servers fail // organically). Any values in the primary server list are overridden by the // next successful heartbeat. -func (p *RpcProxy) AddPrimaryServer(rpcAddr string) *ServerEndpoint { +func (p *RPCProxy) AddPrimaryServer(rpcAddr string) *ServerEndpoint { s, err := newServer(rpcAddr) if err != nil { p.logger.Printf("[WARN] client.rpcproxy: unable to create new primary server from endpoint %q: %v", rpcAddr, err) @@ -298,34 +301,37 @@ func (l *serverList) String() string { // the server list. If the server at the front of the list has failed or // fails during an RPC call, it is rotated to the end of the list. If there // are no servers available, return nil. -func (p *RpcProxy) FindServer() *ServerEndpoint { +func (p *RPCProxy) FindServer() *ServerEndpoint { l := p.getServerList() numServers := len(l.L) if numServers == 0 { p.logger.Printf("[WARN] client.rpcproxy: No servers available") return nil - } else { - // Return whatever is at the front of the list because it is - // assumed to be the oldest in the server list (unless - - // hypothetically - the server list was rotated right after a - // server was added). - return l.L[0] } + + // Return whatever is at the front of the list because it is + // assumed to be the oldest in the server list (unless - + // hypothetically - the server list was rotated right after a + // server was added). + return l.L[0] } // getServerList is a convenience method which hides the locking semantics // of atomic.Value from the caller. -func (p *RpcProxy) getServerList() serverList { +func (p *RPCProxy) getServerList() serverList { return p.activatedList.Load().(serverList) } // saveServerList is a convenience method which hides the locking semantics // of atomic.Value from the caller. -func (p *RpcProxy) saveServerList(l serverList) { +func (p *RPCProxy) saveServerList(l serverList) { p.activatedList.Store(l) } -func (p *RpcProxy) LeaderAddr() string { +// LeaderAddr returns the current leader address. If an empty string, then +// the Nomad Server for this Nomad Agent is in the minority or the Nomad +// Servers are in the middle of an election. +func (p *RPCProxy) LeaderAddr() string { p.listLock.Lock() defer p.listLock.Unlock() return p.leaderAddr @@ -333,7 +339,7 @@ func (p *RpcProxy) LeaderAddr() string { // NotifyFailedServer marks the passed in server as "failed" by rotating it // to the end of the server list. -func (p *RpcProxy) NotifyFailedServer(s *ServerEndpoint) { +func (p *RPCProxy) NotifyFailedServer(s *ServerEndpoint) { l := p.getServerList() // If the server being failed is not the first server on the list, @@ -360,13 +366,15 @@ func (p *RpcProxy) NotifyFailedServer(s *ServerEndpoint) { } } -func (p *RpcProxy) NumNodes() int { +// NumNodes returns the estimated number of nodes according to the last Nomad +// Heartbeat. +func (p *RPCProxy) NumNodes() int { return p.numNodes } // NumServers takes out an internal "read lock" and returns the number of // servers. numServers includes both healthy and unhealthy servers. -func (p *RpcProxy) NumServers() int { +func (p *RPCProxy) NumServers() int { l := p.getServerList() return len(l.L) } @@ -383,7 +391,7 @@ func (p *RpcProxy) NumServers() int { // Unhealthy servers are removed from the server list during the next client // heartbeat. Before the newly shuffled server list is saved, the new remote // endpoint is tested to ensure its responsive. -func (p *RpcProxy) RebalanceServers() { +func (p *RPCProxy) RebalanceServers() { var serverListLocked bool p.serverListLock.Lock() serverListLocked = true @@ -495,7 +503,7 @@ func (p *RpcProxy) RebalanceServers() { // (i.e. was removed by Nomad during a PingNomadServer() call. Newly added // servers are appended to the list and other missing servers are removed // from the list. -func (p *RpcProxy) reconcileServerList(l *serverList) bool { +func (p *RPCProxy) reconcileServerList(l *serverList) bool { p.listLock.Lock() defer p.listLock.Unlock() @@ -559,7 +567,7 @@ func (p *RpcProxy) reconcileServerList(l *serverList) bool { // RemoveServer takes out an internal write lock and removes a server from // the activated server list. -func (p *RpcProxy) RemoveServer(s *ServerEndpoint) { +func (p *RPCProxy) RemoveServer(s *ServerEndpoint) { // Lock hierarchy protocol dictates serverListLock is acquired first. p.serverListLock.Lock() defer p.serverListLock.Unlock() @@ -577,7 +585,7 @@ func (p *RpcProxy) RemoveServer(s *ServerEndpoint) { } // refreshServerRebalanceTimer is only called once p.rebalanceTimer expires. -func (p *RpcProxy) refreshServerRebalanceTimer() time.Duration { +func (p *RPCProxy) refreshServerRebalanceTimer() time.Duration { l := p.getServerList() numServers := len(l.L) // Limit this connection's life based on the size (and health) of the @@ -595,14 +603,14 @@ func (p *RpcProxy) refreshServerRebalanceTimer() time.Duration { // ResetRebalanceTimer resets the rebalance timer. This method exists for // testing and should not be used directly. -func (p *RpcProxy) ResetRebalanceTimer() { +func (p *RPCProxy) ResetRebalanceTimer() { p.listLock.Lock() defer p.listLock.Unlock() p.rebalanceTimer.Reset(clientRPCMinReuseDuration) } // ServerRPCAddrs returns one RPC Address per server -func (p *RpcProxy) ServerRPCAddrs() []string { +func (p *RPCProxy) ServerRPCAddrs() []string { l := p.getServerList() serverAddrs := make([]string, 0, len(l.L)) for _, s := range l.L { @@ -617,7 +625,7 @@ func (p *RpcProxy) ServerRPCAddrs() []string { // automatically cycled to the end of the list. New servers are appended to // the list. The order of the server list must be shuffled periodically to // distribute load across all known and available Nomad servers. -func (p *RpcProxy) Run() { +func (p *RPCProxy) Run() { for { select { case <-p.rebalanceTimer.C: @@ -636,15 +644,15 @@ func (p *RpcProxy) Run() { // Nomad Servers that the Nomad Client should use for RPC requests. // RefreshServerLists does not rebalance its serverLists (that is handled // elsewhere via a periodic timer). New Nomad Servers learned via the -// heartbeat are appended to the RpcProxy's activated serverList. Servers +// heartbeat are appended to the RPCProxy's activated serverList. Servers // that are no longer present in the Heartbeat are removed immediately from // all server lists. Nomad Servers speaking a newer major or minor API // version are filtered from the serverList. -func (p *RpcProxy) RefreshServerLists(servers []*structs.NodeServerInfo, numNodes int32, leaderRpcAddr string) error { +func (p *RPCProxy) RefreshServerLists(servers []*structs.NodeServerInfo, numNodes int32, leaderRPCAddr string) error { // Merge all servers found in the response. Servers in the response // with newer API versions are filtered from the list. If the list - // is missing an address found in the RpcProxy's server list, remove - // it from the RpcProxy. + // is missing an address found in the RPCProxy's server list, remove + // it from the RPCProxy. p.serverListLock.Lock() defer p.serverListLock.Unlock() @@ -678,7 +686,7 @@ func (p *RpcProxy) RefreshServerLists(servers []*structs.NodeServerInfo, numNode // spamming the logs every heartbeat. // // TODO(sean@): Move the logging throttle logic into a - // dedicated logging package so RpcProxy does not have to + // dedicated logging package so RPCProxy does not have to // perform this accounting. if int32(p.configInfo.RpcMajorVersion()) < s.RpcMajorVersion || (int32(p.configInfo.RpcMajorVersion()) == s.RpcMajorVersion && @@ -755,7 +763,7 @@ func (p *RpcProxy) RefreshServerLists(servers []*structs.NodeServerInfo, numNode } p.numNodes = int(numNodes) - p.leaderAddr = leaderRpcAddr + p.leaderAddr = leaderRPCAddr p.saveServerList(newServerCfg) return nil diff --git a/client/rpcproxy/rpcproxy_test.go b/client/rpcproxy/rpcproxy_test.go index fe900e701b3..c24b24afcbe 100644 --- a/client/rpcproxy/rpcproxy_test.go +++ b/client/rpcproxy/rpcproxy_test.go @@ -90,25 +90,25 @@ func (s *fauxSerf) RpcMinorVersion() int { return s.rpcMinorVersion } -func testRpcProxy() (p *RpcProxy) { +func testRPCProxy() (p *RPCProxy) { logger := GetBufferedLogger() logger = log.New(os.Stderr, "", log.LstdFlags) shutdownCh := make(chan struct{}) - p = New(logger, shutdownCh, &fauxSerf{numNodes: 16384}, &fauxConnPool{}) + p = NewRPCProxy(logger, shutdownCh, &fauxSerf{numNodes: 16384}, &fauxConnPool{}) return p } -func testRpcProxyFailProb(failPct float64) (p *RpcProxy) { +func testRPCProxyFailProb(failPct float64) (p *RPCProxy) { logger := GetBufferedLogger() logger = log.New(os.Stderr, "", log.LstdFlags) shutdownCh := make(chan struct{}) - p = New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) + p = NewRPCProxy(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) return p } -// func (p *RpcProxy) AddPrimaryServer(server *ServerEndpoint) { -func TestRpcProxy_AddPrimaryServer(t *testing.T) { - p := testRpcProxy() +// func (p *RPCProxy) AddPrimaryServer(server *ServerEndpoint) { +func TestRPCProxy_AddPrimaryServer(t *testing.T) { + p := testRPCProxy() var num int num = p.NumServers() if num != 0 { @@ -154,9 +154,9 @@ func TestRpcProxy_AddPrimaryServer(t *testing.T) { } } -// func (p *RpcProxy) FindServer() (server *ServerEndpoint) { -func TestRpcProxy_FindServer(t *testing.T) { - p := testRpcProxy() +// func (p *RPCProxy) FindServer() (server *ServerEndpoint) { +func TestRPCProxy_FindServer(t *testing.T) { + p := testRPCProxy() if p.FindServer() != nil { t.Fatalf("Expected nil return") @@ -204,26 +204,26 @@ func TestRpcProxy_FindServer(t *testing.T) { } } -// func New(logger *log.Logger, shutdownCh chan struct{}) (p *RpcProxy) { -func TestRpcProxy_New(t *testing.T) { +// func New(logger *log.Logger, shutdownCh chan struct{}) (p *RPCProxy) { +func TestRPCProxy_New(t *testing.T) { logger := GetBufferedLogger() logger = log.New(os.Stderr, "", log.LstdFlags) shutdownCh := make(chan struct{}) - p := New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{}) + p := NewRPCProxy(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{}) if p == nil { - t.Fatalf("RpcProxy nil") + t.Fatalf("RPCProxy nil") } } -// func (p *RpcProxy) NotifyFailedServer(server *ServerEndpoint) { -func TestRpcProxy_NotifyFailedServer(t *testing.T) { - p := testRpcProxy() +// func (p *RPCProxy) NotifyFailedServer(server *ServerEndpoint) { +func TestRPCProxy_NotifyFailedServer(t *testing.T) { + p := testRPCProxy() if p.NumServers() != 0 { t.Fatalf("Expected zero servers to start") } - // Try notifying for a server that is not managed by RpcProxy + // Try notifying for a server that is not managed by RPCProxy s1Endpoint := makeServerEndpointName() s1 := p.AddPrimaryServer(s1Endpoint) if s1 == nil { @@ -257,7 +257,7 @@ func TestRpcProxy_NotifyFailedServer(t *testing.T) { t.Fatalf("Expected one server") } - // Re-add s2 so there are two servers in the RpcProxy server list + // Re-add s2 so there are two servers in the RPCProxy server list s2 = p.AddPrimaryServer(s2Endpoint) if p.NumServers() != 2 { t.Fatalf("Expected two servers") @@ -291,9 +291,9 @@ func TestRpcProxy_NotifyFailedServer(t *testing.T) { } } -// func (p *RpcProxy) NumServers() (numServers int) { -func TestRpcProxy_NumServers(t *testing.T) { - p := testRpcProxy() +// func (p *RPCProxy) NumServers() (numServers int) { +func TestRPCProxy_NumServers(t *testing.T) { + p := testRPCProxy() const maxNumServers = 100 serverList := make([]*ServerEndpoint, 0, maxNumServers) @@ -330,10 +330,10 @@ func TestRpcProxy_NumServers(t *testing.T) { } } -// func (p *RpcProxy) RebalanceServers() { -func TestRpcProxy_RebalanceServers(t *testing.T) { +// func (p *RPCProxy) RebalanceServers() { +func TestRPCProxy_RebalanceServers(t *testing.T) { const failPct = 0.5 - p := testRpcProxyFailProb(failPct) + p := testRPCProxyFailProb(failPct) const maxServers = 100 const numShuffleTests = 100 const uniquePassRate = 0.5 @@ -366,9 +366,9 @@ func TestRpcProxy_RebalanceServers(t *testing.T) { } } -// func (p *RpcProxy) RemoveServer(server *ServerEndpoint) { -func TestRpcProxy_RemoveServer(t *testing.T) { - p := testRpcProxy() +// func (p *RPCProxy) RemoveServer(server *ServerEndpoint) { +func TestRPCProxy_RemoveServer(t *testing.T) { + p := testRPCProxy() if p.NumServers() != 0 { t.Fatalf("Expected zero servers to start") } @@ -532,11 +532,11 @@ func TestRpcProxy_RemoveServer(t *testing.T) { } } -// func (p *RpcProxy) Start() { +// func (p *RPCProxy) Start() { // func (l *serverList) cycleServer() (servers []*Server) { -func TestRpcProxyInternal_cycleServer(t *testing.T) { - p := testRpcProxy() +func TestRPCProxyInternal_cycleServer(t *testing.T) { + p := testRPCProxy() l := p.getServerList() server0 := &ServerEndpoint{Name: "server1"} @@ -586,9 +586,9 @@ func TestRpcProxyInternal_cycleServer(t *testing.T) { } } -// func (p *RpcProxy) getServerList() serverList { -func TestRpcProxyInternal_getServerList(t *testing.T) { - p := testRpcProxy() +// func (p *RPCProxy) getServerList() serverList { +func TestRPCProxyInternal_getServerList(t *testing.T) { + p := testRPCProxy() l := p.getServerList() if l.L == nil { t.Fatalf("serverList.servers nil") @@ -599,8 +599,8 @@ func TestRpcProxyInternal_getServerList(t *testing.T) { } } -func TestRpcProxyInternal_New(t *testing.T) { - p := testRpcProxy() +func TestRPCProxyInternal_New(t *testing.T) { + p := testRPCProxy() if p == nil { t.Fatalf("bad") } @@ -614,8 +614,8 @@ func TestRpcProxyInternal_New(t *testing.T) { } } -// func (p *RpcProxy) reconcileServerList(l *serverList) bool { -func TestRpcProxyInternal_reconcileServerList(t *testing.T) { +// func (p *RPCProxy) reconcileServerList(l *serverList) bool { +func TestRPCProxyInternal_reconcileServerList(t *testing.T) { tests := []int{0, 1, 2, 3, 4, 5, 10, 100} for _, n := range tests { ok, err := test_reconcileServerList(n) @@ -630,14 +630,14 @@ func test_reconcileServerList(maxServers int) (bool, error) { // missing, the added have been added, and the original server is // present. const failPct = 0.5 - p := testRpcProxyFailProb(failPct) + p := testRPCProxyFailProb(failPct) var failedServers, healthyServers []*ServerEndpoint for i := 0; i < maxServers; i++ { nodeName := fmt.Sprintf("s%02d", i) node := &ServerEndpoint{Name: nodeName} - // Add 66% of servers to RpcProxy + // Add 66% of servers to RPCProxy if rand.Float64() > 0.33 { p.activateEndpoint(node) @@ -658,7 +658,7 @@ func test_reconcileServerList(maxServers int) (bool, error) { } } - // Randomize RpcProxy's server list + // Randomize RPCProxy's server list p.RebalanceServers() selectedServer := p.FindServer() @@ -670,7 +670,7 @@ func test_reconcileServerList(maxServers int) (bool, error) { } } - // Update RpcProxy's server list to be "healthy" based on Serf. + // Update RPCProxy's server list to be "healthy" based on Serf. // Reconcile this with origServers, which is shuffled and has a live // connection, but possibly out of date. origServers := p.getServerList() @@ -701,7 +701,7 @@ func test_reconcileServerList(maxServers int) (bool, error) { resultingServerMap[*s.Key()] = true } - // Test to make sure no failed servers are in the RpcProxy's + // Test to make sure no failed servers are in the RPCProxy's // list. Error if there are any failedServers in l.servers for _, s := range failedServers { _, ok := resultingServerMap[*s.Key()] @@ -726,7 +726,7 @@ func test_reconcileServerList(maxServers int) (bool, error) { } // func (l *serverList) refreshServerRebalanceTimer() { -func TestRpcProxyInternal_refreshServerRebalanceTimer(t *testing.T) { +func TestRPCProxyInternal_refreshServerRebalanceTimer(t *testing.T) { type clusterSizes struct { numNodes int numServers int @@ -765,7 +765,7 @@ func TestRpcProxyInternal_refreshServerRebalanceTimer(t *testing.T) { shutdownCh := make(chan struct{}) for i, s := range clusters { - p := New(logger, shutdownCh, &fauxSerf{numNodes: s.numNodes}, &fauxConnPool{}) + p := NewRPCProxy(logger, shutdownCh, &fauxSerf{numNodes: s.numNodes}, &fauxConnPool{}) for i := 0; i < s.numServers; i++ { nodeName := fmt.Sprintf("s%02d", i) p.activateEndpoint(&ServerEndpoint{Name: nodeName}) @@ -778,15 +778,15 @@ func TestRpcProxyInternal_refreshServerRebalanceTimer(t *testing.T) { } } -// func (p *RpcProxy) saveServerList(l serverList) { -func TestRpcProxyInternal_saveServerList(t *testing.T) { - p := testRpcProxy() +// func (p *RPCProxy) saveServerList(l serverList) { +func TestRPCProxyInternal_saveServerList(t *testing.T) { + p := testRPCProxy() // Initial condition func() { l := p.getServerList() if len(l.L) != 0 { - t.Fatalf("RpcProxy.saveServerList failed to load init config") + t.Fatalf("RPCProxy.saveServerList failed to load init config") } newServer := new(ServerEndpoint) @@ -799,7 +799,7 @@ func TestRpcProxyInternal_saveServerList(t *testing.T) { l1 := p.getServerList() t1NumServers := len(l1.L) if t1NumServers != 1 { - t.Fatalf("RpcProxy.saveServerList failed to save mutated config") + t.Fatalf("RPCProxy.saveServerList failed to save mutated config") } }() @@ -812,7 +812,7 @@ func TestRpcProxyInternal_saveServerList(t *testing.T) { l_orig := p.getServerList() origNumServers := len(l_orig.L) if origNumServers >= len(l.L) { - t.Fatalf("RpcProxy.saveServerList unsaved config overwrote original") + t.Fatalf("RPCProxy.saveServerList unsaved config overwrote original") } }() } diff --git a/command/agent/agent_endpoint.go b/command/agent/agent_endpoint.go index 6dc0872a52a..aaf466373b0 100644 --- a/command/agent/agent_endpoint.go +++ b/command/agent/agent_endpoint.go @@ -139,7 +139,7 @@ func (s *HTTPServer) listServers(resp http.ResponseWriter, req *http.Request) (i return nil, CodedError(501, ErrInvalidMethod) } - peers := s.agent.client.RpcProxy().ServerRPCAddrs() + peers := s.agent.client.RPCProxy().ServerRPCAddrs() return peers, nil } @@ -158,7 +158,7 @@ func (s *HTTPServer) updateServers(resp http.ResponseWriter, req *http.Request) // Set the servers list into the client for _, server := range servers { s.agent.logger.Printf("[TRACE] Adding server %s to the client's primary server list", server) - se := client.AddPrimaryServerToRpcProxy(server) + se := client.AddPrimaryServerToRPCProxy(server) if se == nil { s.agent.logger.Printf("[ERR] Attempt to add server %q to client failed", server) } From ecd84f4be7af3c30416b5144531396cf42c1708d Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 01:54:56 -0400 Subject: [PATCH 150/166] Nomad does not use Serf at the client level. Use a hard lock. --- client/rpcproxy/rpcproxy.go | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 240835e6197..c630ec6365b 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -129,10 +129,6 @@ type RPCProxy struct { // connection pool. Pinger is an interface that wraps // client.ConnPool. connPoolPinger Pinger - - // notifyFailedBarrier is acts as a barrier to prevent queuing behind - // serverListLock and acts as a TryLock(). - notifyFailedBarrier int32 } // NewRPCProxy is the only way to safely create a new RPCProxy. @@ -348,11 +344,7 @@ func (p *RPCProxy) NotifyFailedServer(s *ServerEndpoint) { // the server to the end of the list. // Only rotate the server list when there is more than one server - if len(l.L) > 1 && l.L[0] == s && - // Use atomic.CAS to emulate a TryLock(). - atomic.CompareAndSwapInt32(&p.notifyFailedBarrier, 0, 1) { - defer atomic.StoreInt32(&p.notifyFailedBarrier, 0) - + if len(l.L) > 1 && l.L[0] == s { // Grab a lock, retest, and take the hit of cycling the first // server to the end. p.listLock.Lock() From 83e3df0b6a97fb59088b3c6305ede6c6da7af7c2 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 01:57:02 -0400 Subject: [PATCH 151/166] Rename listLock to activatedListLock --- client/rpcproxy/rpcproxy.go | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index c630ec6365b..a090a672638 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -88,8 +88,8 @@ type serverList struct { type RPCProxy struct { // activatedList manages the list of Nomad Servers that are eligible // to be queried by the Client agent. - activatedList atomic.Value - listLock sync.Mutex + activatedList atomic.Value + activatedListLock sync.Mutex // primaryServers is a list of servers found in the last heartbeat. // primaryServers are periodically reshuffled. Covered by @@ -202,8 +202,8 @@ func (p *RPCProxy) SetBackupServers(addrs []string) error { p.backupServers.L = l p.serverListLock.Unlock() - p.listLock.Lock() - defer p.listLock.Unlock() + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() for _, s := range l { p.activateEndpoint(s) } @@ -229,9 +229,9 @@ func (p *RPCProxy) AddPrimaryServer(rpcAddr string) *ServerEndpoint { p.primaryServers.L = append(p.primaryServers.L, s) p.serverListLock.Unlock() - p.listLock.Lock() + p.activatedListLock.Lock() p.activateEndpoint(s) - p.listLock.Unlock() + p.activatedListLock.Unlock() return s } @@ -328,8 +328,8 @@ func (p *RPCProxy) saveServerList(l serverList) { // the Nomad Server for this Nomad Agent is in the minority or the Nomad // Servers are in the middle of an election. func (p *RPCProxy) LeaderAddr() string { - p.listLock.Lock() - defer p.listLock.Unlock() + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() return p.leaderAddr } @@ -347,8 +347,8 @@ func (p *RPCProxy) NotifyFailedServer(s *ServerEndpoint) { if len(l.L) > 1 && l.L[0] == s { // Grab a lock, retest, and take the hit of cycling the first // server to the end. - p.listLock.Lock() - defer p.listLock.Unlock() + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() l = p.getServerList() if len(l.L) > 1 && l.L[0] == s { @@ -496,8 +496,8 @@ func (p *RPCProxy) RebalanceServers() { // servers are appended to the list and other missing servers are removed // from the list. func (p *RPCProxy) reconcileServerList(l *serverList) bool { - p.listLock.Lock() - defer p.listLock.Unlock() + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() // newServerList is a serverList that has been kept up-to-date with // join and leave events. @@ -564,8 +564,8 @@ func (p *RPCProxy) RemoveServer(s *ServerEndpoint) { p.serverListLock.Lock() defer p.serverListLock.Unlock() - p.listLock.Lock() - defer p.listLock.Unlock() + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() l := p.getServerList() k := s.Key() @@ -596,8 +596,8 @@ func (p *RPCProxy) refreshServerRebalanceTimer() time.Duration { // ResetRebalanceTimer resets the rebalance timer. This method exists for // testing and should not be used directly. func (p *RPCProxy) ResetRebalanceTimer() { - p.listLock.Lock() - defer p.listLock.Unlock() + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() p.rebalanceTimer.Reset(clientRPCMinReuseDuration) } @@ -723,8 +723,8 @@ func (p *RPCProxy) RefreshServerLists(servers []*structs.NodeServerInfo, numNode return nil } - p.listLock.Lock() - defer p.listLock.Unlock() + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() newServerCfg := p.getServerList() for k, v := range mergedPrimaryMap { switch v.state { From 2ac7ecb5b8a37b21707b50e8b03aa1209101e6f6 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 02:00:14 -0400 Subject: [PATCH 152/166] Remove useless statement --- api/nodes_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/api/nodes_test.go b/api/nodes_test.go index 53355c3eb2b..d546dd435cc 100644 --- a/api/nodes_test.go +++ b/api/nodes_test.go @@ -60,7 +60,6 @@ func TestNodes_PrefixList(t *testing.T) { return false, fmt.Errorf("expected 1 node, got: %d", n) } nodeID = out[0].ID - _ = out[0].Datacenter return true, nil }, func(err error) { t.Fatalf("err: %s", err) From 61a36a6d5410d53c123276d196860586d07ae031 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 02:16:28 -0400 Subject: [PATCH 153/166] Commit miss, bump to 5s --- command/agent/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/command/agent/config.go b/command/agent/config.go index 16f4256214d..091e37684d4 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -394,7 +394,7 @@ func DefaultConfig() *Config { ServerServiceName: "nomad", ClientServiceName: "nomad-client", AutoRegister: true, - Timeout: 500 * time.Millisecond, + Timeout: 5 * time.Second, }, Client: &ClientConfig{ Enabled: false, From aff951ca4e170c32eb05fc54de9f8b5d16b69c23 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 11:19:02 -0400 Subject: [PATCH 154/166] Always create a consul.Syncer. Use a default Consul Config if necessary. --- client/driver/executor/executor.go | 7 +++++++ command/agent/consul/syncer.go | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index b238aa2bc89..5f9d7d73d87 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -205,7 +205,14 @@ type UniversalExecutor struct { // NewExecutor returns an Executor func NewExecutor(logger *log.Logger) Executor { + shutdownCh := make(chan struct{}) + cs, err := consul.NewSyncer(nil, shutdownCh, logger) + if err != nil { + return err + } + exec := &UniversalExecutor{ + consulSyncer: cs, logger: logger, processExited: make(chan interface{}), shutdownCh: make(chan struct{}), diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index 0c06091d042..255f4c6addd 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -111,7 +111,14 @@ type Syncer struct { func NewSyncer(config *config.ConsulConfig, shutdownCh chan struct{}, logger *log.Logger) (*Syncer, error) { var err error var c *consul.Client + cfg := consul.DefaultConfig() + + // If a nil config was provided, fall back to the default config + if config == nil { + config = cfg + } + if config.Addr != "" { cfg.Address = config.Addr } From 91582dc87573ae9af7225d87a8432cdb3cceb6ee Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 11:29:57 -0400 Subject: [PATCH 155/166] Always create a consul.Syncer. Use a default Consul Config if necessary. --- client/driver/executor/executor.go | 3 ++- command/agent/consul/syncer.go | 35 +++++++++++++++--------------- nomad/structs/diff_test.go | 8 ++++++- 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index 5f9d7d73d87..f7d971be961 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -208,7 +208,8 @@ func NewExecutor(logger *log.Logger) Executor { shutdownCh := make(chan struct{}) cs, err := consul.NewSyncer(nil, shutdownCh, logger) if err != nil { - return err + logger.Printf("[ERROR] executor: failed to allocate new Consul Syncer: %v", err) + return nil } exec := &UniversalExecutor{ diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index 255f4c6addd..97ca8ed16bc 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -14,6 +14,7 @@ import ( "github.com/hashicorp/consul/lib" "github.com/hashicorp/go-multierror" + cconfig "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs/config" "github.com/hashicorp/nomad/nomad/types" @@ -108,31 +109,31 @@ type Syncer struct { } // NewSyncer returns a new consul.Syncer -func NewSyncer(config *config.ConsulConfig, shutdownCh chan struct{}, logger *log.Logger) (*Syncer, error) { +func NewSyncer(consulConfig *config.ConsulConfig, shutdownCh chan struct{}, logger *log.Logger) (*Syncer, error) { var err error var c *consul.Client cfg := consul.DefaultConfig() // If a nil config was provided, fall back to the default config - if config == nil { - config = cfg + if consulConfig == nil { + consulConfig = cconfig.DefaultConfig().ConsulConfig } - if config.Addr != "" { - cfg.Address = config.Addr + if consulConfig.Addr != "" { + cfg.Address = consulConfig.Addr } - if config.Token != "" { - cfg.Token = config.Token + if consulConfig.Token != "" { + cfg.Token = consulConfig.Token } - if config.Auth != "" { + if consulConfig.Auth != "" { var username, password string - if strings.Contains(config.Auth, ":") { - split := strings.SplitN(config.Auth, ":", 2) + if strings.Contains(consulConfig.Auth, ":") { + split := strings.SplitN(consulConfig.Auth, ":", 2) username = split[0] password = split[1] } else { - username = config.Auth + username = consulConfig.Auth } cfg.HttpAuth = &consul.HttpBasicAuth{ @@ -140,14 +141,14 @@ func NewSyncer(config *config.ConsulConfig, shutdownCh chan struct{}, logger *lo Password: password, } } - if config.EnableSSL { + if consulConfig.EnableSSL { cfg.Scheme = "https" tlsCfg := consul.TLSConfig{ Address: cfg.Address, - CAFile: config.CAFile, - CertFile: config.CertFile, - KeyFile: config.KeyFile, - InsecureSkipVerify: !config.VerifySSL, + CAFile: consulConfig.CAFile, + CertFile: consulConfig.CertFile, + KeyFile: consulConfig.KeyFile, + InsecureSkipVerify: !consulConfig.VerifySSL, } tlsClientCfg, err := consul.SetupTLSConfig(&tlsCfg) if err != nil { @@ -157,7 +158,7 @@ func NewSyncer(config *config.ConsulConfig, shutdownCh chan struct{}, logger *lo TLSClientConfig: tlsClientCfg, } } - if config.EnableSSL && !config.VerifySSL { + if consulConfig.EnableSSL && !consulConfig.VerifySSL { cfg.HttpClient.Transport = &http.Transport{ TLSClientConfig: &tls.Config{ InsecureSkipVerify: true, diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go index d56971f72b3..67d06af65ba 100644 --- a/nomad/structs/diff_test.go +++ b/nomad/structs/diff_test.go @@ -2486,6 +2486,12 @@ func TestTaskDiff(t *testing.T) { Old: "foo", New: "bar", }, + { + Type: DiffTypeNone, + Name: "ServiceID", + Old: "", + New: "", + }, }, }, }, @@ -2821,7 +2827,7 @@ func TestTaskDiff(t *testing.T) { } if !reflect.DeepEqual(actual, c.Expected) { - t.Fatalf("case %d: got:\n%#v\n want:\n%#v\n", + t.Errorf("case %d: got:\n%#v\n want:\n%#v\n", i+1, actual, c.Expected) } } From a0902c3f45a6958e06591941f72f187e0803aa56 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 15:45:54 -0400 Subject: [PATCH 156/166] Prevent duplicate servers being added in AddPrimaryServer. This logic was already present elsewhere and was missed in this one place. --- client/rpcproxy/rpcproxy.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index a090a672638..581c1fcd35d 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -225,7 +225,12 @@ func (p *RPCProxy) AddPrimaryServer(rpcAddr string) *ServerEndpoint { return nil } + k := s.Key() p.serverListLock.Lock() + if serverExists := p.primaryServers.serverExistByKey(k); serverExists { + p.serverListLock.Unlock() + return nil + } p.primaryServers.L = append(p.primaryServers.L, s) p.serverListLock.Unlock() @@ -257,6 +262,18 @@ func (l *serverList) cycleServer() (servers []*ServerEndpoint) { return newServers } +// serverExistByKey performs a search to see if a server exists in the +// serverList. Assumes the caller is holding at least a read lock. +func (l *serverList) serverExistByKey(targetKey *EndpointKey) bool { + var found bool + for _, server := range l.L { + if targetKey.Equal(server.Key()) { + found = true + } + } + return found +} + // removeServerByKey performs an inline removal of the first matching server func (l *serverList) removeServerByKey(targetKey *EndpointKey) { for i, s := range l.L { From 4e543b6d43bc43cfce7b885be0fdc82329d813f6 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 16:46:49 -0400 Subject: [PATCH 157/166] Restore old behavior and have AddPrimaryServer() return a pointer to the existing server (vs nil when the server already exists). --- client/rpcproxy/rpcproxy.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 581c1fcd35d..635a190fc20 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -229,7 +229,7 @@ func (p *RPCProxy) AddPrimaryServer(rpcAddr string) *ServerEndpoint { p.serverListLock.Lock() if serverExists := p.primaryServers.serverExistByKey(k); serverExists { p.serverListLock.Unlock() - return nil + return s } p.primaryServers.L = append(p.primaryServers.L, s) p.serverListLock.Unlock() @@ -490,7 +490,7 @@ func (p *RPCProxy) RebalanceServers() { // Verify that all servers are present. Reconcile will save the // final serverList. if p.reconcileServerList(l) { - p.logger.Printf("[DEBUG] client.rpcproxy: Rebalanced %d servers, next active server is %s/%v", len(l.L), l.L[0].String(), l) + p.logger.Printf("[TRACE] client.rpcproxy: Rebalanced %d servers, next active server is %s", len(l.L), l.L[0].String()) } else { // reconcileServerList failed because Nomad removed the // server that was at the front of the list that had From 4826728bb95c3af2fa578f98c1c7321760ca533c Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 16:50:35 -0400 Subject: [PATCH 158/166] Fix another unit test not expecting ServiceID --- nomad/structs/diff_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go index 67d06af65ba..0f033a524eb 100644 --- a/nomad/structs/diff_test.go +++ b/nomad/structs/diff_test.go @@ -2757,6 +2757,12 @@ func TestTaskDiff(t *testing.T) { Old: "", New: "", }, + { + Type: DiffTypeNone, + Name: "ServiceID", + Old: "", + New: "", + }, }, Objects: []*ObjectDiff{ { From 5ffd9707b001790372a0f9b4b8c2a23e2c93879c Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 22:14:03 -0400 Subject: [PATCH 159/166] Expose rpcproxy's `ServerEndpoint()` constructor, `newServer()` as `NewServerEndpoint()` --- client/rpcproxy/rpcproxy.go | 6 +++--- client/rpcproxy/server_endpoint.go | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 635a190fc20..342f29d7359 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -190,7 +190,7 @@ func (p *RPCProxy) activateEndpoint(s *ServerEndpoint) bool { func (p *RPCProxy) SetBackupServers(addrs []string) error { l := make([]*ServerEndpoint, 0, len(addrs)) for _, s := range addrs { - s, err := newServer(s) + s, err := NewServerEndpoint(s) if err != nil { p.logger.Printf("[WARN] client.rpcproxy: unable to create backup server %q: %v", s, err) return fmt.Errorf("unable to create new backup server from %q: %v", s, err) @@ -219,7 +219,7 @@ func (p *RPCProxy) SetBackupServers(addrs []string) error { // organically). Any values in the primary server list are overridden by the // next successful heartbeat. func (p *RPCProxy) AddPrimaryServer(rpcAddr string) *ServerEndpoint { - s, err := newServer(rpcAddr) + s, err := NewServerEndpoint(rpcAddr) if err != nil { p.logger.Printf("[WARN] client.rpcproxy: unable to create new primary server from endpoint %q: %v", rpcAddr, err) return nil @@ -711,7 +711,7 @@ func (p *RPCProxy) RefreshServerLists(servers []*structs.NodeServerInfo, numNode continue } - server, err := newServer(s.RpcAdvertiseAddr) + server, err := NewServerEndpoint(s.RpcAdvertiseAddr) if err != nil { p.logger.Printf("[WARN] client.rpcproxy: Unable to create a server from %q: %v", s.RpcAdvertiseAddr, err) continue diff --git a/client/rpcproxy/server_endpoint.go b/client/rpcproxy/server_endpoint.go index 5ebca184dfa..d9b1add5b6a 100644 --- a/client/rpcproxy/server_endpoint.go +++ b/client/rpcproxy/server_endpoint.go @@ -40,12 +40,12 @@ func (s *ServerEndpoint) Key() *EndpointKey { } } -// newServer creates a new Server instance with a resolvable endpoint. -// `name` can be either an IP address or a DNS name. If `name` is a DNS -// name, it must be resolvable to an IP address (most inputs are IP +// NewServerEndpoint creates a new Server instance with a resolvable +// endpoint. `name` can be either an IP address or a DNS name. If `name` is +// a DNS name, it must be resolvable to an IP address (most inputs are IP // addresses, not DNS names, but both work equally well when the name is // resolvable). -func newServer(name string) (*ServerEndpoint, error) { +func NewServerEndpoint(name string) (*ServerEndpoint, error) { s := &ServerEndpoint{ Name: name, } From 6819f2b68d91151df7d492768b67760f2c4f435b Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 23:05:14 -0400 Subject: [PATCH 160/166] Query for the Nomad service across multiple Consul datacenters. --- client/client.go | 78 +++++++++++++++++++++++++++------- command/agent/consul/syncer.go | 4 ++ 2 files changed, 66 insertions(+), 16 deletions(-) diff --git a/client/client.go b/client/client.go index a0503d0fefa..35c11834c4a 100644 --- a/client/client.go +++ b/client/client.go @@ -37,6 +37,10 @@ const ( // open to a server clientMaxStreams = 2 + // datacenterQueryLimit searches through up to this many adjacent + // datacenters looking for the Nomad server service. + datacenterQueryLimit = 5 + // registerRetryIntv is minimum interval on which we retry // registration. We pick a value between this and 2x this. registerRetryIntv = 15 * time.Second @@ -1282,33 +1286,75 @@ func (c *Client) setupConsulSyncer() error { c.heartbeatLock.Unlock() c.logger.Printf("[TRACE] client.consul: lost heartbeat with Nomad quorum, falling back to Consul for server list") - nomadServerServiceName := c.config.ConsulConfig.ServerServiceName - services, _, err := c.consulSyncer.ConsulClient().Catalog(). - Service(nomadServerServiceName, consul.ServiceTagRpc, - &consulapi.QueryOptions{AllowStale: true}) + consulCatalog := c.consulSyncer.ConsulClient().Catalog() + dcs, err := consulCatalog.Datacenters() if err != nil { - return fmt.Errorf("client.consul: unable to query service %q: %v", nomadServerServiceName, err) + return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err) } + dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)] + // Walk the list of Consul datacenters randomly in order to + // search for the Nomad server service. + shuffleStrings(dcs) - if len(services) == 0 { - return fmt.Errorf("client.consul: no Nomad servers advertising service %q", nomadServerServiceName) + nomadServerServiceName := c.config.ConsulConfig.ServerServiceName + var mErr multierror.Error + const defaultMaxNumNomadServers = 8 + nomadServerServices := make([]string, 0, defaultMaxNumNomadServers) + for _, dc := range dcs { + opts := &consulapi.QueryOptions{ + AllowStale: true, + Datacenter: dc, + Near: "_agent", + WaitTime: consul.DefaultQueryWaitDuration, + } + consulServices, _, err := consulCatalog.Service(nomadServerServiceName, consul.ServiceTagRpc, opts) + if err != nil { + mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %q from Consul datacenter %q: %v", nomadServerServiceName, dc, err)) + continue + } + + for _, s := range consulServices { + port := strconv.FormatInt(int64(s.ServicePort), 10) + addr := s.ServiceAddress + if addr == "" { + addr = s.Address + } + serverAddr := net.JoinHostPort(addr, port) + serverEndpoint, err := rpcproxy.NewServerEndpoint(serverAddr) + if err != nil { + mErr.Errors = append(mErr.Errors, err) + continue + } + var ok bool + if ok, err = c.connPool.PingNomadServer(c.Region(), c.RpcMajorVersion(), serverEndpoint); err != nil { + mErr.Errors = append(mErr.Errors, err) + continue + } + if ok { + nomadServerServices = append(nomadServerServices, serverAddr) + } + } + // Break if at least one Nomad Server was successfully pinged + if len(nomadServerServices) > 0 { + break + } } + if len(nomadServerServices) == 0 { + if len(mErr.Errors) > 0 { + return mErr.ErrorOrNil() + } - serverAddrs := make([]string, 0, len(services)) - for _, s := range services { - port := strconv.FormatInt(int64(s.ServicePort), 10) - addr := s.ServiceAddress - if addr == "" { - addr = s.Address + for i, _ := range dcs { + dcs[i] = fmt.Sprintf("%q", dcs[i]) } - serverAddrs = append(serverAddrs, net.JoinHostPort(addr, port)) + return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %s", nomadServerServiceName, dcs) } c.heartbeatLock.Lock() if atomic.LoadInt32(&c.lastHeartbeatFromQuorum) == 1 && now.Before(c.consulPullHeartbeatDeadline) { c.heartbeatLock.Unlock() // Common, healthy path - if err := c.rpcProxy.SetBackupServers(serverAddrs); err != nil { + if err := c.rpcProxy.SetBackupServers(nomadServerServices); err != nil { return fmt.Errorf("client.consul: unable to set backup servers: %v", err) } } else { @@ -1322,7 +1368,7 @@ func (c *Client) setupConsulSyncer() error { // eventually, hopefully, find their way to a Nomad // Server that has quorum (assuming Consul has a // server list that is in the majority). - for _, s := range serverAddrs { + for _, s := range nomadServerServices { c.rpcProxy.AddPrimaryServer(s) } } diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index 97ca8ed16bc..c3062ad502d 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -43,6 +43,10 @@ const ( // the check result ttlCheckBuffer = 31 * time.Second + // DefaultQueryWaitDuration is the max duration the Consul Agent will + // spend waiting for a response from a Consul Query. + DefaultQueryWaitDuration = 2 * time.Second + // ServiceTagHttp is the tag assigned to HTTP services ServiceTagHttp = "http" From 5331ea7bc6a07df49df97129007e0109ccbdb5ac Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 23:26:15 -0400 Subject: [PATCH 161/166] goling(1) compliance pass (e.g. Rpc* -> RPC) --- client/client.go | 23 ++++----- client/rpcproxy/rpcproxy.go | 22 ++++----- client/rpcproxy/rpcproxy_test.go | 6 +-- command/agent/agent.go | 82 ++++++++++++++++---------------- command/agent/agent_test.go | 12 ++--- command/agent/consul/syncer.go | 10 ++-- nomad/node_endpoint.go | 6 +-- nomad/structs/structs.go | 6 +-- 8 files changed, 85 insertions(+), 82 deletions(-) diff --git a/client/client.go b/client/client.go index 35c11834c4a..89c538f3fe5 100644 --- a/client/client.go +++ b/client/client.go @@ -302,15 +302,15 @@ func (c *Client) Region() string { return c.config.Region } -// RpcMajorVersion returns the structs.ApiMajorVersion supported by the +// RPCMajorVersion returns the structs.ApiMajorVersion supported by the // client. -func (c *Client) RpcMajorVersion() int { +func (c *Client) RPCMajorVersion() int { return structs.ApiMajorVersion } -// RpcMinorVersion returns the structs.ApiMinorVersion supported by the +// RPCMinorVersion returns the structs.ApiMinorVersion supported by the // client. -func (c *Client) RpcMinorVersion() int { +func (c *Client) RPCMinorVersion() int { return structs.ApiMinorVersion } @@ -354,7 +354,7 @@ func (c *Client) RPC(method string, args interface{}, reply interface{}) error { } // Make the RPC request - if err := c.connPool.RPC(c.Region(), server.Addr, c.RpcMajorVersion(), method, args, reply); err != nil { + if err := c.connPool.RPC(c.Region(), server.Addr, c.RPCMajorVersion(), method, args, reply); err != nil { c.rpcProxy.NotifyFailedServer(server) c.logger.Printf("[ERR] client: RPC failed to server %s: %v", server.Addr, err) return err @@ -1307,7 +1307,7 @@ func (c *Client) setupConsulSyncer() error { Near: "_agent", WaitTime: consul.DefaultQueryWaitDuration, } - consulServices, _, err := consulCatalog.Service(nomadServerServiceName, consul.ServiceTagRpc, opts) + consulServices, _, err := consulCatalog.Service(nomadServerServiceName, consul.ServiceTagRPC, opts) if err != nil { mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %q from Consul datacenter %q: %v", nomadServerServiceName, dc, err)) continue @@ -1326,7 +1326,7 @@ func (c *Client) setupConsulSyncer() error { continue } var ok bool - if ok, err = c.connPool.PingNomadServer(c.Region(), c.RpcMajorVersion(), serverEndpoint); err != nil { + if ok, err = c.connPool.PingNomadServer(c.Region(), c.RPCMajorVersion(), serverEndpoint); err != nil { mErr.Errors = append(mErr.Errors, err) continue } @@ -1344,7 +1344,7 @@ func (c *Client) setupConsulSyncer() error { return mErr.ErrorOrNil() } - for i, _ := range dcs { + for i := range dcs { dcs[i] = fmt.Sprintf("%q", dcs[i]) } return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %s", nomadServerServiceName, dcs) @@ -1381,7 +1381,7 @@ func (c *Client) setupConsulSyncer() error { const estInitialConsulServices = 8 const serviceGroupName = "executor" services := make([]*structs.ConsulService, 0, estInitialConsulServices) - for allocId, ar := range c.getAllocRunners() { + for allocID, ar := range c.getAllocRunners() { ar.taskStatusLock.RLock() taskStates := copyTaskStates(ar.taskStates) ar.taskStatusLock.RUnlock() @@ -1390,10 +1390,10 @@ func (c *Client) setupConsulSyncer() error { if tr, ok := ar.tasks[taskName]; ok { for _, service := range tr.task.ConsulServices { if service.Name == "" { - service.Name = fmt.Sprintf("%s-%s", tr.task.Name, allocId) + service.Name = fmt.Sprintf("%s-%s", tr.task.Name, allocID) } if service.ServiceID == "" { - service.ServiceID = fmt.Sprintf("%s-%s:%s/%s", c.consulSyncer.GenerateServiceID(serviceGroupName, service), tr.task.Name, allocId) + service.ServiceID = fmt.Sprintf("%s-%s:%s/%s", c.consulSyncer.GenerateServiceID(serviceGroupName, service), tr.task.Name, allocID) } services = append(services, service) } @@ -1461,6 +1461,7 @@ func (c *Client) emitStats(hStats *stats.HostStats) { } } +// RPCProxy returns the Client's RPCProxy instance func (c *Client) RPCProxy() *rpcproxy.RPCProxy { return c.rpcProxy } diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index 342f29d7359..ee273275569 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -63,8 +63,8 @@ const ( // configuration to prevents a cyclic import dependency. type NomadConfigInfo interface { Datacenter() string - RpcMajorVersion() int - RpcMinorVersion() int + RPCMajorVersion() int + RPCMinorVersion() int Region() string } @@ -468,7 +468,7 @@ func (p *RPCProxy) RebalanceServers() { // detect the failed node. selectedServer := l.L[0] - ok, err := p.connPoolPinger.PingNomadServer(p.configInfo.Region(), p.configInfo.RpcMajorVersion(), selectedServer) + ok, err := p.connPoolPinger.PingNomadServer(p.configInfo.Region(), p.configInfo.RPCMajorVersion(), selectedServer) if ok { foundHealthyServer = true break @@ -697,23 +697,23 @@ func (p *RPCProxy) RefreshServerLists(servers []*structs.NodeServerInfo, numNode // TODO(sean@): Move the logging throttle logic into a // dedicated logging package so RPCProxy does not have to // perform this accounting. - if int32(p.configInfo.RpcMajorVersion()) < s.RpcMajorVersion || - (int32(p.configInfo.RpcMajorVersion()) == s.RpcMajorVersion && - int32(p.configInfo.RpcMinorVersion()) < s.RpcMinorVersion) { + if int32(p.configInfo.RPCMajorVersion()) < s.RPCMajorVersion || + (int32(p.configInfo.RPCMajorVersion()) == s.RPCMajorVersion && + int32(p.configInfo.RPCMinorVersion()) < s.RPCMinorVersion) { now := time.Now() - t, ok := p.rpcAPIMismatchThrottle[s.RpcAdvertiseAddr] + t, ok := p.rpcAPIMismatchThrottle[s.RPCAdvertiseAddr] if ok && t.After(now) { continue } - p.logger.Printf("[WARN] client.rpcproxy: API mismatch between client version (v%d.%d) and server version (v%d.%d), ignoring server %q", p.configInfo.RpcMajorVersion(), p.configInfo.RpcMinorVersion(), s.RpcMajorVersion, s.RpcMinorVersion, s.RpcAdvertiseAddr) - p.rpcAPIMismatchThrottle[s.RpcAdvertiseAddr] = now.Add(rpcAPIMismatchLogRate) + p.logger.Printf("[WARN] client.rpcproxy: API mismatch between client version (v%d.%d) and server version (v%d.%d), ignoring server %q", p.configInfo.RPCMajorVersion(), p.configInfo.RPCMinorVersion(), s.RPCMajorVersion, s.RPCMinorVersion, s.RPCAdvertiseAddr) + p.rpcAPIMismatchThrottle[s.RPCAdvertiseAddr] = now.Add(rpcAPIMismatchLogRate) continue } - server, err := NewServerEndpoint(s.RpcAdvertiseAddr) + server, err := NewServerEndpoint(s.RPCAdvertiseAddr) if err != nil { - p.logger.Printf("[WARN] client.rpcproxy: Unable to create a server from %q: %v", s.RpcAdvertiseAddr, err) + p.logger.Printf("[WARN] client.rpcproxy: Unable to create a server from %q: %v", s.RPCAdvertiseAddr, err) continue } diff --git a/client/rpcproxy/rpcproxy_test.go b/client/rpcproxy/rpcproxy_test.go index c24b24afcbe..bfe2ac3cad2 100644 --- a/client/rpcproxy/rpcproxy_test.go +++ b/client/rpcproxy/rpcproxy_test.go @@ -82,11 +82,11 @@ func (s *fauxSerf) Datacenter() string { return s.datacenter } -func (s *fauxSerf) RpcMajorVersion() int { +func (s *fauxSerf) RPCMajorVersion() int { return s.rpcMajorVersion } -func (s *fauxSerf) RpcMinorVersion() int { +func (s *fauxSerf) RPCMinorVersion() int { return s.rpcMinorVersion } @@ -645,7 +645,7 @@ func test_reconcileServerList(maxServers int) (bool, error) { // failPct of the servers for the reconcile. This // allows for the selected server to no longer be // healthy for the reconcile below. - if ok, _ := p.connPoolPinger.PingNomadServer(p.configInfo.Region(), p.configInfo.RpcMajorVersion(), node); ok { + if ok, _ := p.connPoolPinger.PingNomadServer(p.configInfo.Region(), p.configInfo.RPCMajorVersion(), node); ok { // Will still be present healthyServers = append(healthyServers, node) } else { diff --git a/command/agent/agent.go b/command/agent/agent.go index bf50d93c070..6ebbfdd5c1d 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -33,12 +33,12 @@ type Agent struct { consulSyncer *consul.Syncer client *client.Client - clientHttpAddr string - clientRpcAddr string + clientHTTPAddr string + clientRPCAddr string server *nomad.Server - serverHttpAddr string - serverRpcAddr string + serverHTTPAddr string + serverRPCAddr string serverSerfAddr string shutdown bool @@ -174,35 +174,35 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { // Resolve the Server's HTTP Address if a.config.AdvertiseAddrs.HTTP != "" { - a.serverHttpAddr = a.config.AdvertiseAddrs.HTTP + a.serverHTTPAddr = a.config.AdvertiseAddrs.HTTP } else if a.config.Addresses.HTTP != "" { - a.serverHttpAddr = fmt.Sprintf("%v:%v", a.config.Addresses.HTTP, a.config.Ports.HTTP) + a.serverHTTPAddr = fmt.Sprintf("%v:%v", a.config.Addresses.HTTP, a.config.Ports.HTTP) } else if a.config.BindAddr != "" { - a.serverHttpAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.HTTP) + a.serverHTTPAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.HTTP) } else { - a.serverHttpAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.HTTP) + a.serverHTTPAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.HTTP) } - addr, err := net.ResolveTCPAddr("tcp", a.serverHttpAddr) + addr, err := net.ResolveTCPAddr("tcp", a.serverHTTPAddr) if err != nil { - return nil, fmt.Errorf("error resolving HTTP addr %q: %v:", a.serverHttpAddr, err) + return nil, fmt.Errorf("error resolving HTTP addr %q: %v", a.serverHTTPAddr, err) } - a.serverHttpAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) + a.serverHTTPAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) // Resolve the Server's RPC Address if a.config.AdvertiseAddrs.RPC != "" { - a.serverRpcAddr = a.config.AdvertiseAddrs.RPC + a.serverRPCAddr = a.config.AdvertiseAddrs.RPC } else if a.config.Addresses.RPC != "" { - a.serverRpcAddr = fmt.Sprintf("%v:%v", a.config.Addresses.RPC, a.config.Ports.RPC) + a.serverRPCAddr = fmt.Sprintf("%v:%v", a.config.Addresses.RPC, a.config.Ports.RPC) } else if a.config.BindAddr != "" { - a.serverRpcAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.RPC) + a.serverRPCAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.RPC) } else { - a.serverRpcAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.RPC) + a.serverRPCAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.RPC) } - addr, err = net.ResolveTCPAddr("tcp", a.serverRpcAddr) + addr, err = net.ResolveTCPAddr("tcp", a.serverRPCAddr) if err != nil { - return nil, fmt.Errorf("error resolving RPC addr %q: %v:", a.serverRpcAddr, err) + return nil, fmt.Errorf("error resolving RPC addr %q: %v", a.serverRPCAddr, err) } - a.serverRpcAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) + a.serverRPCAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) // Resolve the Server's Serf Address if a.config.AdvertiseAddrs.Serf != "" { @@ -216,7 +216,7 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { } addr, err = net.ResolveTCPAddr("tcp", a.serverSerfAddr) if err != nil { - return nil, fmt.Errorf("error resolving Serf addr %q: %v:", a.serverSerfAddr, err) + return nil, fmt.Errorf("error resolving Serf addr %q: %v", a.serverSerfAddr, err) } a.serverSerfAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) @@ -292,37 +292,37 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { // Resolve the Client's HTTP address if a.config.AdvertiseAddrs.HTTP != "" { - a.clientHttpAddr = a.config.AdvertiseAddrs.HTTP + a.clientHTTPAddr = a.config.AdvertiseAddrs.HTTP } else if a.config.Addresses.HTTP != "" { - a.clientHttpAddr = fmt.Sprintf("%v:%v", a.config.Addresses.HTTP, a.config.Ports.HTTP) + a.clientHTTPAddr = fmt.Sprintf("%v:%v", a.config.Addresses.HTTP, a.config.Ports.HTTP) } else if a.config.BindAddr != "" { - a.clientHttpAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.HTTP) + a.clientHTTPAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.HTTP) } else { - a.clientHttpAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.HTTP) + a.clientHTTPAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.HTTP) } - addr, err := net.ResolveTCPAddr("tcp", a.clientHttpAddr) + addr, err := net.ResolveTCPAddr("tcp", a.clientHTTPAddr) if err != nil { - return nil, fmt.Errorf("error resolving HTTP addr %q: %v:", a.clientHttpAddr, err) + return nil, fmt.Errorf("error resolving HTTP addr %q: %v", a.clientHTTPAddr, err) } httpAddr := fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) conf.Node.HTTPAddr = httpAddr - a.clientHttpAddr = httpAddr + a.clientHTTPAddr = httpAddr // Resolve the Client's RPC address if a.config.AdvertiseAddrs.RPC != "" { - a.clientRpcAddr = a.config.AdvertiseAddrs.RPC + a.clientRPCAddr = a.config.AdvertiseAddrs.RPC } else if a.config.Addresses.RPC != "" { - a.clientRpcAddr = fmt.Sprintf("%v:%v", a.config.Addresses.RPC, a.config.Ports.RPC) + a.clientRPCAddr = fmt.Sprintf("%v:%v", a.config.Addresses.RPC, a.config.Ports.RPC) } else if a.config.BindAddr != "" { - a.clientRpcAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.RPC) + a.clientRPCAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.RPC) } else { - a.clientRpcAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.RPC) + a.clientRPCAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.RPC) } - addr, err = net.ResolveTCPAddr("tcp", a.clientRpcAddr) + addr, err = net.ResolveTCPAddr("tcp", a.clientRPCAddr) if err != nil { - return nil, fmt.Errorf("error resolving RPC addr %q: %v:", a.clientRpcAddr, err) + return nil, fmt.Errorf("error resolving RPC addr %q: %v", a.clientRPCAddr, err) } - a.clientRpcAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) + a.clientRPCAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) // Reserve resources on the node. r := conf.Node.Reserved @@ -372,13 +372,13 @@ func (a *Agent) setupServer() error { a.consulSyncer.SetServices(serviceGroupName, []*structs.ConsulService{ &structs.ConsulService{ Name: a.config.Consul.ServerServiceName, - PortLabel: a.serverHttpAddr, - Tags: []string{consul.ServiceTagHttp}, + PortLabel: a.serverHTTPAddr, + Tags: []string{consul.ServiceTagHTTP}, }, &structs.ConsulService{ Name: a.config.Consul.ServerServiceName, - PortLabel: a.serverRpcAddr, - Tags: []string{consul.ServiceTagRpc}, + PortLabel: a.serverRPCAddr, + Tags: []string{consul.ServiceTagRPC}, }, &structs.ConsulService{ PortLabel: a.serverSerfAddr, @@ -423,13 +423,13 @@ func (a *Agent) setupClient() error { a.consulSyncer.SetServices(serviceGroupName, []*structs.ConsulService{ &structs.ConsulService{ Name: a.config.Consul.ClientServiceName, - PortLabel: a.clientHttpAddr, - Tags: []string{consul.ServiceTagHttp}, + PortLabel: a.clientHTTPAddr, + Tags: []string{consul.ServiceTagHTTP}, }, &structs.ConsulService{ Name: a.config.Consul.ClientServiceName, - PortLabel: a.clientRpcAddr, - Tags: []string{consul.ServiceTagRpc}, + PortLabel: a.clientRPCAddr, + Tags: []string{consul.ServiceTagRPC}, }, }) } diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 1d7eb3e1570..aae664f2114 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -119,10 +119,10 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := out.RPCAdvertise; addr.IP.String() != "127.0.0.1" || addr.Port != 4001 { t.Fatalf("bad rpc advertise addr: %#v", addr) } - if addr := a.serverHttpAddr; addr != "10.10.11.1:4005" { + if addr := a.serverHTTPAddr; addr != "10.10.11.1:4005" { t.Fatalf("expect 10.11.11.1:4005, got: %v", addr) } - if addr := a.serverRpcAddr; addr != "127.0.0.1:4001" { + if addr := a.serverRPCAddr; addr != "127.0.0.1:4001" { t.Fatalf("expect 127.0.0.1:4001, got: %v", addr) } @@ -158,11 +158,11 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := out.SerfConfig.MemberlistConfig.BindAddr; addr != "127.0.0.2" { t.Fatalf("expect 127.0.0.2, got: %s", addr) } - if addr := a.serverHttpAddr; addr != "127.0.0.2:4646" { + if addr := a.serverHTTPAddr; addr != "127.0.0.2:4646" { t.Fatalf("expect 127.0.0.2:4646, got: %s", addr) } // NOTE: AdvertiseAddr > Addresses > BindAddr > Defaults - if addr := a.serverRpcAddr; addr != "127.0.0.1:4001" { + if addr := a.serverRPCAddr; addr != "127.0.0.1:4001" { t.Fatalf("expect 127.0.0.1:4001, got: %s", addr) } if addr := a.serverSerfAddr; addr != "127.0.0.1:4000" { @@ -211,10 +211,10 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := out.SerfConfig.MemberlistConfig.BindAddr; addr != "127.0.0.3" { t.Fatalf("expect 127.0.0.3, got: %s", addr) } - if addr := a.serverHttpAddr; addr != "127.0.0.3:4646" { + if addr := a.serverHTTPAddr; addr != "127.0.0.3:4646" { t.Fatalf("expect 127.0.0.3:4646, got: %s", addr) } - if addr := a.serverRpcAddr; addr != "127.0.0.3:4647" { + if addr := a.serverRPCAddr; addr != "127.0.0.3:4647" { t.Fatalf("expect 127.0.0.3:4647, got: %s", addr) } if addr := a.serverSerfAddr; addr != "127.0.0.3:4648" { diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index c3062ad502d..474651f6e3d 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -47,11 +47,11 @@ const ( // spend waiting for a response from a Consul Query. DefaultQueryWaitDuration = 2 * time.Second - // ServiceTagHttp is the tag assigned to HTTP services - ServiceTagHttp = "http" + // ServiceTagHTTP is the tag assigned to HTTP services + ServiceTagHTTP = "http" - // ServiceTagRpc is the tag assigned to RPC services - ServiceTagRpc = "rpc" + // ServiceTagRPC is the tag assigned to RPC services + ServiceTagRPC = "rpc" // ServiceTagSerf is the tag assigned to Serf services ServiceTagSerf = "serf" @@ -895,6 +895,7 @@ func (c *Syncer) AddPeriodicHandler(name string, fn types.PeriodicCallback) bool return true } +// NumHandlers returns the number of callbacks registered with the syncer func (c *Syncer) NumHandlers() int { c.periodicLock.RLock() defer c.periodicLock.RUnlock() @@ -908,6 +909,7 @@ func (c *Syncer) RemovePeriodicHandler(name string) { delete(c.periodicCallbacks, name) } +// ConsulClient returns the Consul client used by the Syncer. func (c *Syncer) ConsulClient() *consul.Client { return c.client } diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index 4784a6fe353..25c8e118ef8 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -126,9 +126,9 @@ func (n *Node) constructNodeServerInfoResponse(snap *state.StateSnapshot, reply for k, v := range n.srv.localPeers { reply.Servers = append(reply.Servers, &structs.NodeServerInfo{ - RpcAdvertiseAddr: k, - RpcMajorVersion: int32(v.MajorVersion), - RpcMinorVersion: int32(v.MinorVersion), + RPCAdvertiseAddr: k, + RPCMajorVersion: int32(v.MajorVersion), + RPCMinorVersion: int32(v.MinorVersion), Datacenter: v.Datacenter, }) } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 1f6c80bb49d..ea323cb1dd2 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -171,15 +171,15 @@ type NodeDeregisterRequest struct { type NodeServerInfo struct { // RPCAdvertiseAddr is the IP endpoint that a Nomad Server wishes to // be contacted at for RPCs. - RpcAdvertiseAddr string + RPCAdvertiseAddr string // RpcMajorVersion is the major version number the Nomad Server // supports - RpcMajorVersion int32 + RPCMajorVersion int32 // RpcMinorVersion is the minor version number the Nomad Server // supports - RpcMinorVersion int32 + RPCMinorVersion int32 // Datacenter is the datacenter that a Nomad server belongs to Datacenter string From 2879c33c8fec103fe15c18216bf718bec65f4246 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Fri, 10 Jun 2016 23:43:54 -0400 Subject: [PATCH 162/166] Perform a nil-check for Executor's consulServices. Executors can `Shutdown()` before calling `SyncServices()`. --- client/driver/executor/executor.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index f7d971be961..063db646e7d 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -425,7 +425,9 @@ func (e *UniversalExecutor) Exit() error { e.lre.Close() e.lro.Close() - e.consulSyncer.Shutdown() + if e.consulSyncer != nil { + e.consulSyncer.Shutdown() + } // If the executor did not launch a process, return. if e.command == nil { From 183b51d8195b7c212e7511583120261c45d100dc Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 11 Jun 2016 17:52:09 -0400 Subject: [PATCH 163/166] Remove default values and use nil for the executor. Much better. --- client/driver/executor/executor.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index 063db646e7d..ba4deb32f4e 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -205,18 +205,9 @@ type UniversalExecutor struct { // NewExecutor returns an Executor func NewExecutor(logger *log.Logger) Executor { - shutdownCh := make(chan struct{}) - cs, err := consul.NewSyncer(nil, shutdownCh, logger) - if err != nil { - logger.Printf("[ERROR] executor: failed to allocate new Consul Syncer: %v", err) - return nil - } - exec := &UniversalExecutor{ - consulSyncer: cs, logger: logger, processExited: make(chan interface{}), - shutdownCh: make(chan struct{}), totalCpuStats: stats.NewCpuStats(), userCpuStats: stats.NewCpuStats(), systemCpuStats: stats.NewCpuStats(), From a8b53b05b481caf41c80335bdcee5298a1c87c39 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 11 Jun 2016 18:17:20 -0400 Subject: [PATCH 164/166] Prefer `%+q` over `%q` in log messages. --- client/client.go | 6 +++--- client/rpcproxy/rpcproxy.go | 10 +++++----- client/rpcproxy/rpcproxy_test.go | 12 ++++++------ command/agent/agent.go | 10 +++++----- command/agent/consul/syncer.go | 14 +++++++------- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/client/client.go b/client/client.go index 89c538f3fe5..b882c6b197c 100644 --- a/client/client.go +++ b/client/client.go @@ -1309,7 +1309,7 @@ func (c *Client) setupConsulSyncer() error { } consulServices, _, err := consulCatalog.Service(nomadServerServiceName, consul.ServiceTagRPC, opts) if err != nil { - mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %q from Consul datacenter %q: %v", nomadServerServiceName, dc, err)) + mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", nomadServerServiceName, dc, err)) continue } @@ -1345,9 +1345,9 @@ func (c *Client) setupConsulSyncer() error { } for i := range dcs { - dcs[i] = fmt.Sprintf("%q", dcs[i]) + dcs[i] = fmt.Sprintf("%+q", dcs[i]) } - return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %s", nomadServerServiceName, dcs) + return fmt.Errorf("no Nomad Servers advertising service %+q in Consul datacenters: %+q", nomadServerServiceName, dcs) } c.heartbeatLock.Lock() diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go index ee273275569..0e8c7604127 100644 --- a/client/rpcproxy/rpcproxy.go +++ b/client/rpcproxy/rpcproxy.go @@ -192,8 +192,8 @@ func (p *RPCProxy) SetBackupServers(addrs []string) error { for _, s := range addrs { s, err := NewServerEndpoint(s) if err != nil { - p.logger.Printf("[WARN] client.rpcproxy: unable to create backup server %q: %v", s, err) - return fmt.Errorf("unable to create new backup server from %q: %v", s, err) + p.logger.Printf("[WARN] client.rpcproxy: unable to create backup server %+q: %v", s, err) + return fmt.Errorf("unable to create new backup server from %+q: %v", s, err) } l = append(l, s) } @@ -221,7 +221,7 @@ func (p *RPCProxy) SetBackupServers(addrs []string) error { func (p *RPCProxy) AddPrimaryServer(rpcAddr string) *ServerEndpoint { s, err := NewServerEndpoint(rpcAddr) if err != nil { - p.logger.Printf("[WARN] client.rpcproxy: unable to create new primary server from endpoint %q: %v", rpcAddr, err) + p.logger.Printf("[WARN] client.rpcproxy: unable to create new primary server from endpoint %+q: %v", rpcAddr, err) return nil } @@ -706,14 +706,14 @@ func (p *RPCProxy) RefreshServerLists(servers []*structs.NodeServerInfo, numNode continue } - p.logger.Printf("[WARN] client.rpcproxy: API mismatch between client version (v%d.%d) and server version (v%d.%d), ignoring server %q", p.configInfo.RPCMajorVersion(), p.configInfo.RPCMinorVersion(), s.RPCMajorVersion, s.RPCMinorVersion, s.RPCAdvertiseAddr) + p.logger.Printf("[WARN] client.rpcproxy: API mismatch between client version (v%d.%d) and server version (v%d.%d), ignoring server %+q", p.configInfo.RPCMajorVersion(), p.configInfo.RPCMinorVersion(), s.RPCMajorVersion, s.RPCMinorVersion, s.RPCAdvertiseAddr) p.rpcAPIMismatchThrottle[s.RPCAdvertiseAddr] = now.Add(rpcAPIMismatchLogRate) continue } server, err := NewServerEndpoint(s.RPCAdvertiseAddr) if err != nil { - p.logger.Printf("[WARN] client.rpcproxy: Unable to create a server from %q: %v", s.RPCAdvertiseAddr, err) + p.logger.Printf("[WARN] client.rpcproxy: Unable to create a server from %+q: %v", s.RPCAdvertiseAddr, err) continue } diff --git a/client/rpcproxy/rpcproxy_test.go b/client/rpcproxy/rpcproxy_test.go index bfe2ac3cad2..c6b7327e62a 100644 --- a/client/rpcproxy/rpcproxy_test.go +++ b/client/rpcproxy/rpcproxy_test.go @@ -306,7 +306,7 @@ func TestRPCProxy_NumServers(t *testing.T) { serverName := makeServerEndpointName() s := p.AddPrimaryServer(serverName) if s == nil { - t.Fatalf("Expected server from %q", serverName) + t.Fatalf("Expected server from %+q", serverName) } serverList = append(serverList, s) @@ -380,11 +380,11 @@ func TestRPCProxy_RemoveServer(t *testing.T) { t.Fatalf("bad") } if s1 == nil || s1.Name != s1Endpoint { - t.Fatalf("Expected s1 server: %q", s1.Name) + t.Fatalf("Expected s1 server: %+q", s1.Name) } s1 = p.FindServer() if s1 == nil || s1.Name != s1Endpoint { - t.Fatalf("Expected s1 server: %q", s1.Name) + t.Fatalf("Expected s1 server: %+q", s1.Name) } p.RemoveServer(s1) if p.NumServers() != 0 { @@ -406,17 +406,17 @@ func TestRPCProxy_RemoveServer(t *testing.T) { t.Fatalf("bad") } if s2 == nil || s2.Name != s2Endpoint { - t.Fatalf("Expected s2 server: %q", s2.Name) + t.Fatalf("Expected s2 server: %+q", s2.Name) } s1 = p.FindServer() if s1 == nil || s1.Name != s1Endpoint { - t.Fatalf("Expected s1 to be the front of the list: %q==%q", s1.Name, s1Endpoint) + t.Fatalf("Expected s1 to be the front of the list: %+q==%+q", s1.Name, s1Endpoint) } // Move s1 to the back of the server list p.NotifyFailedServer(s1) s2 = p.FindServer() if s2 == nil || s2.Name != s2Endpoint { - t.Fatalf("Expected s2 server: %q", s2Endpoint) + t.Fatalf("Expected s2 server: %+q", s2Endpoint) } p.RemoveServer(s2) if p.NumServers() != 1 { diff --git a/command/agent/agent.go b/command/agent/agent.go index 6ebbfdd5c1d..194e3848aa9 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -184,7 +184,7 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { } addr, err := net.ResolveTCPAddr("tcp", a.serverHTTPAddr) if err != nil { - return nil, fmt.Errorf("error resolving HTTP addr %q: %v", a.serverHTTPAddr, err) + return nil, fmt.Errorf("error resolving HTTP addr %+q: %v", a.serverHTTPAddr, err) } a.serverHTTPAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) @@ -200,7 +200,7 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { } addr, err = net.ResolveTCPAddr("tcp", a.serverRPCAddr) if err != nil { - return nil, fmt.Errorf("error resolving RPC addr %q: %v", a.serverRPCAddr, err) + return nil, fmt.Errorf("error resolving RPC addr %+q: %v", a.serverRPCAddr, err) } a.serverRPCAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) @@ -216,7 +216,7 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { } addr, err = net.ResolveTCPAddr("tcp", a.serverSerfAddr) if err != nil { - return nil, fmt.Errorf("error resolving Serf addr %q: %v", a.serverSerfAddr, err) + return nil, fmt.Errorf("error resolving Serf addr %+q: %v", a.serverSerfAddr, err) } a.serverSerfAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) @@ -302,7 +302,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { } addr, err := net.ResolveTCPAddr("tcp", a.clientHTTPAddr) if err != nil { - return nil, fmt.Errorf("error resolving HTTP addr %q: %v", a.clientHTTPAddr, err) + return nil, fmt.Errorf("error resolving HTTP addr %+q: %v", a.clientHTTPAddr, err) } httpAddr := fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) conf.Node.HTTPAddr = httpAddr @@ -320,7 +320,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { } addr, err = net.ResolveTCPAddr("tcp", a.clientRPCAddr) if err != nil { - return nil, fmt.Errorf("error resolving RPC addr %q: %v", a.clientRPCAddr, err) + return nil, fmt.Errorf("error resolving RPC addr %+q: %v", a.clientRPCAddr, err) } a.clientRPCAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go index 474651f6e3d..fe7cc8181e6 100644 --- a/command/agent/consul/syncer.go +++ b/command/agent/consul/syncer.go @@ -347,7 +347,7 @@ func (c *Syncer) Shutdown() error { } for _, service := range services { if err := c.client.Agent().ServiceDeregister(service.ID); err != nil { - c.logger.Printf("[WARN] consul.syncer: failed to deregister service ID %q: %v", service.ID, err) + c.logger.Printf("[WARN] consul.syncer: failed to deregister service ID %+q: %v", service.ID, err) mErr.Errors = append(mErr.Errors, err) } } @@ -696,7 +696,7 @@ func (c *Syncer) createDelegatedCheckReg(check *structs.ServiceCheck, service *c case structs.ServiceCheckScript: chkReg.TTL = (check.Interval + ttlCheckBuffer).String() default: - return nil, fmt.Errorf("check type %q not valid", check.Type) + return nil, fmt.Errorf("check type %+q not valid", check.Type) } return &chkReg, nil } @@ -764,12 +764,12 @@ func (c *Syncer) Run() { if err := c.SyncServices(); err != nil { if c.consulAvailable { - c.logger.Printf("[DEBUG] consul.syncer: disabling checks until successful sync for %q: %v", c.serviceRegPrefix, err) + c.logger.Printf("[DEBUG] consul.syncer: disabling checks until successful sync for %+q: %v", c.serviceRegPrefix, err) } c.consulAvailable = false } else { if !c.consulAvailable { - c.logger.Printf("[DEBUG] consul.syncer: re-enabling checks for for %q", c.serviceRegPrefix) + c.logger.Printf("[DEBUG] consul.syncer: re-enabling checks for for %+q", c.serviceRegPrefix) } c.consulAvailable = true } @@ -779,7 +779,7 @@ func (c *Syncer) Run() { c.Shutdown() case <-c.notifyShutdownCh: sync.Stop() - c.logger.Printf("[INFO] consul.syncer: shutting down sync for %q", c.serviceRegPrefix) + c.logger.Printf("[INFO] consul.syncer: shutting down sync for %+q", c.serviceRegPrefix) return } } @@ -874,7 +874,7 @@ func (c *Syncer) runCheck(check Check) { } if err := c.client.Agent().UpdateTTL(check.ID(), output, state); err != nil { if c.consulAvailable { - c.logger.Printf("[DEBUG] consul.syncer: check %q failed, disabling Consul checks until until next successful sync: %v", check.ID(), err) + c.logger.Printf("[DEBUG] consul.syncer: check %+q failed, disabling Consul checks until until next successful sync: %v", check.ID(), err) c.consulAvailable = false } else { c.consulAvailable = true @@ -888,7 +888,7 @@ func (c *Syncer) AddPeriodicHandler(name string, fn types.PeriodicCallback) bool c.periodicLock.Lock() defer c.periodicLock.Unlock() if _, found := c.periodicCallbacks[name]; found { - c.logger.Printf("[ERROR] consul.syncer: failed adding handler %q", name) + c.logger.Printf("[ERROR] consul.syncer: failed adding handler %+q", name) return false } c.periodicCallbacks[name] = fn From 8d0b798f1d55bf0a1381f9f4fd65bfd781ce1af2 Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 11 Jun 2016 18:23:06 -0400 Subject: [PATCH 165/166] Walk the DCs from nearest to most remote, no limit on the search. --- client/client.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/client/client.go b/client/client.go index b882c6b197c..bee92fda15d 100644 --- a/client/client.go +++ b/client/client.go @@ -1291,10 +1291,6 @@ func (c *Client) setupConsulSyncer() error { if err != nil { return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err) } - dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)] - // Walk the list of Consul datacenters randomly in order to - // search for the Nomad server service. - shuffleStrings(dcs) nomadServerServiceName := c.config.ConsulConfig.ServerServiceName var mErr multierror.Error From 006d1a32905a32a4c1935f6371bd7b4226e9f71a Mon Sep 17 00:00:00 2001 From: Sean Chittenden Date: Sat, 11 Jun 2016 18:52:21 -0400 Subject: [PATCH 166/166] Walk the DCs from nearest to most remote. --- client/client.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/client/client.go b/client/client.go index bee92fda15d..d0c477ad1ba 100644 --- a/client/client.go +++ b/client/client.go @@ -1291,6 +1291,21 @@ func (c *Client) setupConsulSyncer() error { if err != nil { return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err) } + if len(dcs) > 2 { + // Query the local DC first, then shuffle the + // remaining DCs. Future heartbeats will cause Nomad + // Clients to fixate on their local datacenter so + // it's okay to talk with remote DCs. If the no + // Nomad servers are available within + // datacenterQueryLimit, the next heartbeat will pick + // a new set of servers so it's okay. + nearestDC := dcs[0] + otherDCs := make([]string, 0, len(dcs)) + otherDCs = dcs[1:lib.MinInt(len(dcs), datacenterQueryLimit)] + shuffleStrings(otherDCs) + + dcs = append([]string{nearestDC}, otherDCs...) + } nomadServerServiceName := c.config.ConsulConfig.ServerServiceName var mErr multierror.Error