diff --git a/api/agent_test.go b/api/agent_test.go index 09e8c0cadc4..0e764dbd2f4 100644 --- a/api/agent_test.go +++ b/api/agent_test.go @@ -124,39 +124,6 @@ func TestAgent_ForceLeave(t *testing.T) { // TODO: test force-leave on an existing node } -func TestAgent_SetServers(t *testing.T) { - c, s := makeClient(t, nil, func(c *testutil.TestServerConfig) { - c.Client.Enabled = true - c.Server.BootstrapExpect = 0 - }) - defer s.Stop() - a := c.Agent() - - // Attempting to set an empty list errors - err := a.SetServers([]string{}) - if err == nil { - t.Fatalf("expected error, got nothing") - } - - // Setting a valid list works - err = a.SetServers([]string{"foo", "bar"}) - if err != nil { - t.Fatalf("err: %s", err) - } - - // Returns the proper list of servers - out, err := a.Servers() - if err != nil { - t.Fatalf("err: %s", err) - } - if n := len(out); n != 2 { - t.Fatalf("expected 2 servers, got: %d", n) - } - if out[0] != "foo:4647" || out[1] != "bar:4647" { - t.Fatalf("bad server list: %v", out) - } -} - func (a *AgentMember) String() string { return "{Name: " + a.Name + " Region: " + a.Tags["region"] + " DC: " + a.Tags["dc"] + "}" } diff --git a/api/nodes_test.go b/api/nodes_test.go index 0a57321763a..d546dd435cc 100644 --- a/api/nodes_test.go +++ b/api/nodes_test.go @@ -50,7 +50,7 @@ func TestNodes_PrefixList(t *testing.T) { var err error // Get the node ID - var nodeID, dc string + var nodeID string testutil.WaitForResult(func() (bool, error) { out, _, err := nodes.List(nil) if err != nil { @@ -60,7 +60,6 @@ func TestNodes_PrefixList(t *testing.T) { return false, fmt.Errorf("expected 1 node, got: %d", n) } nodeID = out[0].ID - dc = out[0].Datacenter return true, nil }, func(err error) { t.Fatalf("err: %s", err) diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go index 97b3dc34844..f978447999d 100644 --- a/client/alloc_runner_test.go +++ b/client/alloc_runner_test.go @@ -10,6 +10,7 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" + "github.com/hashicorp/nomad/client/config" ctestutil "github.com/hashicorp/nomad/client/testutil" ) @@ -25,7 +26,7 @@ func (m *MockAllocStateUpdater) Update(alloc *structs.Allocation) { func testAllocRunner(restarts bool) (*MockAllocStateUpdater, *AllocRunner) { logger := testLogger() - conf := DefaultConfig() + conf := config.DefaultConfig() conf.StateDir = os.TempDir() conf.AllocDir = os.TempDir() upd := &MockAllocStateUpdater{} diff --git a/client/client.go b/client/client.go index 1c32e13e117..d0c477ad1ba 100644 --- a/client/client.go +++ b/client/client.go @@ -8,22 +8,24 @@ import ( "os" "path/filepath" "strconv" - "strings" "sync" + "sync/atomic" "time" "github.com/armon/go-metrics" - "github.com/mitchellh/hashstructure" - + consulapi "github.com/hashicorp/consul/api" + "github.com/hashicorp/consul/lib" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/config" - "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/driver" "github.com/hashicorp/nomad/client/fingerprint" + "github.com/hashicorp/nomad/client/rpcproxy" "github.com/hashicorp/nomad/client/stats" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/structs" + "github.com/mitchellh/hashstructure" ) const ( @@ -35,6 +37,10 @@ const ( // open to a server clientMaxStreams = 2 + // datacenterQueryLimit searches through up to this many adjacent + // datacenters looking for the Nomad server service. + datacenterQueryLimit = 5 + // registerRetryIntv is minimum interval on which we retry // registration. We pick a value between this and 2x this. registerRetryIntv = 15 * time.Second @@ -70,22 +76,8 @@ const ( // allocSyncRetryIntv is the interval on which we retry updating // the status of the allocation allocSyncRetryIntv = 5 * time.Second - - // consulSyncInterval is the interval at which the client syncs with consul - // to remove services and checks which are no longer valid - consulSyncInterval = 15 * time.Second ) -// DefaultConfig returns the default configuration -func DefaultConfig() *config.Config { - return &config.Config{ - LogOutput: os.Stderr, - Region: "global", - StatsDataPoints: 60, - StatsCollectionInterval: 1 * time.Second, - } -} - // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad // Client type ClientStatsReporter interface { @@ -113,18 +105,26 @@ type Client struct { logger *log.Logger - lastServer net.Addr - lastRPCTime time.Time - lastServerLock sync.Mutex - - servers []string - serverLock sync.RWMutex + rpcProxy *rpcproxy.RPCProxy connPool *nomad.ConnPool - lastHeartbeat time.Time - heartbeatTTL time.Duration - heartbeatLock sync.Mutex + // lastHeartbeatFromQuorum is an atomic int32 acting as a bool. When + // true, the last heartbeat message had a leader. When false (0), + // the last heartbeat did not include the RPC address of the leader, + // indicating that the server is in the minority or middle of an + // election. + lastHeartbeatFromQuorum int32 + + // consulPullHeartbeatDeadline is the deadline at which this Nomad + // Agent will begin polling Consul for a list of Nomad Servers. When + // Nomad Clients are heartbeating successfully with Nomad Servers, + // Nomad Clients do not poll Consul to populate their backup server + // list. + consulPullHeartbeatDeadline time.Time + lastHeartbeat time.Time + heartbeatTTL time.Duration + heartbeatLock sync.Mutex // allocs is the current set of allocations allocs map[string]*AllocRunner @@ -133,7 +133,8 @@ type Client struct { // allocUpdates stores allocations that need to be synced to the server. allocUpdates chan *structs.Allocation - consulService *consul.ConsulService + // consulSyncer advertises this Nomad Agent with Consul + consulSyncer *consul.Syncer // HostStatsCollector collects host resource usage stats hostStatsCollector *stats.HostStatsCollector @@ -146,7 +147,7 @@ type Client struct { } // NewClient is used to create a new client from the given configuration -func NewClient(cfg *config.Config) (*Client, error) { +func NewClient(cfg *config.Config, consulSyncer *consul.Syncer) (*Client, error) { // Create a logger logger := log.New(cfg.LogOutput, "", log.LstdFlags) @@ -158,6 +159,7 @@ func NewClient(cfg *config.Config) (*Client, error) { // Create the client c := &Client{ config: cfg, + consulSyncer: consulSyncer, start: time.Now(), connPool: nomad.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, nil), logger: logger, @@ -191,21 +193,29 @@ func NewClient(cfg *config.Config) (*Client, error) { // Setup the reserved resources c.reservePorts() - // Set up the known servers list - c.SetServers(c.config.Servers) - // Store the config copy before restoring state but after it has been // initialized. + c.configLock.Lock() c.configCopy = c.config.Copy() + c.configLock.Unlock() + + // Create the RPC Proxy and bootstrap with the preconfigured list of + // static servers + c.configLock.RLock() + c.rpcProxy = rpcproxy.NewRPCProxy(c.logger, c.shutdownCh, c, c.connPool) + for _, serverAddr := range c.configCopy.Servers { + c.rpcProxy.AddPrimaryServer(serverAddr) + } + c.configLock.RUnlock() // Restore the state if err := c.restoreState(); err != nil { return nil, fmt.Errorf("failed to restore state: %v", err) } - // Setup the consul client - if err := c.setupConsulClient(); err != nil { - return nil, fmt.Errorf("failed to create consul client: %v") + // Setup the Consul syncer + if err := c.setupConsulSyncer(); err != nil { + return nil, fmt.Errorf("failed to create client Consul syncer: %v") } // Register and then start heartbeating to the servers. @@ -223,8 +233,15 @@ func NewClient(cfg *config.Config) (*Client, error) { // Start collecting stats go c.collectHostStats() - // Start the consul sync - go c.syncConsul() + // Start the RPCProxy maintenance task. This task periodically + // shuffles the list of Nomad Server Endpoints this Client will use + // when communicating with Nomad Servers via RPC. This is done in + // order to prevent server fixation in stable Nomad clusters. This + // task actively populates the active list of Nomad Server Endpoints + // from information from the Nomad Client heartbeats. If a heartbeat + // times out and there are no Nomad servers available, this data is + // populated by periodically polling Consul, if available. + go c.rpcProxy.Run() return c, nil } @@ -272,6 +289,31 @@ func (c *Client) Leave() error { return nil } +// Datacenter returns the datacenter for the given client +func (c *Client) Datacenter() string { + c.configLock.RLock() + dc := c.configCopy.Node.Datacenter + c.configLock.RUnlock() + return dc +} + +// Region returns the region for the given client +func (c *Client) Region() string { + return c.config.Region +} + +// RPCMajorVersion returns the structs.ApiMajorVersion supported by the +// client. +func (c *Client) RPCMajorVersion() int { + return structs.ApiMajorVersion +} + +// RPCMinorVersion returns the structs.ApiMinorVersion supported by the +// client. +func (c *Client) RPCMinorVersion() int { + return structs.ApiMinorVersion +} + // Shutdown is used to tear down the client func (c *Client) Shutdown() error { c.logger.Printf("[INFO] client: shutting down") @@ -284,10 +326,12 @@ func (c *Client) Shutdown() error { // Destroy all the running allocations. if c.config.DevMode { + c.allocLock.Lock() for _, ar := range c.allocs { ar.Destroy() <-ar.WaitCh() } + c.allocLock.Unlock() } c.shutdown = true @@ -298,104 +342,24 @@ func (c *Client) Shutdown() error { // RPC is used to forward an RPC call to a nomad server, or fail if no servers func (c *Client) RPC(method string, args interface{}, reply interface{}) error { - // Invoke the RPCHandle if it exists + // Invoke the RPCHandler if it exists if c.config.RPCHandler != nil { return c.config.RPCHandler.RPC(method, args, reply) } // Pick a server to request from - addr, err := c.pickServer() - if err != nil { - return err + server := c.rpcProxy.FindServer() + if server == nil { + return fmt.Errorf("no known servers") } // Make the RPC request - err = c.connPool.RPC(c.config.Region, addr, 1, method, args, reply) - - // Update the last server information - c.lastServerLock.Lock() - if err != nil { - c.lastServer = nil - c.lastRPCTime = time.Time{} - } else { - c.lastServer = addr - c.lastRPCTime = time.Now() - } - c.lastServerLock.Unlock() - return err -} - -// pickServer is used to pick a target RPC server -func (c *Client) pickServer() (net.Addr, error) { - c.lastServerLock.Lock() - defer c.lastServerLock.Unlock() - - // Check for a valid last-used server - if c.lastServer != nil && time.Now().Sub(c.lastRPCTime) < clientRPCCache { - return c.lastServer, nil - } - - // Bail if we can't find any servers - servers := c.Servers() - if len(servers) == 0 { - return nil, fmt.Errorf("no known servers") - } - - // Shuffle so we don't always use the same server - shuffleStrings(servers) - - // Try to resolve each server - for i := 0; i < len(servers); i++ { - addr, err := net.ResolveTCPAddr("tcp", servers[i]) - if err == nil { - c.lastServer = addr - c.lastRPCTime = time.Now() - return addr, nil - } - c.logger.Printf("[WARN] client: failed to resolve '%s': %s", servers[i], err) - } - - // Bail if we reach this point - return nil, fmt.Errorf("failed to resolve any servers") -} - -// Servers is used to return the current known servers list. When an agent -// is first started, this list comes directly from configuration files. -func (c *Client) Servers() []string { - c.serverLock.RLock() - defer c.serverLock.RUnlock() - return c.servers -} - -// SetServers is used to modify the known servers list. This avoids forcing -// a config rollout + rolling restart and enables auto-join features. The -// full set of servers is passed to support adding and/or removing servers. -func (c *Client) SetServers(servers []string) { - c.serverLock.Lock() - defer c.serverLock.Unlock() - if servers == nil { - servers = make([]string, 0) - } - // net.ResolveTCPAddr requires port to be set, if one is not provided, supply default port - // Using net.SplitHostPort in the event of IPv6 addresses with multiple colons. - // IPv6 addresses must be passed in with brackets, - // i.e: [::1]:4647 or [::1] - setServers := make([]string, len(servers)) - copy(setServers, servers) - for i := 0; i < len(setServers); i++ { - if _, _, err := net.SplitHostPort(setServers[i]); err != nil { - // multiple errors can be returned here, only searching for missing - if strings.Contains(err.Error(), "missing port") { - c.logger.Printf("[WARN] client: port not specified, using default port") - setServers[i] = net.JoinHostPort(setServers[i], "4647") - } else { - c.logger.Printf("[WARN] client: server address %q invalid: %v", setServers[i], err) - } - } + if err := c.connPool.RPC(c.Region(), server.Addr, c.RPCMajorVersion(), method, args, reply); err != nil { + c.rpcProxy.NotifyFailedServer(server) + c.logger.Printf("[ERR] client: RPC failed to server %s: %v", server.Addr, err) + return err } - - c.logger.Printf("[INFO] client: setting server address list: %s", setServers) - c.servers = setServers + return nil } // Stats is used to return statistics for debugging and insight @@ -408,10 +372,12 @@ func (c *Client) Stats() map[string]map[string]string { numAllocs := len(c.allocs) c.allocLock.RUnlock() + c.heartbeatLock.Lock() + defer c.heartbeatLock.Unlock() stats := map[string]map[string]string{ "client": map[string]string{ "node_id": c.Node().ID, - "known_servers": toString(uint64(len(c.Servers()))), + "known_servers": toString(uint64(c.rpcProxy.NumServers())), "num_allocations": toString(uint64(numAllocs)), "last_heartbeat": fmt.Sprintf("%v", time.Since(c.lastHeartbeat)), "heartbeat_ttl": fmt.Sprintf("%v", c.heartbeatTTL), @@ -438,8 +404,7 @@ func (c *Client) StatsReporter() ClientStatsReporter { // Nomad client func (c *Client) AllocStats() map[string]AllocStatsReporter { res := make(map[string]AllocStatsReporter) - allocRunners := c.getAllocRunners() - for alloc, ar := range allocRunners { + for alloc, ar := range c.getAllocRunners() { res[alloc] = ar } return res @@ -491,6 +456,9 @@ func (c *Client) HostStatsTS(since int64) []*stats.HostStats { // GetAllocFS returns the AllocFS interface for the alloc dir of an allocation func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) { + c.allocLock.RLock() + defer c.allocLock.RUnlock() + ar, ok := c.allocs[allocID] if !ok { return nil, fmt.Errorf("alloc not found") @@ -498,6 +466,12 @@ func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) { return ar.ctx.AllocDir, nil } +// AddPrimaryServerToRPCProxy adds serverAddr to the RPC Proxy's primary +// server list. +func (c *Client) AddPrimaryServerToRPCProxy(serverAddr string) *rpcproxy.ServerEndpoint { + return c.rpcProxy.AddPrimaryServer(serverAddr) +} + // restoreState is used to restore our state from the data dir func (c *Client) restoreState() error { if c.config.DevMode { @@ -520,7 +494,9 @@ func (c *Client) restoreState() error { c.configLock.RLock() ar := NewAllocRunner(c.logger, c.configCopy, c.updateAllocStatus, alloc) c.configLock.RUnlock() + c.allocLock.Lock() c.allocs[id] = ar + c.allocLock.Unlock() if err := ar.RestoreState(); err != nil { c.logger.Printf("[ERR] client: failed to restore state for alloc %s: %v", id, err) mErr.Errors = append(mErr.Errors, err) @@ -718,7 +694,7 @@ func (c *Client) fingerprint() error { // fingerprintPeriodic runs a fingerprinter at the specified duration. func (c *Client) fingerprintPeriodic(name string, f fingerprint.Fingerprint, d time.Duration) { - c.logger.Printf("[DEBUG] client: periodically fingerprinting %v at duration %v", name, d) + c.logger.Printf("[DEBUG] client: fingerprinting %v every %v", name, d) for { select { case <-time.After(d): @@ -785,7 +761,7 @@ func (c *Client) retryIntv(base time.Duration) time.Duration { if c.config.DevMode { return devModeRetryIntv } - return base + randomStagger(base) + return base + lib.RandomStagger(base) } // registerAndHeartbeat is a long lived goroutine used to register the client @@ -804,7 +780,7 @@ func (c *Client) registerAndHeartbeat() { if c.config.DevMode { heartbeat = time.After(0) } else { - heartbeat = time.After(randomStagger(initialHeartbeatStagger)) + heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger)) } for { @@ -904,7 +880,7 @@ func (c *Client) registerNode() error { node := c.Node() req := structs.NodeRegisterRequest{ Node: node, - WriteRequest: structs.WriteRequest{Region: c.config.Region}, + WriteRequest: structs.WriteRequest{Region: c.Region()}, } var resp structs.NodeUpdateResponse err := c.RPC("Node.Register", &req, &resp) @@ -938,7 +914,7 @@ func (c *Client) updateNodeStatus() error { req := structs.NodeUpdateStatusRequest{ NodeID: node.ID, Status: structs.NodeStatusReady, - WriteRequest: structs.WriteRequest{Region: c.config.Region}, + WriteRequest: structs.WriteRequest{Region: c.Region()}, } var resp structs.NodeUpdateResponse err := c.RPC("Node.UpdateStatus", &req, &resp) @@ -957,6 +933,24 @@ func (c *Client) updateNodeStatus() error { defer c.heartbeatLock.Unlock() c.lastHeartbeat = time.Now() c.heartbeatTTL = resp.HeartbeatTTL + + if err := c.rpcProxy.RefreshServerLists(resp.Servers, resp.NumNodes, resp.LeaderRPCAddr); err != nil { + return err + } + + // Begin polling Consul if there is no Nomad leader. We could be + // heartbeating to a Nomad server that is in the minority of a + // partition of the Nomad server quorum, but this Nomad Agent still + // has connectivity to the existing majority of Nomad Servers, but + // only if it queries Consul. + if resp.LeaderRPCAddr == "" { + atomic.CompareAndSwapInt32(&c.lastHeartbeatFromQuorum, 1, 0) + return nil + } + + const heartbeatFallbackFactor = 3 + atomic.CompareAndSwapInt32(&c.lastHeartbeatFromQuorum, 0, 1) + c.consulPullHeartbeatDeadline = time.Now().Add(heartbeatFallbackFactor * resp.HeartbeatTTL) return nil } @@ -1003,7 +997,7 @@ func (c *Client) allocSync() { // Send to server. args := structs.AllocUpdateRequest{ Alloc: sync, - WriteRequest: structs.WriteRequest{Region: c.config.Region}, + WriteRequest: structs.WriteRequest{Region: c.Region()}, } var resp structs.GenericResponse @@ -1043,7 +1037,7 @@ func (c *Client) watchAllocations(updates chan *allocUpdates) { req := structs.NodeSpecificRequest{ NodeID: c.Node().ID, QueryOptions: structs.QueryOptions{ - Region: c.config.Region, + Region: c.Region(), AllowStale: true, }, } @@ -1053,7 +1047,7 @@ func (c *Client) watchAllocations(updates chan *allocUpdates) { // new, or updated server side. allocsReq := structs.AllocsGetRequest{ QueryOptions: structs.QueryOptions{ - Region: c.config.Region, + Region: c.Region(), AllowStale: true, }, } @@ -1266,61 +1260,165 @@ func (c *Client) addAlloc(alloc *structs.Allocation) error { return nil } -// setupConsulClient creates a ConsulService -func (c *Client) setupConsulClient() error { - cs, err := consul.NewConsulService(c.config.ConsulConfig, c.logger) - c.consulService = cs - return err -} +// setupConsulSyncer creates Client-mode consul.Syncer which periodically +// executes callbacks on a fixed interval. +// +// TODO(sean@): this could eventually be moved to a priority queue and give +// each task an interval, but that is not necessary at this time. +func (c *Client) setupConsulSyncer() error { + // The bootstrapFn callback handler is used to periodically poll + // Consul to look up the Nomad Servers in Consul. In the event the + // heartbeat deadline has been exceeded and this Client is orphaned + // from its servers, periodically poll Consul to reattach this Client + // to its cluster and automatically recover from a detached state. + bootstrapFn := func() error { + now := time.Now() + c.heartbeatLock.Lock() + + // If the last heartbeat didn't contain a leader, give the + // Nomad server this Agent is talking to one more attempt at + // providing a heartbeat that does contain a leader. + if atomic.LoadInt32(&c.lastHeartbeatFromQuorum) == 1 && now.Before(c.consulPullHeartbeatDeadline) { + c.heartbeatLock.Unlock() + // c.logger.Printf("[TRACE] client.consul: heartbeat received, sleeping until %v", c.consulPullHeartbeatDeadline) + return nil + } + c.heartbeatLock.Unlock() + c.logger.Printf("[TRACE] client.consul: lost heartbeat with Nomad quorum, falling back to Consul for server list") -// syncConsul removes services of tasks which are no longer in running state -func (c *Client) syncConsul() { - sync := time.NewTicker(consulSyncInterval) - for { - select { - case <-sync.C: - // Give up pruning services if we can't fingerprint Consul + consulCatalog := c.consulSyncer.ConsulClient().Catalog() + dcs, err := consulCatalog.Datacenters() + if err != nil { + return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err) + } + if len(dcs) > 2 { + // Query the local DC first, then shuffle the + // remaining DCs. Future heartbeats will cause Nomad + // Clients to fixate on their local datacenter so + // it's okay to talk with remote DCs. If the no + // Nomad servers are available within + // datacenterQueryLimit, the next heartbeat will pick + // a new set of servers so it's okay. + nearestDC := dcs[0] + otherDCs := make([]string, 0, len(dcs)) + otherDCs = dcs[1:lib.MinInt(len(dcs), datacenterQueryLimit)] + shuffleStrings(otherDCs) + + dcs = append([]string{nearestDC}, otherDCs...) + } - c.configLock.RLock() - _, ok := c.configCopy.Node.Attributes["consul.server"] - c.configLock.RUnlock() - if !ok { + nomadServerServiceName := c.config.ConsulConfig.ServerServiceName + var mErr multierror.Error + const defaultMaxNumNomadServers = 8 + nomadServerServices := make([]string, 0, defaultMaxNumNomadServers) + for _, dc := range dcs { + opts := &consulapi.QueryOptions{ + AllowStale: true, + Datacenter: dc, + Near: "_agent", + WaitTime: consul.DefaultQueryWaitDuration, + } + consulServices, _, err := consulCatalog.Service(nomadServerServiceName, consul.ServiceTagRPC, opts) + if err != nil { + mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", nomadServerServiceName, dc, err)) continue } - services := make(map[string]struct{}) - // Get the existing allocs - c.allocLock.RLock() - allocs := make([]*AllocRunner, 0, len(c.allocs)) - for _, ar := range c.allocs { - allocs = append(allocs, ar) + + for _, s := range consulServices { + port := strconv.FormatInt(int64(s.ServicePort), 10) + addr := s.ServiceAddress + if addr == "" { + addr = s.Address + } + serverAddr := net.JoinHostPort(addr, port) + serverEndpoint, err := rpcproxy.NewServerEndpoint(serverAddr) + if err != nil { + mErr.Errors = append(mErr.Errors, err) + continue + } + var ok bool + if ok, err = c.connPool.PingNomadServer(c.Region(), c.RPCMajorVersion(), serverEndpoint); err != nil { + mErr.Errors = append(mErr.Errors, err) + continue + } + if ok { + nomadServerServices = append(nomadServerServices, serverAddr) + } + } + // Break if at least one Nomad Server was successfully pinged + if len(nomadServerServices) > 0 { + break + } + } + if len(nomadServerServices) == 0 { + if len(mErr.Errors) > 0 { + return mErr.ErrorOrNil() + } + + for i := range dcs { + dcs[i] = fmt.Sprintf("%+q", dcs[i]) + } + return fmt.Errorf("no Nomad Servers advertising service %+q in Consul datacenters: %+q", nomadServerServiceName, dcs) + } + + c.heartbeatLock.Lock() + if atomic.LoadInt32(&c.lastHeartbeatFromQuorum) == 1 && now.Before(c.consulPullHeartbeatDeadline) { + c.heartbeatLock.Unlock() + // Common, healthy path + if err := c.rpcProxy.SetBackupServers(nomadServerServices); err != nil { + return fmt.Errorf("client.consul: unable to set backup servers: %v", err) } - c.allocLock.RUnlock() - for _, ar := range allocs { - ar.taskStatusLock.RLock() - taskStates := copyTaskStates(ar.taskStates) - ar.taskStatusLock.RUnlock() - for taskName, taskState := range taskStates { - if taskState.State == structs.TaskStateRunning { - if tr, ok := ar.tasks[taskName]; ok { - for _, service := range tr.task.Services { - svcIdentifier := consul.GenerateServiceIdentifier(ar.alloc.ID, tr.task.Name) - services[service.ID(svcIdentifier)] = struct{}{} + } else { + c.heartbeatLock.Unlock() + // If this Client is talking with a Server that + // doesn't have a leader, and we have exceeded the + // consulPullHeartbeatDeadline, change the call from + // SetBackupServers() to calling AddPrimaryServer() + // in order to allow the Clients to randomly begin + // considering all known Nomad servers and + // eventually, hopefully, find their way to a Nomad + // Server that has quorum (assuming Consul has a + // server list that is in the majority). + for _, s := range nomadServerServices { + c.rpcProxy.AddPrimaryServer(s) + } + } + + return nil + } + c.consulSyncer.AddPeriodicHandler("Nomad Client Fallback Server Handler", bootstrapFn) + + consulServicesSyncFn := func() error { + const estInitialConsulServices = 8 + const serviceGroupName = "executor" + services := make([]*structs.ConsulService, 0, estInitialConsulServices) + for allocID, ar := range c.getAllocRunners() { + ar.taskStatusLock.RLock() + taskStates := copyTaskStates(ar.taskStates) + ar.taskStatusLock.RUnlock() + for taskName, taskState := range taskStates { + if taskState.State == structs.TaskStateRunning { + if tr, ok := ar.tasks[taskName]; ok { + for _, service := range tr.task.ConsulServices { + if service.Name == "" { + service.Name = fmt.Sprintf("%s-%s", tr.task.Name, allocID) + } + if service.ServiceID == "" { + service.ServiceID = fmt.Sprintf("%s-%s:%s/%s", c.consulSyncer.GenerateServiceID(serviceGroupName, service), tr.task.Name, allocID) } + services = append(services, service) } } } } - - if err := c.consulService.KeepServices(services); err != nil { - c.logger.Printf("[DEBUG] client: error removing services from non-running tasks: %v", err) - } - case <-c.shutdownCh: - sync.Stop() - c.logger.Printf("[INFO] client: shutting down consul sync") - return } + c.consulSyncer.SetServices(serviceGroupName, services) + return nil } + c.consulSyncer.AddPeriodicHandler("Nomad Client Services Sync Handler", consulServicesSyncFn) + + return nil } // collectHostStats collects host resource usage stats periodically @@ -1373,3 +1471,8 @@ func (c *Client) emitStats(hStats *stats.HostStats) { metrics.EmitKey([]string{"disk", disk.Device, "inodes_percent"}, float32(disk.InodesUsedPercent)) } } + +// RPCProxy returns the Client's RPCProxy instance +func (c *Client) RPCProxy() *rpcproxy.RPCProxy { + return c.rpcProxy +} diff --git a/client/client_test.go b/client/client_test.go index c429527adbe..36e3ae16928 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -7,13 +7,12 @@ import ( "net" "os" "path/filepath" - "reflect" "sync/atomic" "testing" "time" "github.com/hashicorp/nomad/client/config" - "github.com/hashicorp/nomad/client/consul" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" @@ -70,14 +69,19 @@ func testServer(t *testing.T, cb func(*nomad.Config)) (*nomad.Server, string) { } func testClient(t *testing.T, cb func(c *config.Config)) *Client { - conf := DefaultConfig() + conf := config.DefaultConfig() conf.DevMode = true - conf.ConsulConfig = &consul.ConsulConfig{} if cb != nil { cb(conf) } - client, err := NewClient(conf) + shutdownCh := make(chan struct{}) + consulSyncer, err := consul.NewSyncer(conf.ConsulConfig, shutdownCh, log.New(os.Stderr, "", log.LstdFlags)) + if err != nil { + t.Fatalf("err: %v", err) + } + + client, err := NewClient(conf, consulSyncer) if err != nil { t.Fatalf("err: %v", err) } @@ -464,7 +468,13 @@ func TestClient_SaveRestoreState(t *testing.T) { } // Create a new client - c2, err := NewClient(c1.config) + shutdownCh := make(chan struct{}) + consulSyncer, err := consul.NewSyncer(c1.config.ConsulConfig, shutdownCh, log.New(os.Stderr, "", log.LstdFlags)) + if err != nil { + t.Fatalf("err: %v", err) + } + + c2, err := NewClient(c1.config, consulSyncer) if err != nil { t.Fatalf("err: %v", err) } @@ -509,54 +519,3 @@ func TestClient_Init(t *testing.T) { t.Fatalf("err: %s", err) } } - -func TestClient_SetServers(t *testing.T) { - client := testClient(t, nil) - - // Sets an empty list - client.SetServers(nil) - if client.servers == nil { - t.Fatalf("should not be nil") - } - - // Set the initial servers list - expect := []string{"foo:4647"} - client.SetServers(expect) - if !reflect.DeepEqual(client.servers, expect) { - t.Fatalf("expect %v, got %v", expect, client.servers) - } - - // Add a server - expect = []string{"foo:5445", "bar:8080"} - client.SetServers(expect) - if !reflect.DeepEqual(client.servers, expect) { - t.Fatalf("expect %v, got %v", expect, client.servers) - } - - // Remove a server - expect = []string{"bar:8080"} - client.SetServers(expect) - if !reflect.DeepEqual(client.servers, expect) { - t.Fatalf("expect %v, got %v", expect, client.servers) - } - - // Add and remove a server - expect = []string{"baz:9090", "zip:4545"} - client.SetServers(expect) - if !reflect.DeepEqual(client.servers, expect) { - t.Fatalf("expect %v, got %v", expect, client.servers) - } - - // Query the servers list - if servers := client.Servers(); !reflect.DeepEqual(servers, expect) { - t.Fatalf("expect %v, got %v", expect, servers) - } - - // Add servers without ports, and remove old servers - servers := []string{"foo", "bar", "baz"} - expect = []string{"foo:4647", "bar:4647", "baz:4647"} - client.SetServers(servers) - if !reflect.DeepEqual(client.servers, expect) { - t.Fatalf("expect %v, got %v", expect, client.servers) - } -} diff --git a/client/config/config.go b/client/config/config.go index 4d606ba2439..d2ae39e4d9a 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -3,12 +3,13 @@ package config import ( "fmt" "io" + "os" "strconv" "strings" "time" - "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/structs/config" ) var ( @@ -110,8 +111,8 @@ type Config struct { // Revision is the commit number of the Nomad client Revision string - // ConsulConfig is the configuration to connect with Consul Agent - ConsulConfig *consul.ConsulConfig + // ConsulConfig is this Agent's Consul configuration + ConsulConfig *config.ConsulConfig // StatsDataPoints is the number of resource usage data points the Nomad // client keeps in memory @@ -131,6 +132,22 @@ func (c *Config) Copy() *Config { return nc } +// DefaultConfig returns the default configuration +func DefaultConfig() *Config { + return &Config{ + ConsulConfig: &config.ConsulConfig{ + ServerServiceName: "nomad", + ClientServiceName: "nomad-client", + AutoRegister: true, + Timeout: 5 * time.Second, + }, + LogOutput: os.Stderr, + Region: "global", + StatsDataPoints: 60, + StatsCollectionInterval: 1 * time.Second, + } +} + // Read returns the specified configuration value or "". func (c *Config) Read(id string) string { return c.Options[id] diff --git a/client/consul/sync.go b/client/consul/sync.go deleted file mode 100644 index 08da6e89d23..00000000000 --- a/client/consul/sync.go +++ /dev/null @@ -1,474 +0,0 @@ -package consul - -import ( - "crypto/tls" - "fmt" - "log" - "net/http" - "net/url" - "reflect" - "strings" - "sync" - "time" - - consul "github.com/hashicorp/consul/api" - "github.com/hashicorp/go-multierror" - - "github.com/hashicorp/nomad/nomad/structs" -) - -// ConsulService allows syncing of services and checks with Consul -type ConsulService struct { - client *consul.Client - availble bool - - serviceIdentifier string // serviceIdentifier is a token which identifies which task/alloc the service belongs to - delegateChecks map[string]struct{} // delegateChecks are the checks that the Nomad client runs and reports to Consul - createCheck func(*structs.ServiceCheck, string) (Check, error) - addrFinder func(portLabel string) (string, int) - - trackedServices map[string]*consul.AgentService - trackedChecks map[string]*consul.AgentCheckRegistration - checkRunners map[string]*CheckRunner - - logger *log.Logger - - shutdownCh chan struct{} - shutdown bool - shutdownLock sync.Mutex -} - -// ConsulConfig is the configuration used to create a new ConsulService client -type ConsulConfig struct { - Addr string - Token string - Auth string - EnableSSL bool - VerifySSL bool - CAFile string - CertFile string - KeyFile string -} - -const ( - // The periodic time interval for syncing services and checks with Consul - syncInterval = 5 * time.Second - - // ttlCheckBuffer is the time interval that Nomad can take to report Consul - // the check result - ttlCheckBuffer = 31 * time.Second -) - -// NewConsulService returns a new ConsulService -func NewConsulService(config *ConsulConfig, logger *log.Logger) (*ConsulService, error) { - var err error - var c *consul.Client - cfg := consul.DefaultConfig() - if config.Addr != "" { - cfg.Address = config.Addr - } - if config.Token != "" { - cfg.Token = config.Token - } - if config.Auth != "" { - var username, password string - if strings.Contains(config.Auth, ":") { - split := strings.SplitN(config.Auth, ":", 2) - username = split[0] - password = split[1] - } else { - username = config.Auth - } - - cfg.HttpAuth = &consul.HttpBasicAuth{ - Username: username, - Password: password, - } - } - if config.EnableSSL { - cfg.Scheme = "https" - tlsCfg := consul.TLSConfig{ - Address: cfg.Address, - CAFile: config.CAFile, - CertFile: config.CertFile, - KeyFile: config.KeyFile, - InsecureSkipVerify: !config.VerifySSL, - } - tlsClientCfg, err := consul.SetupTLSConfig(&tlsCfg) - if err != nil { - return nil, fmt.Errorf("error creating tls client config for consul: %v", err) - } - cfg.HttpClient.Transport = &http.Transport{ - TLSClientConfig: tlsClientCfg, - } - } - if config.EnableSSL && !config.VerifySSL { - cfg.HttpClient.Transport = &http.Transport{ - TLSClientConfig: &tls.Config{ - InsecureSkipVerify: true, - }, - } - } - if c, err = consul.NewClient(cfg); err != nil { - return nil, err - } - consulService := ConsulService{ - client: c, - logger: logger, - trackedServices: make(map[string]*consul.AgentService), - trackedChecks: make(map[string]*consul.AgentCheckRegistration), - checkRunners: make(map[string]*CheckRunner), - - shutdownCh: make(chan struct{}), - } - return &consulService, nil -} - -// SetDelegatedChecks sets the checks that nomad is going to run and report the -// result back to consul -func (c *ConsulService) SetDelegatedChecks(delegateChecks map[string]struct{}, createCheck func(*structs.ServiceCheck, string) (Check, error)) *ConsulService { - c.delegateChecks = delegateChecks - c.createCheck = createCheck - return c -} - -// SetAddrFinder sets a function to find the host and port for a Service given its port label -func (c *ConsulService) SetAddrFinder(addrFinder func(string) (string, int)) *ConsulService { - c.addrFinder = addrFinder - return c -} - -// SetServiceIdentifier sets the identifier of the services we are syncing with Consul -func (c *ConsulService) SetServiceIdentifier(serviceIdentifier string) *ConsulService { - c.serviceIdentifier = serviceIdentifier - return c -} - -// SyncServices sync the services with consul -func (c *ConsulService) SyncServices(services []*structs.Service) error { - var mErr multierror.Error - taskServices := make(map[string]*consul.AgentService) - taskChecks := make(map[string]*consul.AgentCheckRegistration) - - // Register Services and Checks that we don't know about or has changed - for _, service := range services { - srv, err := c.createService(service) - if err != nil { - mErr.Errors = append(mErr.Errors, err) - continue - } - trackedService, ok := c.trackedServices[srv.ID] - if (ok && !reflect.DeepEqual(trackedService, srv)) || !ok { - if err := c.registerService(srv); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - c.trackedServices[srv.ID] = srv - taskServices[srv.ID] = srv - - for _, chk := range service.Checks { - // Create a consul check registration - chkReg, err := c.createCheckReg(chk, srv) - if err != nil { - mErr.Errors = append(mErr.Errors, err) - continue - } - // creating a nomad check if we have to handle this particular check type - if _, ok := c.delegateChecks[chk.Type]; ok { - nc, err := c.createCheck(chk, chkReg.ID) - if err != nil { - mErr.Errors = append(mErr.Errors, err) - continue - } - cr := NewCheckRunner(nc, c.runCheck, c.logger) - c.checkRunners[nc.ID()] = cr - } - - if _, ok := c.trackedChecks[chkReg.ID]; !ok { - if err := c.registerCheck(chkReg); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - c.trackedChecks[chkReg.ID] = chkReg - taskChecks[chkReg.ID] = chkReg - } - } - - // Remove services that are not present anymore - for _, service := range c.trackedServices { - if _, ok := taskServices[service.ID]; !ok { - if err := c.deregisterService(service.ID); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - delete(c.trackedServices, service.ID) - } - } - - // Remove the checks that are not present anymore - for checkID, _ := range c.trackedChecks { - if _, ok := taskChecks[checkID]; !ok { - if err := c.deregisterCheck(checkID); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - delete(c.trackedChecks, checkID) - } - } - return mErr.ErrorOrNil() -} - -// Shutdown de-registers the services and checks and shuts down periodic syncing -func (c *ConsulService) Shutdown() error { - var mErr multierror.Error - - c.shutdownLock.Lock() - if !c.shutdown { - close(c.shutdownCh) - c.shutdown = true - } - c.shutdownLock.Unlock() - - // Stop all the checks that nomad is running - for _, cr := range c.checkRunners { - cr.Stop() - } - - // De-register all the services from consul - for _, service := range c.trackedServices { - if err := c.client.Agent().ServiceDeregister(service.ID); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - return mErr.ErrorOrNil() -} - -// KeepServices removes services from consul which are not present in the list -// of tasks passed to it -func (c *ConsulService) KeepServices(services map[string]struct{}) error { - var mErr multierror.Error - - // Get the services from Consul - cServices, err := c.client.Agent().Services() - if err != nil { - return err - } - cServices = c.filterConsulServices(cServices) - - // Remove the services from consul which are not in any of the tasks - for _, service := range cServices { - if _, validService := services[service.ID]; !validService { - if err := c.deregisterService(service.ID); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - } - return mErr.ErrorOrNil() -} - -// registerCheck registers a check definition with Consul -func (c *ConsulService) registerCheck(chkReg *consul.AgentCheckRegistration) error { - if cr, ok := c.checkRunners[chkReg.ID]; ok { - cr.Start() - } - return c.client.Agent().CheckRegister(chkReg) -} - -// createCheckReg creates a Check that can be registered with Nomad. It also -// creates a Nomad check for the check types that it can handle. -func (c *ConsulService) createCheckReg(check *structs.ServiceCheck, service *consul.AgentService) (*consul.AgentCheckRegistration, error) { - chkReg := consul.AgentCheckRegistration{ - ID: check.Hash(service.ID), - Name: check.Name, - ServiceID: service.ID, - } - chkReg.Timeout = check.Timeout.String() - chkReg.Interval = check.Interval.String() - switch check.Type { - case structs.ServiceCheckHTTP: - if check.Protocol == "" { - check.Protocol = "http" - } - url := url.URL{ - Scheme: check.Protocol, - Host: fmt.Sprintf("%s:%d", service.Address, service.Port), - Path: check.Path, - } - chkReg.HTTP = url.String() - case structs.ServiceCheckTCP: - chkReg.TCP = fmt.Sprintf("%s:%d", service.Address, service.Port) - case structs.ServiceCheckScript: - chkReg.TTL = (check.Interval + ttlCheckBuffer).String() - default: - return nil, fmt.Errorf("check type %q not valid", check.Type) - } - return &chkReg, nil -} - -// createService creates a Consul AgentService from a Nomad Service -func (c *ConsulService) createService(service *structs.Service) (*consul.AgentService, error) { - srv := consul.AgentService{ - ID: service.ID(c.serviceIdentifier), - Service: service.Name, - Tags: service.Tags, - } - host, port := c.addrFinder(service.PortLabel) - if host != "" { - srv.Address = host - } - - if port != 0 { - srv.Port = port - } - - return &srv, nil -} - -// registerService registers a service with Consul -func (c *ConsulService) registerService(service *consul.AgentService) error { - srvReg := consul.AgentServiceRegistration{ - ID: service.ID, - Name: service.Service, - Tags: service.Tags, - Port: service.Port, - Address: service.Address, - } - return c.client.Agent().ServiceRegister(&srvReg) -} - -// deregisterService de-registers a service with the given ID from consul -func (c *ConsulService) deregisterService(ID string) error { - return c.client.Agent().ServiceDeregister(ID) -} - -// deregisterCheck de-registers a check with a given ID from Consul. -func (c *ConsulService) deregisterCheck(ID string) error { - // Deleting the nomad check - if cr, ok := c.checkRunners[ID]; ok { - cr.Stop() - delete(c.checkRunners, ID) - } - - // Deleting from consul - return c.client.Agent().CheckDeregister(ID) -} - -// PeriodicSync triggers periodic syncing of services and checks with Consul. -// This is a long lived go-routine which is stopped during shutdown -func (c *ConsulService) PeriodicSync() { - sync := time.NewTicker(syncInterval) - for { - select { - case <-sync.C: - if err := c.performSync(); err != nil { - if c.availble { - c.logger.Printf("[DEBUG] consul: error in syncing services for %q: %v", c.serviceIdentifier, err) - } - c.availble = false - } else { - c.availble = true - } - case <-c.shutdownCh: - sync.Stop() - c.logger.Printf("[INFO] consul: shutting down sync for %q", c.serviceIdentifier) - return - } - } -} - -// performSync sync the services and checks we are tracking with Consul. -func (c *ConsulService) performSync() error { - var mErr multierror.Error - cServices, err := c.client.Agent().Services() - if err != nil { - return err - } - - cChecks, err := c.client.Agent().Checks() - if err != nil { - return err - } - - // Add services and checks that consul doesn't have but we do - for serviceID, service := range c.trackedServices { - if _, ok := cServices[serviceID]; !ok { - if err := c.registerService(service); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - } - for checkID, check := range c.trackedChecks { - if _, ok := cChecks[checkID]; !ok { - if err := c.registerCheck(check); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - } - - return mErr.ErrorOrNil() -} - -// filterConsulServices prunes out all the service whose ids are not prefixed -// with nomad- -func (c *ConsulService) filterConsulServices(srvcs map[string]*consul.AgentService) map[string]*consul.AgentService { - nomadServices := make(map[string]*consul.AgentService) - for _, srv := range srvcs { - if strings.HasPrefix(srv.ID, structs.NomadConsulPrefix) && - !strings.HasPrefix(srv.ID, structs.AgentServicePrefix) { - nomadServices[srv.ID] = srv - } - } - return nomadServices -} - -// filterConsulChecks prunes out all the consul checks which do not have -// services with id prefixed with noamd- -func (c *ConsulService) filterConsulChecks(chks map[string]*consul.AgentCheck) map[string]*consul.AgentCheck { - nomadChecks := make(map[string]*consul.AgentCheck) - for _, chk := range chks { - if strings.HasPrefix(chk.ServiceID, structs.NomadConsulPrefix) { - nomadChecks[chk.CheckID] = chk - } - } - return nomadChecks -} - -// consulPresent indicates whether the consul agent is responding -func (c *ConsulService) consulPresent() bool { - _, err := c.client.Agent().Self() - return err == nil -} - -// runCheck runs a check and updates the corresponding ttl check in consul -func (c *ConsulService) runCheck(check Check) { - res := check.Run() - if res.Duration >= check.Timeout() { - c.logger.Printf("[DEBUG] consul.sync: check took time: %v, timeout: %v", res.Duration, check.Timeout()) - } - state := consul.HealthCritical - output := res.Output - switch res.ExitCode { - case 0: - state = consul.HealthPassing - case 1: - state = consul.HealthWarning - default: - state = consul.HealthCritical - } - if res.Err != nil { - state = consul.HealthCritical - output = res.Err.Error() - } - if err := c.client.Agent().UpdateTTL(check.ID(), output, state); err != nil { - if c.availble { - c.logger.Printf("[DEBUG] consul.sync: error updating ttl check for check %q: %v", check.ID(), err) - c.availble = false - } else { - c.availble = true - } - } -} - -// GenerateServiceIdentifier returns a service identifier based on an allocation -// id and task name -func GenerateServiceIdentifier(allocID string, taskName string) string { - return fmt.Sprintf("%s-%s", taskName, allocID) -} diff --git a/client/consul/sync_test.go b/client/consul/sync_test.go deleted file mode 100644 index 80907c78780..00000000000 --- a/client/consul/sync_test.go +++ /dev/null @@ -1,168 +0,0 @@ -package consul - -import ( - "fmt" - "log" - "os" - "reflect" - "testing" - "time" - - "github.com/hashicorp/go-multierror" - "github.com/hashicorp/nomad/nomad/structs" -) - -const ( - allocID = "12" -) - -var ( - logger = log.New(os.Stdout, "", log.LstdFlags) - check1 = structs.ServiceCheck{ - Name: "check-foo-1", - Type: structs.ServiceCheckTCP, - Interval: 30 * time.Second, - Timeout: 5 * time.Second, - } - service1 = structs.Service{ - Name: "foo-1", - Tags: []string{"tag1", "tag2"}, - PortLabel: "port1", - Checks: []*structs.ServiceCheck{ - &check1, - }, - } - - service2 = structs.Service{ - Name: "foo-2", - Tags: []string{"tag1", "tag2"}, - PortLabel: "port2", - } -) - -func TestConsulServiceRegisterServices(t *testing.T) { - cs, err := NewConsulService(&ConsulConfig{}, logger) - if err != nil { - t.Fatalf("Err: %v", err) - } - // Skipping the test if consul isn't present - if !cs.consulPresent() { - return - } - task := mockTask() - cs.SetServiceIdentifier(GenerateServiceIdentifier(allocID, task.Name)) - cs.SetAddrFinder(task.FindHostAndPortFor) - if err := cs.SyncServices(task.Services); err != nil { - t.Fatalf("err: %v", err) - } - defer cs.Shutdown() - - service1ID := service1.ID(GenerateServiceIdentifier(allocID, task.Name)) - service2ID := service2.ID(GenerateServiceIdentifier(allocID, task.Name)) - if err := servicesPresent(t, []string{service1ID, service2ID}, cs); err != nil { - t.Fatalf("err : %v", err) - } - if err := checksPresent(t, []string{check1.Hash(service1ID)}, cs); err != nil { - t.Fatalf("err : %v", err) - } -} - -func TestConsulServiceUpdateService(t *testing.T) { - cs, err := NewConsulService(&ConsulConfig{}, logger) - if err != nil { - t.Fatalf("Err: %v", err) - } - // Skipping the test if consul isn't present - if !cs.consulPresent() { - return - } - - task := mockTask() - cs.SetServiceIdentifier(GenerateServiceIdentifier(allocID, task.Name)) - cs.SetAddrFinder(task.FindHostAndPortFor) - if err := cs.SyncServices(task.Services); err != nil { - t.Fatalf("err: %v", err) - } - defer cs.Shutdown() - - //Update Service defn 1 - newTags := []string{"tag3"} - task.Services[0].Tags = newTags - if err := cs.SyncServices(task.Services); err != nil { - t.Fatalf("err: %v", err) - } - // Make sure all the services and checks are still present - service1ID := service1.ID(GenerateServiceIdentifier(allocID, task.Name)) - service2ID := service2.ID(GenerateServiceIdentifier(allocID, task.Name)) - if err := servicesPresent(t, []string{service1ID, service2ID}, cs); err != nil { - t.Fatalf("err : %v", err) - } - if err := checksPresent(t, []string{check1.Hash(service1ID)}, cs); err != nil { - t.Fatalf("err : %v", err) - } - - // check if service defn 1 has been updated - services, err := cs.client.Agent().Services() - if err != nil { - t.Fatalf("errL: %v", err) - } - srv, _ := services[service1ID] - if !reflect.DeepEqual(srv.Tags, newTags) { - t.Fatalf("expected tags: %v, actual: %v", newTags, srv.Tags) - } -} - -func servicesPresent(t *testing.T, serviceIDs []string, consulService *ConsulService) error { - var mErr multierror.Error - services, err := consulService.client.Agent().Services() - if err != nil { - t.Fatalf("err: %v", err) - } - - for _, serviceID := range serviceIDs { - if _, ok := services[serviceID]; !ok { - mErr.Errors = append(mErr.Errors, fmt.Errorf("service ID %q not synced", serviceID)) - } - } - return mErr.ErrorOrNil() -} - -func checksPresent(t *testing.T, checkIDs []string, consulService *ConsulService) error { - var mErr multierror.Error - checks, err := consulService.client.Agent().Checks() - if err != nil { - t.Fatalf("err: %v", err) - } - - for _, checkID := range checkIDs { - if _, ok := checks[checkID]; !ok { - mErr.Errors = append(mErr.Errors, fmt.Errorf("check ID %q not synced", checkID)) - } - } - return mErr.ErrorOrNil() -} - -func mockTask() *structs.Task { - task := structs.Task{ - Name: "foo", - Services: []*structs.Service{&service1, &service2}, - Resources: &structs.Resources{ - Networks: []*structs.NetworkResource{ - &structs.NetworkResource{ - IP: "10.10.11.5", - DynamicPorts: []structs.Port{ - structs.Port{ - Label: "port1", - Value: 20002, - }, - structs.Port{ - Label: "port2", - Value: 20003, - }, - }, - }, - }, - }, - } - return &task -} diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index c6e4b9655fd..ba4deb32f4e 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -21,12 +21,13 @@ import ( "github.com/shirou/gopsutil/process" "github.com/hashicorp/nomad/client/allocdir" - "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/driver/env" "github.com/hashicorp/nomad/client/driver/logging" cstructs "github.com/hashicorp/nomad/client/driver/structs" "github.com/hashicorp/nomad/client/stats" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/structs/config" ) const ( @@ -34,6 +35,9 @@ const ( // tree for finding out the pids that the executor and it's child processes // have forked pidScanInterval = 5 * time.Second + + // serviceRegPrefix is the prefix the entire Executor should use + serviceRegPrefix = "executor" ) var ( @@ -58,10 +62,11 @@ type Executor interface { Stats() (*cstructs.TaskResourceUsage, error) } -// ConsulContext holds context to configure the consul client and run checks +// ConsulContext holds context to configure the Consul client and run checks type ConsulContext struct { - // ConsulConfig is the configuration used to create a consul client - ConsulConfig *consul.ConsulConfig + // ConsulConfig contains the configuration information for talking + // with this Nomad Agent's Consul Agent. + ConsulConfig *config.ConsulConfig // ContainerID is the ID of the container ContainerID string @@ -181,6 +186,8 @@ type UniversalExecutor struct { lro *logging.FileRotator rotatorLock sync.Mutex + shutdownCh chan struct{} + syslogServer *logging.SyslogServer syslogChan chan *logging.SyslogMessage @@ -188,7 +195,7 @@ type UniversalExecutor struct { cgPaths map[string]string cgLock sync.Mutex - consulService *consul.ConsulService + consulSyncer *consul.Syncer consulCtx *ConsulContext totalCpuStats *stats.CpuStats userCpuStats *stats.CpuStats @@ -352,11 +359,10 @@ func (e *UniversalExecutor) UpdateTask(task *structs.Task) error { e.lre.MaxFiles = task.LogConfig.MaxFiles e.lre.FileSize = fileSize - // Re-syncing task with consul service - if e.consulService != nil { - if err := e.consulService.SyncServices(task.Services); err != nil { - return err - } + // Re-syncing task with Consul agent + if e.consulSyncer != nil { + e.interpolateServices(e.ctx.Task) + e.consulSyncer.SetServices(e.ctx.AllocID, task.ConsulServices) } return nil } @@ -410,6 +416,10 @@ func (e *UniversalExecutor) Exit() error { e.lre.Close() e.lro.Close() + if e.consulSyncer != nil { + e.consulSyncer.Shutdown() + } + // If the executor did not launch a process, return. if e.command == nil { return nil @@ -469,21 +479,26 @@ func (e *UniversalExecutor) ShutDown() error { func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { e.logger.Printf("[INFO] executor: registering services") e.consulCtx = ctx - if e.consulService == nil { - cs, err := consul.NewConsulService(ctx.ConsulConfig, e.logger) + if e.consulSyncer == nil { + cs, err := consul.NewSyncer(ctx.ConsulConfig, e.shutdownCh, e.logger) if err != nil { return err } cs.SetDelegatedChecks(e.createCheckMap(), e.createCheck) - cs.SetServiceIdentifier(consul.GenerateServiceIdentifier(e.ctx.AllocID, e.ctx.Task.Name)) + cs.SetServiceRegPrefix(serviceRegPrefix) cs.SetAddrFinder(e.ctx.Task.FindHostAndPortFor) - e.consulService = cs + e.consulSyncer = cs + go e.consulSyncer.Run() } if e.ctx != nil { - e.interpolateServices(e.ctx.Task) + syncerFn := func() error { + e.interpolateServices(e.ctx.Task) + e.consulSyncer.SetServices(e.ctx.AllocID, e.ctx.Task.ConsulServices) + return nil + } + e.consulSyncer.AddPeriodicHandler(e.ctx.AllocID, syncerFn) } - err := e.consulService.SyncServices(e.ctx.Task.Services) - go e.consulService.PeriodicSync() + err := e.consulSyncer.SyncServices() // Attempt to register immediately return err } @@ -491,8 +506,8 @@ func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { // running from Consul func (e *UniversalExecutor) DeregisterServices() error { e.logger.Printf("[INFO] executor: de-registering services and shutting down consul service") - if e.consulService != nil { - return e.consulService.Shutdown() + if e.consulSyncer != nil { + return e.consulSyncer.Shutdown() } return nil } @@ -680,7 +695,7 @@ func (e *UniversalExecutor) createCheck(check *structs.ServiceCheck, checkID str // task's environment. func (e *UniversalExecutor) interpolateServices(task *structs.Task) { e.ctx.TaskEnv.Build() - for _, service := range task.Services { + for _, service := range task.ConsulServices { for _, check := range service.Checks { if check.Type == structs.ServiceCheckScript { check.Name = e.ctx.TaskEnv.ReplaceEnv(check.Name) diff --git a/client/driver/executor/executor_test.go b/client/driver/executor/executor_test.go index de479970e5d..ad96a4c5066 100644 --- a/client/driver/executor/executor_test.go +++ b/client/driver/executor/executor_test.go @@ -338,18 +338,18 @@ func TestExecutorInterpolateServices(t *testing.T) { executor.(*UniversalExecutor).ctx = ctx executor.(*UniversalExecutor).interpolateServices(task) expectedTags := []string{"pci:true", "datacenter:dc1"} - if !reflect.DeepEqual(task.Services[0].Tags, expectedTags) { - t.Fatalf("expected: %v, actual: %v", expectedTags, task.Services[0].Tags) + if !reflect.DeepEqual(task.ConsulServices[0].Tags, expectedTags) { + t.Fatalf("expected: %v, actual: %v", expectedTags, task.ConsulServices[0].Tags) } expectedCheckCmd := "/usr/local/check-table-mysql" expectedCheckArgs := []string{"5.6"} - if !reflect.DeepEqual(task.Services[0].Checks[0].Command, expectedCheckCmd) { - t.Fatalf("expected: %v, actual: %v", expectedCheckCmd, task.Services[0].Checks[0].Command) + if !reflect.DeepEqual(task.ConsulServices[0].Checks[0].Command, expectedCheckCmd) { + t.Fatalf("expected: %v, actual: %v", expectedCheckCmd, task.ConsulServices[0].Checks[0].Command) } - if !reflect.DeepEqual(task.Services[0].Checks[0].Args, expectedCheckArgs) { - t.Fatalf("expected: %v, actual: %v", expectedCheckArgs, task.Services[0].Checks[0].Args) + if !reflect.DeepEqual(task.ConsulServices[0].Checks[0].Args, expectedCheckArgs) { + t.Fatalf("expected: %v, actual: %v", expectedCheckArgs, task.ConsulServices[0].Checks[0].Args) } } diff --git a/client/driver/utils.go b/client/driver/utils.go index ce1160a79f1..562e3165e5f 100644 --- a/client/driver/utils.go +++ b/client/driver/utils.go @@ -12,7 +12,6 @@ import ( "github.com/hashicorp/go-multierror" "github.com/hashicorp/go-plugin" "github.com/hashicorp/nomad/client/config" - "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/driver/executor" "github.com/hashicorp/nomad/client/driver/logging" cstructs "github.com/hashicorp/nomad/client/driver/structs" @@ -73,18 +72,8 @@ func createLogCollector(config *plugin.ClientConfig, w io.Writer, } func consulContext(clientConfig *config.Config, containerID string) *executor.ConsulContext { - cfg := consul.ConsulConfig{ - Addr: clientConfig.ReadDefault("consul.address", "127.0.0.1:8500"), - Token: clientConfig.Read("consul.token"), - Auth: clientConfig.Read("consul.auth"), - EnableSSL: clientConfig.ReadBoolDefault("consul.ssl", false), - VerifySSL: clientConfig.ReadBoolDefault("consul.verifyssl", true), - CAFile: clientConfig.Read("consul.tls_ca_file"), - CertFile: clientConfig.Read("consul.tls_cert_file"), - KeyFile: clientConfig.Read("consul.tls_key_file"), - } return &executor.ConsulContext{ - ConsulConfig: &cfg, + ConsulConfig: clientConfig.ConsulConfig, ContainerID: containerID, DockerEndpoint: clientConfig.Read("docker.endpoint"), TLSCa: clientConfig.Read("docker.tls.ca"), diff --git a/client/fingerprint/consul.go b/client/fingerprint/consul.go index b15f7bac399..14932460149 100644 --- a/client/fingerprint/consul.go +++ b/client/fingerprint/consul.go @@ -38,16 +38,11 @@ func (f *ConsulFingerprint) Fingerprint(config *client.Config, node *structs.Nod // Only create the client once to avoid creating too many connections to // Consul. if f.client == nil { - address := config.ReadDefault("consul.address", "127.0.0.1:8500") - timeout, err := time.ParseDuration(config.ReadDefault("consul.timeout", "10ms")) + consulConfig, err := config.ConsulConfig.ApiConfig() if err != nil { - return false, fmt.Errorf("Unable to parse consul.timeout: %s", err) + return false, fmt.Errorf("Failed to initialize the Consul client config: %v", err) } - consulConfig := consul.DefaultConfig() - consulConfig.Address = address - consulConfig.HttpClient.Timeout = timeout - f.client, err = consul.NewClient(consulConfig) if err != nil { return false, fmt.Errorf("Failed to initialize consul client: %s", err) diff --git a/client/fingerprint/consul_test.go b/client/fingerprint/consul_test.go index 2557d0f271e..c1ab3acf483 100644 --- a/client/fingerprint/consul_test.go +++ b/client/fingerprint/consul_test.go @@ -4,6 +4,7 @@ import ( "fmt" "net/http" "net/http/httptest" + "os" "testing" "github.com/hashicorp/nomad/client/config" @@ -11,6 +12,11 @@ import ( ) func TestConsulFingerprint(t *testing.T) { + addr := os.Getenv("CONSUL_HTTP_ADDR") + if addr == "" { + t.Skipf("No consul process running, skipping test") + } + fp := NewConsulFingerprint(testLogger()) node := &structs.Node{ Attributes: make(map[string]string), @@ -22,14 +28,9 @@ func TestConsulFingerprint(t *testing.T) { })) defer ts.Close() - consulConfig := &config.Config{ - Options: map[string]string{ - // Split off "http://" - "consul.address": ts.URL[7:], - }, - } + config := config.DefaultConfig() - ok, err := fp.Fingerprint(consulConfig, node) + ok, err := fp.Fingerprint(config, node) if err != nil { t.Fatalf("Failed to fingerprint: %s", err) } @@ -43,9 +44,8 @@ func TestConsulFingerprint(t *testing.T) { assertNodeAttributeContains(t, node, "unique.consul.name") assertNodeAttributeContains(t, node, "consul.datacenter") - expectedLink := "vagrant.consul2" - if node.Links["consul"] != expectedLink { - t.Errorf("Expected consul link: %s\nFound links: %#v", expectedLink, node.Links) + if _, ok := node.Links["consul"]; !ok { + t.Errorf("Expected a link to consul, none found") } } @@ -151,9 +151,7 @@ const mockConsulResponse = ` "expect": "3", "port": "8300", "role": "consul", - "vsn": "2", - "vsn_max": "2", - "vsn_min": "1" + "vsn": "2" }, "Status": 1, "ProtocolMin": 1, diff --git a/client/rpcproxy/rpcproxy.go b/client/rpcproxy/rpcproxy.go new file mode 100644 index 00000000000..0e8c7604127 --- /dev/null +++ b/client/rpcproxy/rpcproxy.go @@ -0,0 +1,779 @@ +// Package rpcproxy provides a proxy interface to Nomad Servers. The +// RPCProxy periodically shuffles which server a Nomad Client communicates +// with in order to redistribute load across Nomad Servers. Nomad Servers +// that fail an RPC request are automatically cycled to the end of the list +// until the server list is reshuffled. +// +// The rpcproxy package does not provide any external API guarantees and +// should be called only by `hashicorp/nomad`. +package rpcproxy + +import ( + "fmt" + "log" + "math/rand" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/hashicorp/consul/lib" + "github.com/hashicorp/nomad/nomad/structs" +) + +const ( + // clientRPCJitterFraction determines the amount of jitter added to + // clientRPCMinReuseDuration before a connection is expired and a new + // connection is established in order to rebalance load across Nomad + // servers. The cluster-wide number of connections per second from + // rebalancing is applied after this jitter to ensure the CPU impact + // is always finite. See newRebalanceConnsPerSecPerServer's comment + // for additional commentary. + // + // For example, in a 10K Nomad cluster with 5x servers, this default + // averages out to ~13 new connections from rebalancing per server + // per second. + clientRPCJitterFraction = 2 + + // clientRPCMinReuseDuration controls the minimum amount of time RPC + // queries are sent over an established connection to a single server + clientRPCMinReuseDuration = 600 * time.Second + + // Limit the number of new connections a server receives per second + // for connection rebalancing. This limit caps the load caused by + // continual rebalancing efforts when a cluster is in equilibrium. A + // lower value comes at the cost of increased recovery time after a + // partition. This parameter begins to take effect when there are + // more than ~48K clients querying 5x servers or at lower server + // counts when there is a partition. + // + // For example, in a 100K Nomad cluster with 5x servers, it will take + // ~5min for all servers to rebalance their connections. If 99,995 + // agents are in the minority talking to only one server, it will + // take ~26min for all servers to rebalance. A 10K cluster in the + // same scenario will take ~2.6min to rebalance. + newRebalanceConnsPerSecPerServer = 64 + + // rpcAPIMismatchLogRate determines the rate at which log entries are + // emitted when the client and server's API versions are mismatched. + rpcAPIMismatchLogRate = 3 * time.Hour +) + +// NomadConfigInfo is an interface wrapper around this Nomad Agent's +// configuration to prevents a cyclic import dependency. +type NomadConfigInfo interface { + Datacenter() string + RPCMajorVersion() int + RPCMinorVersion() int + Region() string +} + +// Pinger is an interface wrapping client.ConnPool to prevent a +// cyclic import dependency +type Pinger interface { + PingNomadServer(region string, apiMajorVersion int, s *ServerEndpoint) (bool, error) +} + +// serverList is an array of Nomad Servers. The first server in the list is +// the active server. +// +// NOTE(sean@): We are explicitly relying on the fact that serverList will be +// copied onto the stack by atomic.Value. Please keep this structure light. +type serverList struct { + L []*ServerEndpoint +} + +// RPCProxy is the manager type responsible for returning and managing Nomad +// addresses. +type RPCProxy struct { + // activatedList manages the list of Nomad Servers that are eligible + // to be queried by the Client agent. + activatedList atomic.Value + activatedListLock sync.Mutex + + // primaryServers is a list of servers found in the last heartbeat. + // primaryServers are periodically reshuffled. Covered by + // serverListLock. + primaryServers serverList + + // backupServers is a list of fallback servers. These servers are + // appended to the RPCProxy's serverList, but are never shuffled with + // the list of servers discovered via the Nomad heartbeat. Covered + // by serverListLock. + backupServers serverList + + // serverListLock covers both backupServers and primaryServers. If + // it is necessary to hold serverListLock and listLock, obtain an + // exclusive lock on serverListLock before listLock. + serverListLock sync.RWMutex + + leaderAddr string + numNodes int + + // rebalanceTimer controls the duration of the rebalance interval + rebalanceTimer *time.Timer + + // shutdownCh is a copy of the channel in nomad.Client + shutdownCh chan struct{} + + logger *log.Logger + + configInfo NomadConfigInfo + + // rpcAPIMismatchThrottle regulates the rate at which warning + // messages are emitted in the event of an API mismatch between the + // clients and servers. + rpcAPIMismatchThrottle map[string]time.Time + + // connPoolPinger is used to test the health of a server in the + // connection pool. Pinger is an interface that wraps + // client.ConnPool. + connPoolPinger Pinger +} + +// NewRPCProxy is the only way to safely create a new RPCProxy. +func NewRPCProxy(logger *log.Logger, shutdownCh chan struct{}, configInfo NomadConfigInfo, connPoolPinger Pinger) *RPCProxy { + p := &RPCProxy{ + logger: logger, + configInfo: configInfo, // can't pass *nomad.Client: import cycle + connPoolPinger: connPoolPinger, // can't pass *nomad.ConnPool: import cycle + rebalanceTimer: time.NewTimer(clientRPCMinReuseDuration), + shutdownCh: shutdownCh, + } + + l := serverList{} + l.L = make([]*ServerEndpoint, 0) + p.saveServerList(l) + return p +} + +// activateEndpoint adds an endpoint to the RPCProxy's active serverList. +// Returns true if the server was added, returns false if the server already +// existed in the RPCProxy's serverList. +func (p *RPCProxy) activateEndpoint(s *ServerEndpoint) bool { + l := p.getServerList() + + // Check if this server is known + found := false + for idx, existing := range l.L { + if existing.Name == s.Name { + newServers := make([]*ServerEndpoint, len(l.L)) + copy(newServers, l.L) + + // Overwrite the existing server details in order to + // possibly update metadata (e.g. server version) + newServers[idx] = s + + l.L = newServers + found = true + break + } + } + + // Add to the list if not known + if !found { + newServers := make([]*ServerEndpoint, len(l.L), len(l.L)+1) + copy(newServers, l.L) + newServers = append(newServers, s) + l.L = newServers + } + + p.saveServerList(l) + + return !found +} + +// SetBackupServers sets a list of Nomad Servers to be used in the event that +// the Nomad Agent lost contact with the list of Nomad Servers provided via +// the Nomad Agent's heartbeat. If available, the backup servers are +// populated via Consul. +func (p *RPCProxy) SetBackupServers(addrs []string) error { + l := make([]*ServerEndpoint, 0, len(addrs)) + for _, s := range addrs { + s, err := NewServerEndpoint(s) + if err != nil { + p.logger.Printf("[WARN] client.rpcproxy: unable to create backup server %+q: %v", s, err) + return fmt.Errorf("unable to create new backup server from %+q: %v", s, err) + } + l = append(l, s) + } + + p.serverListLock.Lock() + p.backupServers.L = l + p.serverListLock.Unlock() + + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() + for _, s := range l { + p.activateEndpoint(s) + } + + return nil +} + +// AddPrimaryServer takes the RPC address of a Nomad server, creates a new +// endpoint, and adds it to both the primaryServers list and the active +// serverList used in the RPC Proxy. If the endpoint is not known by the +// RPCProxy, appends the endpoint to the list. The new endpoint will begin +// seeing use after the rebalance timer fires (or enough servers fail +// organically). Any values in the primary server list are overridden by the +// next successful heartbeat. +func (p *RPCProxy) AddPrimaryServer(rpcAddr string) *ServerEndpoint { + s, err := NewServerEndpoint(rpcAddr) + if err != nil { + p.logger.Printf("[WARN] client.rpcproxy: unable to create new primary server from endpoint %+q: %v", rpcAddr, err) + return nil + } + + k := s.Key() + p.serverListLock.Lock() + if serverExists := p.primaryServers.serverExistByKey(k); serverExists { + p.serverListLock.Unlock() + return s + } + p.primaryServers.L = append(p.primaryServers.L, s) + p.serverListLock.Unlock() + + p.activatedListLock.Lock() + p.activateEndpoint(s) + p.activatedListLock.Unlock() + + return s +} + +// cycleServers returns a new list of servers that has dequeued the first +// server and enqueued it at the end of the list. cycleServers assumes the +// caller is holding the listLock. cycleServer does not test or ping +// the next server inline. cycleServer may be called when the environment +// has just entered an unhealthy situation and blocking on a server test is +// less desirable than just returning the next server in the firing line. If +// the next server fails, it will fail fast enough and cycleServer will be +// called again. +func (l *serverList) cycleServer() (servers []*ServerEndpoint) { + numServers := len(l.L) + if numServers < 2 { + return servers // No action required + } + + newServers := make([]*ServerEndpoint, 0, numServers) + newServers = append(newServers, l.L[1:]...) + newServers = append(newServers, l.L[0]) + + return newServers +} + +// serverExistByKey performs a search to see if a server exists in the +// serverList. Assumes the caller is holding at least a read lock. +func (l *serverList) serverExistByKey(targetKey *EndpointKey) bool { + var found bool + for _, server := range l.L { + if targetKey.Equal(server.Key()) { + found = true + } + } + return found +} + +// removeServerByKey performs an inline removal of the first matching server +func (l *serverList) removeServerByKey(targetKey *EndpointKey) { + for i, s := range l.L { + if targetKey.Equal(s.Key()) { + copy(l.L[i:], l.L[i+1:]) + l.L[len(l.L)-1] = nil + l.L = l.L[:len(l.L)-1] + return + } + } +} + +// shuffleServers shuffles the server list in place +func (l *serverList) shuffleServers() { + for i := len(l.L) - 1; i > 0; i-- { + j := rand.Int31n(int32(i + 1)) + l.L[i], l.L[j] = l.L[j], l.L[i] + } +} + +// String returns a string representation of serverList +func (l *serverList) String() string { + if len(l.L) == 0 { + return fmt.Sprintf("empty server list") + } + + serverStrs := make([]string, 0, len(l.L)) + for _, server := range l.L { + serverStrs = append(serverStrs, server.String()) + } + + return fmt.Sprintf("[%s]", strings.Join(serverStrs, ", ")) +} + +// FindServer takes out an internal "read lock" and searches through the list +// of servers to find a "healthy" server. If the server is actually +// unhealthy, we rely on heartbeats to detect this and remove the node from +// the server list. If the server at the front of the list has failed or +// fails during an RPC call, it is rotated to the end of the list. If there +// are no servers available, return nil. +func (p *RPCProxy) FindServer() *ServerEndpoint { + l := p.getServerList() + numServers := len(l.L) + if numServers == 0 { + p.logger.Printf("[WARN] client.rpcproxy: No servers available") + return nil + } + + // Return whatever is at the front of the list because it is + // assumed to be the oldest in the server list (unless - + // hypothetically - the server list was rotated right after a + // server was added). + return l.L[0] +} + +// getServerList is a convenience method which hides the locking semantics +// of atomic.Value from the caller. +func (p *RPCProxy) getServerList() serverList { + return p.activatedList.Load().(serverList) +} + +// saveServerList is a convenience method which hides the locking semantics +// of atomic.Value from the caller. +func (p *RPCProxy) saveServerList(l serverList) { + p.activatedList.Store(l) +} + +// LeaderAddr returns the current leader address. If an empty string, then +// the Nomad Server for this Nomad Agent is in the minority or the Nomad +// Servers are in the middle of an election. +func (p *RPCProxy) LeaderAddr() string { + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() + return p.leaderAddr +} + +// NotifyFailedServer marks the passed in server as "failed" by rotating it +// to the end of the server list. +func (p *RPCProxy) NotifyFailedServer(s *ServerEndpoint) { + l := p.getServerList() + + // If the server being failed is not the first server on the list, + // this is a noop. If, however, the server is failed and first on + // the list, acquire the lock, retest, and take the penalty of moving + // the server to the end of the list. + + // Only rotate the server list when there is more than one server + if len(l.L) > 1 && l.L[0] == s { + // Grab a lock, retest, and take the hit of cycling the first + // server to the end. + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() + l = p.getServerList() + + if len(l.L) > 1 && l.L[0] == s { + l.L = l.cycleServer() + p.saveServerList(l) + } + } +} + +// NumNodes returns the estimated number of nodes according to the last Nomad +// Heartbeat. +func (p *RPCProxy) NumNodes() int { + return p.numNodes +} + +// NumServers takes out an internal "read lock" and returns the number of +// servers. numServers includes both healthy and unhealthy servers. +func (p *RPCProxy) NumServers() int { + l := p.getServerList() + return len(l.L) +} + +// RebalanceServers shuffles the list of servers on this agent. The server +// at the front of the list is selected for the next RPC. RPC calls that +// fail for a particular server are rotated to the end of the list. This +// method reshuffles the list periodically in order to redistribute work +// across all known Nomad servers (i.e. guarantee that the order of servers +// in the server list is not positively correlated with the age of a server +// in the Nomad cluster). Periodically shuffling the server list prevents +// long-lived clients from fixating on long-lived servers. +// +// Unhealthy servers are removed from the server list during the next client +// heartbeat. Before the newly shuffled server list is saved, the new remote +// endpoint is tested to ensure its responsive. +func (p *RPCProxy) RebalanceServers() { + var serverListLocked bool + p.serverListLock.Lock() + serverListLocked = true + defer func() { + if serverListLocked { + p.serverListLock.Unlock() + } + }() + + // Early abort if there is nothing to shuffle + if (len(p.primaryServers.L) + len(p.backupServers.L)) < 2 { + return + } + + // Shuffle server lists independently + p.primaryServers.shuffleServers() + p.backupServers.shuffleServers() + + // Create a new merged serverList + type targetServer struct { + server *ServerEndpoint + // 'p' == Primary Server + // 's' == Secondary/Backup Server + // 'b' == Both + state byte + } + mergedList := make(map[EndpointKey]*targetServer, len(p.primaryServers.L)+len(p.backupServers.L)) + for _, s := range p.primaryServers.L { + mergedList[*s.Key()] = &targetServer{server: s, state: 'p'} + } + for _, s := range p.backupServers.L { + k := s.Key() + _, found := mergedList[*k] + if found { + mergedList[*k].state = 'b' + } else { + mergedList[*k] = &targetServer{server: s, state: 's'} + } + } + + l := &serverList{L: make([]*ServerEndpoint, 0, len(mergedList))} + for _, s := range p.primaryServers.L { + l.L = append(l.L, s) + } + for _, v := range mergedList { + if v.state != 's' { + continue + } + l.L = append(l.L, v.server) + } + + // Release the lock before we begin transition to operations on the + // network timescale and attempt to ping servers. A copy of the + // servers has been made at this point. + p.serverListLock.Unlock() + serverListLocked = false + + // Iterate through the shuffled server list to find an assumed + // healthy server. NOTE: Do not iterate on the list directly because + // this loop mutates the server list in-place. + var foundHealthyServer bool + for i := 0; i < len(l.L); i++ { + // Always test the first server. Failed servers are cycled + // and eventually removed from the list when Nomad heartbeats + // detect the failed node. + selectedServer := l.L[0] + + ok, err := p.connPoolPinger.PingNomadServer(p.configInfo.Region(), p.configInfo.RPCMajorVersion(), selectedServer) + if ok { + foundHealthyServer = true + break + } + p.logger.Printf(`[DEBUG] client.rpcproxy: pinging server "%s" failed: %s`, selectedServer.String(), err) + + l.cycleServer() + } + + // If no healthy servers were found, sleep and wait for the admin to + // join this node to a server and begin receiving heartbeats with an + // updated list of Nomad servers. Or Consul will begin advertising a + // new server in the nomad service (Nomad server service). + if !foundHealthyServer { + p.logger.Printf("[DEBUG] client.rpcproxy: No healthy servers during rebalance, aborting") + return + } + + // Verify that all servers are present. Reconcile will save the + // final serverList. + if p.reconcileServerList(l) { + p.logger.Printf("[TRACE] client.rpcproxy: Rebalanced %d servers, next active server is %s", len(l.L), l.L[0].String()) + } else { + // reconcileServerList failed because Nomad removed the + // server that was at the front of the list that had + // successfully been Ping'ed. Between the Ping and + // reconcile, a Nomad heartbeat removed the node. + // + // Instead of doing any heroics, "freeze in place" and + // continue to use the existing connection until the next + // rebalance occurs. + } + + return +} + +// reconcileServerList returns true when the first server in serverList +// (l) exists in the receiver's serverList (p). If true, the merged +// serverList (l) is stored as the receiver's serverList (p). Returns +// false if the first server in p does not exist in the passed in list (l) +// (i.e. was removed by Nomad during a PingNomadServer() call. Newly added +// servers are appended to the list and other missing servers are removed +// from the list. +func (p *RPCProxy) reconcileServerList(l *serverList) bool { + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() + + // newServerList is a serverList that has been kept up-to-date with + // join and leave events. + newServerList := p.getServerList() + + // If a Nomad heartbeat removed all nodes, or there is no selected + // server (zero nodes in serverList), abort early. + if len(newServerList.L) == 0 || len(l.L) == 0 { + return false + } + + type targetServer struct { + server *ServerEndpoint + + // 'b' == both + // 'o' == original + // 'n' == new + state byte + } + mergedList := make(map[EndpointKey]*targetServer, len(l.L)) + for _, s := range l.L { + mergedList[*s.Key()] = &targetServer{server: s, state: 'o'} + } + for _, s := range newServerList.L { + k := s.Key() + _, found := mergedList[*k] + if found { + mergedList[*k].state = 'b' + } else { + mergedList[*k] = &targetServer{server: s, state: 'n'} + } + } + + // Ensure the selected server has not been removed by a heartbeat + selectedServerKey := l.L[0].Key() + if v, found := mergedList[*selectedServerKey]; found && v.state == 'o' { + return false + } + + // Append any new servers and remove any old servers + for k, v := range mergedList { + switch v.state { + case 'b': + // Do nothing, server exists in both + case 'o': + // Server has been removed + l.removeServerByKey(&k) + case 'n': + // Server added + l.L = append(l.L, v.server) + default: + panic("unknown merge list state") + } + } + + p.saveServerList(*l) + return true +} + +// RemoveServer takes out an internal write lock and removes a server from +// the activated server list. +func (p *RPCProxy) RemoveServer(s *ServerEndpoint) { + // Lock hierarchy protocol dictates serverListLock is acquired first. + p.serverListLock.Lock() + defer p.serverListLock.Unlock() + + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() + l := p.getServerList() + + k := s.Key() + l.removeServerByKey(k) + p.saveServerList(l) + + p.primaryServers.removeServerByKey(k) + p.backupServers.removeServerByKey(k) +} + +// refreshServerRebalanceTimer is only called once p.rebalanceTimer expires. +func (p *RPCProxy) refreshServerRebalanceTimer() time.Duration { + l := p.getServerList() + numServers := len(l.L) + // Limit this connection's life based on the size (and health) of the + // cluster. Never rebalance a connection more frequently than + // connReuseLowWatermarkDuration, and make sure we never exceed + // clusterWideRebalanceConnsPerSec operations/s across numLANMembers. + clusterWideRebalanceConnsPerSec := float64(numServers * newRebalanceConnsPerSecPerServer) + connReuseLowWatermarkDuration := clientRPCMinReuseDuration + lib.RandomStagger(clientRPCMinReuseDuration/clientRPCJitterFraction) + numLANMembers := p.numNodes + connRebalanceTimeout := lib.RateScaledInterval(clusterWideRebalanceConnsPerSec, connReuseLowWatermarkDuration, numLANMembers) + + p.rebalanceTimer.Reset(connRebalanceTimeout) + return connRebalanceTimeout +} + +// ResetRebalanceTimer resets the rebalance timer. This method exists for +// testing and should not be used directly. +func (p *RPCProxy) ResetRebalanceTimer() { + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() + p.rebalanceTimer.Reset(clientRPCMinReuseDuration) +} + +// ServerRPCAddrs returns one RPC Address per server +func (p *RPCProxy) ServerRPCAddrs() []string { + l := p.getServerList() + serverAddrs := make([]string, 0, len(l.L)) + for _, s := range l.L { + serverAddrs = append(serverAddrs, s.Addr.String()) + } + return serverAddrs +} + +// Run is used to start and manage the task of automatically shuffling and +// rebalancing the list of Nomad servers. This maintenance only happens +// periodically based on the expiration of the timer. Failed servers are +// automatically cycled to the end of the list. New servers are appended to +// the list. The order of the server list must be shuffled periodically to +// distribute load across all known and available Nomad servers. +func (p *RPCProxy) Run() { + for { + select { + case <-p.rebalanceTimer.C: + p.RebalanceServers() + + p.refreshServerRebalanceTimer() + case <-p.shutdownCh: + p.logger.Printf("[INFO] client.rpcproxy: shutting down") + return + } + } +} + +// RefreshServerLists is called when the Client receives an update from a +// Nomad Server. The response from Nomad Client Heartbeats contain a list of +// Nomad Servers that the Nomad Client should use for RPC requests. +// RefreshServerLists does not rebalance its serverLists (that is handled +// elsewhere via a periodic timer). New Nomad Servers learned via the +// heartbeat are appended to the RPCProxy's activated serverList. Servers +// that are no longer present in the Heartbeat are removed immediately from +// all server lists. Nomad Servers speaking a newer major or minor API +// version are filtered from the serverList. +func (p *RPCProxy) RefreshServerLists(servers []*structs.NodeServerInfo, numNodes int32, leaderRPCAddr string) error { + // Merge all servers found in the response. Servers in the response + // with newer API versions are filtered from the list. If the list + // is missing an address found in the RPCProxy's server list, remove + // it from the RPCProxy. + + p.serverListLock.Lock() + defer p.serverListLock.Unlock() + + // Clear the backup server list when a heartbeat contains at least + // one server. + if len(servers) > 0 && len(p.backupServers.L) > 0 { + p.backupServers.L = make([]*ServerEndpoint, 0, len(servers)) + } + + // 1) Create a map to reconcile the difference between + // p.primaryServers and servers. + type targetServer struct { + server *ServerEndpoint + + // 'b' == both + // 'o' == original + // 'n' == new + state byte + } + mergedPrimaryMap := make(map[EndpointKey]*targetServer, len(p.primaryServers.L)+len(servers)) + numOldServers := 0 + for _, s := range p.primaryServers.L { + mergedPrimaryMap[*s.Key()] = &targetServer{server: s, state: 'o'} + numOldServers++ + } + numBothServers := 0 + var newServers bool + for _, s := range servers { + // Filter out servers using a newer API version. Prevent + // spamming the logs every heartbeat. + // + // TODO(sean@): Move the logging throttle logic into a + // dedicated logging package so RPCProxy does not have to + // perform this accounting. + if int32(p.configInfo.RPCMajorVersion()) < s.RPCMajorVersion || + (int32(p.configInfo.RPCMajorVersion()) == s.RPCMajorVersion && + int32(p.configInfo.RPCMinorVersion()) < s.RPCMinorVersion) { + now := time.Now() + t, ok := p.rpcAPIMismatchThrottle[s.RPCAdvertiseAddr] + if ok && t.After(now) { + continue + } + + p.logger.Printf("[WARN] client.rpcproxy: API mismatch between client version (v%d.%d) and server version (v%d.%d), ignoring server %+q", p.configInfo.RPCMajorVersion(), p.configInfo.RPCMinorVersion(), s.RPCMajorVersion, s.RPCMinorVersion, s.RPCAdvertiseAddr) + p.rpcAPIMismatchThrottle[s.RPCAdvertiseAddr] = now.Add(rpcAPIMismatchLogRate) + continue + } + + server, err := NewServerEndpoint(s.RPCAdvertiseAddr) + if err != nil { + p.logger.Printf("[WARN] client.rpcproxy: Unable to create a server from %+q: %v", s.RPCAdvertiseAddr, err) + continue + } + + // Nomad servers in different datacenters are automatically + // added to the backup server list. + if s.Datacenter != p.configInfo.Datacenter() { + p.backupServers.L = append(p.backupServers.L, server) + continue + } + + k := server.Key() + _, found := mergedPrimaryMap[*k] + if found { + mergedPrimaryMap[*k].state = 'b' + numBothServers++ + } else { + mergedPrimaryMap[*k] = &targetServer{server: server, state: 'n'} + newServers = true + } + } + + // Short-circuit acquiring listLock if nothing changed + if !newServers && numOldServers == numBothServers { + return nil + } + + p.activatedListLock.Lock() + defer p.activatedListLock.Unlock() + newServerCfg := p.getServerList() + for k, v := range mergedPrimaryMap { + switch v.state { + case 'b': + // Do nothing, server exists in both + case 'o': + // Server has been removed + + // TODO(sean@): Teach Nomad servers how to remove + // themselves from their heartbeat in order to + // gracefully drain their clients over the next + // cluster's max rebalanceTimer duration. Without + // this enhancement, if a server being shutdown and + // it is the first in serverList, the client will + // fail its next RPC connection. + p.primaryServers.removeServerByKey(&k) + newServerCfg.removeServerByKey(&k) + case 'n': + // Server added. Append it to both lists + // immediately. The server should only go into + // active use in the event of a failure or after a + // rebalance occurs. + p.primaryServers.L = append(p.primaryServers.L, v.server) + newServerCfg.L = append(newServerCfg.L, v.server) + default: + panic("unknown merge list state") + } + } + + p.numNodes = int(numNodes) + p.leaderAddr = leaderRPCAddr + p.saveServerList(newServerCfg) + + return nil +} diff --git a/client/rpcproxy/rpcproxy_test.go b/client/rpcproxy/rpcproxy_test.go new file mode 100644 index 00000000000..c6b7327e62a --- /dev/null +++ b/client/rpcproxy/rpcproxy_test.go @@ -0,0 +1,818 @@ +package rpcproxy + +import ( + "bytes" + "encoding/binary" + "fmt" + "log" + "math/rand" + "net" + "os" + "strings" + "sync/atomic" + "testing" + "time" +) + +const ( + ipv4len = 4 + nodeNameFmt = "s%03d" + defaultNomadPort = "4647" + + // Poached from RFC2544 and RFC3330 + testingNetworkCidr = "198.18.0.0/15" + testingNetworkUint32 = 3323068416 +) + +var ( + localLogger *log.Logger + localLogBuffer *bytes.Buffer + serverCount uint32 + validIp uint32 +) + +func init() { + localLogBuffer = new(bytes.Buffer) + localLogger = log.New(localLogBuffer, "", 0) +} + +func makeServerEndpointName() string { + serverNum := atomic.AddUint32(&serverCount, 1) + validIp := testingNetworkUint32 + serverNum + ipv4 := make(net.IP, ipv4len) + binary.BigEndian.PutUint32(ipv4, validIp) + return net.JoinHostPort(ipv4.String(), defaultNomadPort) +} + +func GetBufferedLogger() *log.Logger { + return localLogger +} + +type fauxConnPool struct { + // failPct between 0.0 and 1.0 == pct of time a Ping should fail + failPct float64 +} + +func (cp *fauxConnPool) PingNomadServer(region string, majorVersion int, s *ServerEndpoint) (bool, error) { + var success bool + successProb := rand.Float64() + if successProb > cp.failPct { + success = true + } + return success, nil +} + +type fauxSerf struct { + datacenter string + numNodes int + region string + rpcMinorVersion int + rpcMajorVersion int +} + +func (s *fauxSerf) NumNodes() int { + return s.numNodes +} + +func (s *fauxSerf) Region() string { + return s.region +} + +func (s *fauxSerf) Datacenter() string { + return s.datacenter +} + +func (s *fauxSerf) RPCMajorVersion() int { + return s.rpcMajorVersion +} + +func (s *fauxSerf) RPCMinorVersion() int { + return s.rpcMinorVersion +} + +func testRPCProxy() (p *RPCProxy) { + logger := GetBufferedLogger() + logger = log.New(os.Stderr, "", log.LstdFlags) + shutdownCh := make(chan struct{}) + p = NewRPCProxy(logger, shutdownCh, &fauxSerf{numNodes: 16384}, &fauxConnPool{}) + return p +} + +func testRPCProxyFailProb(failPct float64) (p *RPCProxy) { + logger := GetBufferedLogger() + logger = log.New(os.Stderr, "", log.LstdFlags) + shutdownCh := make(chan struct{}) + p = NewRPCProxy(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct}) + return p +} + +// func (p *RPCProxy) AddPrimaryServer(server *ServerEndpoint) { +func TestRPCProxy_AddPrimaryServer(t *testing.T) { + p := testRPCProxy() + var num int + num = p.NumServers() + if num != 0 { + t.Fatalf("Expected zero servers to start") + } + + s1Endpoint := makeServerEndpointName() + s1 := p.AddPrimaryServer(s1Endpoint) + num = p.NumServers() + if num != 1 { + t.Fatalf("Expected one server") + } + if s1 == nil { + t.Fatalf("bad") + } + if s1.Name != s1Endpoint { + t.Fatalf("bad") + } + + s1 = p.AddPrimaryServer(s1Endpoint) + num = p.NumServers() + if num != 1 { + t.Fatalf("Expected one server (still)") + } + if s1 == nil { + t.Fatalf("bad") + } + if s1.Name != s1Endpoint { + t.Fatalf("bad") + } + + s2Endpoint := makeServerEndpointName() + s2 := p.AddPrimaryServer(s2Endpoint) + num = p.NumServers() + if num != 2 { + t.Fatalf("Expected two servers") + } + if s2 == nil { + t.Fatalf("bad") + } + if s2.Name != s2Endpoint { + t.Fatalf("bad") + } +} + +// func (p *RPCProxy) FindServer() (server *ServerEndpoint) { +func TestRPCProxy_FindServer(t *testing.T) { + p := testRPCProxy() + + if p.FindServer() != nil { + t.Fatalf("Expected nil return") + } + + s1Endpoint := makeServerEndpointName() + p.AddPrimaryServer(s1Endpoint) + if p.NumServers() != 1 { + t.Fatalf("Expected one server") + } + + s1 := p.FindServer() + if s1 == nil { + t.Fatalf("Expected non-nil server") + } + if s1.Name != s1Endpoint { + t.Fatalf("Expected s1 server") + } + + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { + t.Fatalf("Expected s1 server (still)") + } + + s2Endpoint := makeServerEndpointName() + p.AddPrimaryServer(s2Endpoint) + if p.NumServers() != 2 { + t.Fatalf("Expected two servers") + } + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { + t.Fatalf("Expected s1 server (still)") + } + + p.NotifyFailedServer(s1) + s2 := p.FindServer() + if s2 == nil || s2.Name != s2Endpoint { + t.Fatalf("Expected s2 server") + } + + p.NotifyFailedServer(s2) + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { + t.Fatalf("Expected s1 server") + } +} + +// func New(logger *log.Logger, shutdownCh chan struct{}) (p *RPCProxy) { +func TestRPCProxy_New(t *testing.T) { + logger := GetBufferedLogger() + logger = log.New(os.Stderr, "", log.LstdFlags) + shutdownCh := make(chan struct{}) + p := NewRPCProxy(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{}) + if p == nil { + t.Fatalf("RPCProxy nil") + } +} + +// func (p *RPCProxy) NotifyFailedServer(server *ServerEndpoint) { +func TestRPCProxy_NotifyFailedServer(t *testing.T) { + p := testRPCProxy() + + if p.NumServers() != 0 { + t.Fatalf("Expected zero servers to start") + } + + // Try notifying for a server that is not managed by RPCProxy + s1Endpoint := makeServerEndpointName() + s1 := p.AddPrimaryServer(s1Endpoint) + if s1 == nil { + t.Fatalf("bad") + } + if p.NumServers() != 1 { + t.Fatalf("bad") + } + p.RemoveServer(s1) + if p.NumServers() != 0 { + t.Fatalf("bad") + } + p.NotifyFailedServer(s1) + s1 = p.AddPrimaryServer(s1Endpoint) + + // Test again w/ a server not in the list + s2Endpoint := makeServerEndpointName() + s2 := p.AddPrimaryServer(s2Endpoint) + if s2 == nil { + t.Fatalf("bad") + } + if p.NumServers() != 2 { + t.Fatalf("bad") + } + p.RemoveServer(s2) + if p.NumServers() != 1 { + t.Fatalf("bad") + } + p.NotifyFailedServer(s2) + if p.NumServers() != 1 { + t.Fatalf("Expected one server") + } + + // Re-add s2 so there are two servers in the RPCProxy server list + s2 = p.AddPrimaryServer(s2Endpoint) + if p.NumServers() != 2 { + t.Fatalf("Expected two servers") + } + + // Find the first server, it should be s1 + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { + t.Fatalf("Expected s1 server") + } + + // Notify s2 as failed, s1 should still be first + p.NotifyFailedServer(s2) + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { + t.Fatalf("Expected s1 server (still)") + } + + // Fail s1, s2 should be first + p.NotifyFailedServer(s1) + s2 = p.FindServer() + if s2 == nil || s2.Name != s2Endpoint { + t.Fatalf("Expected s2 server") + } + + // Fail s2, s1 should be first + p.NotifyFailedServer(s2) + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { + t.Fatalf("Expected s1 server") + } +} + +// func (p *RPCProxy) NumServers() (numServers int) { +func TestRPCProxy_NumServers(t *testing.T) { + p := testRPCProxy() + const maxNumServers = 100 + serverList := make([]*ServerEndpoint, 0, maxNumServers) + + // Add some servers + for i := 0; i < maxNumServers; i++ { + num := p.NumServers() + if num != i { + t.Fatalf("%d: Expected %d servers", i, num) + } + serverName := makeServerEndpointName() + s := p.AddPrimaryServer(serverName) + if s == nil { + t.Fatalf("Expected server from %+q", serverName) + } + serverList = append(serverList, s) + + num = p.NumServers() + if num != i+1 { + t.Fatalf("%d: Expected %d servers", i, num+1) + } + } + + // Remove some servers + for i := maxNumServers; i > 0; i-- { + num := p.NumServers() + if num != i { + t.Fatalf("%d: Expected %d servers", i, num) + } + p.RemoveServer(serverList[i-1]) + num = p.NumServers() + if num != i-1 { + t.Fatalf("%d: Expected %d servers", i, num-1) + } + } +} + +// func (p *RPCProxy) RebalanceServers() { +func TestRPCProxy_RebalanceServers(t *testing.T) { + const failPct = 0.5 + p := testRPCProxyFailProb(failPct) + const maxServers = 100 + const numShuffleTests = 100 + const uniquePassRate = 0.5 + + // Make a huge list of nodes. + for i := 0; i < maxServers; i++ { + p.AddPrimaryServer(makeServerEndpointName()) + } + + // Keep track of how many unique shuffles we get. + uniques := make(map[string]struct{}, maxServers) + for i := 0; i < numShuffleTests; i++ { + p.RebalanceServers() + + var names []string + for j := 0; j < maxServers; j++ { + server := p.FindServer() + p.NotifyFailedServer(server) + names = append(names, server.Name) + } + key := strings.Join(names, "|") + uniques[key] = struct{}{} + } + + // We have to allow for the fact that there won't always be a unique + // shuffle each pass, so we just look for smell here without the test + // being flaky. + if len(uniques) < int(maxServers*uniquePassRate) { + t.Fatalf("unique shuffle ratio too low: %d/%d", len(uniques), maxServers) + } +} + +// func (p *RPCProxy) RemoveServer(server *ServerEndpoint) { +func TestRPCProxy_RemoveServer(t *testing.T) { + p := testRPCProxy() + if p.NumServers() != 0 { + t.Fatalf("Expected zero servers to start") + } + + // Test removing server before its added + s1Endpoint := makeServerEndpointName() + s1 := p.AddPrimaryServer(s1Endpoint) + if p.NumServers() != 1 { + t.Fatalf("bad") + } + if s1 == nil || s1.Name != s1Endpoint { + t.Fatalf("Expected s1 server: %+q", s1.Name) + } + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { + t.Fatalf("Expected s1 server: %+q", s1.Name) + } + p.RemoveServer(s1) + if p.NumServers() != 0 { + t.Fatalf("bad") + } + // Remove it a second time now that it doesn't exist + p.RemoveServer(s1) + if p.NumServers() != 0 { + t.Fatalf("bad") + } + p.AddPrimaryServer(s1Endpoint) + if p.NumServers() != 1 { + t.Fatalf("bad") + } + + s2Endpoint := makeServerEndpointName() + s2 := p.AddPrimaryServer(s2Endpoint) + if p.NumServers() != 2 { + t.Fatalf("bad") + } + if s2 == nil || s2.Name != s2Endpoint { + t.Fatalf("Expected s2 server: %+q", s2.Name) + } + s1 = p.FindServer() + if s1 == nil || s1.Name != s1Endpoint { + t.Fatalf("Expected s1 to be the front of the list: %+q==%+q", s1.Name, s1Endpoint) + } + // Move s1 to the back of the server list + p.NotifyFailedServer(s1) + s2 = p.FindServer() + if s2 == nil || s2.Name != s2Endpoint { + t.Fatalf("Expected s2 server: %+q", s2Endpoint) + } + p.RemoveServer(s2) + if p.NumServers() != 1 { + t.Fatalf("bad") + } + p.RemoveServer(s2) + if p.NumServers() != 1 { + t.Fatalf("bad") + } + p.AddPrimaryServer(s2Endpoint) + + const maxServers = 19 + servers := make([]*ServerEndpoint, 0, maxServers) + servers = append(servers, s1) + servers = append(servers, s2) + // Already added two servers above + for i := maxServers; i > 2; i-- { + server := p.AddPrimaryServer(makeServerEndpointName()) + servers = append(servers, server) + } + if p.NumServers() != maxServers { + t.Fatalf("Expected %d servers, received %d", maxServers, p.NumServers()) + } + + p.RebalanceServers() + + if p.NumServers() != maxServers { + t.Fatalf("Expected %d servers, received %d", maxServers, p.NumServers()) + } + + findServer := func(server *ServerEndpoint) bool { + for i := p.NumServers(); i > 0; i-- { + s := p.FindServer() + if s == server { + return true + } + } + return false + } + + expectedNumServers := maxServers + removedServers := make([]*ServerEndpoint, 0, maxServers) + + // Remove servers from the front of the list + for i := 3; i > 0; i-- { + server := p.FindServer() + if server == nil { + t.Fatalf("FindServer returned nil") + } + p.RemoveServer(server) + expectedNumServers-- + if p.NumServers() != expectedNumServers { + t.Fatalf("Expected %d servers (got %d)", expectedNumServers, p.NumServers()) + } + if findServer(server) == true { + t.Fatalf("Did not expect to find server %s after removal from the front", server.Name) + } + removedServers = append(removedServers, server) + } + + // Remove server from the end of the list + for i := 3; i > 0; i-- { + server := p.FindServer() + p.NotifyFailedServer(server) + p.RemoveServer(server) + expectedNumServers-- + if p.NumServers() != expectedNumServers { + t.Fatalf("Expected %d servers (got %d)", expectedNumServers, p.NumServers()) + } + if findServer(server) == true { + t.Fatalf("Did not expect to find server %s", server.Name) + } + removedServers = append(removedServers, server) + } + + // Remove server from the middle of the list + for i := 3; i > 0; i-- { + server := p.FindServer() + p.NotifyFailedServer(server) + server2 := p.FindServer() + p.NotifyFailedServer(server2) // server2 now at end of the list + + p.RemoveServer(server) + expectedNumServers-- + if p.NumServers() != expectedNumServers { + t.Fatalf("Expected %d servers (got %d)", expectedNumServers, p.NumServers()) + } + if findServer(server) == true { + t.Fatalf("Did not expect to find server %s", server.Name) + } + removedServers = append(removedServers, server) + } + + if p.NumServers()+len(removedServers) != maxServers { + t.Fatalf("Expected %d+%d=%d servers", p.NumServers(), len(removedServers), maxServers) + } + + // Drain the remaining servers from the middle + for i := p.NumServers(); i > 0; i-- { + server := p.FindServer() + p.NotifyFailedServer(server) + server2 := p.FindServer() + p.NotifyFailedServer(server2) // server2 now at end of the list + p.RemoveServer(server) + removedServers = append(removedServers, server) + } + + if p.NumServers() != 0 { + t.Fatalf("Expected an empty server list") + } + if len(removedServers) != maxServers { + t.Fatalf("Expected all servers to be in removed server list") + } +} + +// func (p *RPCProxy) Start() { + +// func (l *serverList) cycleServer() (servers []*Server) { +func TestRPCProxyInternal_cycleServer(t *testing.T) { + p := testRPCProxy() + l := p.getServerList() + + server0 := &ServerEndpoint{Name: "server1"} + server1 := &ServerEndpoint{Name: "server2"} + server2 := &ServerEndpoint{Name: "server3"} + l.L = append(l.L, server0, server1, server2) + p.saveServerList(l) + + l = p.getServerList() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server0 && + l.L[1] != server1 && + l.L[2] != server2 { + t.Fatalf("initial server ordering not correct") + } + + l.L = l.cycleServer() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server1 && + l.L[1] != server2 && + l.L[2] != server0 { + t.Fatalf("server ordering after one cycle not correct") + } + + l.L = l.cycleServer() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server2 && + l.L[1] != server0 && + l.L[2] != server1 { + t.Fatalf("server ordering after two cycles not correct") + } + + l.L = l.cycleServer() + if len(l.L) != 3 { + t.Fatalf("server length incorrect: %d/3", len(l.L)) + } + if l.L[0] != server0 && + l.L[1] != server1 && + l.L[2] != server2 { + t.Fatalf("server ordering after three cycles not correct") + } +} + +// func (p *RPCProxy) getServerList() serverList { +func TestRPCProxyInternal_getServerList(t *testing.T) { + p := testRPCProxy() + l := p.getServerList() + if l.L == nil { + t.Fatalf("serverList.servers nil") + } + + if len(l.L) != 0 { + t.Fatalf("serverList.servers length not zero") + } +} + +func TestRPCProxyInternal_New(t *testing.T) { + p := testRPCProxy() + if p == nil { + t.Fatalf("bad") + } + + if p.logger == nil { + t.Fatalf("bad") + } + + if p.shutdownCh == nil { + t.Fatalf("bad") + } +} + +// func (p *RPCProxy) reconcileServerList(l *serverList) bool { +func TestRPCProxyInternal_reconcileServerList(t *testing.T) { + tests := []int{0, 1, 2, 3, 4, 5, 10, 100} + for _, n := range tests { + ok, err := test_reconcileServerList(n) + if !ok { + t.Errorf("Expected %d to pass: %v", n, err) + } + } +} + +func test_reconcileServerList(maxServers int) (bool, error) { + // Build a server list, reconcile, verify the missing servers are + // missing, the added have been added, and the original server is + // present. + const failPct = 0.5 + p := testRPCProxyFailProb(failPct) + + var failedServers, healthyServers []*ServerEndpoint + for i := 0; i < maxServers; i++ { + nodeName := fmt.Sprintf("s%02d", i) + + node := &ServerEndpoint{Name: nodeName} + // Add 66% of servers to RPCProxy + if rand.Float64() > 0.33 { + p.activateEndpoint(node) + + // Of healthy servers, (ab)use connPoolPinger to + // failPct of the servers for the reconcile. This + // allows for the selected server to no longer be + // healthy for the reconcile below. + if ok, _ := p.connPoolPinger.PingNomadServer(p.configInfo.Region(), p.configInfo.RPCMajorVersion(), node); ok { + // Will still be present + healthyServers = append(healthyServers, node) + } else { + // Will be missing + failedServers = append(failedServers, node) + } + } else { + // Will be added from the call to reconcile + healthyServers = append(healthyServers, node) + } + } + + // Randomize RPCProxy's server list + p.RebalanceServers() + selectedServer := p.FindServer() + + var selectedServerFailed bool + for _, s := range failedServers { + if selectedServer.Key().Equal(s.Key()) { + selectedServerFailed = true + break + } + } + + // Update RPCProxy's server list to be "healthy" based on Serf. + // Reconcile this with origServers, which is shuffled and has a live + // connection, but possibly out of date. + origServers := p.getServerList() + p.saveServerList(serverList{L: healthyServers}) + + // This should always succeed with non-zero server lists + if !selectedServerFailed && !p.reconcileServerList(&origServers) && + len(p.getServerList().L) != 0 && + len(origServers.L) != 0 { + // If the random gods are unfavorable and we end up with zero + // length lists, expect things to fail and retry the test. + return false, fmt.Errorf("Expected reconcile to succeed: %v %d %d", + selectedServerFailed, + len(p.getServerList().L), + len(origServers.L)) + } + + // If we have zero-length server lists, test succeeded in degenerate + // case. + if len(p.getServerList().L) == 0 && + len(origServers.L) == 0 { + // Failed as expected w/ zero length list + return true, nil + } + + resultingServerMap := make(map[EndpointKey]bool) + for _, s := range p.getServerList().L { + resultingServerMap[*s.Key()] = true + } + + // Test to make sure no failed servers are in the RPCProxy's + // list. Error if there are any failedServers in l.servers + for _, s := range failedServers { + _, ok := resultingServerMap[*s.Key()] + if ok { + return false, fmt.Errorf("Found failed server %v in merged list %v", s, resultingServerMap) + } + } + + // Test to make sure all healthy servers are in the healthy list. + if len(healthyServers) != len(p.getServerList().L) { + return false, fmt.Errorf("Expected healthy map and servers to match: %d/%d", len(healthyServers), len(healthyServers)) + } + + // Test to make sure all healthy servers are in the resultingServerMap list. + for _, s := range healthyServers { + _, ok := resultingServerMap[*s.Key()] + if !ok { + return false, fmt.Errorf("Server %v missing from healthy map after merged lists", s) + } + } + return true, nil +} + +// func (l *serverList) refreshServerRebalanceTimer() { +func TestRPCProxyInternal_refreshServerRebalanceTimer(t *testing.T) { + type clusterSizes struct { + numNodes int + numServers int + minRebalance time.Duration + } + clusters := []clusterSizes{ + {0, 3, 10 * time.Minute}, + {1, 0, 10 * time.Minute}, // partitioned cluster + {1, 3, 10 * time.Minute}, + {2, 3, 10 * time.Minute}, + {100, 0, 10 * time.Minute}, // partitioned + {100, 1, 10 * time.Minute}, // partitioned + {100, 3, 10 * time.Minute}, + {1024, 1, 10 * time.Minute}, // partitioned + {1024, 3, 10 * time.Minute}, // partitioned + {1024, 5, 10 * time.Minute}, + {16384, 1, 10 * time.Minute}, // partitioned + {16384, 2, 10 * time.Minute}, // partitioned + {16384, 3, 10 * time.Minute}, // partitioned + {16384, 5, 10 * time.Minute}, + {65535, 0, 10 * time.Minute}, // partitioned + {65535, 1, 10 * time.Minute}, // partitioned + {65535, 2, 10 * time.Minute}, // partitioned + {65535, 3, 10 * time.Minute}, // partitioned + {65535, 5, 10 * time.Minute}, // partitioned + {65535, 7, 10 * time.Minute}, + {1000000, 1, 10 * time.Minute}, // partitioned + {1000000, 2, 10 * time.Minute}, // partitioned + {1000000, 3, 10 * time.Minute}, // partitioned + {1000000, 5, 10 * time.Minute}, // partitioned + {1000000, 11, 10 * time.Minute}, // partitioned + {1000000, 19, 10 * time.Minute}, + } + + logger := log.New(os.Stderr, "", log.LstdFlags) + shutdownCh := make(chan struct{}) + + for i, s := range clusters { + p := NewRPCProxy(logger, shutdownCh, &fauxSerf{numNodes: s.numNodes}, &fauxConnPool{}) + for i := 0; i < s.numServers; i++ { + nodeName := fmt.Sprintf("s%02d", i) + p.activateEndpoint(&ServerEndpoint{Name: nodeName}) + } + + d := p.refreshServerRebalanceTimer() + if d < s.minRebalance { + t.Errorf("[%d] duration too short for cluster of size %d and %d servers (%s < %s)", i, s.numNodes, s.numServers, d, s.minRebalance) + } + } +} + +// func (p *RPCProxy) saveServerList(l serverList) { +func TestRPCProxyInternal_saveServerList(t *testing.T) { + p := testRPCProxy() + + // Initial condition + func() { + l := p.getServerList() + if len(l.L) != 0 { + t.Fatalf("RPCProxy.saveServerList failed to load init config") + } + + newServer := new(ServerEndpoint) + l.L = append(l.L, newServer) + p.saveServerList(l) + }() + + // Test that save works + func() { + l1 := p.getServerList() + t1NumServers := len(l1.L) + if t1NumServers != 1 { + t.Fatalf("RPCProxy.saveServerList failed to save mutated config") + } + }() + + // Verify mutation w/o a save doesn't alter the original + func() { + newServer := new(ServerEndpoint) + l := p.getServerList() + l.L = append(l.L, newServer) + + l_orig := p.getServerList() + origNumServers := len(l_orig.L) + if origNumServers >= len(l.L) { + t.Fatalf("RPCProxy.saveServerList unsaved config overwrote original") + } + }() +} diff --git a/client/rpcproxy/server_endpoint.go b/client/rpcproxy/server_endpoint.go new file mode 100644 index 00000000000..d9b1add5b6a --- /dev/null +++ b/client/rpcproxy/server_endpoint.go @@ -0,0 +1,84 @@ +package rpcproxy + +import ( + "fmt" + "net" + "strings" +) + +const ( + defaultNomadRPCPort = "4647" +) + +// EndpointKey is used in maps and for equality tests. A key is based on endpoints. +type EndpointKey struct { + name string +} + +// Equal compares two EndpointKey objects +func (k *EndpointKey) Equal(x *EndpointKey) bool { + return k.name == x.name +} + +// ServerEndpoint contains the address information for to connect to a Nomad +// server. +// +// TODO(sean@): Server is stubbed out so that in the future it can hold a +// reference to Node (and ultimately Node.ID). +type ServerEndpoint struct { + // Name is the unique lookup key for a Server instance + Name string + Host string + Port string + Addr net.Addr +} + +// Key returns the corresponding Key +func (s *ServerEndpoint) Key() *EndpointKey { + return &EndpointKey{ + name: s.Name, + } +} + +// NewServerEndpoint creates a new Server instance with a resolvable +// endpoint. `name` can be either an IP address or a DNS name. If `name` is +// a DNS name, it must be resolvable to an IP address (most inputs are IP +// addresses, not DNS names, but both work equally well when the name is +// resolvable). +func NewServerEndpoint(name string) (*ServerEndpoint, error) { + s := &ServerEndpoint{ + Name: name, + } + + var host, port string + var err error + host, port, err = net.SplitHostPort(name) + if err == nil { + s.Host = host + s.Port = port + } else { + if strings.Contains(err.Error(), "missing port") { + s.Host = name + s.Port = defaultNomadRPCPort + } else { + return nil, err + } + } + + if s.Addr, err = net.ResolveTCPAddr("tcp", net.JoinHostPort(s.Host, s.Port)); err != nil { + return nil, err + } + + return s, err +} + +// String returns a string representation of Server +func (s *ServerEndpoint) String() string { + var addrStr, networkStr string + if s.Addr != nil { + addrStr = s.Addr.String() + networkStr = s.Addr.Network() + } + + return fmt.Sprintf("%s (%s:%s)", s.Name, networkStr, addrStr) +} diff --git a/client/rpcproxy/server_endpoint_test.go b/client/rpcproxy/server_endpoint_test.go new file mode 100644 index 00000000000..f04494859d3 --- /dev/null +++ b/client/rpcproxy/server_endpoint_test.go @@ -0,0 +1,77 @@ +package rpcproxy + +import ( + "fmt" + "net" + "testing" +) + +// func (k *EndpointKey) Equal(x *EndpointKey) { +func TestServerEndpointKey_Equal(t *testing.T) { + tests := []struct { + name string + s1 *ServerEndpoint + s2 *ServerEndpoint + equal bool + }{ + { + name: "equal", + s1: &ServerEndpoint{Name: "k1"}, + s2: &ServerEndpoint{Name: "k1"}, + equal: true, + }, + { + name: "not equal", + s1: &ServerEndpoint{Name: "k1"}, + s2: &ServerEndpoint{Name: "k2"}, + equal: false, + }, + } + + for _, test := range tests { + if test.s1.Key().Equal(test.s2.Key()) != test.equal { + t.Errorf("fixture %s failed forward comparison", test.name) + } + + if test.s2.Key().Equal(test.s1.Key()) != test.equal { + t.Errorf("fixture %s failed reverse comparison", test.name) + } + } +} + +// func (k *ServerEndpoint) String() { +func TestServerEndpoint_String(t *testing.T) { + tests := []struct { + name string + s *ServerEndpoint + str string + }{ + { + name: "name", + s: &ServerEndpoint{Name: "s"}, + str: "s (:)", + }, + { + name: "name, host, port", + s: &ServerEndpoint{ + Name: "s", + Host: "127.0.0.1", + Port: "4647", + }, + str: "s (tcp:127.0.0.1:4647)", + }, + } + + for _, test := range tests { + if test.s.Addr == nil && (test.s.Host != "" && test.s.Port != "") { + fmt.Printf("Setting addr\n") + addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(test.s.Host, test.s.Port)) + if err == nil { + test.s.Addr = addr + } + } + if test.s.String() != test.str { + t.Errorf("fixture %q failed: %q vs %q", test.name, test.s.String(), test.str) + } + } +} diff --git a/client/task_runner_test.go b/client/task_runner_test.go index afcb31d0182..00472fb70be 100644 --- a/client/task_runner_test.go +++ b/client/task_runner_test.go @@ -11,6 +11,7 @@ import ( "time" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/driver" cstructs "github.com/hashicorp/nomad/client/driver/structs" "github.com/hashicorp/nomad/nomad/mock" @@ -46,7 +47,7 @@ func testTaskRunner(restarts bool) (*MockTaskStateUpdater, *TaskRunner) { // the passed allocation. func testTaskRunnerFromAlloc(restarts bool, alloc *structs.Allocation) (*MockTaskStateUpdater, *TaskRunner) { logger := testLogger() - conf := DefaultConfig() + conf := config.DefaultConfig() conf.StateDir = os.TempDir() conf.AllocDir = os.TempDir() upd := &MockTaskStateUpdater{} diff --git a/client/util.go b/client/util.go index a8afcd17152..b04f173f008 100644 --- a/client/util.go +++ b/client/util.go @@ -7,7 +7,6 @@ import ( "math/rand" "os" "path/filepath" - "time" "github.com/hashicorp/nomad/nomad/structs" ) @@ -69,11 +68,6 @@ func diffAllocs(existing []*structs.Allocation, allocs *allocUpdates) *diffResul return result } -// Returns a random stagger interval between 0 and the duration -func randomStagger(intv time.Duration) time.Duration { - return time.Duration(uint64(rand.Int63()) % uint64(intv)) -} - // shuffleStrings randomly shuffles the list of strings func shuffleStrings(list []string) { for i := range list { diff --git a/client/util_test.go b/client/util_test.go index 431a93f6ba5..c0a8633c329 100644 --- a/client/util_test.go +++ b/client/util_test.go @@ -6,7 +6,6 @@ import ( "path/filepath" "reflect" "testing" - "time" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" @@ -56,17 +55,6 @@ func TestDiffAllocs(t *testing.T) { } } -func TestRandomStagger(t *testing.T) { - t.Parallel() - intv := time.Minute - for i := 0; i < 10; i++ { - stagger := randomStagger(intv) - if stagger < 0 || stagger >= intv { - t.Fatalf("Bad: %v", stagger) - } - } -} - func TestShuffleStrings(t *testing.T) { t.Parallel() // Generate input diff --git a/command/agent/agent.go b/command/agent/agent.go index c7e623d8348..194e3848aa9 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -14,7 +14,7 @@ import ( "github.com/hashicorp/nomad/client" clientconfig "github.com/hashicorp/nomad/client/config" - "github.com/hashicorp/nomad/client/consul" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/structs" ) @@ -29,13 +29,17 @@ type Agent struct { logger *log.Logger logOutput io.Writer - consulService *consul.ConsulService // consulService registers the Nomad agent with the consul agent - consulConfig *consul.ConsulConfig // consulConfig is the consul configuration the Nomad client uses to connect with Consul agent - serverHTTPAddr string + // consulSyncer registers the Nomad agent with the Consul Agent + consulSyncer *consul.Syncer + + client *client.Client clientHTTPAddr string + clientRPCAddr string - server *nomad.Server - client *client.Client + server *nomad.Server + serverHTTPAddr string + serverRPCAddr string + serverSerfAddr string shutdown bool shutdownCh chan struct{} @@ -49,17 +53,17 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { logOutput = os.Stderr } + shutdownCh := make(chan struct{}) a := &Agent{ config: config, logger: log.New(logOutput, "", log.LstdFlags), logOutput: logOutput, - shutdownCh: make(chan struct{}), + shutdownCh: shutdownCh, } - // creating the consul client configuration that both the server and client - // uses - a.createConsulConfig() - + if err := a.setupConsulSyncer(shutdownCh); err != nil { + return nil, fmt.Errorf("Failed to initialize Consul syncer task: %v", err) + } if err := a.setupServer(); err != nil { return nil, err } @@ -69,14 +73,22 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { if a.client == nil && a.server == nil { return nil, fmt.Errorf("must have at least client or server mode enabled") } - if a.config.ConsulConfig.AutoRegister { - if err := a.syncAgentServicesWithConsul(a.serverHTTPAddr, a.clientHTTPAddr); err != nil { - a.logger.Printf("[ERR] agent: unable to sync agent services with consul: %v", err) - } - if a.consulService != nil { - go a.consulService.PeriodicSync() - } + + // The Nomad Agent runs the consul.Syncer regardless of whether or + // not the Agent is running in Client or Server mode (or both), and + // regardless of the consul.auto_register parameter. The Client and + // Server both reuse the same consul.Syncer instance. This Syncer + // task periodically executes callbacks that update Consul. The + // reason the Syncer is always running is because one of the + // callbacks is attempts to self-bootstrap Nomad using information + // found in Consul. The Syncer's handlers automatically deactivate + // when the Consul Fingerprinter has detected the local Consul Agent + // is missing. + if err := a.consulSyncer.SyncServices(); err != nil { + a.logger.Printf("[WARN] agent.consul: Initial sync of Consul failed: %v", err) } + go a.consulSyncer.Run() + return a, nil } @@ -160,6 +172,7 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { conf.SerfConfig.MemberlistConfig.BindPort = port } + // Resolve the Server's HTTP Address if a.config.AdvertiseAddrs.HTTP != "" { a.serverHTTPAddr = a.config.AdvertiseAddrs.HTTP } else if a.config.Addresses.HTTP != "" { @@ -169,6 +182,43 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { } else { a.serverHTTPAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.HTTP) } + addr, err := net.ResolveTCPAddr("tcp", a.serverHTTPAddr) + if err != nil { + return nil, fmt.Errorf("error resolving HTTP addr %+q: %v", a.serverHTTPAddr, err) + } + a.serverHTTPAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) + + // Resolve the Server's RPC Address + if a.config.AdvertiseAddrs.RPC != "" { + a.serverRPCAddr = a.config.AdvertiseAddrs.RPC + } else if a.config.Addresses.RPC != "" { + a.serverRPCAddr = fmt.Sprintf("%v:%v", a.config.Addresses.RPC, a.config.Ports.RPC) + } else if a.config.BindAddr != "" { + a.serverRPCAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.RPC) + } else { + a.serverRPCAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.RPC) + } + addr, err = net.ResolveTCPAddr("tcp", a.serverRPCAddr) + if err != nil { + return nil, fmt.Errorf("error resolving RPC addr %+q: %v", a.serverRPCAddr, err) + } + a.serverRPCAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) + + // Resolve the Server's Serf Address + if a.config.AdvertiseAddrs.Serf != "" { + a.serverSerfAddr = a.config.AdvertiseAddrs.Serf + } else if a.config.Addresses.Serf != "" { + a.serverSerfAddr = fmt.Sprintf("%v:%v", a.config.Addresses.Serf, a.config.Ports.Serf) + } else if a.config.BindAddr != "" { + a.serverSerfAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.Serf) + } else { + a.serverSerfAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.Serf) + } + addr, err = net.ResolveTCPAddr("tcp", a.serverSerfAddr) + if err != nil { + return nil, fmt.Errorf("error resolving Serf addr %+q: %v", a.serverSerfAddr, err) + } + a.serverSerfAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) if gcThreshold := a.config.Server.NodeGCThreshold; gcThreshold != "" { dur, err := time.ParseDuration(gcThreshold) @@ -190,12 +240,12 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { } // clientConfig is used to generate a new client configuration struct -// for initializing a nomad client. +// for initializing a Nomad client. func (a *Agent) clientConfig() (*clientconfig.Config, error) { // Setup the configuration conf := a.config.ClientConfig if conf == nil { - conf = client.DefaultConfig() + conf = clientconfig.DefaultConfig() } if a.server != nil { conf.RPCHandler = a.server @@ -240,23 +290,40 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { conf.Node.Meta = a.config.Client.Meta conf.Node.NodeClass = a.config.Client.NodeClass - // Setting the proper HTTP Addr - httpAddr := fmt.Sprintf("%s:%d", a.config.BindAddr, a.config.Ports.HTTP) - if a.config.Addresses.HTTP != "" && a.config.AdvertiseAddrs.HTTP == "" { - httpAddr = fmt.Sprintf("%s:%d", a.config.Addresses.HTTP, a.config.Ports.HTTP) - if _, err := net.ResolveTCPAddr("tcp", httpAddr); err != nil { - return nil, fmt.Errorf("error resolving http addr: %v:", err) - } - } else if a.config.AdvertiseAddrs.HTTP != "" { - addr, err := net.ResolveTCPAddr("tcp", a.config.AdvertiseAddrs.HTTP) - if err != nil { - return nil, fmt.Errorf("error resolving advertise http addr: %v", err) - } - httpAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) + // Resolve the Client's HTTP address + if a.config.AdvertiseAddrs.HTTP != "" { + a.clientHTTPAddr = a.config.AdvertiseAddrs.HTTP + } else if a.config.Addresses.HTTP != "" { + a.clientHTTPAddr = fmt.Sprintf("%v:%v", a.config.Addresses.HTTP, a.config.Ports.HTTP) + } else if a.config.BindAddr != "" { + a.clientHTTPAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.HTTP) + } else { + a.clientHTTPAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.HTTP) } + addr, err := net.ResolveTCPAddr("tcp", a.clientHTTPAddr) + if err != nil { + return nil, fmt.Errorf("error resolving HTTP addr %+q: %v", a.clientHTTPAddr, err) + } + httpAddr := fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) conf.Node.HTTPAddr = httpAddr a.clientHTTPAddr = httpAddr + // Resolve the Client's RPC address + if a.config.AdvertiseAddrs.RPC != "" { + a.clientRPCAddr = a.config.AdvertiseAddrs.RPC + } else if a.config.Addresses.RPC != "" { + a.clientRPCAddr = fmt.Sprintf("%v:%v", a.config.Addresses.RPC, a.config.Ports.RPC) + } else if a.config.BindAddr != "" { + a.clientRPCAddr = fmt.Sprintf("%v:%v", a.config.BindAddr, a.config.Ports.RPC) + } else { + a.clientRPCAddr = fmt.Sprintf("%v:%v", "127.0.0.1", a.config.Ports.RPC) + } + addr, err = net.ResolveTCPAddr("tcp", a.clientRPCAddr) + if err != nil { + return nil, fmt.Errorf("error resolving RPC addr %+q: %v", a.clientRPCAddr, err) + } + a.clientRPCAddr = fmt.Sprintf("%s:%d", addr.IP.String(), addr.Port) + // Reserve resources on the node. r := conf.Node.Reserved if r == nil { @@ -272,7 +339,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { conf.Version = fmt.Sprintf("%s%s", a.config.Version, a.config.VersionPrerelease) conf.Revision = a.config.Revision - conf.ConsulConfig = a.consulConfig + conf.ConsulConfig = a.config.Consul conf.StatsDataPoints = a.config.Client.StatsConfig.DataPoints conf.StatsCollectionInterval = a.config.Client.StatsConfig.collectionInterval @@ -297,8 +364,30 @@ func (a *Agent) setupServer() error { if err != nil { return fmt.Errorf("server setup failed: %v", err) } - a.server = server + + // Create the Nomad Server services for Consul + if a.config.Consul.AutoRegister && a.config.Consul.ServerServiceName != "" { + const serviceGroupName = "server" + a.consulSyncer.SetServices(serviceGroupName, []*structs.ConsulService{ + &structs.ConsulService{ + Name: a.config.Consul.ServerServiceName, + PortLabel: a.serverHTTPAddr, + Tags: []string{consul.ServiceTagHTTP}, + }, + &structs.ConsulService{ + Name: a.config.Consul.ServerServiceName, + PortLabel: a.serverRPCAddr, + Tags: []string{consul.ServiceTagRPC}, + }, + &structs.ConsulService{ + PortLabel: a.serverSerfAddr, + Name: a.config.Consul.ServerServiceName, + Tags: []string{consul.ServiceTagSerf}, + }, + }) + } + return nil } @@ -322,15 +411,33 @@ func (a *Agent) setupClient() error { } // Create the client - client, err := client.NewClient(conf) + client, err := client.NewClient(conf, a.consulSyncer) if err != nil { return fmt.Errorf("client setup failed: %v", err) } a.client = client + + // Create the Nomad Server services for Consul + if a.config.Consul.AutoRegister && a.config.Consul.ClientServiceName != "" { + const serviceGroupName = "client" + a.consulSyncer.SetServices(serviceGroupName, []*structs.ConsulService{ + &structs.ConsulService{ + Name: a.config.Consul.ClientServiceName, + PortLabel: a.clientHTTPAddr, + Tags: []string{consul.ServiceTagHTTP}, + }, + &structs.ConsulService{ + Name: a.config.Consul.ClientServiceName, + PortLabel: a.clientRPCAddr, + Tags: []string{consul.ServiceTagRPC}, + }, + }) + } + return nil } -// reservePortsForClient reservers a range of ports for the client to use when +// reservePortsForClient reserves a range of ports for the client to use when // it creates various plugins for log collection, executors, drivers, etc func (a *Agent) reservePortsForClient(conf *clientconfig.Config) error { // finding the device name for loopback @@ -438,10 +545,8 @@ func (a *Agent) Shutdown() error { } } - if a.consulService != nil { - if err := a.consulService.Shutdown(); err != nil { - a.logger.Printf("[ERR] agent: shutting down consul service failed: %v", err) - } + if err := a.consulSyncer.Shutdown(); err != nil { + a.logger.Printf("[ERR] agent: shutting down consul service failed: %v", err) } a.logger.Println("[INFO] agent: shutdown complete") @@ -487,65 +592,47 @@ func (a *Agent) Stats() map[string]map[string]string { return stats } -func (a *Agent) createConsulConfig() { - cfg := &consul.ConsulConfig{ - Addr: a.config.ConsulConfig.Addr, - Token: a.config.ConsulConfig.Token, - Auth: a.config.ConsulConfig.Auth, - EnableSSL: a.config.ConsulConfig.EnableSSL, - VerifySSL: a.config.ConsulConfig.VerifySSL, - CAFile: a.config.ConsulConfig.CAFile, - CertFile: a.config.ConsulConfig.CertFile, - KeyFile: a.config.ConsulConfig.KeyFile, - } - a.consulConfig = cfg -} - -// syncAgentServicesWithConsul syncs the client and server services with Consul -func (a *Agent) syncAgentServicesWithConsul(clientHttpAddr string, serverHttpAddr string) error { - cs, err := consul.NewConsulService(a.consulConfig, a.logger) +// setupConsulSyncer creates the Consul tasks used by this Nomad Agent +// (either Client or Server mode). +func (a *Agent) setupConsulSyncer(shutdownCh chan struct{}) error { + var err error + a.consulSyncer, err = consul.NewSyncer(a.config.Consul, shutdownCh, a.logger) if err != nil { return err } - a.consulService = cs - var services []*structs.Service - if a.client != nil && a.config.ConsulConfig.ClientServiceName != "" { - if err != nil { - return err - } - clientService := &structs.Service{ - Name: a.config.ConsulConfig.ClientServiceName, - PortLabel: clientHttpAddr, - } - services = append(services, clientService) - cs.SetServiceIdentifier("agent-client") - } - if a.server != nil && a.config.ConsulConfig.ServerServiceName != "" { - serverService := &structs.Service{ - Name: a.config.ConsulConfig.ServerServiceName, - PortLabel: serverHttpAddr, - } - services = append(services, serverService) - cs.SetServiceIdentifier("agent-server") - } + a.consulSyncer.SetServiceRegPrefix("agent") - cs.SetAddrFinder(func(portLabel string) (string, int) { + a.consulSyncer.SetAddrFinder(func(portLabel string) (string, int) { host, port, err := net.SplitHostPort(portLabel) if err != nil { - return "", 0 + p, err := strconv.Atoi(port) + if err != nil { + return "", 0 + } + return "", p } - // if the addr for the service is ":port", then we default to - // registering the service with ip as the loopback addr + // If the addr for the service is ":port", then we fall back + // to Nomad's default address resolution protocol. + // + // TODO(sean@): This should poll Consul to figure out what + // its advertise address is and use that in order to handle + // the case where there is something funky like NAT on this + // host. For now we just use the BindAddr if set, otherwise + // we fall back to a loopback addr. if host == "" { - host = "127.0.0.1" + if a.config.BindAddr != "" { + host = a.config.BindAddr + } else { + host = "127.0.0.1" + } } p, err := strconv.Atoi(port) if err != nil { - return "", 0 + return host, 0 } return host, p }) - return cs.SyncServices(services) + return nil } diff --git a/command/agent/agent_endpoint.go b/command/agent/agent_endpoint.go index b2ca095991d..aaf466373b0 100644 --- a/command/agent/agent_endpoint.go +++ b/command/agent/agent_endpoint.go @@ -119,6 +119,9 @@ func (s *HTTPServer) AgentForceLeaveRequest(resp http.ResponseWriter, req *http. return nil, err } +// AgentServersRequest is used to query the list of servers used by the Nomad +// Client for RPCs. This endpoint can also be used to update the list of +// servers for a given agent. func (s *HTTPServer) AgentServersRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) { switch req.Method { case "PUT", "POST": @@ -136,8 +139,8 @@ func (s *HTTPServer) listServers(resp http.ResponseWriter, req *http.Request) (i return nil, CodedError(501, ErrInvalidMethod) } - // Get the current list of servers - return client.Servers(), nil + peers := s.agent.client.RPCProxy().ServerRPCAddrs() + return peers, nil } func (s *HTTPServer) updateServers(resp http.ResponseWriter, req *http.Request) (interface{}, error) { @@ -153,7 +156,13 @@ func (s *HTTPServer) updateServers(resp http.ResponseWriter, req *http.Request) } // Set the servers list into the client - client.SetServers(servers) + for _, server := range servers { + s.agent.logger.Printf("[TRACE] Adding server %s to the client's primary server list", server) + se := client.AddPrimaryServerToRPCProxy(server) + if se == nil { + s.agent.logger.Printf("[ERR] Attempt to add server %q to client failed", server) + } + } return nil, nil } diff --git a/command/agent/agent_endpoint_test.go b/command/agent/agent_endpoint_test.go index 8ab104b119f..03f7b2a7904 100644 --- a/command/agent/agent_endpoint_test.go +++ b/command/agent/agent_endpoint_test.go @@ -107,21 +107,35 @@ func TestHTTP_AgentForceLeave(t *testing.T) { func TestHTTP_AgentSetServers(t *testing.T) { httpTest(t, nil, func(s *TestServer) { + // Establish a baseline number of servers + req, err := http.NewRequest("GET", "/v1/agent/servers", nil) + if err != nil { + t.Fatalf("err: %s", err) + } + respW := httptest.NewRecorder() + + // Make the request and check the result + out, err := s.Server.AgentServersRequest(respW, req) + if err != nil { + t.Fatalf("err: %s", err) + } + numServers := len(out.([]string)) + // Create the request - req, err := http.NewRequest("PUT", "/v1/agent/servers", nil) + req, err = http.NewRequest("PUT", "/v1/agent/servers", nil) if err != nil { t.Fatalf("err: %s", err) } // Send the request - respW := httptest.NewRecorder() + respW = httptest.NewRecorder() _, err = s.Server.AgentServersRequest(respW, req) if err == nil || !strings.Contains(err.Error(), "missing server address") { t.Fatalf("expected missing servers error, got: %#v", err) } // Create a valid request - req, err = http.NewRequest("PUT", "/v1/agent/servers?address=foo&address=bar", nil) + req, err = http.NewRequest("PUT", "/v1/agent/servers?address=127.0.0.1%3A4647&address=127.0.0.2%3A4647", nil) if err != nil { t.Fatalf("err: %s", err) } @@ -141,16 +155,31 @@ func TestHTTP_AgentSetServers(t *testing.T) { respW = httptest.NewRecorder() // Make the request and check the result - out, err := s.Server.AgentServersRequest(respW, req) + expected := map[string]bool{ + "127.0.0.1:4647": true, + "127.0.0.2:4647": true, + } + out, err = s.Server.AgentServersRequest(respW, req) if err != nil { t.Fatalf("err: %s", err) } servers := out.([]string) - if n := len(servers); n != 2 { - t.Fatalf("expected 2 servers, got: %d", n) - } - if servers[0] != "foo:4647" || servers[1] != "bar:4647" { - t.Fatalf("bad servers result: %v", servers) + if n := len(servers); n != numServers+2 { + t.Fatalf("expected %d servers, got: %d: %v", numServers+2, n, servers) + } + received := make(map[string]bool, len(servers)) + for _, server := range servers { + received[server] = true + } + foundCount := 0 + for k, _ := range received { + _, found := expected[k] + if found { + foundCount++ + } + } + if foundCount != len(expected) { + t.Fatalf("bad servers result") } }) } diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 23662ef4477..aae664f2114 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -10,6 +10,7 @@ import ( "time" "github.com/hashicorp/nomad/nomad" + cconfig "github.com/hashicorp/nomad/nomad/structs/config" ) var nextPort uint32 = 17000 @@ -42,7 +43,7 @@ func makeAgent(t testing.TB, cb func(*Config)) (string, *Agent) { Serf: getPort(), } conf.NodeName = fmt.Sprintf("Node %d", conf.Ports.RPC) - conf.ConsulConfig = &ConsulConfig{} + conf.Consul = &cconfig.ConsulConfig{} // Tighten the Serf timing config.SerfConfig.MemberlistConfig.SuspicionMult = 2 @@ -121,6 +122,9 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := a.serverHTTPAddr; addr != "10.10.11.1:4005" { t.Fatalf("expect 10.11.11.1:4005, got: %v", addr) } + if addr := a.serverRPCAddr; addr != "127.0.0.1:4001" { + t.Fatalf("expect 127.0.0.1:4001, got: %v", addr) + } // Sets up the ports properly conf.Ports.RPC = 4003 @@ -137,7 +141,7 @@ func TestAgent_ServerConfig(t *testing.T) { t.Fatalf("expect 4004, got: %d", port) } - // Prefers the most specific bind addrs + // Prefers advertise over bind addr conf.BindAddr = "127.0.0.3" conf.Addresses.RPC = "127.0.0.2" conf.Addresses.Serf = "127.0.0.2" @@ -155,7 +159,14 @@ func TestAgent_ServerConfig(t *testing.T) { t.Fatalf("expect 127.0.0.2, got: %s", addr) } if addr := a.serverHTTPAddr; addr != "127.0.0.2:4646" { - t.Fatalf("expect 127.0.0.3:4646, got: %s", addr) + t.Fatalf("expect 127.0.0.2:4646, got: %s", addr) + } + // NOTE: AdvertiseAddr > Addresses > BindAddr > Defaults + if addr := a.serverRPCAddr; addr != "127.0.0.1:4001" { + t.Fatalf("expect 127.0.0.1:4001, got: %s", addr) + } + if addr := a.serverSerfAddr; addr != "127.0.0.1:4000" { + t.Fatalf("expect 127.0.0.1:4000, got: %s", addr) } conf.Server.NodeGCThreshold = "42g" @@ -184,6 +195,12 @@ func TestAgent_ServerConfig(t *testing.T) { conf.Addresses.RPC = "" conf.Addresses.Serf = "" conf.Addresses.HTTP = "" + conf.AdvertiseAddrs.RPC = "" + conf.AdvertiseAddrs.HTTP = "" + conf.AdvertiseAddrs.Serf = "" + conf.Ports.HTTP = 4646 + conf.Ports.RPC = 4647 + conf.Ports.Serf = 4648 out, err = a.serverConfig() if err != nil { t.Fatalf("err: %s", err) @@ -197,6 +214,12 @@ func TestAgent_ServerConfig(t *testing.T) { if addr := a.serverHTTPAddr; addr != "127.0.0.3:4646" { t.Fatalf("expect 127.0.0.3:4646, got: %s", addr) } + if addr := a.serverRPCAddr; addr != "127.0.0.3:4647" { + t.Fatalf("expect 127.0.0.3:4647, got: %s", addr) + } + if addr := a.serverSerfAddr; addr != "127.0.0.3:4648" { + t.Fatalf("expect 127.0.0.3:4648, got: %s", addr) + } // Properly handles the bootstrap flags conf.Server.BootstrapExpect = 1 diff --git a/command/agent/command.go b/command/agent/command.go index 5c8371b79a9..42b06569ebd 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -16,11 +16,13 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/hashicorp/consul/lib" "github.com/hashicorp/go-checkpoint" "github.com/hashicorp/go-syslog" "github.com/hashicorp/logutils" "github.com/hashicorp/nomad/helper/flag-slice" "github.com/hashicorp/nomad/helper/gated-writer" + "github.com/hashicorp/nomad/nomad/structs/config" "github.com/hashicorp/scada-client/scada" "github.com/mitchellh/cli" ) @@ -58,11 +60,11 @@ func (c *Command) readConfig() *Config { // Make a new, empty config. cmdConfig := &Config{ - Atlas: &AtlasConfig{}, - ConsulConfig: &ConsulConfig{}, - Client: &ClientConfig{}, - Ports: &Ports{}, - Server: &ServerConfig{}, + Atlas: &AtlasConfig{}, + Consul: &config.ConsulConfig{}, + Client: &ClientConfig{}, + Ports: &Ports{}, + Server: &ServerConfig{}, } flags := flag.NewFlagSet("agent", flag.ContinueOnError) @@ -334,7 +336,7 @@ func (c *Command) setupAgent(config *Config, logOutput io.Writer) error { // Do an immediate check within the next 30 seconds go func() { - time.Sleep(randomStagger(30 * time.Second)) + time.Sleep(lib.RandomStagger(30 * time.Second)) c.checkpointResults(checkpoint.Check(updateParams)) }() } diff --git a/command/agent/config-test-fixtures/basic.hcl b/command/agent/config-test-fixtures/basic.hcl index 38c22ec7814..f5f1380e0f0 100644 --- a/command/agent/config-test-fixtures/basic.hcl +++ b/command/agent/config-test-fixtures/basic.hcl @@ -86,9 +86,9 @@ http_api_response_headers { Access-Control-Allow-Origin = "*" } consul { - server_service_name = "nomad-server" + server_service_name = "nomad" client_service_name = "nomad-client" - addr = "127.0.0.1:9500" + address = "127.0.0.1:9500" token = "token1" auth = "username:pass" ssl = true diff --git a/command/agent/config.go b/command/agent/config.go index be0cd0db26b..091e37684d4 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -14,6 +14,7 @@ import ( client "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/nomad" + "github.com/hashicorp/nomad/nomad/structs/config" ) // Config is the configuration for the Nomad agent. @@ -82,9 +83,10 @@ type Config struct { // AtlasConfig is used to configure Atlas Atlas *AtlasConfig `mapstructure:"atlas"` - // ConsulConfig is used to configure Consul clients and register the nomad - // server and client services with Consul - ConsulConfig *ConsulConfig `mapstructure:"consul"` + // Consul contains the configuration for the Consul Agent and + // parameters necessary to register services, their checks, and + // discover the current Nomad servers. + Consul *config.ConsulConfig `mapstructure:"consul"` // NomadConfig is used to override the default config. // This is largly used for testing purposes. @@ -128,57 +130,6 @@ type AtlasConfig struct { Endpoint string `mapstructure:"endpoint"` } -// ConsulConfig is used to configure Consul clients and register the nomad -// server and client services with Consul -type ConsulConfig struct { - - // ServerServiceName is the name of the service that Nomad uses to register - // servers with Consul - ServerServiceName string `mapstructure:"server_service_name"` - - // ClientServiceName is the name of the service that Nomad uses to register - // clients with Consul - ClientServiceName string `mapstructure:"client_service_name"` - - // AutoRegister determines if Nomad will register the Nomad client and - // server agents with Consul - AutoRegister bool `mapstructure:"auto_register"` - - // Addr is the address of the local Consul agent - Addr string `mapstructure:"addr"` - - // Token is used to provide a per-request ACL token.This options overrides - // the agent's default token - Token string `mapstructure:"token"` - - // Auth is the information to use for http access to Consul agent - Auth string `mapstructure:"auth"` - - // EnableSSL sets the transport scheme to talk to the Consul agent as https - EnableSSL bool `mapstructure:"ssl"` - - // VerifySSL enables or disables SSL verification when the transport scheme - // for the consul api client is https - VerifySSL bool `mapstructure:"verify_ssl"` - - // CAFile is the path to the ca certificate used for Consul communication - CAFile string `mapstructure:"ca_file"` - - // CertFile is the path to the certificate for Consul communication - CertFile string `mapstructure:"cert_file"` - - // KeyFile is the path to the private key for Consul communication - KeyFile string `mapstructure:"key_file"` - - // ServerAutoJoin enables Nomad servers to find peers by querying Consul and - // joining them - ServerAutoJoin bool `mapstructure:"server_auto_join"` - - // ClientAutoJoin enables Nomad servers to find addresses of Nomad servers - // and register with them - ClientAutoJoin bool `mapstructure:"client_auto_join"` -} - // StatsConfig determines behavior of resource usage stats collections type StatsConfig struct { @@ -411,7 +362,7 @@ func DevConfig() *Config { conf.DevMode = true conf.EnableDebug = true conf.DisableAnonymousSignature = true - conf.ConsulConfig.AutoRegister = true + conf.Consul.AutoRegister = true if runtime.GOOS == "darwin" { conf.Client.NetworkInterface = "lo0" } else if runtime.GOOS == "linux" { @@ -439,9 +390,11 @@ func DefaultConfig() *Config { Addresses: &Addresses{}, AdvertiseAddrs: &AdvertiseAddrs{}, Atlas: &AtlasConfig{}, - ConsulConfig: &ConsulConfig{ - ServerServiceName: "nomad-server", + Consul: &config.ConsulConfig{ + ServerServiceName: "nomad", ClientServiceName: "nomad-client", + AutoRegister: true, + Timeout: 5 * time.Second, }, Client: &ClientConfig{ Enabled: false, @@ -593,11 +546,11 @@ func (c *Config) Merge(b *Config) *Config { } // Apply the Consul Configuration - if result.ConsulConfig == nil && b.ConsulConfig != nil { - consulConfig := *b.ConsulConfig - result.ConsulConfig = &consulConfig - } else if b.ConsulConfig != nil { - result.ConsulConfig = result.ConsulConfig.Merge(b.ConsulConfig) + if result.Consul == nil && b.Consul != nil { + consulConfig := *b.Consul + result.Consul = &consulConfig + } else if b.Consul != nil { + result.Consul = result.Consul.Merge(b.Consul) } // Merge config files lists @@ -809,52 +762,6 @@ func (a *AtlasConfig) Merge(b *AtlasConfig) *AtlasConfig { return &result } -// Merge merges two Consul Configurations together. -func (a *ConsulConfig) Merge(b *ConsulConfig) *ConsulConfig { - result := *a - - if b.ServerServiceName != "" { - result.ServerServiceName = b.ServerServiceName - } - if b.ClientServiceName != "" { - result.ClientServiceName = b.ClientServiceName - } - if b.AutoRegister { - result.AutoRegister = true - } - if b.Addr != "" { - result.Addr = b.Addr - } - if b.Token != "" { - result.Token = b.Token - } - if b.Auth != "" { - result.Auth = b.Auth - } - if b.EnableSSL { - result.EnableSSL = true - } - if b.VerifySSL { - result.VerifySSL = true - } - if b.CAFile != "" { - result.CAFile = b.CAFile - } - if b.CertFile != "" { - result.CertFile = b.CertFile - } - if b.KeyFile != "" { - result.KeyFile = b.KeyFile - } - if b.ServerAutoJoin { - result.ServerAutoJoin = true - } - if b.ClientAutoJoin { - result.ClientAutoJoin = true - } - return &result -} - func (r *Resources) Merge(b *Resources) *Resources { result := *r if b.CPU != 0 { diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index 4ecbab43117..873752c0161 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -10,6 +10,7 @@ import ( "github.com/hashicorp/go-multierror" "github.com/hashicorp/hcl" "github.com/hashicorp/hcl/hcl/ast" + "github.com/hashicorp/nomad/nomad/structs/config" "github.com/mitchellh/mapstructure" ) @@ -169,7 +170,7 @@ func parseConfig(result *Config, list *ast.ObjectList) error { // Parse the consul config if o := list.Filter("consul"); len(o.Items) > 0 { - if err := parseConsulConfig(&result.ConsulConfig, o); err != nil { + if err := parseConsulConfig(&result.Consul, o); err != nil { return multierror.Prefix(err, "consul ->") } } @@ -586,7 +587,7 @@ func parseAtlas(result **AtlasConfig, list *ast.ObjectList) error { return nil } -func parseConsulConfig(result **ConsulConfig, list *ast.ObjectList) error { +func parseConsulConfig(result **config.ConsulConfig, list *ast.ObjectList) error { list = list.Elem() if len(list.Items) > 1 { return fmt.Errorf("only one 'consul' block allowed") @@ -597,19 +598,20 @@ func parseConsulConfig(result **ConsulConfig, list *ast.ObjectList) error { // Check for invalid keys valid := []string{ - "server_service_name", - "client_service_name", - "auto_register", - "addr", - "token", + "address", "auth", - "ssl", - "verify_ssl", + "auto_register", "ca_file", "cert_file", - "key_file", "client_auto_join", + "client_service_name", + "key_file", "server_auto_join", + "server_service_name", + "ssl", + "timeout", + "token", + "verify_ssl", } if err := checkHCLKeys(listVal, valid); err != nil { @@ -621,8 +623,16 @@ func parseConsulConfig(result **ConsulConfig, list *ast.ObjectList) error { return err } - var consulConfig ConsulConfig - if err := mapstructure.WeakDecode(m, &consulConfig); err != nil { + var consulConfig config.ConsulConfig + dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ + DecodeHook: mapstructure.StringToTimeDurationHookFunc(), + WeaklyTypedInput: true, + Result: &consulConfig, + }) + if err != nil { + return err + } + if err := dec.Decode(m); err != nil { return err } diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 6012ba88126..2e74e50eed9 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -4,6 +4,8 @@ import ( "path/filepath" "reflect" "testing" + + "github.com/hashicorp/nomad/nomad/structs/config" ) func TestConfig_Parse(t *testing.T) { @@ -100,8 +102,8 @@ func TestConfig_Parse(t *testing.T) { Join: true, Endpoint: "127.0.0.1:1234", }, - ConsulConfig: &ConsulConfig{ - ServerServiceName: "nomad-server", + Consul: &config.ConsulConfig{ + ServerServiceName: "nomad", ClientServiceName: "nomad-client", Addr: "127.0.0.1:9500", Token: "token1", diff --git a/client/consul/check.go b/command/agent/consul/check.go similarity index 87% rename from client/consul/check.go rename to command/agent/consul/check.go index 052c5c78c20..28df291f67f 100644 --- a/client/consul/check.go +++ b/command/agent/consul/check.go @@ -2,10 +2,10 @@ package consul import ( "log" - "math/rand" "sync" "time" + "github.com/hashicorp/consul/lib" cstructs "github.com/hashicorp/nomad/client/driver/structs" ) @@ -60,7 +60,7 @@ func (r *CheckRunner) Stop() { // run is invoked by a goroutine to run until Stop() is called func (r *CheckRunner) run() { // Get the randomized initial pause time - initialPauseTime := randomStagger(r.check.Interval()) + initialPauseTime := lib.RandomStagger(r.check.Interval()) r.logger.Printf("[DEBUG] agent: pausing %v before first invocation of %s", initialPauseTime, r.check.ID()) next := time.NewTimer(initialPauseTime) for { @@ -82,8 +82,3 @@ type Check interface { Interval() time.Duration Timeout() time.Duration } - -// Returns a random stagger interval between 0 and the duration -func randomStagger(intv time.Duration) time.Duration { - return time.Duration(uint64(rand.Int63()) % uint64(intv)) -} diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go new file mode 100644 index 00000000000..fe7cc8181e6 --- /dev/null +++ b/command/agent/consul/syncer.go @@ -0,0 +1,915 @@ +package consul + +import ( + "crypto/tls" + "fmt" + "log" + "net/http" + "net/url" + "strings" + "sync" + "time" + + consul "github.com/hashicorp/consul/api" + "github.com/hashicorp/consul/lib" + "github.com/hashicorp/go-multierror" + + cconfig "github.com/hashicorp/nomad/client/config" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/structs/config" + "github.com/hashicorp/nomad/nomad/types" +) + +const ( + // initialSyncBuffer is the max time an initial sync will sleep + // before syncing. + initialSyncBuffer = 30 * time.Second + + // initialSyncDelay is the delay before an initial sync. + initialSyncDelay = 5 * time.Second + + // nomadServicePrefix is the prefix used when registering a service + // with consul + nomadServicePrefix = "nomad" + + // The periodic time interval for syncing services and checks with Consul + syncInterval = 5 * time.Second + + // syncJitter provides a little variance in the frequency at which + // Syncer polls Consul. + syncJitter = 8 + + // ttlCheckBuffer is the time interval that Nomad can take to report Consul + // the check result + ttlCheckBuffer = 31 * time.Second + + // DefaultQueryWaitDuration is the max duration the Consul Agent will + // spend waiting for a response from a Consul Query. + DefaultQueryWaitDuration = 2 * time.Second + + // ServiceTagHTTP is the tag assigned to HTTP services + ServiceTagHTTP = "http" + + // ServiceTagRPC is the tag assigned to RPC services + ServiceTagRPC = "rpc" + + // ServiceTagSerf is the tag assigned to Serf services + ServiceTagSerf = "serf" +) + +// Syncer allows syncing of services and checks with Consul +type Syncer struct { + client *consul.Client + consulAvailable bool + + // servicesGroups is a named group of services that will be flattened + // and reconciled with Consul when SyncServices() is called. The key + // to the servicesGroups map is unique per handler and is used to + // allow the Agent's services to be maintained independently of the + // Client or Server's services. + servicesGroups map[string][]*consul.AgentServiceRegistration + servicesGroupsLock sync.RWMutex + + // The "Consul Registry" is a collection of Consul Services and + // Checks all guarded by the registryLock. + registryLock sync.RWMutex + + checkRunners map[string]*CheckRunner + delegateChecks map[string]struct{} // delegateChecks are the checks that the Nomad client runs and reports to Consul + trackedChecks map[string]*consul.AgentCheckRegistration + trackedServices map[string]*consul.AgentServiceRegistration + + // serviceRegPrefix is used to namespace the domain of registered + // Consul Services and Checks belonging to a single Syncer. A given + // Nomad Agent may spawn multiple Syncer tasks between the Agent + // Agent and its Executors, all syncing to a single Consul Agent. + // The serviceRegPrefix allows multiple Syncers to coexist without + // each Syncer clobbering each others Services. The Syncer namespace + // protocol is fmt.Sprintf("nomad-%s-%s", serviceRegPrefix, miscID). + // serviceRegPrefix is guarded by the registryLock. + serviceRegPrefix string + + addrFinder func(portLabel string) (string, int) + createDelegatedCheck func(*structs.ServiceCheck, string) (Check, error) + // End registryLock guarded attributes. + + logger *log.Logger + + shutdownCh chan struct{} + shutdown bool + shutdownLock sync.Mutex + + // notifyShutdownCh is used to notify a Syncer it needs to shutdown. + // This can happen because there was an explicit call to the Syncer's + // Shutdown() method, or because the calling task signaled the + // program is going to exit by closing its shutdownCh. + notifyShutdownCh chan struct{} + + // periodicCallbacks is walked sequentially when the timer in Run + // fires. + periodicCallbacks map[string]types.PeriodicCallback + notifySyncCh chan struct{} + periodicLock sync.RWMutex +} + +// NewSyncer returns a new consul.Syncer +func NewSyncer(consulConfig *config.ConsulConfig, shutdownCh chan struct{}, logger *log.Logger) (*Syncer, error) { + var err error + var c *consul.Client + + cfg := consul.DefaultConfig() + + // If a nil config was provided, fall back to the default config + if consulConfig == nil { + consulConfig = cconfig.DefaultConfig().ConsulConfig + } + + if consulConfig.Addr != "" { + cfg.Address = consulConfig.Addr + } + if consulConfig.Token != "" { + cfg.Token = consulConfig.Token + } + if consulConfig.Auth != "" { + var username, password string + if strings.Contains(consulConfig.Auth, ":") { + split := strings.SplitN(consulConfig.Auth, ":", 2) + username = split[0] + password = split[1] + } else { + username = consulConfig.Auth + } + + cfg.HttpAuth = &consul.HttpBasicAuth{ + Username: username, + Password: password, + } + } + if consulConfig.EnableSSL { + cfg.Scheme = "https" + tlsCfg := consul.TLSConfig{ + Address: cfg.Address, + CAFile: consulConfig.CAFile, + CertFile: consulConfig.CertFile, + KeyFile: consulConfig.KeyFile, + InsecureSkipVerify: !consulConfig.VerifySSL, + } + tlsClientCfg, err := consul.SetupTLSConfig(&tlsCfg) + if err != nil { + return nil, fmt.Errorf("error creating tls client config for consul: %v", err) + } + cfg.HttpClient.Transport = &http.Transport{ + TLSClientConfig: tlsClientCfg, + } + } + if consulConfig.EnableSSL && !consulConfig.VerifySSL { + cfg.HttpClient.Transport = &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, + }, + } + } + if c, err = consul.NewClient(cfg); err != nil { + return nil, err + } + consulSyncer := Syncer{ + client: c, + logger: logger, + consulAvailable: true, + shutdownCh: shutdownCh, + servicesGroups: make(map[string][]*consul.AgentServiceRegistration), + trackedServices: make(map[string]*consul.AgentServiceRegistration), + trackedChecks: make(map[string]*consul.AgentCheckRegistration), + checkRunners: make(map[string]*CheckRunner), + periodicCallbacks: make(map[string]types.PeriodicCallback), + } + return &consulSyncer, nil +} + +// SetDelegatedChecks sets the checks that nomad is going to run and report the +// result back to consul +func (c *Syncer) SetDelegatedChecks(delegateChecks map[string]struct{}, createDelegatedCheckFn func(*structs.ServiceCheck, string) (Check, error)) *Syncer { + c.delegateChecks = delegateChecks + c.createDelegatedCheck = createDelegatedCheckFn + return c +} + +// SetAddrFinder sets a function to find the host and port for a Service given its port label +func (c *Syncer) SetAddrFinder(addrFinder func(string) (string, int)) *Syncer { + c.addrFinder = addrFinder + return c +} + +// SetServiceRegPrefix sets the registration prefix used by the Syncer when +// registering Services with Consul. +func (c *Syncer) SetServiceRegPrefix(servicePrefix string) *Syncer { + c.registryLock.Lock() + defer c.registryLock.Unlock() + c.serviceRegPrefix = servicePrefix + return c +} + +// filterPrefix generates a unique prefix that a Syncer can later filter on. +func (c *Syncer) filterPrefix() string { + c.registryLock.RLock() + defer c.registryLock.RUnlock() + return fmt.Sprintf("%s-%s", nomadServicePrefix, c.serviceRegPrefix) +} + +// GenerateServiceID creates a unique Consul ServiceID for a given +// ConsulService. +func (c *Syncer) GenerateServiceID(groupName string, service *structs.ConsulService) string { + numTags := len(service.Tags) + switch numTags { + case 0: + return fmt.Sprintf("%s-%s:%s", c.filterPrefix(), groupName, service.Name) + case 1: + return fmt.Sprintf("%s-%s:%s@%s", c.filterPrefix(), groupName, service.Tags[0], service.Name) + default: + tags := strings.Join(service.Tags, "|") + return fmt.Sprintf("%s-%s:(%s)@%s", c.filterPrefix(), groupName, tags, service.Name) + } +} + +// SetServices assigns the slice of Nomad Services to the provided services +// group name. +func (c *Syncer) SetServices(groupName string, services []*structs.ConsulService) error { + var mErr multierror.Error + registeredServices := make([]*consul.AgentServiceRegistration, 0, len(services)) + for _, service := range services { + if service.ServiceID == "" { + service.ServiceID = c.GenerateServiceID(groupName, service) + } + var serviceReg *consul.AgentServiceRegistration + var err error + if serviceReg, err = c.createService(service); err != nil { + mErr.Errors = append(mErr.Errors, err) + continue + } + registeredServices = append(registeredServices, serviceReg) + + // Register the check(s) for this service + for _, chk := range service.Checks { + // Create a Consul check registration + chkReg, err := c.createDelegatedCheckReg(chk, serviceReg) + if err != nil { + mErr.Errors = append(mErr.Errors, err) + continue + } + // creating a nomad check if we have to handle this particular check type + c.registryLock.RLock() + if _, ok := c.delegateChecks[chk.Type]; ok { + _, ok := c.checkRunners[chkReg.ID] + c.registryLock.RUnlock() + if ok { + continue + } + nc, err := c.createDelegatedCheck(chk, chkReg.ID) + if err != nil { + mErr.Errors = append(mErr.Errors, err) + continue + } + cr := NewCheckRunner(nc, c.runCheck, c.logger) + c.registryLock.Lock() + c.checkRunners[nc.ID()] = cr + c.registryLock.Unlock() + } else { + c.registryLock.RUnlock() + } + } + } + + if len(mErr.Errors) > 0 { + return mErr.ErrorOrNil() + } + + c.servicesGroupsLock.Lock() + c.servicesGroups[groupName] = registeredServices + c.servicesGroupsLock.Unlock() + + return nil +} + +// SyncNow expires the current timer forcing the list of periodic callbacks +// to be synced immediately. +func (c *Syncer) SyncNow() { + select { + case c.notifySyncCh <- struct{}{}: + default: + } +} + +// flattenedServices returns a flattened list of services +func (c *Syncer) flattenedServices() []*consul.AgentServiceRegistration { + const initialNumServices = 8 + services := make([]*consul.AgentServiceRegistration, 0, initialNumServices) + c.servicesGroupsLock.RLock() + for _, servicesGroup := range c.servicesGroups { + for _, service := range servicesGroup { + services = append(services, service) + } + } + c.servicesGroupsLock.RUnlock() + + return services +} + +func (c *Syncer) signalShutdown() { + select { + case c.notifyShutdownCh <- struct{}{}: + default: + } +} + +// Shutdown de-registers the services and checks and shuts down periodic syncing +func (c *Syncer) Shutdown() error { + var mErr multierror.Error + + c.shutdownLock.Lock() + if !c.shutdown { + c.shutdown = true + } + c.shutdownLock.Unlock() + + c.signalShutdown() + + // Stop all the checks that nomad is running + c.registryLock.RLock() + for _, cr := range c.checkRunners { + cr.Stop() + } + c.registryLock.RUnlock() + + // De-register all the services from Consul + services, err := c.queryAgentServices() + if err != nil { + mErr.Errors = append(mErr.Errors, err) + } + for _, service := range services { + if err := c.client.Agent().ServiceDeregister(service.ID); err != nil { + c.logger.Printf("[WARN] consul.syncer: failed to deregister service ID %+q: %v", service.ID, err) + mErr.Errors = append(mErr.Errors, err) + } + } + return mErr.ErrorOrNil() +} + +// queryChecks queries the Consul Agent for a list of Consul checks that +// have been registered with this Consul Syncer. +func (c *Syncer) queryChecks() (map[string]*consul.AgentCheck, error) { + checks, err := c.client.Agent().Checks() + if err != nil { + return nil, err + } + return c.filterConsulChecks(checks), nil +} + +// queryAgentServices queries the Consul Agent for a list of Consul services that +// have been registered with this Consul Syncer. +func (c *Syncer) queryAgentServices() (map[string]*consul.AgentService, error) { + services, err := c.client.Agent().Services() + if err != nil { + return nil, err + } + return c.filterConsulServices(services), nil +} + +// syncChecks synchronizes this Syncer's Consul Checks with the Consul Agent. +func (c *Syncer) syncChecks() error { + var mErr multierror.Error + consulChecks, err := c.queryChecks() + if err != nil { + return err + } + + // Synchronize checks with Consul + missingChecks, _, changedChecks, staleChecks := c.calcChecksDiff(consulChecks) + for _, check := range missingChecks { + if err := c.registerCheck(check); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + c.registryLock.Lock() + c.trackedChecks[check.ID] = check + c.registryLock.Unlock() + } + for _, check := range changedChecks { + // NOTE(sean@): Do we need to deregister the check before + // re-registering it? Not deregistering to avoid missing the + // TTL but doesn't correct reconcile any possible drift with + // the check. + // + // if err := c.deregisterCheck(check.ID); err != nil { + // mErr.Errors = append(mErr.Errors, err) + // } + if err := c.registerCheck(check); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + } + for _, check := range staleChecks { + if err := c.deregisterCheck(check.ID); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + c.registryLock.Lock() + delete(c.trackedChecks, check.ID) + c.registryLock.Unlock() + } + return mErr.ErrorOrNil() +} + +// compareConsulCheck takes a consul.AgentCheckRegistration instance and +// compares it with a consul.AgentCheck. Returns true if they are equal +// according to consul.AgentCheck, otherwise false. +func compareConsulCheck(localCheck *consul.AgentCheckRegistration, consulCheck *consul.AgentCheck) bool { + if consulCheck.CheckID != localCheck.ID || + consulCheck.Name != localCheck.Name || + consulCheck.Notes != localCheck.Notes || + consulCheck.ServiceID != localCheck.ServiceID { + return false + } + return true +} + +// calcChecksDiff takes the argument (consulChecks) and calculates the delta +// between the consul.Syncer's list of known checks (c.trackedChecks). Three +// arrays are returned: +// +// 1) a slice of checks that exist only locally in the Syncer and are missing +// from the Consul Agent (consulChecks) and therefore need to be registered. +// +// 2) a slice of checks that exist in both the local consul.Syncer's +// tracked list and Consul Agent (consulChecks). +// +// 3) a slice of checks that exist in both the local consul.Syncer's +// tracked list and Consul Agent (consulServices) but have diverged state. +// +// 4) a slice of checks that exist only in the Consul Agent (consulChecks) +// and should be removed because the Consul Agent has drifted from the +// Syncer. +func (c *Syncer) calcChecksDiff(consulChecks map[string]*consul.AgentCheck) (missingChecks []*consul.AgentCheckRegistration, equalChecks []*consul.AgentCheckRegistration, changedChecks []*consul.AgentCheckRegistration, staleChecks []*consul.AgentCheckRegistration) { + type mergedCheck struct { + check *consul.AgentCheckRegistration + // 'l' == Nomad local only + // 'e' == equal + // 'c' == changed + // 'a' == Consul agent only + state byte + } + var ( + localChecksCount = 0 + equalChecksCount = 0 + changedChecksCount = 0 + agentChecks = 0 + ) + c.registryLock.RLock() + localChecks := make(map[string]*mergedCheck, len(c.trackedChecks)+len(consulChecks)) + for _, localCheck := range c.trackedChecks { + localChecksCount++ + localChecks[localCheck.ID] = &mergedCheck{localCheck, 'l'} + } + c.registryLock.RUnlock() + for _, consulCheck := range consulChecks { + if localCheck, found := localChecks[consulCheck.CheckID]; found { + localChecksCount-- + if compareConsulCheck(localCheck.check, consulCheck) { + equalChecksCount++ + localChecks[consulCheck.CheckID].state = 'e' + } else { + changedChecksCount++ + localChecks[consulCheck.CheckID].state = 'c' + } + } else { + agentChecks++ + agentCheckReg := &consul.AgentCheckRegistration{ + ID: consulCheck.CheckID, + Name: consulCheck.Name, + Notes: consulCheck.Notes, + ServiceID: consulCheck.ServiceID, + } + localChecks[consulCheck.CheckID] = &mergedCheck{agentCheckReg, 'a'} + } + } + + missingChecks = make([]*consul.AgentCheckRegistration, 0, localChecksCount) + equalChecks = make([]*consul.AgentCheckRegistration, 0, equalChecksCount) + changedChecks = make([]*consul.AgentCheckRegistration, 0, changedChecksCount) + staleChecks = make([]*consul.AgentCheckRegistration, 0, agentChecks) + for _, check := range localChecks { + switch check.state { + case 'l': + missingChecks = append(missingChecks, check.check) + case 'e': + equalChecks = append(equalChecks, check.check) + case 'c': + changedChecks = append(changedChecks, check.check) + case 'a': + staleChecks = append(staleChecks, check.check) + } + } + + return missingChecks, equalChecks, changedChecks, staleChecks +} + +// compareConsulService takes a consul.AgentServiceRegistration instance and +// compares it with a consul.AgentService. Returns true if they are equal +// according to consul.AgentService, otherwise false. +func compareConsulService(localService *consul.AgentServiceRegistration, consulService *consul.AgentService) bool { + if consulService.ID != localService.ID || + consulService.Service != localService.Name || + consulService.Port != localService.Port || + consulService.Address != localService.Address || + consulService.Address != localService.Address || + consulService.EnableTagOverride != localService.EnableTagOverride { + return false + } + + serviceTags := make(map[string]byte, len(localService.Tags)) + for _, tag := range localService.Tags { + serviceTags[tag] = 'l' + } + for _, tag := range consulService.Tags { + if _, found := serviceTags[tag]; !found { + return false + } + serviceTags[tag] = 'b' + } + for _, state := range serviceTags { + if state == 'l' { + return false + } + } + + return true +} + +// calcServicesDiff takes the argument (consulServices) and calculates the +// delta between the consul.Syncer's list of known services +// (c.trackedServices). Three arrays are returned: +// +// 1) a slice of services that exist only locally in the Syncer and are +// missing from the Consul Agent (consulServices) and therefore need to be +// registered. +// +// 2) a slice of services that exist in both the local consul.Syncer's +// tracked list and Consul Agent (consulServices) *AND* are identical. +// +// 3) a slice of services that exist in both the local consul.Syncer's +// tracked list and Consul Agent (consulServices) but have diverged state. +// +// 4) a slice of services that exist only in the Consul Agent +// (consulServices) and should be removed because the Consul Agent has +// drifted from the Syncer. +func (c *Syncer) calcServicesDiff(consulServices map[string]*consul.AgentService) (missingServices []*consul.AgentServiceRegistration, equalServices []*consul.AgentServiceRegistration, changedServices []*consul.AgentServiceRegistration, staleServices []*consul.AgentServiceRegistration) { + type mergedService struct { + service *consul.AgentServiceRegistration + // 'l' == Nomad local only + // 'e' == equal + // 'c' == changed + // 'a' == Consul agent only + state byte + } + var ( + localServicesCount = 0 + equalServicesCount = 0 + changedServicesCount = 0 + agentServices = 0 + ) + c.registryLock.RLock() + localServices := make(map[string]*mergedService, len(c.trackedServices)+len(consulServices)) + c.registryLock.RUnlock() + for _, localService := range c.flattenedServices() { + localServicesCount++ + localServices[localService.ID] = &mergedService{localService, 'l'} + } + for _, consulService := range consulServices { + if localService, found := localServices[consulService.ID]; found { + localServicesCount-- + if compareConsulService(localService.service, consulService) { + equalServicesCount++ + localServices[consulService.ID].state = 'e' + } else { + changedServicesCount++ + localServices[consulService.ID].state = 'c' + } + } else { + agentServices++ + agentServiceReg := &consul.AgentServiceRegistration{ + ID: consulService.ID, + Name: consulService.Service, + Tags: consulService.Tags, + Port: consulService.Port, + Address: consulService.Address, + } + localServices[consulService.ID] = &mergedService{agentServiceReg, 'a'} + } + } + + missingServices = make([]*consul.AgentServiceRegistration, 0, localServicesCount) + equalServices = make([]*consul.AgentServiceRegistration, 0, equalServicesCount) + changedServices = make([]*consul.AgentServiceRegistration, 0, changedServicesCount) + staleServices = make([]*consul.AgentServiceRegistration, 0, agentServices) + for _, service := range localServices { + switch service.state { + case 'l': + missingServices = append(missingServices, service.service) + case 'e': + equalServices = append(equalServices, service.service) + case 'c': + changedServices = append(changedServices, service.service) + case 'a': + staleServices = append(staleServices, service.service) + } + } + + return missingServices, equalServices, changedServices, staleServices +} + +// syncServices synchronizes this Syncer's Consul Services with the Consul +// Agent. +func (c *Syncer) syncServices() error { + consulServices, err := c.queryAgentServices() + if err != nil { + return err + } + + // Synchronize services with Consul + var mErr multierror.Error + missingServices, _, changedServices, removedServices := c.calcServicesDiff(consulServices) + for _, service := range missingServices { + if err := c.client.Agent().ServiceRegister(service); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + c.registryLock.Lock() + c.trackedServices[service.ID] = service + c.registryLock.Unlock() + } + for _, service := range changedServices { + // Re-register the local service + if err := c.client.Agent().ServiceRegister(service); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + } + for _, service := range removedServices { + if err := c.deregisterService(service.ID); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + c.registryLock.Lock() + delete(c.trackedServices, service.ID) + c.registryLock.Unlock() + } + return mErr.ErrorOrNil() +} + +// registerCheck registers a check definition with Consul +func (c *Syncer) registerCheck(chkReg *consul.AgentCheckRegistration) error { + c.registryLock.RLock() + if cr, ok := c.checkRunners[chkReg.ID]; ok { + cr.Start() + } + c.registryLock.RUnlock() + return c.client.Agent().CheckRegister(chkReg) +} + +// createDelegatedCheckReg creates a Check that can be registered with +// Nomad. It also creates a Nomad check for the check types that it can +// handle. +func (c *Syncer) createDelegatedCheckReg(check *structs.ServiceCheck, service *consul.AgentServiceRegistration) (*consul.AgentCheckRegistration, error) { + chkReg := consul.AgentCheckRegistration{ + ID: check.Hash(service.ID), + Name: check.Name, + ServiceID: service.ID, + } + chkReg.Timeout = check.Timeout.String() + chkReg.Interval = check.Interval.String() + switch check.Type { + case structs.ServiceCheckHTTP: + if check.Protocol == "" { + check.Protocol = "http" + } + url := url.URL{ + Scheme: check.Protocol, + Host: fmt.Sprintf("%s:%d", service.Address, service.Port), + Path: check.Path, + } + chkReg.HTTP = url.String() + case structs.ServiceCheckTCP: + chkReg.TCP = fmt.Sprintf("%s:%d", service.Address, service.Port) + case structs.ServiceCheckScript: + chkReg.TTL = (check.Interval + ttlCheckBuffer).String() + default: + return nil, fmt.Errorf("check type %+q not valid", check.Type) + } + return &chkReg, nil +} + +// createService creates a Consul AgentService from a Nomad ConsulService. +func (c *Syncer) createService(service *structs.ConsulService) (*consul.AgentServiceRegistration, error) { + c.registryLock.RLock() + defer c.registryLock.RUnlock() + + srv := consul.AgentServiceRegistration{ + ID: service.ServiceID, + Name: service.Name, + Tags: service.Tags, + } + host, port := c.addrFinder(service.PortLabel) + if host != "" { + srv.Address = host + } + + if port != 0 { + srv.Port = port + } + + return &srv, nil +} + +// deregisterService de-registers a service with the given ID from consul +func (c *Syncer) deregisterService(serviceID string) error { + return c.client.Agent().ServiceDeregister(serviceID) +} + +// deregisterCheck de-registers a check from Consul +func (c *Syncer) deregisterCheck(checkID string) error { + c.registryLock.Lock() + defer c.registryLock.Unlock() + + // Deleting from Consul Agent + if err := c.client.Agent().CheckDeregister(checkID); err != nil { + // CheckDeregister() will be reattempted again in a future + // sync. + return err + } + + // Remove the check from the local registry + if cr, ok := c.checkRunners[checkID]; ok { + cr.Stop() + delete(c.checkRunners, checkID) + } + + return nil +} + +// Run triggers periodic syncing of services and checks with Consul. This is +// a long lived go-routine which is stopped during shutdown. +func (c *Syncer) Run() { + d := initialSyncDelay + lib.RandomStagger(initialSyncBuffer-initialSyncDelay) + sync := time.NewTimer(d) + c.logger.Printf("[DEBUG] consul.syncer: sleeping %v before first sync", d) + + for { + select { + case <-sync.C: + d = syncInterval - lib.RandomStagger(syncInterval/syncJitter) + sync.Reset(d) + + if err := c.SyncServices(); err != nil { + if c.consulAvailable { + c.logger.Printf("[DEBUG] consul.syncer: disabling checks until successful sync for %+q: %v", c.serviceRegPrefix, err) + } + c.consulAvailable = false + } else { + if !c.consulAvailable { + c.logger.Printf("[DEBUG] consul.syncer: re-enabling checks for for %+q", c.serviceRegPrefix) + } + c.consulAvailable = true + } + case <-c.notifySyncCh: + sync.Reset(syncInterval) + case <-c.shutdownCh: + c.Shutdown() + case <-c.notifyShutdownCh: + sync.Stop() + c.logger.Printf("[INFO] consul.syncer: shutting down sync for %+q", c.serviceRegPrefix) + return + } + } +} + +// RunHandlers executes each handler (randomly) +func (c *Syncer) RunHandlers() error { + c.periodicLock.RLock() + handlers := make(map[string]types.PeriodicCallback, len(c.periodicCallbacks)) + for name, fn := range c.periodicCallbacks { + handlers[name] = fn + } + c.periodicLock.RUnlock() + + var mErr multierror.Error + for _, fn := range handlers { + if err := fn(); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + } + return mErr.ErrorOrNil() +} + +// SyncServices sync the services with the Consul Agent +func (c *Syncer) SyncServices() error { + if err := c.RunHandlers(); err != nil { + return err + } + if err := c.syncServices(); err != nil { + return err + } + if err := c.syncChecks(); err != nil { + return err + } + + return nil +} + +// filterConsulServices prunes out all the service whose ids are not prefixed +// with nomad- +func (c *Syncer) filterConsulServices(consulServices map[string]*consul.AgentService) map[string]*consul.AgentService { + localServices := make(map[string]*consul.AgentService, len(consulServices)) + c.registryLock.RLock() + defer c.registryLock.RUnlock() + filterPrefix := c.filterPrefix() + for serviceID, service := range consulServices { + if strings.HasPrefix(service.ID, filterPrefix) { + localServices[serviceID] = service + } + } + return localServices +} + +// filterConsulChecks prunes out all the consul checks which do not have +// services with Syncer's idPrefix. +func (c *Syncer) filterConsulChecks(consulChecks map[string]*consul.AgentCheck) map[string]*consul.AgentCheck { + localChecks := make(map[string]*consul.AgentCheck, len(consulChecks)) + filterPrefix := c.filterPrefix() + for checkID, check := range consulChecks { + if strings.HasPrefix(check.ServiceID, filterPrefix) { + localChecks[checkID] = check + } + } + return localChecks +} + +// consulPresent indicates whether the Consul Agent is responding +func (c *Syncer) consulPresent() bool { + _, err := c.client.Agent().Self() + return err == nil +} + +// runCheck runs a check and updates the corresponding ttl check in consul +func (c *Syncer) runCheck(check Check) { + res := check.Run() + if res.Duration >= check.Timeout() { + c.logger.Printf("[DEBUG] consul.syncer: check took time: %v, timeout: %v", res.Duration, check.Timeout()) + } + state := consul.HealthCritical + output := res.Output + switch res.ExitCode { + case 0: + state = consul.HealthPassing + case 1: + state = consul.HealthWarning + default: + state = consul.HealthCritical + } + if res.Err != nil { + state = consul.HealthCritical + output = res.Err.Error() + } + if err := c.client.Agent().UpdateTTL(check.ID(), output, state); err != nil { + if c.consulAvailable { + c.logger.Printf("[DEBUG] consul.syncer: check %+q failed, disabling Consul checks until until next successful sync: %v", check.ID(), err) + c.consulAvailable = false + } else { + c.consulAvailable = true + } + } +} + +// AddPeriodicHandler adds a uniquely named callback. Returns true if +// successful, false if a handler with the same name already exists. +func (c *Syncer) AddPeriodicHandler(name string, fn types.PeriodicCallback) bool { + c.periodicLock.Lock() + defer c.periodicLock.Unlock() + if _, found := c.periodicCallbacks[name]; found { + c.logger.Printf("[ERROR] consul.syncer: failed adding handler %+q", name) + return false + } + c.periodicCallbacks[name] = fn + return true +} + +// NumHandlers returns the number of callbacks registered with the syncer +func (c *Syncer) NumHandlers() int { + c.periodicLock.RLock() + defer c.periodicLock.RUnlock() + return len(c.periodicCallbacks) +} + +// RemovePeriodicHandler removes a handler with a given name. +func (c *Syncer) RemovePeriodicHandler(name string) { + c.periodicLock.Lock() + defer c.periodicLock.Unlock() + delete(c.periodicCallbacks, name) +} + +// ConsulClient returns the Consul client used by the Syncer. +func (c *Syncer) ConsulClient() *consul.Client { + return c.client +} diff --git a/command/agent/consul/syncer_test.go b/command/agent/consul/syncer_test.go new file mode 100644 index 00000000000..b9827e3a174 --- /dev/null +++ b/command/agent/consul/syncer_test.go @@ -0,0 +1,183 @@ +package consul + +import ( + "fmt" + "log" + "os" + "reflect" + "testing" + "time" + + "github.com/hashicorp/go-multierror" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/structs/config" +) + +const ( + allocID = "12" + serviceRegPrefix = "test" + serviceGroupName = "executor" +) + +var ( + logger = log.New(os.Stdout, "", log.LstdFlags) + check1 = structs.ServiceCheck{ + Name: "check-foo-1", + Type: structs.ServiceCheckTCP, + Interval: 30 * time.Second, + Timeout: 5 * time.Second, + } + service1 = structs.ConsulService{ + Name: "foo-1", + Tags: []string{"tag1", "tag2"}, + PortLabel: "port1", + Checks: []*structs.ServiceCheck{ + &check1, + }, + } + + service2 = structs.ConsulService{ + Name: "foo-2", + Tags: []string{"tag1", "tag2"}, + PortLabel: "port2", + } +) + +func TestConsulServiceRegisterServices(t *testing.T) { + shutdownCh := make(chan struct{}) + cs, err := NewSyncer(&config.ConsulConfig{}, shutdownCh, logger) + if err != nil { + t.Fatalf("Err: %v", err) + } + // Skipping the test if consul isn't present + if !cs.consulPresent() { + return + } + task := mockTask() + cs.SetServiceRegPrefix(serviceRegPrefix) + cs.SetAddrFinder(task.FindHostAndPortFor) + if err := cs.SyncServices(); err != nil { + t.Fatalf("err: %v", err) + } + defer cs.Shutdown() + + service1 := &structs.ConsulService{Name: task.Name} + service2 := &structs.ConsulService{Name: task.Name} + services := []*structs.ConsulService{service1, service2} + service1.ServiceID = fmt.Sprintf("%s-%s:%s/%s", cs.GenerateServiceID(serviceGroupName, service1), task.Name, allocID) + service2.ServiceID = fmt.Sprintf("%s-%s:%s/%s", cs.GenerateServiceID(serviceGroupName, service2), task.Name, allocID) + + cs.SetServices(serviceGroupName, services) + if err := servicesPresent(t, services, cs); err != nil { + t.Fatalf("err : %v", err) + } + // FIXME(sean@) + // if err := checksPresent(t, []string{check1.Hash(service1ID)}, cs); err != nil { + // t.Fatalf("err : %v", err) + // } +} + +func TestConsulServiceUpdateService(t *testing.T) { + shutdownCh := make(chan struct{}) + cs, err := NewSyncer(&config.ConsulConfig{}, shutdownCh, logger) + if err != nil { + t.Fatalf("Err: %v", err) + } + // Skipping the test if consul isn't present + if !cs.consulPresent() { + return + } + + task := mockTask() + cs.SetServiceRegPrefix(serviceRegPrefix) + cs.SetAddrFinder(task.FindHostAndPortFor) + if err := cs.SyncServices(); err != nil { + t.Fatalf("err: %v", err) + } + defer cs.Shutdown() + + //Update Service defn 1 + newTags := []string{"tag3"} + task.ConsulServices[0].Tags = newTags + if err := cs.SyncServices(); err != nil { + t.Fatalf("err: %v", err) + } + // Make sure all the services and checks are still present + service1 := &structs.ConsulService{Name: task.Name} + service2 := &structs.ConsulService{Name: task.Name} + services := []*structs.ConsulService{service1, service2} + service1.ServiceID = fmt.Sprintf("%s-%s:%s/%s", cs.GenerateServiceID(serviceGroupName, service1), task.Name, allocID) + service2.ServiceID = fmt.Sprintf("%s-%s:%s/%s", cs.GenerateServiceID(serviceGroupName, service2), task.Name, allocID) + if err := servicesPresent(t, services, cs); err != nil { + t.Fatalf("err : %v", err) + } + // FIXME(sean@) + // if err := checksPresent(t, []string{check1.Hash(service1ID)}, cs); err != nil { + // t.Fatalf("err : %v", err) + // } + + // check if service defn 1 has been updated + consulServices, err := cs.client.Agent().Services() + if err != nil { + t.Fatalf("errL: %v", err) + } + srv, _ := consulServices[service1.ServiceID] + if !reflect.DeepEqual(srv.Tags, newTags) { + t.Fatalf("expected tags: %v, actual: %v", newTags, srv.Tags) + } +} + +func servicesPresent(t *testing.T, configuredServices []*structs.ConsulService, syncer *Syncer) error { + var mErr multierror.Error + services, err := syncer.client.Agent().Services() + if err != nil { + t.Fatalf("err: %v", err) + } + + for _, configuredService := range configuredServices { + if _, ok := services[configuredService.ServiceID]; !ok { + mErr.Errors = append(mErr.Errors, fmt.Errorf("service ID %q not synced", configuredService.ServiceID)) + } + } + return mErr.ErrorOrNil() +} + +func checksPresent(t *testing.T, checkIDs []string, syncer *Syncer) error { + var mErr multierror.Error + checks, err := syncer.client.Agent().Checks() + if err != nil { + t.Fatalf("err: %v", err) + } + + for _, checkID := range checkIDs { + if _, ok := checks[checkID]; !ok { + mErr.Errors = append(mErr.Errors, fmt.Errorf("check ID %q not synced", checkID)) + } + } + return mErr.ErrorOrNil() +} + +func mockTask() *structs.Task { + task := structs.Task{ + Name: "foo", + ConsulServices: []*structs.ConsulService{&service1, &service2}, + Resources: &structs.Resources{ + Networks: []*structs.NetworkResource{ + &structs.NetworkResource{ + IP: "10.10.11.5", + DynamicPorts: []structs.Port{ + structs.Port{ + Label: "port1", + Value: 20002, + }, + structs.Port{ + Label: "port2", + Value: 20003, + }, + }, + }, + }, + }, + } + return &task +} diff --git a/command/agent/util.go b/command/agent/util.go index 2fa4993aeaa..c74fe645cf6 100644 --- a/command/agent/util.go +++ b/command/agent/util.go @@ -2,16 +2,9 @@ package agent import ( "fmt" - "math/rand" "net" - "time" ) -// Returns a random stagger interval between 0 and the duration -func randomStagger(intv time.Duration) time.Duration { - return time.Duration(uint64(rand.Int63()) % uint64(intv)) -} - // IpOfDevice returns a routable ip addr of a device func ipOfDevice(name string) (net.IP, error) { intf, err := net.InterfaceByName(name) diff --git a/command/agent/util_test.go b/command/agent/util_test.go deleted file mode 100644 index e31943a2037..00000000000 --- a/command/agent/util_test.go +++ /dev/null @@ -1,16 +0,0 @@ -package agent - -import ( - "testing" - "time" -) - -func TestRandomStagger(t *testing.T) { - intv := time.Minute - for i := 0; i < 10; i++ { - stagger := randomStagger(intv) - if stagger < 0 || stagger >= intv { - t.Fatalf("Bad: %v", stagger) - } - } -} diff --git a/command/client_config_test.go b/command/client_config_test.go index ab3370db97e..e1e96e001a4 100644 --- a/command/client_config_test.go +++ b/command/client_config_test.go @@ -33,7 +33,7 @@ func TestClientConfigCommand_UpdateServers(t *testing.T) { ui.ErrorWriter.Reset() // Set the servers list - code = cmd.Run([]string{"-address=" + url, "-update-servers", "foo", "bar"}) + code = cmd.Run([]string{"-address=" + url, "-update-servers", "127.0.0.42", "198.18.5.5"}) if code != 0 { t.Fatalf("expected exit 0, got: %d", code) } @@ -44,11 +44,11 @@ func TestClientConfigCommand_UpdateServers(t *testing.T) { t.Fatalf("expect exit 0, got: %d", code) } out := ui.OutputWriter.String() - if !strings.Contains(out, "foo") { - t.Fatalf("missing foo") + if !strings.Contains(out, "127.0.0.42") { + t.Fatalf("missing 127.0.0.42") } - if !strings.Contains(out, "bar") { - t.Fatalf("missing bar") + if !strings.Contains(out, "198.18.5.5") { + t.Fatalf("missing 198.18.5.5") } } diff --git a/jobspec/parse.go b/jobspec/parse.go index 028f4baae2b..99dbd6676ef 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -701,7 +701,7 @@ func parseArtifactOption(result map[string]string, list *ast.ObjectList) error { } func parseServices(jobName string, taskGroupName string, task *structs.Task, serviceObjs *ast.ObjectList) error { - task.Services = make([]*structs.Service, len(serviceObjs.Items)) + task.ConsulServices = make([]*structs.ConsulService, len(serviceObjs.Items)) var defaultServiceName bool for idx, o := range serviceObjs.Items { // Check for invalid keys @@ -715,7 +715,7 @@ func parseServices(jobName string, taskGroupName string, task *structs.Task, ser return multierror.Prefix(err, fmt.Sprintf("service (%d) ->", idx)) } - var service structs.Service + var service structs.ConsulService var m map[string]interface{} if err := hcl.DecodeObject(&m, o.Val); err != nil { return err @@ -750,13 +750,13 @@ func parseServices(jobName string, taskGroupName string, task *structs.Task, ser } } - task.Services[idx] = &service + task.ConsulServices[idx] = &service } return nil } -func parseChecks(service *structs.Service, checkObjs *ast.ObjectList) error { +func parseChecks(service *structs.ConsulService, checkObjs *ast.ObjectList) error { service.Checks = make([]*structs.ServiceCheck, len(checkObjs.Items)) for idx, co := range checkObjs.Items { // Check for invalid keys diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index 10374146eef..e2119668e0a 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -97,7 +97,7 @@ func TestParse(t *testing.T) { }, }, }, - Services: []*structs.Service{ + ConsulServices: []*structs.ConsulService{ { Name: "binstore-storagelocker-binsl-binstore", Tags: []string{"foo", "bar"}, diff --git a/main.go b/main.go index 212c28f502a..2c15085504a 100644 --- a/main.go +++ b/main.go @@ -4,9 +4,14 @@ import ( "fmt" "os" + "github.com/hashicorp/consul/lib" "github.com/mitchellh/cli" ) +func init() { + lib.SeedMathRand() +} + func main() { os.Exit(Run(os.Args[1:])) } diff --git a/nomad/config.go b/nomad/config.go index 1773921984f..d998045d9ab 100644 --- a/nomad/config.go +++ b/nomad/config.go @@ -44,14 +44,14 @@ var ( // Config is used to parameterize the server type Config struct { - // Bootstrap mode is used to bring up the first Consul server. - // It is required so that it can elect a leader without any - // other nodes being present + // Bootstrap mode is used to bring up the first Nomad server. It is + // required so that it can elect a leader without any other nodes + // being present Bootstrap bool - // BootstrapExpect mode is used to automatically bring up a collection of - // Consul servers. This can be used to automatically bring up a collection - // of nodes. + // BootstrapExpect mode is used to automatically bring up a + // collection of Nomad servers. This can be used to automatically + // bring up a collection of nodes. BootstrapExpect int // DataDir is the directory to store our state in diff --git a/nomad/eval_broker.go b/nomad/eval_broker.go index 950c9ac0d3c..d91ee824488 100644 --- a/nomad/eval_broker.go +++ b/nomad/eval_broker.go @@ -348,7 +348,7 @@ func (b *EvalBroker) scanForSchedulers(schedulers []string) (*structs.Evaluation default: // Multiple tasks. We pick a random task so that we fairly // distribute work. - offset := rand.Int63() % int64(n) + offset := rand.Intn(n) return b.dequeueForSched(eligibleSched[offset]) } } diff --git a/nomad/heartbeat.go b/nomad/heartbeat.go index 3f2c037651a..9b2867ecaa3 100644 --- a/nomad/heartbeat.go +++ b/nomad/heartbeat.go @@ -4,6 +4,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/hashicorp/consul/lib" "github.com/hashicorp/nomad/nomad/structs" ) @@ -49,9 +50,8 @@ func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) { // Compute the target TTL value n := len(s.heartbeatTimers) - ttl := rateScaledInterval(s.config.MaxHeartbeatsPerSecond, - s.config.MinHeartbeatTTL, n) - ttl += randomStagger(ttl) + ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n) + ttl += lib.RandomStagger(ttl) // Reset the TTL s.resetHeartbeatTimerLocked(id, ttl+s.config.HeartbeatGrace) @@ -72,7 +72,7 @@ func (s *Server) resetHeartbeatTimerLocked(id string, ttl time.Duration) { return } - // Create a new timer to track expiration of thi sheartbeat + // Create a new timer to track expiration of this heartbeat timer := time.AfterFunc(ttl, func() { s.invalidateHeartbeat(id) }) diff --git a/nomad/job_endpoint_test.go b/nomad/job_endpoint_test.go index 339b13439b4..5aab5ee89b3 100644 --- a/nomad/job_endpoint_test.go +++ b/nomad/job_endpoint_test.go @@ -48,7 +48,7 @@ func TestJobEndpoint_Register(t *testing.T) { if out.CreateIndex != resp.JobModifyIndex { t.Fatalf("index mis-match") } - serviceName := out.TaskGroups[0].Tasks[0].Services[0].Name + serviceName := out.TaskGroups[0].Tasks[0].ConsulServices[0].Name expectedServiceName := "web-frontend" if serviceName != expectedServiceName { t.Fatalf("Expected Service Name: %s, Actual: %s", expectedServiceName, serviceName) @@ -237,7 +237,7 @@ func TestJobEndpoint_Register_Periodic(t *testing.T) { if out.CreateIndex != resp.JobModifyIndex { t.Fatalf("index mis-match") } - serviceName := out.TaskGroups[0].Tasks[0].Services[0].Name + serviceName := out.TaskGroups[0].Tasks[0].ConsulServices[0].Name expectedServiceName := "web-frontend" if serviceName != expectedServiceName { t.Fatalf("Expected Service Name: %s, Actual: %s", expectedServiceName, serviceName) @@ -573,12 +573,12 @@ func TestJobEndpoint_GetJob(t *testing.T) { // Make a copy of the origin job and change the service name so that we can // do a deep equal with the response from the GET JOB Api j := job - j.TaskGroups[0].Tasks[0].Services[0].Name = "web-frontend" + j.TaskGroups[0].Tasks[0].ConsulServices[0].Name = "web-frontend" for tgix, tg := range j.TaskGroups { for tidx, t := range tg.Tasks { - for sidx, service := range t.Services { + for sidx, service := range t.ConsulServices { for cidx, check := range service.Checks { - check.Name = resp2.Job.TaskGroups[tgix].Tasks[tidx].Services[sidx].Checks[cidx].Name + check.Name = resp2.Job.TaskGroups[tgix].Tasks[tidx].ConsulServices[sidx].Checks[cidx].Name } } } diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 9e920864150..1f7b64dee44 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -94,7 +94,7 @@ func Job() *structs.Job { Env: map[string]string{ "FOO": "bar", }, - Services: []*structs.Service{ + ConsulServices: []*structs.ConsulService{ { Name: "${TASK}-frontend", PortLabel: "http", diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index e115e66599c..25c8e118ef8 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -7,6 +7,7 @@ import ( "github.com/armon/go-metrics" "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/watch" ) @@ -101,6 +102,53 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp // Set the reply index reply.Index = index + snap, err := n.srv.fsm.State().Snapshot() + if err != nil { + return err + } + + n.srv.peerLock.RLock() + defer n.srv.peerLock.RUnlock() + if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { + n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) + return err + } + + return nil +} + +// updateNodeUpdateResponse assumes the n.srv.peerLock is held for reading. +func (n *Node) constructNodeServerInfoResponse(snap *state.StateSnapshot, reply *structs.NodeUpdateResponse) error { + reply.LeaderRPCAddr = n.srv.raft.Leader() + + // Reply with config information required for future RPC requests + reply.Servers = make([]*structs.NodeServerInfo, 0, len(n.srv.localPeers)) + for k, v := range n.srv.localPeers { + reply.Servers = append(reply.Servers, + &structs.NodeServerInfo{ + RPCAdvertiseAddr: k, + RPCMajorVersion: int32(v.MajorVersion), + RPCMinorVersion: int32(v.MinorVersion), + Datacenter: v.Datacenter, + }) + } + + // TODO(sean@): Use an indexed node count instead + // + // Snapshot is used only to iterate over all nodes to create a node + // count to send back to Nomad Clients in their heartbeat so Clients + // can estimate the size of the cluster. + iter, err := snap.Nodes() + if err == nil { + for { + raw := iter.Next() + if raw == nil { + break + } + reply.NumNodes++ + } + } + return nil } @@ -205,8 +253,15 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct reply.HeartbeatTTL = ttl } - // Set the reply index + // Set the reply index and leader reply.Index = index + n.srv.peerLock.RLock() + defer n.srv.peerLock.RUnlock() + if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { + n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) + return err + } + return nil } @@ -298,6 +353,13 @@ func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUp // Set the reply index reply.Index = evalIndex + + n.srv.peerLock.RLock() + defer n.srv.peerLock.RUnlock() + if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { + n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) + return err + } return nil } diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go index 5aede49916e..e796b194c4c 100644 --- a/nomad/node_endpoint_test.go +++ b/nomad/node_endpoint_test.go @@ -237,6 +237,28 @@ func TestClientEndpoint_UpdateStatus_GetEvals(t *testing.T) { func TestClientEndpoint_UpdateStatus_HeartbeatOnly(t *testing.T) { s1 := testServer(t, nil) defer s1.Shutdown() + + s2 := testServer(t, func(c *Config) { + c.DevDisableBootstrap = true + }) + defer s2.Shutdown() + + s3 := testServer(t, func(c *Config) { + c.DevDisableBootstrap = true + }) + defer s3.Shutdown() + servers := []*Server{s1, s2, s3} + testJoin(t, s1, s2, s3) + + for _, s := range servers { + testutil.WaitForResult(func() (bool, error) { + peers, _ := s.raftPeers.Peers() + return len(peers) == 3, nil + }, func(err error) { + t.Fatalf("should have 3 peers") + }) + } + codec := rpcClient(t, s1) testutil.WaitForLeader(t, s1.RPC) @@ -259,6 +281,12 @@ func TestClientEndpoint_UpdateStatus_HeartbeatOnly(t *testing.T) { t.Fatalf("bad: %#v", ttl) } + // Check for heartbeat servers + serverAddrs := resp.Servers + if len(serverAddrs) == 0 { + t.Fatalf("bad: %#v", serverAddrs) + } + // Update the status, static state dereg := &structs.NodeUpdateStatusRequest{ NodeID: node.ID, diff --git a/nomad/pool.go b/nomad/pool.go index 96328a3f091..669e788989e 100644 --- a/nomad/pool.go +++ b/nomad/pool.go @@ -12,6 +12,7 @@ import ( "github.com/hashicorp/consul/tlsutil" "github.com/hashicorp/net-rpc-msgpackrpc" + "github.com/hashicorp/nomad/client/rpcproxy" "github.com/hashicorp/yamux" ) @@ -373,6 +374,30 @@ func (p *ConnPool) RPC(region string, addr net.Addr, version int, method string, return nil } +// PingNomadServer sends a Status.Ping message to the specified server and +// returns true if healthy, false if an error occurred +func (p *ConnPool) PingNomadServer(region string, apiMajorVersion int, s *rpcproxy.ServerEndpoint) (bool, error) { + // Get a usable client + conn, sc, err := p.getClient(region, s.Addr, apiMajorVersion) + if err != nil { + return false, err + } + + // Make the RPC call + var out struct{} + err = msgpackrpc.CallWithCodec(sc.codec, "Status.Ping", struct{}{}, &out) + if err != nil { + sc.Close() + p.releaseConn(conn) + return false, err + } + + // Done with the connection + conn.returnClient(sc) + p.releaseConn(conn) + return true, nil +} + // Reap is used to close conns open over maxTime func (p *ConnPool) reap() { for { diff --git a/nomad/rpc.go b/nomad/rpc.go index a25566f111f..d65f65868e5 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -11,6 +11,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/hashicorp/consul/lib" "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -215,7 +216,7 @@ func (s *Server) forwardLeader(method string, args interface{}, reply interface{ if server == nil { return structs.ErrNoLeader } - return s.connPool.RPC(s.config.Region, server.Addr, server.Version, method, args, reply) + return s.connPool.RPC(s.config.Region, server.Addr, server.MajorVersion, method, args, reply) } // forwardRegion is used to forward an RPC call to a remote region, or fail if no servers @@ -231,13 +232,13 @@ func (s *Server) forwardRegion(region, method string, args interface{}, reply in } // Select a random addr - offset := rand.Int31() % int32(len(servers)) + offset := rand.Intn(len(servers)) server := servers[offset] s.peerLock.RUnlock() // Forward to remote Nomad metrics.IncrCounter([]string{"nomad", "rpc", "cross-region", region}, 1) - return s.connPool.RPC(region, server.Addr, server.Version, method, args, reply) + return s.connPool.RPC(region, server.Addr, server.MajorVersion, method, args, reply) } // raftApplyFuture is used to encode a message, run it through raft, and return the Raft future. @@ -308,7 +309,7 @@ func (s *Server) blockingRPC(opts *blockingOptions) error { } // Apply a small amount of jitter to the request - opts.queryOpts.MaxQueryTime += randomStagger(opts.queryOpts.MaxQueryTime / jitterFraction) + opts.queryOpts.MaxQueryTime += lib.RandomStagger(opts.queryOpts.MaxQueryTime / jitterFraction) // Setup a query timeout timeout = time.NewTimer(opts.queryOpts.MaxQueryTime) diff --git a/nomad/server.go b/nomad/server.go index 1aad1394869..1932d18b197 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -18,6 +18,7 @@ import ( "github.com/hashicorp/consul/tlsutil" "github.com/hashicorp/nomad/nomad/state" + "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/raft" "github.com/hashicorp/raft-boltdb" "github.com/hashicorp/serf/serf" @@ -41,17 +42,6 @@ const ( // raftRemoveGracePeriod is how long we wait to allow a RemovePeer // to replicate to gracefully leave the cluster. raftRemoveGracePeriod = 5 * time.Second - - // apiMajorVersion is returned as part of the Status.Version request. - // It should be incremented anytime the APIs are changed in a way that - // would break clients for sane client versioning. - apiMajorVersion = 1 - - // apiMinorVersion is returned as part of the Status.Version request. - // It should be incremented anytime the APIs are changed to allow - // for sane client versioning. Minor changes should be compatible - // within the major version. - apiMinorVersion = 1 ) // Server is Nomad server which manages the job queues, @@ -249,11 +239,6 @@ func NewServer(config *Config) (*Server, error) { // Emit metrics go s.heartbeatStats() - // Seed the global random. - if err := seedRandom(); err != nil { - return nil, err - } - // Done return s, nil } @@ -539,9 +524,8 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) ( conf.Tags["role"] = "nomad" conf.Tags["region"] = s.config.Region conf.Tags["dc"] = s.config.Datacenter - conf.Tags["vsn"] = fmt.Sprintf("%d", s.config.ProtocolVersion) - conf.Tags["vsn_min"] = fmt.Sprintf("%d", ProtocolVersionMin) - conf.Tags["vsn_max"] = fmt.Sprintf("%d", ProtocolVersionMax) + conf.Tags["vsn"] = fmt.Sprintf("%d", structs.ApiMajorVersion) + conf.Tags["mvn"] = fmt.Sprintf("%d", structs.ApiMinorVersion) conf.Tags["build"] = s.config.Build conf.Tags["port"] = fmt.Sprintf("%d", s.rpcAdvertise.(*net.TCPAddr).Port) if s.config.Bootstrap || (s.config.DevMode && !s.config.DevDisableBootstrap) { diff --git a/nomad/server_test.go b/nomad/server_test.go index 1ee4e7a65ff..a44cb88da1f 100644 --- a/nomad/server_test.go +++ b/nomad/server_test.go @@ -11,7 +11,10 @@ import ( "github.com/hashicorp/nomad/testutil" ) -var nextPort uint32 = 15000 +var ( + nextPort uint32 = 15000 + nodeNumber uint32 = 0 +) func getPort() int { return int(atomic.AddUint32(&nextPort, 1)) @@ -34,7 +37,8 @@ func testServer(t *testing.T, cb func(*Config)) *Server { IP: []byte{127, 0, 0, 1}, Port: getPort(), } - config.NodeName = fmt.Sprintf("Node %d", config.RPCAddr.Port) + nodeNumber = atomic.AddUint32(&nodeNumber, 1) + config.NodeName = fmt.Sprintf("nomad-%03d", nodeNumber) // Tighten the Serf timing config.SerfConfig.MemberlistConfig.BindAddr = "127.0.0.1" diff --git a/nomad/status_endpoint.go b/nomad/status_endpoint.go index 335bd6ed0e7..aaddb220137 100644 --- a/nomad/status_endpoint.go +++ b/nomad/status_endpoint.go @@ -18,8 +18,8 @@ func (s *Status) Version(args *structs.GenericRequest, reply *structs.VersionRes reply.Build = conf.Build reply.Versions = map[string]int{ structs.ProtocolVersion: int(conf.ProtocolVersion), - structs.APIMajorVersion: apiMajorVersion, - structs.APIMinorVersion: apiMinorVersion, + structs.APIMajorVersion: structs.ApiMajorVersion, + structs.APIMinorVersion: structs.ApiMinorVersion, } return nil } diff --git a/nomad/status_endpoint_test.go b/nomad/status_endpoint_test.go index ebbab2ead18..9d4407d2650 100644 --- a/nomad/status_endpoint_test.go +++ b/nomad/status_endpoint_test.go @@ -30,10 +30,10 @@ func TestStatusVersion(t *testing.T) { if out.Versions[structs.ProtocolVersion] != ProtocolVersionMax { t.Fatalf("bad: %#v", out) } - if out.Versions[structs.APIMajorVersion] != apiMajorVersion { + if out.Versions[structs.APIMajorVersion] != structs.ApiMajorVersion { t.Fatalf("bad: %#v", out) } - if out.Versions[structs.APIMinorVersion] != apiMinorVersion { + if out.Versions[structs.APIMinorVersion] != structs.ApiMinorVersion { t.Fatalf("bad: %#v", out) } } diff --git a/nomad/structs/config/README.md b/nomad/structs/config/README.md new file mode 100644 index 00000000000..c75016932d3 --- /dev/null +++ b/nomad/structs/config/README.md @@ -0,0 +1,7 @@ +# Overview + +`nomad/structs/config` is a package for configuration `struct`s that are +shared among packages that needs the same `struct` definitions, but can't +import each other without creating a cyle. This `config` package must be +terminal in the import graph (or very close to terminal in the dependency +graph). diff --git a/nomad/structs/config/consul.go b/nomad/structs/config/consul.go new file mode 100644 index 00000000000..cccb91903ee --- /dev/null +++ b/nomad/structs/config/consul.go @@ -0,0 +1,134 @@ +package config + +import ( + "time" + + consul "github.com/hashicorp/consul/api" +) + +// ConsulConfig contains the configuration information necessary to +// communicate with a Consul Agent in order to: +// +// - Register services and their checks with Consul +// +// - Bootstrap this Nomad Client with the list of Nomad Servers registered +// with Consul +// +// Both the Agent and the executor need to be able to import ConsulConfig. +type ConsulConfig struct { + // ServerServiceName is the name of the service that Nomad uses to register + // servers with Consul + ServerServiceName string `mapstructure:"server_service_name"` + + // ClientServiceName is the name of the service that Nomad uses to register + // clients with Consul + ClientServiceName string `mapstructure:"client_service_name"` + + // AutoRegister determines if Nomad will register the Nomad client and + // server agents with Consul + AutoRegister bool `mapstructure:"auto_register"` + + // Addr is the address of the local Consul agent + Addr string `mapstructure:"address"` + + // Timeout is used by Consul HTTP Client + Timeout time.Duration `mapstructure:"timeout"` + + // Token is used to provide a per-request ACL token.This options overrides + // the agent's default token + Token string `mapstructure:"token"` + + // Auth is the information to use for http access to Consul agent + Auth string `mapstructure:"auth"` + + // EnableSSL sets the transport scheme to talk to the Consul agent as https + EnableSSL bool `mapstructure:"ssl"` + + // VerifySSL enables or disables SSL verification when the transport scheme + // for the consul api client is https + VerifySSL bool `mapstructure:"verify_ssl"` + + // CAFile is the path to the ca certificate used for Consul communication + CAFile string `mapstructure:"ca_file"` + + // CertFile is the path to the certificate for Consul communication + CertFile string `mapstructure:"cert_file"` + + // KeyFile is the path to the private key for Consul communication + KeyFile string `mapstructure:"key_file"` + + // ServerAutoJoin enables Nomad servers to find peers by querying Consul and + // joining them + ServerAutoJoin bool `mapstructure:"server_auto_join"` + + // ClientAutoJoin enables Nomad servers to find addresses of Nomad servers + // and register with them + ClientAutoJoin bool `mapstructure:"client_auto_join"` +} + +// Merge merges two Consul Configurations together. +func (a *ConsulConfig) Merge(b *ConsulConfig) *ConsulConfig { + result := *a + + if b.ServerServiceName != "" { + result.ServerServiceName = b.ServerServiceName + } + if b.ClientServiceName != "" { + result.ClientServiceName = b.ClientServiceName + } + if b.AutoRegister { + result.AutoRegister = true + } + if b.Addr != "" { + result.Addr = b.Addr + } + if b.Timeout != 0 { + result.Timeout = b.Timeout + } + if b.Token != "" { + result.Token = b.Token + } + if b.Auth != "" { + result.Auth = b.Auth + } + if b.EnableSSL { + result.EnableSSL = true + } + if b.VerifySSL { + result.VerifySSL = true + } + if b.CAFile != "" { + result.CAFile = b.CAFile + } + if b.CertFile != "" { + result.CertFile = b.CertFile + } + if b.KeyFile != "" { + result.KeyFile = b.KeyFile + } + if b.ServerAutoJoin { + result.ServerAutoJoin = true + } + if b.ClientAutoJoin { + result.ClientAutoJoin = true + } + return &result +} + +// ApiConfig() returns a usable Consul config that can be passed directly to +// hashicorp/consul/api. NOTE: datacenter is not set +func (c *ConsulConfig) ApiConfig() (*consul.Config, error) { + config := consul.DefaultConfig() + if c.Addr != "" { + config.Address = c.Addr + } + if c.Token != "" { + config.Token = c.Token + } + + if c.Timeout != 0 { + config.HttpClient.Timeout = c.Timeout + } + + return config, nil +} diff --git a/nomad/structs/diff.go b/nomad/structs/diff.go index eabc3ec4b52..4d59d2b5ac0 100644 --- a/nomad/structs/diff.go +++ b/nomad/structs/diff.go @@ -386,7 +386,7 @@ func (t *Task) Diff(other *Task, contextual bool) (*TaskDiff, error) { } // Services diff - if sDiffs := serviceDiffs(t.Services, other.Services, contextual); sDiffs != nil { + if sDiffs := serviceDiffs(t.ConsulServices, other.ConsulServices, contextual); sDiffs != nil { diff.Objects = append(diff.Objects, sDiffs...) } @@ -458,18 +458,18 @@ func (t TaskDiffs) Less(i, j int) bool { return t[i].Name < t[j].Name } // serviceDiff returns the diff of two service objects. If contextual diff is // enabled, all fields will be returned, even if no diff occurred. -func serviceDiff(old, new *Service, contextual bool) *ObjectDiff { +func serviceDiff(old, new *ConsulService, contextual bool) *ObjectDiff { diff := &ObjectDiff{Type: DiffTypeNone, Name: "Service"} var oldPrimitiveFlat, newPrimitiveFlat map[string]string if reflect.DeepEqual(old, new) { return nil } else if old == nil { - old = &Service{} + old = &ConsulService{} diff.Type = DiffTypeAdded newPrimitiveFlat = flatmap.Flatten(new, nil, true) } else if new == nil { - new = &Service{} + new = &ConsulService{} diff.Type = DiffTypeDeleted oldPrimitiveFlat = flatmap.Flatten(old, nil, true) } else { @@ -491,9 +491,9 @@ func serviceDiff(old, new *Service, contextual bool) *ObjectDiff { // serviceDiffs diffs a set of services. If contextual diff is enabled, unchanged // fields within objects nested in the tasks will be returned. -func serviceDiffs(old, new []*Service, contextual bool) []*ObjectDiff { - oldMap := make(map[string]*Service, len(old)) - newMap := make(map[string]*Service, len(new)) +func serviceDiffs(old, new []*ConsulService, contextual bool) []*ObjectDiff { + oldMap := make(map[string]*ConsulService, len(old)) + newMap := make(map[string]*ConsulService, len(new)) for _, o := range old { oldMap[o.Name] = o } diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go index a851d891b37..0f033a524eb 100644 --- a/nomad/structs/diff_test.go +++ b/nomad/structs/diff_test.go @@ -2363,7 +2363,7 @@ func TestTaskDiff(t *testing.T) { { // Services edited (no checks) Old: &Task{ - Services: []*Service{ + ConsulServices: []*ConsulService{ { Name: "foo", PortLabel: "foo", @@ -2379,7 +2379,7 @@ func TestTaskDiff(t *testing.T) { }, }, New: &Task{ - Services: []*Service{ + ConsulServices: []*ConsulService{ { Name: "bar", PortLabel: "bar", @@ -2452,7 +2452,7 @@ func TestTaskDiff(t *testing.T) { // Services edited (no checks) with context Contextual: true, Old: &Task{ - Services: []*Service{ + ConsulServices: []*ConsulService{ { Name: "foo", PortLabel: "foo", @@ -2460,7 +2460,7 @@ func TestTaskDiff(t *testing.T) { }, }, New: &Task{ - Services: []*Service{ + ConsulServices: []*ConsulService{ { Name: "foo", PortLabel: "bar", @@ -2486,6 +2486,12 @@ func TestTaskDiff(t *testing.T) { Old: "foo", New: "bar", }, + { + Type: DiffTypeNone, + Name: "ServiceID", + Old: "", + New: "", + }, }, }, }, @@ -2494,7 +2500,7 @@ func TestTaskDiff(t *testing.T) { { // Service Checks edited Old: &Task{ - Services: []*Service{ + ConsulServices: []*ConsulService{ { Name: "foo", Checks: []*ServiceCheck{ @@ -2533,7 +2539,7 @@ func TestTaskDiff(t *testing.T) { }, }, New: &Task{ - Services: []*Service{ + ConsulServices: []*ConsulService{ { Name: "foo", Checks: []*ServiceCheck{ @@ -2695,7 +2701,7 @@ func TestTaskDiff(t *testing.T) { // Service Checks edited with context Contextual: true, Old: &Task{ - Services: []*Service{ + ConsulServices: []*ConsulService{ { Name: "foo", Checks: []*ServiceCheck{ @@ -2714,7 +2720,7 @@ func TestTaskDiff(t *testing.T) { }, }, New: &Task{ - Services: []*Service{ + ConsulServices: []*ConsulService{ { Name: "foo", Checks: []*ServiceCheck{ @@ -2751,6 +2757,12 @@ func TestTaskDiff(t *testing.T) { Old: "", New: "", }, + { + Type: DiffTypeNone, + Name: "ServiceID", + Old: "", + New: "", + }, }, Objects: []*ObjectDiff{ { @@ -2821,7 +2833,7 @@ func TestTaskDiff(t *testing.T) { } if !reflect.DeepEqual(actual, c.Expected) { - t.Fatalf("case %d: got:\n%#v\n want:\n%#v\n", + t.Errorf("case %d: got:\n%#v\n want:\n%#v\n", i+1, actual, c.Expected) } } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index f624c4bfdde..ea323cb1dd2 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -54,6 +54,21 @@ const ( // that new commands can be added in a way that won't cause // old servers to crash when the FSM attempts to process them. IgnoreUnknownTypeFlag MessageType = 128 + + // ApiMajorVersion is returned as part of the Status.Version request. + // It should be incremented anytime the APIs are changed in a way + // that would break clients for sane client versioning. + ApiMajorVersion = 1 + + // ApiMinorVersion is returned as part of the Status.Version request. + // It should be incremented anytime the APIs are changed to allow + // for sane client versioning. Minor changes should be compatible + // within the major version. + ApiMinorVersion = 1 + + ProtocolVersion = "protocol" + APIMajorVersion = "api.major" + APIMinorVersion = "api.minor" ) // RPCInfo is used to describe common information about query @@ -151,6 +166,25 @@ type NodeDeregisterRequest struct { WriteRequest } +// NodeServerInfo is used to in NodeUpdateResponse to return Nomad server +// information used in RPC server lists. +type NodeServerInfo struct { + // RPCAdvertiseAddr is the IP endpoint that a Nomad Server wishes to + // be contacted at for RPCs. + RPCAdvertiseAddr string + + // RpcMajorVersion is the major version number the Nomad Server + // supports + RPCMajorVersion int32 + + // RpcMinorVersion is the minor version number the Nomad Server + // supports + RPCMinorVersion int32 + + // Datacenter is the datacenter that a Nomad server belongs to + Datacenter string +} + // NodeUpdateStatusRequest is used for Node.UpdateStatus endpoint // to update the status of a node. type NodeUpdateStatusRequest struct { @@ -292,7 +326,7 @@ type AllocSpecificRequest struct { QueryOptions } -// AllocsGetcRequest is used to query a set of allocations +// AllocsGetRequest is used to query a set of allocations type AllocsGetRequest struct { AllocIDs []string QueryOptions @@ -316,12 +350,6 @@ type GenericResponse struct { WriteMeta } -const ( - ProtocolVersion = "protocol" - APIMajorVersion = "api.major" - APIMinorVersion = "api.minor" -) - // VersionResponse is used for the Status.Version reseponse type VersionResponse struct { Build string @@ -351,6 +379,20 @@ type NodeUpdateResponse struct { EvalIDs []string EvalCreateIndex uint64 NodeModifyIndex uint64 + + // LeaderRPCAddr is the RPC address of the current Raft Leader. If + // empty, the current Nomad Server is in the minority of a partition. + LeaderRPCAddr string + + // NumNodes is the number of Nomad nodes attached to this quorum of + // Nomad Servers at the time of the response. This value can + // fluctuate based on the health of the cluster between heartbeats. + NumNodes int32 + + // Servers is the full list of known Nomad servers in the local + // region. + Servers []*NodeServerInfo + QueryMeta } @@ -1502,27 +1544,31 @@ func (sc *ServiceCheck) Hash(serviceID string) string { return fmt.Sprintf("%x", h.Sum(nil)) } -const ( - NomadConsulPrefix = "nomad-registered-service" -) +// The ConsulService model represents a Consul service definition in Nomad +// Agent's Config. +type ConsulService struct { + // ServiceID is the calculated Consul ServiceID used for a service. + // This value is not available to be set via configuration. + ServiceID string `mapstructure:"-"` -var ( - AgentServicePrefix = fmt.Sprintf("%s-%s", NomadConsulPrefix, "agent") -) + // Name of the service registered with Consul. Consul defaults the + // Name to ServiceID if not specified. The Name if specified is used + // as one of the seed values when generating a Consul ServiceID. + Name string -// The Service model represents a Consul service definition -type Service struct { - Name string // Name of the service, defaults to id + // PortLabel is either the numeric port number or the `host:port`. + // To specify the port number using the host's Consul Advertise + // address, specify an empty host in the PortLabel (e.g. `:port`). + PortLabel string `mapstructure:"port"` Tags []string // List of tags for the service - PortLabel string `mapstructure:"port"` // port for the service Checks []*ServiceCheck // List of checks associated with the service } -func (s *Service) Copy() *Service { +func (s *ConsulService) Copy() *ConsulService { if s == nil { return nil } - ns := new(Service) + ns := new(ConsulService) *ns = *s ns.Tags = CopySliceString(ns.Tags) @@ -1539,7 +1585,7 @@ func (s *Service) Copy() *Service { // InitFields interpolates values of Job, Task Group and Task in the Service // Name. This also generates check names, service id and check ids. -func (s *Service) InitFields(job string, taskGroup string, task string) { +func (s *ConsulService) InitFields(job string, taskGroup string, task string) { s.Name = args.ReplaceEnv(s.Name, map[string]string{ "JOB": job, "TASKGROUP": taskGroup, @@ -1555,12 +1601,8 @@ func (s *Service) InitFields(job string, taskGroup string, task string) { } } -func (s *Service) ID(identifier string) string { - return fmt.Sprintf("%s-%s-%s", NomadConsulPrefix, identifier, s.Hash()) -} - // Validate checks if the Check definition is valid -func (s *Service) Validate() error { +func (s *ConsulService) Validate() error { var mErr multierror.Error // Ensure the service name is valid per RFC-952 ยง1 @@ -1586,7 +1628,7 @@ func (s *Service) Validate() error { // Hash calculates the hash of the check based on it's content and the service // which owns it -func (s *Service) Hash() string { +func (s *ConsulService) Hash() string { h := sha1.New() io.WriteString(h, s.Name) io.WriteString(h, strings.Join(s.Tags, "")) @@ -1645,7 +1687,7 @@ type Task struct { Env map[string]string // List of service definitions exposed by the Task - Services []*Service + ConsulServices []*ConsulService // Constraints can be specified at a task level and apply only to // the particular task. @@ -1678,12 +1720,12 @@ func (t *Task) Copy() *Task { *nt = *t nt.Env = CopyMapStringString(nt.Env) - if t.Services != nil { - services := make([]*Service, len(nt.Services)) - for i, s := range nt.Services { + if t.ConsulServices != nil { + services := make([]*ConsulService, len(nt.ConsulServices)) + for i, s := range nt.ConsulServices { services[i] = s.Copy() } - nt.Services = services + nt.ConsulServices = services } nt.Constraints = CopySliceConstraints(nt.Constraints) @@ -1720,7 +1762,7 @@ func (t *Task) InitFields(job *Job, tg *TaskGroup) { // and Tasks in all the service Names of a Task. This also generates the service // id, check id and check names. func (t *Task) InitServiceFields(job string, taskGroup string) { - for _, service := range t.Services { + for _, service := range t.ConsulServices { service.InitFields(job, taskGroup, t.Name) } } @@ -1817,7 +1859,7 @@ func validateServices(t *Task) error { // unique. servicePorts := make(map[string][]string) knownServices := make(map[string]struct{}) - for i, service := range t.Services { + for i, service := range t.ConsulServices { if err := service.Validate(); err != nil { outer := fmt.Errorf("service %d validation failed: %s", i, err) mErr.Errors = append(mErr.Errors, outer) @@ -2268,9 +2310,6 @@ type Allocation struct { // task. These should sum to the total Resources. TaskResources map[string]*Resources - // Services is a map of service names to service ids - Services map[string]string - // Metrics associated with this allocation Metrics *AllocMetric @@ -2320,14 +2359,6 @@ func (a *Allocation) Copy() *Allocation { na.TaskResources = tr } - if a.Services != nil { - s := make(map[string]string, len(na.Services)) - for service, id := range na.Services { - s[service] = id - } - na.Services = s - } - na.Metrics = na.Metrics.Copy() if a.TaskStates != nil { @@ -2396,31 +2427,6 @@ func (a *Allocation) Stub() *AllocListStub { } } -// PopulateServiceIDs generates the service IDs for all the service definitions -// in that Allocation -func (a *Allocation) PopulateServiceIDs(tg *TaskGroup) { - // Retain the old services, and re-initialize. We may be removing - // services, so we cannot update the existing map. - previous := a.Services - a.Services = make(map[string]string) - - for _, task := range tg.Tasks { - for _, service := range task.Services { - // Retain the service if an ID is already generated - if id, ok := previous[service.Name]; ok { - a.Services[service.Name] = id - continue - } - - // If the service hasn't been generated an ID, we generate one. - // We add a prefix to the Service ID so that we can know that this service - // is managed by Nomad since Consul can also have service which are not - // managed by Nomad - a.Services[service.Name] = fmt.Sprintf("%s-%s", NomadConsulPrefix, GenerateUUID()) - } - } -} - var ( // AllocationIndexRegex is a regular expression to find the allocation index. AllocationIndexRegex = regexp.MustCompile(".+\\[(\\d+)\\]$") diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index e60ceadcfa7..5c56e9ff10d 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -136,7 +136,7 @@ func testJob() *Job { GetterSource: "http://foo.com", }, }, - Services: []*Service{ + ConsulServices: []*ConsulService{ { Name: "${TASK}-frontend", PortLabel: "http", @@ -283,7 +283,7 @@ func TestTask_Validate(t *testing.T) { } func TestTask_Validate_Services(t *testing.T) { - s1 := &Service{ + s1 := &ConsulService{ Name: "service-name", PortLabel: "bar", Checks: []*ServiceCheck{ @@ -298,7 +298,7 @@ func TestTask_Validate_Services(t *testing.T) { }, } - s2 := &Service{ + s2 := &ConsulService{ Name: "service-name", } @@ -311,7 +311,7 @@ func TestTask_Validate_Services(t *testing.T) { MemoryMB: 100, IOPS: 10, }, - Services: []*Service{s1, s2}, + ConsulServices: []*ConsulService{s1, s2}, } err := task.Validate() if err == nil { @@ -568,7 +568,7 @@ func BenchmarkEncodeDecode(b *testing.B) { } func TestInvalidServiceCheck(t *testing.T) { - s := Service{ + s := ConsulService{ Name: "service-name", PortLabel: "bar", Checks: []*ServiceCheck{ @@ -582,7 +582,7 @@ func TestInvalidServiceCheck(t *testing.T) { t.Fatalf("Service should be invalid (invalid type)") } - s = Service{ + s = ConsulService{ Name: "service.name", PortLabel: "bar", } @@ -590,7 +590,7 @@ func TestInvalidServiceCheck(t *testing.T) { t.Fatalf("Service should be invalid (contains a dot): %v", err) } - s = Service{ + s = ConsulService{ Name: "-my-service", PortLabel: "bar", } @@ -598,7 +598,7 @@ func TestInvalidServiceCheck(t *testing.T) { t.Fatalf("Service should be invalid (begins with a hyphen): %v", err) } - s = Service{ + s = ConsulService{ Name: "abcdef0123456789-abcdef0123456789-abcdef0123456789-abcdef0123456", PortLabel: "bar", } @@ -606,7 +606,7 @@ func TestInvalidServiceCheck(t *testing.T) { t.Fatalf("Service should be invalid (too long): %v", err) } - s = Service{ + s = ConsulService{ Name: "service-name", Checks: []*ServiceCheck{ { @@ -628,7 +628,7 @@ func TestInvalidServiceCheck(t *testing.T) { t.Fatalf("service should be invalid (tcp/http checks with no port): %v", err) } - s = Service{ + s = ConsulService{ Name: "service-name", Checks: []*ServiceCheck{ { @@ -684,7 +684,7 @@ func TestService_InitFields(t *testing.T) { taskGroup := "cache" task := "redis" - s := Service{ + s := ConsulService{ Name: "${TASK}-db", } @@ -722,7 +722,7 @@ func TestJob_ExpandServiceNames(t *testing.T) { Tasks: []*Task{ { Name: "frontend", - Services: []*Service{ + ConsulServices: []*ConsulService{ { Name: "${BASE}-default", }, @@ -746,12 +746,12 @@ func TestJob_ExpandServiceNames(t *testing.T) { j.InitFields() - service1Name := j.TaskGroups[0].Tasks[0].Services[0].Name + service1Name := j.TaskGroups[0].Tasks[0].ConsulServices[0].Name if service1Name != "my-job-web-frontend-default" { t.Fatalf("Expected Service Name: %s, Actual: %s", "my-job-web-frontend-default", service1Name) } - service2Name := j.TaskGroups[0].Tasks[0].Services[1].Name + service2Name := j.TaskGroups[0].Tasks[0].ConsulServices[1].Name if service2Name != "jmx" { t.Fatalf("Expected Service Name: %s, Actual: %s", "jmx", service2Name) } diff --git a/nomad/types/types.go b/nomad/types/types.go new file mode 100644 index 00000000000..2a05ddbb3ff --- /dev/null +++ b/nomad/types/types.go @@ -0,0 +1,3 @@ +package types + +type PeriodicCallback func() error diff --git a/nomad/util.go b/nomad/util.go index 1c7dba0bd48..d2f50bb4861 100644 --- a/nomad/util.go +++ b/nomad/util.go @@ -2,17 +2,12 @@ package nomad import ( "fmt" - "math" - "math/big" "math/rand" "net" "os" "path/filepath" "runtime" "strconv" - "time" - - crand "crypto/rand" "github.com/hashicorp/serf/serf" ) @@ -39,14 +34,15 @@ func RuntimeStats() map[string]string { // serverParts is used to return the parts of a server role type serverParts struct { - Name string - Region string - Datacenter string - Port int - Bootstrap bool - Expect int - Version int - Addr net.Addr + Name string + Region string + Datacenter string + Port int + Bootstrap bool + Expect int + MajorVersion int + MinorVersion int + Addr net.Addr } func (s *serverParts) String() string { @@ -81,31 +77,36 @@ func isNomadServer(m serf.Member) (bool, *serverParts) { return false, nil } - vsn_str := m.Tags["vsn"] - vsn, err := strconv.Atoi(vsn_str) + // The "vsn" tag was Version, which is now the MajorVersion number. + majorVersionStr := m.Tags["vsn"] + majorVersion, err := strconv.Atoi(majorVersionStr) if err != nil { return false, nil } + // To keep some semblance of convention, "mvn" is now the "Minor + // Version Number." + minorVersionStr := m.Tags["mvn"] + minorVersion, err := strconv.Atoi(minorVersionStr) + if err != nil { + minorVersion = 0 + } + addr := &net.TCPAddr{IP: m.Addr, Port: port} parts := &serverParts{ - Name: m.Name, - Region: region, - Datacenter: datacenter, - Port: port, - Bootstrap: bootstrap, - Expect: expect, - Addr: addr, - Version: vsn, + Name: m.Name, + Region: region, + Datacenter: datacenter, + Port: port, + Bootstrap: bootstrap, + Expect: expect, + Addr: addr, + MajorVersion: majorVersion, + MinorVersion: minorVersion, } return true, parts } -// Returns a random stagger interval between 0 and the duration -func randomStagger(intv time.Duration) time.Duration { - return time.Duration(uint64(rand.Int63()) % uint64(intv)) -} - // shuffleStrings randomly shuffles the list of strings func shuffleStrings(list []string) { for i := range list { @@ -121,24 +122,3 @@ func maxUint64(a, b uint64) uint64 { } return b } - -// rateScaledInterval is used to choose an interval to perform an action in order -// to target an aggregate number of actions per second across the whole cluster. -func rateScaledInterval(rate float64, min time.Duration, n int) time.Duration { - interval := time.Duration(float64(time.Second) * float64(n) / rate) - if interval < min { - return min - } - return interval -} - -// seedRandom seeds the global random variable using a cryptographically random -// seed. It returns an error if determing the random seed fails. -func seedRandom() error { - n, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64)) - if err != nil { - return err - } - rand.Seed(n.Int64()) - return nil -} diff --git a/nomad/util_test.go b/nomad/util_test.go index d1a399590e9..60a6e6918ad 100644 --- a/nomad/util_test.go +++ b/nomad/util_test.go @@ -4,7 +4,6 @@ import ( "net" "reflect" "testing" - "time" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/serf/serf" @@ -45,7 +44,7 @@ func TestIsNomadServer(t *testing.T) { if parts.Addr.String() != "127.0.0.1:10000" { t.Fatalf("bad addr: %v", parts.Addr) } - if parts.Version != 1 { + if parts.MajorVersion != 1 { t.Fatalf("bad: %v", parts) } @@ -57,16 +56,6 @@ func TestIsNomadServer(t *testing.T) { } } -func TestRandomStagger(t *testing.T) { - intv := time.Minute - for i := 0; i < 10; i++ { - stagger := randomStagger(intv) - if stagger < 0 || stagger >= intv { - t.Fatalf("Bad: %v", stagger) - } - } -} - func TestShuffleStrings(t *testing.T) { // Generate input inp := make([]string, 10) @@ -98,26 +87,3 @@ func TestMaxUint64(t *testing.T) { t.Fatalf("bad") } } - -func TestRateScaledInterval(t *testing.T) { - min := 1 * time.Second - rate := 200.0 - if v := rateScaledInterval(rate, min, 0); v != min { - t.Fatalf("Bad: %v", v) - } - if v := rateScaledInterval(rate, min, 100); v != min { - t.Fatalf("Bad: %v", v) - } - if v := rateScaledInterval(rate, min, 200); v != 1*time.Second { - t.Fatalf("Bad: %v", v) - } - if v := rateScaledInterval(rate, min, 1000); v != 5*time.Second { - t.Fatalf("Bad: %v", v) - } - if v := rateScaledInterval(rate, min, 5000); v != 25*time.Second { - t.Fatalf("Bad: %v", v) - } - if v := rateScaledInterval(rate, min, 10000); v != 50*time.Second { - t.Fatalf("Bad: %v", v) - } -} diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 0a942cd1398..584b8a0176d 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -424,10 +424,6 @@ func (s *GenericScheduler) computePlacements(place []allocTuple) error { ClientStatus: structs.AllocClientStatusPending, } - // Generate service IDs tasks in this allocation - // COMPAT - This is no longer required and would be removed in v0.4 - alloc.PopulateServiceIDs(missing.TaskGroup) - s.plan.AppendAlloc(alloc) } else { // Lazy initialize the failed map diff --git a/scheduler/system_sched.go b/scheduler/system_sched.go index 3b8a6f4389c..42f509b395f 100644 --- a/scheduler/system_sched.go +++ b/scheduler/system_sched.go @@ -264,10 +264,6 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { ClientStatus: structs.AllocClientStatusPending, } - // Generate service IDs tasks in this allocation - // COMPAT - This is no longer required and would be removed in v0.4 - alloc.PopulateServiceIDs(missing.TaskGroup) - s.plan.AppendAlloc(alloc) } else { // Lazy initialize the failed map diff --git a/scheduler/util.go b/scheduler/util.go index 1c1c93275f0..269a4b4f04b 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -455,7 +455,6 @@ func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, newAlloc.Metrics = ctx.Metrics() newAlloc.DesiredStatus = structs.AllocDesiredStatusRun newAlloc.ClientStatus = structs.AllocClientStatusPending - newAlloc.PopulateServiceIDs(update.TaskGroup) ctx.Plan().AppendAlloc(newAlloc) // Remove this allocation from the slice diff --git a/scheduler/util_test.go b/scheduler/util_test.go index fc2a1f13324..0e8a5d5d8db 100644 --- a/scheduler/util_test.go +++ b/scheduler/util_test.go @@ -664,22 +664,14 @@ func TestInplaceUpdate_Success(t *testing.T) { DesiredStatus: structs.AllocDesiredStatusRun, } alloc.TaskResources = map[string]*structs.Resources{"web": alloc.Resources} - alloc.PopulateServiceIDs(job.TaskGroups[0]) noErr(t, state.UpsertAllocs(1001, []*structs.Allocation{alloc})) - webFeSrvID := alloc.Services["web-frontend"] - adminSrvID := alloc.Services["web-admin"] - - if webFeSrvID == "" || adminSrvID == "" { - t.Fatal("Service ID needs to be generated for service") - } - // Create a new task group that updates the resources. tg := &structs.TaskGroup{} *tg = *job.TaskGroups[0] resource := &structs.Resources{CPU: 737} tg.Tasks[0].Resources = resource - newServices := []*structs.Service{ + newServices := []*structs.ConsulService{ { Name: "dummy-service", PortLabel: "http", @@ -691,10 +683,10 @@ func TestInplaceUpdate_Success(t *testing.T) { } // Delete service 2 - tg.Tasks[0].Services = tg.Tasks[0].Services[:1] + tg.Tasks[0].ConsulServices = tg.Tasks[0].ConsulServices[:1] // Add the new services - tg.Tasks[0].Services = append(tg.Tasks[0].Services, newServices...) + tg.Tasks[0].ConsulServices = append(tg.Tasks[0].ConsulServices, newServices...) updates := []allocTuple{{Alloc: alloc, TaskGroup: tg}} stack := NewGenericStack(false, ctx) @@ -716,20 +708,35 @@ func TestInplaceUpdate_Success(t *testing.T) { } // Get the alloc we inserted. - a := ctx.plan.NodeAllocation[alloc.NodeID][0] - if len(a.Services) != 3 { - t.Fatalf("Expected number of services: %v, Actual: %v", 3, len(a.Services)) + a := inplace[0].Alloc // TODO(sean@): Verify this is correct vs: ctx.plan.NodeAllocation[alloc.NodeID][0] + if a.Job == nil { + t.Fatalf("bad") } - // Test that the service id for the old service is still the same - if a.Services["web-frontend"] != webFeSrvID { - t.Fatalf("Expected service ID: %v, Actual: %v", webFeSrvID, a.Services["web-frontend"]) + if len(a.Job.TaskGroups) != 1 { + t.Fatalf("bad") + } + + if len(a.Job.TaskGroups[0].Tasks) != 1 { + t.Fatalf("bad") } - // Test that the map doesn't contain the service ID of the admin Service - // anymore - if _, ok := a.Services["web-admin"]; ok { - t.Fatal("Service shouldn't be present") + if len(a.Job.TaskGroups[0].Tasks[0].ConsulServices) != 3 { + t.Fatalf("Expected number of services: %v, Actual: %v", 3, len(a.Job.TaskGroups[0].Tasks[0].ConsulServices)) + } + + serviceNames := make(map[string]struct{}, 3) + for _, consulService := range a.Job.TaskGroups[0].Tasks[0].ConsulServices { + serviceNames[consulService.Name] = struct{}{} + } + if len(serviceNames) != 3 { + t.Fatalf("bad") + } + + for _, name := range []string{"dummy-service", "dummy-service2", "web-frontend"} { + if _, found := serviceNames[name]; !found { + t.Errorf("Expected consul service name missing: %v", name) + } } } diff --git a/vendor/github.com/hashicorp/consul/lib/cluster.go b/vendor/github.com/hashicorp/consul/lib/cluster.go new file mode 100644 index 00000000000..a95232c5737 --- /dev/null +++ b/vendor/github.com/hashicorp/consul/lib/cluster.go @@ -0,0 +1,56 @@ +package lib + +import ( + "math/rand" + "time" +) + +// DurationMinusBuffer returns a duration, minus a buffer and jitter +// subtracted from the duration. This function is used primarily for +// servicing Consul TTL Checks in advance of the TTL. +func DurationMinusBuffer(intv time.Duration, buffer time.Duration, jitter int64) time.Duration { + d := intv - buffer + if jitter == 0 { + d -= RandomStagger(d) + } else { + d -= RandomStagger(time.Duration(int64(d) / jitter)) + } + return d +} + +// DurationMinusBufferDomain returns the domain of valid durations from a +// call to DurationMinusBuffer. This function is used to check user +// specified input values to DurationMinusBuffer. +func DurationMinusBufferDomain(intv time.Duration, buffer time.Duration, jitter int64) (min time.Duration, max time.Duration) { + max = intv - buffer + if jitter == 0 { + min = max + } else { + min = max - time.Duration(int64(max)/jitter) + } + return min, max +} + +// Returns a random stagger interval between 0 and the duration +func RandomStagger(intv time.Duration) time.Duration { + if intv == 0 { + return 0 + } + return time.Duration(uint64(rand.Int63()) % uint64(intv)) +} + +// RateScaledInterval is used to choose an interval to perform an action in +// order to target an aggregate number of actions per second across the whole +// cluster. +func RateScaledInterval(rate float64, min time.Duration, n int) time.Duration { + const minRate = 1 / 86400 // 1/(1 * time.Day) + if rate <= minRate { + return min + } + interval := time.Duration(float64(time.Second) * float64(n) / rate) + if interval < min { + return min + } + + return interval +} diff --git a/vendor/github.com/hashicorp/consul/lib/math.go b/vendor/github.com/hashicorp/consul/lib/math.go new file mode 100644 index 00000000000..1d0b6dc0f6b --- /dev/null +++ b/vendor/github.com/hashicorp/consul/lib/math.go @@ -0,0 +1,22 @@ +package lib + +func AbsInt(a int) int { + if a > 0 { + return a + } + return a * -1 +} + +func MaxInt(a, b int) int { + if a > b { + return a + } + return b +} + +func MinInt(a, b int) int { + if a > b { + return b + } + return a +} diff --git a/vendor/github.com/hashicorp/consul/lib/rand.go b/vendor/github.com/hashicorp/consul/lib/rand.go new file mode 100644 index 00000000000..22aa4f3544b --- /dev/null +++ b/vendor/github.com/hashicorp/consul/lib/rand.go @@ -0,0 +1,34 @@ +package lib + +import ( + crand "crypto/rand" + "math" + "math/big" + "math/rand" + "sync" + "time" +) + +var ( + once sync.Once + + // SeededSecurely is set to true if a cryptographically secure seed + // was used to initialize rand. When false, the start time is used + // as a seed. + SeededSecurely bool +) + +// SeedMathRand provides weak, but guaranteed seeding, which is better than +// running with Go's default seed of 1. A call to SeedMathRand() is expected +// to be called via init(), but never a second time. +func SeedMathRand() { + once.Do(func() { + n, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64)) + if err != nil { + rand.Seed(time.Now().UTC().UnixNano()) + return + } + rand.Seed(n.Int64()) + SeededSecurely = true + }) +} diff --git a/vendor/github.com/hashicorp/consul/lib/string.go b/vendor/github.com/hashicorp/consul/lib/string.go new file mode 100644 index 00000000000..0780abb632c --- /dev/null +++ b/vendor/github.com/hashicorp/consul/lib/string.go @@ -0,0 +1,11 @@ +package lib + +// StrContains checks if a list contains a string +func StrContains(l []string, s string) bool { + for _, v := range l { + if v == s { + return true + } + } + return false +} diff --git a/website/source/docs/http/agent-members.html.md b/website/source/docs/http/agent-members.html.md index f56b5f9229c..e686b574cda 100644 --- a/website/source/docs/http/agent-members.html.md +++ b/website/source/docs/http/agent-members.html.md @@ -46,9 +46,7 @@ the gossip pool. This is only applicable to servers. "port": "4647", "region": "global", "role": "nomad", - "vsn": "1", - "vsn_max": "1", - "vsn_min": "1" + "vsn": "1" }, "Status": "alive", "ProtocolMin": 1, diff --git a/website/source/docs/http/agent-self.html.md b/website/source/docs/http/agent-self.html.md index a7a03debc24..91fb615deca 100644 --- a/website/source/docs/http/agent-self.html.md +++ b/website/source/docs/http/agent-self.html.md @@ -98,9 +98,7 @@ The `self` endpoint is used to query the state of the target agent. "port": "4647", "region": "global", "role": "nomad", - "vsn": "1", - "vsn_max": "1", - "vsn_min": "1" + "vsn": "1" }, "Status": "alive", "ProtocolMin": 1,