From 10cb924b2cbf82d4f7947aafe99866355e0b9a14 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Tue, 31 Jan 2017 16:43:57 -0800 Subject: [PATCH] Refactor Consul Syncer into new ServiceClient Fixes #2478 #2474 #1995 #2294 The new client only handles agent and task service advertisement. Server discovery is mostly unchanged. The Nomad client agent now handles all Consul operations instead of the executor handling task related operations. When upgrading from an earlier version of Nomad existing executors will be told to deregister from Consul so that the Nomad agent can re-register the task's services and checks. Drivers - other than qemu - now support an Exec method for executing abritrary commands in a task's environment. This is used to implement script checks. Interfaces are used extensively to avoid interacting with Consul in tests that don't assert any Consul related behavior. --- client/alloc_runner.go | 36 +- client/alloc_runner_test.go | 10 +- client/allocdir/alloc_dir.go | 8 +- client/client.go | 77 +- client/client_test.go | 30 +- client/consul.go | 13 + client/consul_test.go | 56 + client/driver/docker.go | 50 +- client/driver/driver_test.go | 13 + client/driver/exec.go | 18 +- client/driver/exec_test.go | 64 ++ client/driver/executor/checks.go | 205 ---- client/driver/executor/checks_linux_test.go | 56 - client/driver/executor/checks_test.go | 96 -- client/driver/executor/checks_unix.go | 18 - client/driver/executor/checks_windows.go | 8 - client/driver/executor/executor.go | 148 +-- client/driver/executor/executor_test.go | 26 - client/driver/executor_plugin.go | 16 +- client/driver/java.go | 21 +- client/driver/mock_driver.go | 6 + client/driver/qemu.go | 12 - client/driver/raw_exec.go | 15 +- client/driver/raw_exec_test.go | 62 + client/driver/rkt.go | 60 +- client/driver/rkt_test.go | 74 ++ client/driver/utils.go | 57 +- client/driver/utils_unix.go | 8 + client/task_runner.go | 72 +- client/task_runner_test.go | 5 +- command/agent/agent.go | 102 +- command/agent/consul/chaos_test.go | 193 ---- command/agent/consul/check.go | 91 -- command/agent/consul/client.go | 636 +++++++++++ command/agent/consul/int_test.go | 228 ++++ command/agent/consul/mock.go | 27 + command/agent/consul/script.go | 136 +++ command/agent/consul/script_test.go | 165 +++ command/agent/consul/syncer.go | 1016 ----------------- command/agent/consul/syncer_test.go | 358 ------ command/agent/consul/unit_test.go | 497 ++++++++ nomad/server.go | 70 +- nomad/server_test.go | 8 +- .../docs/job-specification/service.html.md | 2 +- 44 files changed, 2360 insertions(+), 2509 deletions(-) create mode 100644 client/consul.go create mode 100644 client/consul_test.go delete mode 100644 client/driver/executor/checks.go delete mode 100644 client/driver/executor/checks_linux_test.go delete mode 100644 client/driver/executor/checks_test.go delete mode 100644 client/driver/executor/checks_unix.go delete mode 100644 client/driver/executor/checks_windows.go delete mode 100644 command/agent/consul/chaos_test.go delete mode 100644 command/agent/consul/check.go create mode 100644 command/agent/consul/client.go create mode 100644 command/agent/consul/int_test.go create mode 100644 command/agent/consul/mock.go create mode 100644 command/agent/consul/script.go create mode 100644 command/agent/consul/script_test.go delete mode 100644 command/agent/consul/syncer.go delete mode 100644 command/agent/consul/syncer_test.go create mode 100644 command/agent/consul/unit_test.go diff --git a/client/alloc_runner.go b/client/alloc_runner.go index 719a84d4a67..8ca6f671eae 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -59,7 +59,8 @@ type AllocRunner struct { updateCh chan *structs.Allocation - vaultClient vaultclient.VaultClient + vaultClient vaultclient.VaultClient + consulClient ConsulServiceAPI otherAllocDir *allocdir.AllocDir @@ -96,20 +97,23 @@ type allocRunnerState struct { // NewAllocRunner is used to create a new allocation context func NewAllocRunner(logger *log.Logger, config *config.Config, updater AllocStateUpdater, - alloc *structs.Allocation, vaultClient vaultclient.VaultClient) *AllocRunner { + alloc *structs.Allocation, vaultClient vaultclient.VaultClient, + consulClient ConsulServiceAPI) *AllocRunner { + ar := &AllocRunner{ - config: config, - updater: updater, - logger: logger, - alloc: alloc, - dirtyCh: make(chan struct{}, 1), - tasks: make(map[string]*TaskRunner), - taskStates: copyTaskStates(alloc.TaskStates), - restored: make(map[string]struct{}), - updateCh: make(chan *structs.Allocation, 64), - destroyCh: make(chan struct{}), - waitCh: make(chan struct{}), - vaultClient: vaultClient, + config: config, + updater: updater, + logger: logger, + alloc: alloc, + dirtyCh: make(chan struct{}, 1), + tasks: make(map[string]*TaskRunner), + taskStates: copyTaskStates(alloc.TaskStates), + restored: make(map[string]struct{}), + updateCh: make(chan *structs.Allocation, 64), + destroyCh: make(chan struct{}), + waitCh: make(chan struct{}), + vaultClient: vaultClient, + consulClient: consulClient, } return ar } @@ -174,7 +178,7 @@ func (r *AllocRunner) RestoreState() error { } task := &structs.Task{Name: name} - tr := NewTaskRunner(r.logger, r.config, r.setTaskState, td, r.Alloc(), task, r.vaultClient) + tr := NewTaskRunner(r.logger, r.config, r.setTaskState, td, r.Alloc(), task, r.vaultClient, r.consulClient) r.tasks[name] = tr // Skip tasks in terminal states. @@ -512,7 +516,7 @@ func (r *AllocRunner) Run() { taskdir := r.allocDir.NewTaskDir(task.Name) r.allocDirLock.Unlock() - tr := NewTaskRunner(r.logger, r.config, r.setTaskState, taskdir, r.Alloc(), task.Copy(), r.vaultClient) + tr := NewTaskRunner(r.logger, r.config, r.setTaskState, taskdir, r.Alloc(), task.Copy(), r.vaultClient, r.consulClient) r.tasks[task.Name] = tr tr.MarkReceived() diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go index 6bac5e4051b..f1bc828b807 100644 --- a/client/alloc_runner_test.go +++ b/client/alloc_runner_test.go @@ -40,7 +40,7 @@ func testAllocRunnerFromAlloc(alloc *structs.Allocation, restarts bool) (*MockAl alloc.Job.Type = structs.JobTypeBatch } vclient := vaultclient.NewMockVaultClient() - ar := NewAllocRunner(logger, conf, upd.Update, alloc, vclient) + ar := NewAllocRunner(logger, conf, upd.Update, alloc, vclient, newMockConsulServiceClient()) return upd, ar } @@ -323,7 +323,8 @@ func TestAllocRunner_SaveRestoreState(t *testing.T) { // Create a new alloc runner l2 := prefixedTestLogger("----- ar2: ") ar2 := NewAllocRunner(l2, ar.config, upd.Update, - &structs.Allocation{ID: ar.alloc.ID}, ar.vaultClient) + &structs.Allocation{ID: ar.alloc.ID}, ar.vaultClient, + ar.consulClient) err = ar2.RestoreState() if err != nil { t.Fatalf("err: %v", err) @@ -415,7 +416,7 @@ func TestAllocRunner_SaveRestoreState_TerminalAlloc(t *testing.T) { // Create a new alloc runner ar2 := NewAllocRunner(ar.logger, ar.config, upd.Update, - &structs.Allocation{ID: ar.alloc.ID}, ar.vaultClient) + &structs.Allocation{ID: ar.alloc.ID}, ar.vaultClient, ar.consulClient) ar2.logger = prefixedTestLogger("ar2: ") err = ar2.RestoreState() if err != nil { @@ -573,7 +574,8 @@ func TestAllocRunner_RestoreOldState(t *testing.T) { *alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0} alloc.Job.Type = structs.JobTypeBatch vclient := vaultclient.NewMockVaultClient() - ar := NewAllocRunner(logger, conf, upd.Update, alloc, vclient) + cclient := newMockConsulServiceClient() + ar := NewAllocRunner(logger, conf, upd.Update, alloc, vclient, cclient) defer ar.Destroy() // RestoreState should fail on the task state since we only test the diff --git a/client/allocdir/alloc_dir.go b/client/allocdir/alloc_dir.go index dd665926112..749f5da1793 100644 --- a/client/allocdir/alloc_dir.go +++ b/client/allocdir/alloc_dir.go @@ -34,8 +34,12 @@ var ( // included in snapshots. SharedDataDir = "data" + // TmpDirName is the name of the temporary directory in each alloc and + // task. + TmpDirName = "tmp" + // The set of directories that exist inside eache shared alloc directory. - SharedAllocDirs = []string{LogDirName, "tmp", SharedDataDir} + SharedAllocDirs = []string{LogDirName, TmpDirName, SharedDataDir} // The name of the directory that exists inside each task directory // regardless of driver. @@ -46,7 +50,7 @@ var ( TaskSecrets = "secrets" // TaskDirs is the set of directories created in each tasks directory. - TaskDirs = map[string]os.FileMode{"tmp": os.ModeSticky | 0777} + TaskDirs = map[string]os.FileMode{TmpDirName: os.ModeSticky | 0777} ) type AllocDir struct { diff --git a/client/client.go b/client/client.go index 7183c939f1d..89e14823b85 100644 --- a/client/client.go +++ b/client/client.go @@ -49,10 +49,6 @@ const ( // datacenters looking for the Nomad server service. datacenterQueryLimit = 9 - // consulReaperIntv is the interval at which the Consul reaper will - // run. - consulReaperIntv = 5 * time.Second - // registerRetryIntv is minimum interval on which we retry // registration. We pick a value between this and 2x this. registerRetryIntv = 15 * time.Second @@ -142,8 +138,12 @@ type Client struct { // allocUpdates stores allocations that need to be synced to the server. allocUpdates chan *structs.Allocation - // consulSyncer advertises this Nomad Agent with Consul - consulSyncer *consul.Syncer + // consulService is Nomad's custom Consul client for managing services + // and checks. + consulService ConsulServiceAPI + + // consulCatalog is the subset of Consul's Catalog API Nomad uses. + consulCatalog consul.CatalogAPI // HostStatsCollector collects host resource usage stats hostStatsCollector *stats.HostStatsCollector @@ -196,7 +196,7 @@ var ( ) // NewClient is used to create a new client from the given configuration -func NewClient(cfg *config.Config, consulSyncer *consul.Syncer, logger *log.Logger) (*Client, error) { +func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulService ConsulServiceAPI, logger *log.Logger) (*Client, error) { // Create the tls wrapper var tlsWrap tlsutil.RegionWrapper if cfg.TLSConfig.EnableRPC { @@ -210,7 +210,8 @@ func NewClient(cfg *config.Config, consulSyncer *consul.Syncer, logger *log.Logg // Create the client c := &Client{ config: cfg, - consulSyncer: consulSyncer, + consulCatalog: consulCatalog, + consulService: consulService, start: time.Now(), connPool: nomad.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, tlsWrap), logger: logger, @@ -285,9 +286,6 @@ func NewClient(cfg *config.Config, consulSyncer *consul.Syncer, logger *log.Logg } } - // Start Consul reaper - go c.consulReaper() - // Setup the vault client for token and secret renewals if err := c.setupVaultClient(); err != nil { return nil, fmt.Errorf("failed to setup vault client: %v", err) @@ -606,7 +604,7 @@ func (c *Client) restoreState() error { id := entry.Name() alloc := &structs.Allocation{ID: id} c.configLock.RLock() - ar := NewAllocRunner(c.logger, c.configCopy, c.updateAllocStatus, alloc, c.vaultClient) + ar := NewAllocRunner(c.logger, c.configCopy, c.updateAllocStatus, alloc, c.vaultClient, c.consulService) c.configLock.RUnlock() c.allocLock.Lock() c.allocs[id] = ar @@ -1894,7 +1892,7 @@ func (c *Client) addAlloc(alloc *structs.Allocation, prevAllocDir *allocdir.Allo defer c.allocLock.Unlock() c.configLock.RLock() - ar := NewAllocRunner(c.logger, c.configCopy, c.updateAllocStatus, alloc, c.vaultClient) + ar := NewAllocRunner(c.logger, c.configCopy, c.updateAllocStatus, alloc, c.vaultClient, c.consulService) ar.SetPreviousAllocDir(prevAllocDir) c.configLock.RUnlock() go ar.Run() @@ -2047,8 +2045,7 @@ func (c *Client) consulDiscoveryImpl() error { c.heartbeatLock.Lock() defer c.heartbeatLock.Unlock() - consulCatalog := c.consulSyncer.ConsulClient().Catalog() - dcs, err := consulCatalog.Datacenters() + dcs, err := c.consulCatalog.Datacenters() if err != nil { return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err) } @@ -2084,7 +2081,7 @@ DISCOLOOP: Near: "_agent", WaitTime: consul.DefaultQueryWaitDuration, } - consulServices, _, err := consulCatalog.Service(serviceName, consul.ServiceTagRPC, consulOpts) + consulServices, _, err := c.consulCatalog.Service(serviceName, consul.ServiceTagRPC, consulOpts) if err != nil { mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", serviceName, dc, err)) continue @@ -2143,54 +2140,6 @@ DISCOLOOP: } } -// consulReaper periodically reaps unmatched domains from Consul. Intended to -// be called in its own goroutine. See consulReaperIntv for interval. -func (c *Client) consulReaper() { - ticker := time.NewTicker(consulReaperIntv) - defer ticker.Stop() - lastok := true - for { - select { - case <-ticker.C: - if err := c.consulReaperImpl(); err != nil { - if lastok { - c.logger.Printf("[ERR] client.consul: error reaping services in consul: %v", err) - lastok = false - } - } else { - lastok = true - } - case <-c.shutdownCh: - return - } - } -} - -// consulReaperImpl reaps unmatched domains from Consul. -func (c *Client) consulReaperImpl() error { - const estInitialExecutorDomains = 8 - - // Create the domains to keep and add the server and client - domains := make([]consul.ServiceDomain, 2, estInitialExecutorDomains) - domains[0] = consul.ServerDomain - domains[1] = consul.ClientDomain - - for allocID, ar := range c.getAllocRunners() { - ar.taskStatusLock.RLock() - taskStates := copyTaskStates(ar.taskStates) - ar.taskStatusLock.RUnlock() - for taskName, taskState := range taskStates { - // Only keep running tasks - if taskState.State == structs.TaskStateRunning { - d := consul.NewExecutorDomain(allocID, taskName) - domains = append(domains, d) - } - } - } - - return c.consulSyncer.ReapUnmatched(domains) -} - // emitStats collects host resource usage stats periodically func (c *Client) emitStats() { // Start collecting host stats right away and then keep collecting every diff --git a/client/client_test.go b/client/client_test.go index 0e79c77e788..abf857da4fc 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -75,15 +75,11 @@ func testServer(t *testing.T, cb func(*nomad.Config)) (*nomad.Server, string) { cb(config) } - shutdownCh := make(chan struct{}) logger := log.New(config.LogOutput, "", log.LstdFlags) - consulSyncer, err := consul.NewSyncer(config.ConsulConfig, shutdownCh, logger) - if err != nil { - t.Fatalf("err: %v", err) - } + catalog := consul.NewMockCatalog(logger) // Create server - server, err := nomad.NewServer(config, consulSyncer, logger) + server, err := nomad.NewServer(config, catalog, logger) if err != nil { t.Fatalf("err: %v", err) } @@ -105,14 +101,11 @@ func testClient(t *testing.T, cb func(c *config.Config)) *Client { cb(conf) } - shutdownCh := make(chan struct{}) - consulSyncer, err := consul.NewSyncer(conf.ConsulConfig, shutdownCh, log.New(os.Stderr, "", log.LstdFlags)) - if err != nil { - t.Fatalf("err: %v", err) - } - logger := log.New(conf.LogOutput, "", log.LstdFlags) - client, err := NewClient(conf, consulSyncer, logger) + catalog := consul.NewMockCatalog(logger) + mockService := newMockConsulServiceClient() + mockService.logger = logger + client, err := NewClient(conf, catalog, mockService, logger) if err != nil { t.Fatalf("err: %v", err) } @@ -754,14 +747,11 @@ func TestClient_SaveRestoreState(t *testing.T) { } // Create a new client - shutdownCh := make(chan struct{}) logger := log.New(c1.config.LogOutput, "", log.LstdFlags) - consulSyncer, err := consul.NewSyncer(c1.config.ConsulConfig, shutdownCh, logger) - if err != nil { - t.Fatalf("err: %v", err) - } - - c2, err := NewClient(c1.config, consulSyncer, logger) + catalog := consul.NewMockCatalog(logger) + mockService := newMockConsulServiceClient() + mockService.logger = logger + c2, err := NewClient(c1.config, catalog, mockService, logger) if err != nil { t.Fatalf("err: %v", err) } diff --git a/client/consul.go b/client/consul.go new file mode 100644 index 00000000000..41da0abebfb --- /dev/null +++ b/client/consul.go @@ -0,0 +1,13 @@ +package client + +import ( + "github.com/hashicorp/nomad/command/agent/consul" + "github.com/hashicorp/nomad/nomad/structs" +) + +// ConsulServiceAPI is the interface the Nomad Client uses to register and +// remove services and checks from Consul. +type ConsulServiceAPI interface { + RegisterTask(allocID string, task *structs.Task, exec consul.ScriptExecutor) error + RemoveTask(allocID string, task *structs.Task) +} diff --git a/client/consul_test.go b/client/consul_test.go new file mode 100644 index 00000000000..f37aec3f9ad --- /dev/null +++ b/client/consul_test.go @@ -0,0 +1,56 @@ +package client + +import ( + "io/ioutil" + "log" + "os" + "sync" + "testing" + + "github.com/hashicorp/nomad/command/agent/consul" + "github.com/hashicorp/nomad/nomad/structs" +) + +// mockConsulOp represents the register/deregister operations. +type mockConsulOp struct { + allocID string + task *structs.Task + exec consul.ScriptExecutor +} + +// mockConsulServiceClient implements the ConsulServiceAPI interface to record +// and log task registration/deregistration. +type mockConsulServiceClient struct { + registers []mockConsulOp + removes []mockConsulOp + mu sync.Mutex + + logger *log.Logger +} + +func newMockConsulServiceClient() *mockConsulServiceClient { + m := mockConsulServiceClient{ + registers: make([]mockConsulOp, 0, 10), + removes: make([]mockConsulOp, 0, 10), + logger: log.New(ioutil.Discard, "", 0), + } + if testing.Verbose() { + m.logger = log.New(os.Stderr, "", log.LstdFlags) + } + return &m +} + +func (m *mockConsulServiceClient) RegisterTask(allocID string, task *structs.Task, exec consul.ScriptExecutor) error { + m.logger.Printf("[TEST] mock_consul: RegisterTask(%q, %q, %T)", allocID, task.Name, exec) + m.mu.Lock() + defer m.mu.Unlock() + m.registers = append(m.registers, mockConsulOp{allocID, task, exec}) + return nil +} + +func (m *mockConsulServiceClient) RemoveTask(allocID string, task *structs.Task) { + m.logger.Printf("[TEST] mock_consul: RemoveTask(%q, %q)", allocID, task.Name) + m.mu.Lock() + defer m.mu.Unlock() + m.removes = append(m.removes, mockConsulOp{allocID, task, nil}) +} diff --git a/client/driver/docker.go b/client/driver/docker.go index b045e3f3bba..171c1ed6df6 100644 --- a/client/driver/docker.go +++ b/client/driver/docker.go @@ -1,6 +1,7 @@ package driver import ( + "context" "encoding/json" "fmt" "log" @@ -14,6 +15,7 @@ import ( "syscall" "time" + "github.com/armon/circbuf" docker "github.com/fsouza/go-dockerclient" "github.com/docker/docker/cli/config/configfile" @@ -564,9 +566,6 @@ func (d *DockerDriver) Start(ctx *ExecContext, task *structs.Task) (DriverHandle doneCh: make(chan bool), waitCh: make(chan *dstructs.WaitResult, 1), } - if err := exec.SyncServices(consulContext(d.config, container.ID)); err != nil { - d.logger.Printf("[ERR] driver.docker: error registering services with consul for task: %q: %v", task.Name, err) - } go h.collectStats() go h.run() return h, nil @@ -1227,10 +1226,6 @@ func (d *DockerDriver) Open(ctx *ExecContext, handleID string) (DriverHandle, er doneCh: make(chan bool), waitCh: make(chan *dstructs.WaitResult, 1), } - if err := exec.SyncServices(consulContext(d.config, pid.ContainerID)); err != nil { - h.logger.Printf("[ERR] driver.docker: error registering services with consul: %v", err) - } - go h.collectStats() go h.run() return h, nil @@ -1273,6 +1268,42 @@ func (h *DockerHandle) Update(task *structs.Task) error { return nil } +func (h *DockerHandle) Exec(ctx context.Context, cmd string, args []string) ([]byte, int, error) { + fullCmd := make([]string, len(args)+1) + fullCmd[0] = cmd + copy(fullCmd[1:], args) + createExecOpts := docker.CreateExecOptions{ + AttachStdin: false, + AttachStdout: true, + AttachStderr: true, + Tty: false, + Cmd: fullCmd, + Container: h.containerID, + Context: ctx, + } + exec, err := h.client.CreateExec(createExecOpts) + if err != nil { + return nil, 0, err + } + + output, _ := circbuf.NewBuffer(int64(dstructs.CheckBufSize)) + startOpts := docker.StartExecOptions{ + Detach: false, + Tty: false, + OutputStream: output, + ErrorStream: output, + Context: ctx, + } + if err := client.StartExec(exec.ID, startOpts); err != nil { + return nil, 0, err + } + res, err := client.InspectExec(exec.ID) + if err != nil { + return output.Bytes(), 0, err + } + return output.Bytes(), res.ExitCode, nil +} + func (h *DockerHandle) Signal(s os.Signal) error { // Convert types sysSig, ok := s.(syscall.Signal) @@ -1332,11 +1363,6 @@ func (h *DockerHandle) run() { close(h.doneCh) - // Remove services - if err := h.executor.DeregisterServices(); err != nil { - h.logger.Printf("[ERR] driver.docker: error deregistering services: %v", err) - } - // Shutdown the syslog collector if err := h.executor.Exit(); err != nil { h.logger.Printf("[ERR] driver.docker: failed to kill the syslog collector: %v", err) diff --git a/client/driver/driver_test.go b/client/driver/driver_test.go index 8cd44b331ae..b186efac8c1 100644 --- a/client/driver/driver_test.go +++ b/client/driver/driver_test.go @@ -13,6 +13,7 @@ import ( "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/driver/env" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/helper/testtask" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" @@ -421,3 +422,15 @@ func TestCreatedResources_CopyRemove(t *testing.T) { t.Fatalf("res1 should not equal res2: #%v", res1) } } + +// TestHandleExec statically asserts the drivers we expect to implement the +// consul.Executor interface do. +func TestHandleScriptExecutor(t *testing.T) { + _ = []consul.ScriptExecutor{ + &DockerHandle{}, + &execHandle{}, + &javaHandle{}, + &rawExecHandle{}, + &rktHandle{}, + } +} diff --git a/client/driver/exec.go b/client/driver/exec.go index d94c82443ff..7da657e1ed3 100644 --- a/client/driver/exec.go +++ b/client/driver/exec.go @@ -1,6 +1,7 @@ package driver import ( + "context" "encoding/json" "fmt" "log" @@ -163,9 +164,7 @@ func (d *ExecDriver) Start(ctx *ExecContext, task *structs.Task) (DriverHandle, version: d.config.Version, doneCh: make(chan struct{}), waitCh: make(chan *dstructs.WaitResult, 1), - } - if err := exec.SyncServices(consulContext(d.config, "")); err != nil { - d.logger.Printf("[ERR] driver.exec: error registering services with consul for task: %q: %v", task.Name, err) + taskDir: ctx.TaskDir, } go h.run() return h, nil @@ -222,9 +221,7 @@ func (d *ExecDriver) Open(ctx *ExecContext, handleID string) (DriverHandle, erro maxKillTimeout: id.MaxKillTimeout, doneCh: make(chan struct{}), waitCh: make(chan *dstructs.WaitResult, 1), - } - if err := exec.SyncServices(consulContext(d.config, "")); err != nil { - d.logger.Printf("[ERR] driver.exec: error registering services with consul: %v", err) + taskDir: ctx.TaskDir, } go h.run() return h, nil @@ -260,6 +257,10 @@ func (h *execHandle) Update(task *structs.Task) error { return nil } +func (h *execHandle) Exec(ctx context.Context, cmd string, args []string) ([]byte, int, error) { + return execChroot(ctx, h.taskDir.Dir, cmd, args) +} + func (h *execHandle) Signal(s os.Signal) error { return h.executor.Signal(s) } @@ -307,11 +308,6 @@ func (h *execHandle) run() { } } - // Remove services - if err := h.executor.DeregisterServices(); err != nil { - h.logger.Printf("[ERR] driver.exec: failed to deregister services: %v", err) - } - // Exit the executor if err := h.executor.Exit(); err != nil { h.logger.Printf("[ERR] driver.exec: error destroying executor: %v", err) diff --git a/client/driver/exec_test.go b/client/driver/exec_test.go index 85976df1aa7..2f35efa8dd6 100644 --- a/client/driver/exec_test.go +++ b/client/driver/exec_test.go @@ -1,6 +1,8 @@ package driver import ( + "bytes" + "context" "fmt" "io/ioutil" "path/filepath" @@ -11,6 +13,7 @@ import ( "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/driver/env" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" @@ -280,3 +283,64 @@ func TestExecDriverUser(t *testing.T) { t.Fatalf("Expecting '%v' in '%v'", msg, err) } } + +// TestExecDriver_HandlerExec ensures the exec driver's handle properly executes commands inside the chroot. +func TestExecDriver_HandlerExec(t *testing.T) { + ctestutils.ExecCompatible(t) + task := &structs.Task{ + Name: "sleep", + Driver: "exec", + Config: map[string]interface{}{ + "command": "/bin/sleep", + "args": []string{"9000"}, + }, + LogConfig: &structs.LogConfig{ + MaxFiles: 10, + MaxFileSizeMB: 10, + }, + Resources: basicResources, + } + + ctx := testDriverContexts(t, task) + defer ctx.AllocDir.Destroy() + d := NewExecDriver(ctx.DriverCtx) + + if _, err := d.Prestart(ctx.ExecCtx, task); err != nil { + t.Fatalf("prestart err: %v", err) + } + handle, err := d.Start(ctx.ExecCtx, task) + if err != nil { + t.Fatalf("err: %v", err) + } + if handle == nil { + t.Fatalf("missing handle") + } + + // Exec a command that should work + out, code, err := handle.(consul.ScriptExecutor).Exec(context.TODO(), "/usr/bin/stat", []string{"/alloc"}) + if err != nil { + t.Fatalf("error exec'ing stat: %v", err) + } + if code != 0 { + t.Fatalf("expected `stat /alloc` to succeed but exit code was: %d", code) + } + if expected := 100; len(out) < expected { + t.Fatalf("expected at least %d bytes of output but found %d:\n%s", expected, len(out), out) + } + + // Exec a command that should fail + out, code, err = handle.(consul.ScriptExecutor).Exec(context.TODO(), "/usr/bin/stat", []string{"lkjhdsaflkjshowaisxmcvnlia"}) + if err != nil { + t.Fatalf("error exec'ing stat: %v", err) + } + if code == 0 { + t.Fatalf("expected `stat` to fail but exit code was: %d", code) + } + if expected := "No such file or directory"; !bytes.Contains(out, []byte(expected)) { + t.Fatalf("expected output to contain %q but found: %q", expected, out) + } + + if err := handle.Kill(); err != nil { + t.Fatalf("error killing exec handle: %v", err) + } +} diff --git a/client/driver/executor/checks.go b/client/driver/executor/checks.go deleted file mode 100644 index de93146ccdf..00000000000 --- a/client/driver/executor/checks.go +++ /dev/null @@ -1,205 +0,0 @@ -package executor - -import ( - "fmt" - "log" - "os/exec" - "sync" - "syscall" - "time" - - "github.com/armon/circbuf" - docker "github.com/fsouza/go-dockerclient" - cstructs "github.com/hashicorp/nomad/client/driver/structs" -) - -var ( - // We store the client globally to cache the connection to the docker daemon. - createClient sync.Once - client *docker.Client -) - -const ( - // The default check timeout - defaultCheckTimeout = 30 * time.Second -) - -// DockerScriptCheck runs nagios compatible scripts in a docker container and -// provides the check result -type DockerScriptCheck struct { - id string // id of the check - interval time.Duration // interval of the check - timeout time.Duration // timeout of the check - containerID string // container id in which the check will be invoked - logger *log.Logger - cmd string // check command - args []string // check command arguments - - dockerEndpoint string // docker endpoint - tlsCert string // path to tls certificate - tlsCa string // path to tls ca - tlsKey string // path to tls key -} - -// dockerClient creates the client to interact with the docker daemon -func (d *DockerScriptCheck) dockerClient() (*docker.Client, error) { - if client != nil { - return client, nil - } - - var err error - createClient.Do(func() { - if d.dockerEndpoint != "" { - if d.tlsCert+d.tlsKey+d.tlsCa != "" { - d.logger.Printf("[DEBUG] executor.checks: using TLS client connection to %s", d.dockerEndpoint) - client, err = docker.NewTLSClient(d.dockerEndpoint, d.tlsCert, d.tlsKey, d.tlsCa) - } else { - d.logger.Printf("[DEBUG] executor.checks: using standard client connection to %s", d.dockerEndpoint) - client, err = docker.NewClient(d.dockerEndpoint) - } - return - } - - d.logger.Println("[DEBUG] executor.checks: using client connection initialized from environment") - client, err = docker.NewClientFromEnv() - }) - return client, err -} - -// Run runs a script check inside a docker container -func (d *DockerScriptCheck) Run() *cstructs.CheckResult { - var ( - exec *docker.Exec - err error - execRes *docker.ExecInspect - time = time.Now() - ) - - if client, err = d.dockerClient(); err != nil { - return &cstructs.CheckResult{Err: err} - } - execOpts := docker.CreateExecOptions{ - AttachStdin: false, - AttachStdout: true, - AttachStderr: true, - Tty: false, - Cmd: append([]string{d.cmd}, d.args...), - Container: d.containerID, - } - if exec, err = client.CreateExec(execOpts); err != nil { - return &cstructs.CheckResult{Err: err} - } - - output, _ := circbuf.NewBuffer(int64(cstructs.CheckBufSize)) - startOpts := docker.StartExecOptions{ - Detach: false, - Tty: false, - OutputStream: output, - ErrorStream: output, - } - - if err = client.StartExec(exec.ID, startOpts); err != nil { - return &cstructs.CheckResult{Err: err} - } - if execRes, err = client.InspectExec(exec.ID); err != nil { - return &cstructs.CheckResult{Err: err} - } - return &cstructs.CheckResult{ - ExitCode: execRes.ExitCode, - Output: string(output.Bytes()), - Timestamp: time, - } -} - -// ID returns the check id -func (d *DockerScriptCheck) ID() string { - return d.id -} - -// Interval returns the interval at which the check has to run -func (d *DockerScriptCheck) Interval() time.Duration { - return d.interval -} - -// Timeout returns the duration after which a check is timed out. -func (d *DockerScriptCheck) Timeout() time.Duration { - if d.timeout == 0 { - return defaultCheckTimeout - } - return d.timeout -} - -// ExecScriptCheck runs a nagios compatible script and returns the check result -type ExecScriptCheck struct { - id string // id of the script check - interval time.Duration // interval at which the check is invoked - timeout time.Duration // timeout duration of the check - cmd string // command of the check - args []string // args passed to the check - taskDir string // the root directory of the check - - FSIsolation bool // indicates whether the check has to be run within a chroot -} - -// Run runs an exec script check -func (e *ExecScriptCheck) Run() *cstructs.CheckResult { - buf, _ := circbuf.NewBuffer(int64(cstructs.CheckBufSize)) - cmd := exec.Command(e.cmd, e.args...) - cmd.Stdout = buf - cmd.Stderr = buf - e.setChroot(cmd) - ts := time.Now() - if err := cmd.Start(); err != nil { - return &cstructs.CheckResult{Err: err} - } - errCh := make(chan error, 2) - go func() { - errCh <- cmd.Wait() - }() - - select { - case err := <-errCh: - endTime := time.Now() - if err == nil { - return &cstructs.CheckResult{ - ExitCode: 0, - Output: string(buf.Bytes()), - Timestamp: ts, - } - } - exitCode := 1 - if exitErr, ok := err.(*exec.ExitError); ok { - if status, ok := exitErr.Sys().(syscall.WaitStatus); ok { - exitCode = status.ExitStatus() - } - } - return &cstructs.CheckResult{ - ExitCode: exitCode, - Output: string(buf.Bytes()), - Timestamp: ts, - Duration: endTime.Sub(ts), - } - case <-time.After(e.Timeout()): - errCh <- fmt.Errorf("timed out after waiting 30s") - } - - return nil -} - -// ID returns the check id -func (e *ExecScriptCheck) ID() string { - return e.id -} - -// Interval returns the interval at which the check has to run -func (e *ExecScriptCheck) Interval() time.Duration { - return e.interval -} - -// Timeout returns the duration after which a check is timed out. -func (e *ExecScriptCheck) Timeout() time.Duration { - if e.timeout == 0 { - return defaultCheckTimeout - } - return e.timeout -} diff --git a/client/driver/executor/checks_linux_test.go b/client/driver/executor/checks_linux_test.go deleted file mode 100644 index 3affd0e08f3..00000000000 --- a/client/driver/executor/checks_linux_test.go +++ /dev/null @@ -1,56 +0,0 @@ -package executor - -import ( - "log" - "os" - "strings" - "testing" - - dstructs "github.com/hashicorp/nomad/client/driver/structs" - "github.com/hashicorp/nomad/client/testutil" -) - -func TestExecScriptCheckWithIsolation(t *testing.T) { - testutil.ExecCompatible(t) - - execCmd := ExecCommand{Cmd: "/bin/echo", Args: []string{"hello world"}} - ctx, allocDir := testExecutorContextWithChroot(t) - defer allocDir.Destroy() - - execCmd.FSIsolation = true - execCmd.ResourceLimits = true - execCmd.User = dstructs.DefaultUnpriviledgedUser - - executor := NewExecutor(log.New(os.Stdout, "", log.LstdFlags)) - - if err := executor.SetContext(ctx); err != nil { - t.Fatalf("Unexpected error") - } - - _, err := executor.LaunchCmd(&execCmd) - if err != nil { - t.Fatalf("error in launching command: %v", err) - } - - check := &ExecScriptCheck{ - id: "foo", - cmd: "/bin/echo", - args: []string{"hello", "world"}, - taskDir: ctx.TaskDir, - FSIsolation: true, - } - - res := check.Run() - expectedOutput := "hello world" - expectedExitCode := 0 - if res.Err != nil { - t.Fatalf("err: %v", res.Err) - } - if strings.TrimSpace(res.Output) != expectedOutput { - t.Fatalf("output expected: %v, actual: %v", expectedOutput, res.Output) - } - - if res.ExitCode != expectedExitCode { - t.Fatalf("exitcode expected: %v, actual: %v", expectedExitCode, res.ExitCode) - } -} diff --git a/client/driver/executor/checks_test.go b/client/driver/executor/checks_test.go deleted file mode 100644 index 9533026fd71..00000000000 --- a/client/driver/executor/checks_test.go +++ /dev/null @@ -1,96 +0,0 @@ -package executor - -import ( - "log" - "os" - "strings" - "testing" - "time" - - docker "github.com/fsouza/go-dockerclient" - - "github.com/hashicorp/nomad/client/testutil" -) - -func TestExecScriptCheckNoIsolation(t *testing.T) { - check := &ExecScriptCheck{ - id: "foo", - cmd: "/bin/echo", - args: []string{"hello", "world"}, - taskDir: "/tmp", - FSIsolation: false, - } - - res := check.Run() - expectedOutput := "hello world" - expectedExitCode := 0 - if res.Err != nil { - t.Fatalf("err: %v", res.Err) - } - if strings.TrimSpace(res.Output) != expectedOutput { - t.Fatalf("output expected: %v, actual: %v", expectedOutput, res.Output) - } - - if res.ExitCode != expectedExitCode { - t.Fatalf("exitcode expected: %v, actual: %v", expectedExitCode, res.ExitCode) - } -} - -func TestDockerScriptCheck(t *testing.T) { - if !testutil.DockerIsConnected(t) { - return - } - client, err := docker.NewClientFromEnv() - if err != nil { - t.Fatalf("error creating docker client: %v", err) - } - - if err := client.PullImage(docker.PullImageOptions{Repository: "busybox", Tag: "latest"}, - docker.AuthConfiguration{}); err != nil { - t.Fatalf("error pulling redis: %v", err) - } - - container, err := client.CreateContainer(docker.CreateContainerOptions{ - Config: &docker.Config{ - Image: "busybox", - Cmd: []string{"/bin/sleep", "1000"}, - }, - }) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer removeContainer(client, container.ID) - - if err := client.StartContainer(container.ID, container.HostConfig); err != nil { - t.Fatalf("error starting container: %v", err) - } - - check := &DockerScriptCheck{ - id: "1", - interval: 5 * time.Second, - containerID: container.ID, - logger: log.New(os.Stdout, "", log.LstdFlags), - cmd: "/bin/echo", - args: []string{"hello", "world"}, - } - - res := check.Run() - expectedOutput := "hello world" - expectedExitCode := 0 - if res.Err != nil { - t.Fatalf("err: %v", res.Err) - } - if strings.TrimSpace(res.Output) != expectedOutput { - t.Fatalf("output expected: %v, actual: %v", expectedOutput, res.Output) - } - - if res.ExitCode != expectedExitCode { - t.Fatalf("exitcode expected: %v, actual: %v", expectedExitCode, res.ExitCode) - } -} - -// removeContainer kills and removes a container -func removeContainer(client *docker.Client, containerID string) { - client.KillContainer(docker.KillContainerOptions{ID: containerID}) - client.RemoveContainer(docker.RemoveContainerOptions{ID: containerID, RemoveVolumes: true, Force: true}) -} diff --git a/client/driver/executor/checks_unix.go b/client/driver/executor/checks_unix.go deleted file mode 100644 index b18812dd8d0..00000000000 --- a/client/driver/executor/checks_unix.go +++ /dev/null @@ -1,18 +0,0 @@ -// +build darwin dragonfly freebsd linux netbsd openbsd solaris - -package executor - -import ( - "os/exec" - "syscall" -) - -func (e *ExecScriptCheck) setChroot(cmd *exec.Cmd) { - if e.FSIsolation { - if cmd.SysProcAttr == nil { - cmd.SysProcAttr = &syscall.SysProcAttr{} - } - cmd.SysProcAttr.Chroot = e.taskDir - } - cmd.Dir = "/" -} diff --git a/client/driver/executor/checks_windows.go b/client/driver/executor/checks_windows.go deleted file mode 100644 index a35c2722d60..00000000000 --- a/client/driver/executor/checks_windows.go +++ /dev/null @@ -1,8 +0,0 @@ -// +build windows - -package executor - -import "os/exec" - -func (e *ExecScriptCheck) setChroot(cmd *exec.Cmd) { -} diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index 53b9c37f581..90797fbefa2 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -23,10 +23,8 @@ import ( "github.com/hashicorp/nomad/client/driver/env" "github.com/hashicorp/nomad/client/driver/logging" "github.com/hashicorp/nomad/client/stats" - "github.com/hashicorp/nomad/command/agent/consul" shelpers "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/nomad/structs" - "github.com/hashicorp/nomad/nomad/structs/config" dstructs "github.com/hashicorp/nomad/client/driver/structs" cstructs "github.com/hashicorp/nomad/client/structs" @@ -56,38 +54,11 @@ type Executor interface { Exit() error UpdateLogConfig(logConfig *structs.LogConfig) error UpdateTask(task *structs.Task) error - SyncServices(ctx *ConsulContext) error - DeregisterServices() error Version() (*ExecutorVersion, error) Stats() (*cstructs.TaskResourceUsage, error) Signal(s os.Signal) error } -// ConsulContext holds context to configure the Consul client and run checks -type ConsulContext struct { - // ConsulConfig contains the configuration information for talking - // with this Nomad Agent's Consul Agent. - ConsulConfig *config.ConsulConfig - - // ContainerID is the ID of the container - ContainerID string - - // TLSCert is the cert which docker client uses while interactng with the docker - // daemon over TLS - TLSCert string - - // TLSCa is the CA which the docker client uses while interacting with the docker - // daeemon over TLS - TLSCa string - - // TLSKey is the TLS key which the docker client uses while interacting with - // the docker daemon - TLSKey string - - // DockerEndpoint is the endpoint of the docker daemon - DockerEndpoint string -} - // ExecutorContext holds context to configure the command user // wants to run and isolate it type ExecutorContext struct { @@ -196,8 +167,6 @@ type UniversalExecutor struct { resConCtx resourceContainerContext - consulSyncer *consul.Syncer - consulCtx *ConsulContext totalCpuStats *stats.CpuStats userCpuStats *stats.CpuStats systemCpuStats *stats.CpuStats @@ -224,7 +193,7 @@ func NewExecutor(logger *log.Logger) Executor { // Version returns the api version of the executor func (e *UniversalExecutor) Version() (*ExecutorVersion, error) { - return &ExecutorVersion{Version: "1.0.0"}, nil + return &ExecutorVersion{Version: "1.1.0"}, nil } // SetContext is used to set the executors context and should be the first call @@ -377,28 +346,9 @@ func (e *UniversalExecutor) UpdateTask(task *structs.Task) error { e.lre.FileSize = fileSize } e.rotatorLock.Unlock() - - // Re-syncing task with Consul agent - if e.consulSyncer != nil { - e.interpolateServices(e.ctx.Task) - domain := consul.NewExecutorDomain(e.ctx.AllocID, task.Name) - serviceMap := generateServiceKeys(e.ctx.AllocID, task.Services) - e.consulSyncer.SetServices(domain, serviceMap) - } return nil } -// generateServiceKeys takes a list of interpolated Nomad Services and returns a map -// of ServiceKeys to Nomad Services. -func generateServiceKeys(allocID string, services []*structs.Service) map[consul.ServiceKey]*structs.Service { - keys := make(map[consul.ServiceKey]*structs.Service, len(services)) - for _, service := range services { - key := consul.GenerateServiceKey(service) - keys[key] = service - } - return keys -} - func (e *UniversalExecutor) wait() { defer close(e.processExited) err := e.cmd.Wait() @@ -464,10 +414,6 @@ func (e *UniversalExecutor) Exit() error { e.lro.Close() } - if e.consulSyncer != nil { - e.consulSyncer.Shutdown() - } - // If the executor did not launch a process, return. if e.command == nil { return nil @@ -514,38 +460,6 @@ func (e *UniversalExecutor) ShutDown() error { return nil } -// SyncServices syncs the services of the task that the executor is running with -// Consul -func (e *UniversalExecutor) SyncServices(ctx *ConsulContext) error { - e.logger.Printf("[INFO] executor: registering services") - e.consulCtx = ctx - if e.consulSyncer == nil { - cs, err := consul.NewSyncer(ctx.ConsulConfig, e.shutdownCh, e.logger) - if err != nil { - return err - } - e.consulSyncer = cs - go e.consulSyncer.Run() - } - e.interpolateServices(e.ctx.Task) - e.consulSyncer.SetDelegatedChecks(e.createCheckMap(), e.createCheck) - e.consulSyncer.SetAddrFinder(e.ctx.Task.FindHostAndPortFor) - domain := consul.NewExecutorDomain(e.ctx.AllocID, e.ctx.Task.Name) - serviceMap := generateServiceKeys(e.ctx.AllocID, e.ctx.Task.Services) - e.consulSyncer.SetServices(domain, serviceMap) - return nil -} - -// DeregisterServices removes the services of the task that the executor is -// running from Consul -func (e *UniversalExecutor) DeregisterServices() error { - e.logger.Printf("[INFO] executor: de-registering services and shutting down consul service") - if e.consulSyncer != nil { - return e.consulSyncer.Shutdown() - } - return nil -} - // pidStats returns the resource usage stats per pid func (e *UniversalExecutor) pidStats() (map[string]*cstructs.ResourceUsage, error) { stats := make(map[string]*cstructs.ResourceUsage) @@ -677,66 +591,6 @@ func (e *UniversalExecutor) listenerUnix() (net.Listener, error) { return net.Listen("unix", path) } -// createCheckMap creates a map of checks that the executor will handle on it's -// own -func (e *UniversalExecutor) createCheckMap() map[string]struct{} { - checks := map[string]struct{}{ - "script": struct{}{}, - } - return checks -} - -// createCheck creates NomadCheck from a ServiceCheck -func (e *UniversalExecutor) createCheck(check *structs.ServiceCheck, checkID string) (consul.Check, error) { - if check.Type == structs.ServiceCheckScript && e.ctx.Driver == "docker" { - return &DockerScriptCheck{ - id: checkID, - interval: check.Interval, - timeout: check.Timeout, - containerID: e.consulCtx.ContainerID, - logger: e.logger, - cmd: check.Command, - args: check.Args, - }, nil - } - - if check.Type == structs.ServiceCheckScript && (e.ctx.Driver == "exec" || - e.ctx.Driver == "raw_exec" || e.ctx.Driver == "java") { - return &ExecScriptCheck{ - id: checkID, - interval: check.Interval, - timeout: check.Timeout, - cmd: check.Command, - args: check.Args, - taskDir: e.ctx.TaskDir, - FSIsolation: e.command.FSIsolation, - }, nil - - } - return nil, fmt.Errorf("couldn't create check for %v", check.Name) -} - -// interpolateServices interpolates tags in a service and checks with values from the -// task's environment. -func (e *UniversalExecutor) interpolateServices(task *structs.Task) { - e.ctx.TaskEnv.Build() - for _, service := range task.Services { - for _, check := range service.Checks { - check.Name = e.ctx.TaskEnv.ReplaceEnv(check.Name) - check.Type = e.ctx.TaskEnv.ReplaceEnv(check.Type) - check.Command = e.ctx.TaskEnv.ReplaceEnv(check.Command) - check.Args = e.ctx.TaskEnv.ParseAndReplace(check.Args) - check.Path = e.ctx.TaskEnv.ReplaceEnv(check.Path) - check.Protocol = e.ctx.TaskEnv.ReplaceEnv(check.Protocol) - check.PortLabel = e.ctx.TaskEnv.ReplaceEnv(check.PortLabel) - check.InitialStatus = e.ctx.TaskEnv.ReplaceEnv(check.InitialStatus) - } - service.Name = e.ctx.TaskEnv.ReplaceEnv(service.Name) - service.PortLabel = e.ctx.TaskEnv.ReplaceEnv(service.PortLabel) - service.Tags = e.ctx.TaskEnv.ParseAndReplace(service.Tags) - } -} - // collectPids collects the pids of the child processes that the executor is // running every 5 seconds func (e *UniversalExecutor) collectPids() { diff --git a/client/driver/executor/executor_test.go b/client/driver/executor/executor_test.go index 8508233d6c8..51325e09a9e 100644 --- a/client/driver/executor/executor_test.go +++ b/client/driver/executor/executor_test.go @@ -5,7 +5,6 @@ import ( "log" "os" "path/filepath" - "reflect" "strings" "syscall" "testing" @@ -259,31 +258,6 @@ func TestExecutor_MakeExecutable(t *testing.T) { } } -func TestExecutorInterpolateServices(t *testing.T) { - task := mock.Job().TaskGroups[0].Tasks[0] - // Make a fake exececutor - ctx, allocDir := testExecutorContext(t) - defer allocDir.Destroy() - executor := NewExecutor(log.New(os.Stdout, "", log.LstdFlags)) - - executor.(*UniversalExecutor).ctx = ctx - executor.(*UniversalExecutor).interpolateServices(task) - expectedTags := []string{"pci:true", "datacenter:dc1"} - if !reflect.DeepEqual(task.Services[0].Tags, expectedTags) { - t.Fatalf("expected: %v, actual: %v", expectedTags, task.Services[0].Tags) - } - - expectedCheckCmd := "/usr/local/check-table-mysql" - expectedCheckArgs := []string{"5.6"} - if !reflect.DeepEqual(task.Services[0].Checks[0].Command, expectedCheckCmd) { - t.Fatalf("expected: %v, actual: %v", expectedCheckCmd, task.Services[0].Checks[0].Command) - } - - if !reflect.DeepEqual(task.Services[0].Checks[0].Args, expectedCheckArgs) { - t.Fatalf("expected: %v, actual: %v", expectedCheckArgs, task.Services[0].Checks[0].Args) - } -} - func TestScanPids(t *testing.T) { p1 := NewFakeProcess(2, 5) p2 := NewFakeProcess(10, 2) diff --git a/client/driver/executor_plugin.go b/client/driver/executor_plugin.go index 17f40ac25d4..7c4074feffc 100644 --- a/client/driver/executor_plugin.go +++ b/client/driver/executor_plugin.go @@ -33,11 +33,6 @@ type LaunchCmdArgs struct { Cmd *executor.ExecCommand } -// SyncServicesArgs wraps the consul context for the purposes of RPC -type SyncServicesArgs struct { - Ctx *executor.ConsulContext -} - func (e *ExecutorRPC) LaunchCmd(cmd *executor.ExecCommand) (*executor.ProcessState, error) { var ps *executor.ProcessState err := e.client.Call("Plugin.LaunchCmd", LaunchCmdArgs{Cmd: cmd}, &ps) @@ -76,10 +71,6 @@ func (e *ExecutorRPC) UpdateTask(task *structs.Task) error { return e.client.Call("Plugin.UpdateTask", task, new(interface{})) } -func (e *ExecutorRPC) SyncServices(ctx *executor.ConsulContext) error { - return e.client.Call("Plugin.SyncServices", SyncServicesArgs{Ctx: ctx}, new(interface{})) -} - func (e *ExecutorRPC) DeregisterServices() error { return e.client.Call("Plugin.DeregisterServices", new(interface{}), new(interface{})) } @@ -149,12 +140,9 @@ func (e *ExecutorRPCServer) UpdateTask(args *structs.Task, resp *interface{}) er return e.Impl.UpdateTask(args) } -func (e *ExecutorRPCServer) SyncServices(args SyncServicesArgs, resp *interface{}) error { - return e.Impl.SyncServices(args.Ctx) -} - func (e *ExecutorRPCServer) DeregisterServices(args interface{}, resp *interface{}) error { - return e.Impl.DeregisterServices() + // In 0.6 this is a noop. Goes away in 0.7. + return nil } func (e *ExecutorRPCServer) Version(args interface{}, version *executor.ExecutorVersion) error { diff --git a/client/driver/java.go b/client/driver/java.go index 4a90b4efad6..c684e85bf56 100644 --- a/client/driver/java.go +++ b/client/driver/java.go @@ -2,6 +2,7 @@ package driver import ( "bytes" + "context" "encoding/json" "fmt" "log" @@ -59,6 +60,7 @@ type javaHandle struct { userPid int executor executor.Executor isolationConfig *dstructs.IsolationConfig + taskDir string killTimeout time.Duration maxKillTimeout time.Duration @@ -284,6 +286,7 @@ func (d *JavaDriver) Start(ctx *ExecContext, task *structs.Task) (DriverHandle, executor: execIntf, userPid: ps.Pid, isolationConfig: ps.IsolationConfig, + taskDir: ctx.TaskDir.Dir, killTimeout: GetKillTimeout(task.KillTimeout, maxKill), maxKillTimeout: maxKill, version: d.config.Version, @@ -291,9 +294,6 @@ func (d *JavaDriver) Start(ctx *ExecContext, task *structs.Task) (DriverHandle, doneCh: make(chan struct{}), waitCh: make(chan *dstructs.WaitResult, 1), } - if err := h.executor.SyncServices(consulContext(d.config, "")); err != nil { - d.logger.Printf("[ERR] driver.java: error registering services with consul for task: %q: %v", task.Name, err) - } go h.run() return h, nil } @@ -306,6 +306,7 @@ type javaId struct { MaxKillTimeout time.Duration PluginConfig *PluginReattachConfig IsolationConfig *dstructs.IsolationConfig + TaskDir string UserPid int } @@ -352,10 +353,6 @@ func (d *JavaDriver) Open(ctx *ExecContext, handleID string) (DriverHandle, erro doneCh: make(chan struct{}), waitCh: make(chan *dstructs.WaitResult, 1), } - if err := h.executor.SyncServices(consulContext(d.config, "")); err != nil { - d.logger.Printf("[ERR] driver.java: error registering services with consul: %v", err) - } - go h.run() return h, nil } @@ -368,6 +365,7 @@ func (h *javaHandle) ID() string { PluginConfig: NewPluginReattachConfig(h.pluginClient.ReattachConfig()), UserPid: h.userPid, IsolationConfig: h.isolationConfig, + TaskDir: h.taskDir, } data, err := json.Marshal(id) @@ -390,6 +388,10 @@ func (h *javaHandle) Update(task *structs.Task) error { return nil } +func (h *javaHandle) Exec(ctx context.Context, cmd string, args []string) ([]byte, int, error) { + return execChroot(ctx, h.taskDir, cmd, args) +} + func (h *javaHandle) Signal(s os.Signal) error { return h.executor.Signal(s) } @@ -436,11 +438,6 @@ func (h *javaHandle) run() { } } - // Remove services - if err := h.executor.DeregisterServices(); err != nil { - h.logger.Printf("[ERR] driver.java: failed to kill the deregister services: %v", err) - } - // Exit the executor h.executor.Exit() h.pluginClient.Kill() diff --git a/client/driver/mock_driver.go b/client/driver/mock_driver.go index 8f34297e513..518e79cea17 100644 --- a/client/driver/mock_driver.go +++ b/client/driver/mock_driver.go @@ -3,6 +3,7 @@ package driver import ( + "context" "encoding/json" "errors" "fmt" @@ -234,6 +235,11 @@ func (h *mockDriverHandle) WaitCh() chan *dstructs.WaitResult { return h.waitCh } +func (h *mockDriverHandle) Exec(ctx context.Context, cmd string, args []string) ([]byte, int, error) { + h.logger.Printf("[DEBUG] driver.mock: Exec(%q, %q)", cmd, args) + return []byte(fmt.Sprintf("Exec(%q, %q)", cmd, args)), 0, nil +} + // TODO Implement when we need it. func (h *mockDriverHandle) Update(task *structs.Task) error { return nil diff --git a/client/driver/qemu.go b/client/driver/qemu.go index ad90c80383c..4e04f95229b 100644 --- a/client/driver/qemu.go +++ b/client/driver/qemu.go @@ -273,10 +273,6 @@ func (d *QemuDriver) Start(ctx *ExecContext, task *structs.Task) (DriverHandle, doneCh: make(chan struct{}), waitCh: make(chan *dstructs.WaitResult, 1), } - - if err := h.executor.SyncServices(consulContext(d.config, "")); err != nil { - h.logger.Printf("[ERR] driver.qemu: error registering services for task: %q: %v", task.Name, err) - } go h.run() return h, nil } @@ -322,9 +318,6 @@ func (d *QemuDriver) Open(ctx *ExecContext, handleID string) (DriverHandle, erro doneCh: make(chan struct{}), waitCh: make(chan *dstructs.WaitResult, 1), } - if err := h.executor.SyncServices(consulContext(d.config, "")); err != nil { - h.logger.Printf("[ERR] driver.qemu: error registering services: %v", err) - } go h.run() return h, nil } @@ -402,11 +395,6 @@ func (h *qemuHandle) run() { } close(h.doneCh) - // Remove services - if err := h.executor.DeregisterServices(); err != nil { - h.logger.Printf("[ERR] driver.qemu: failed to deregister services: %v", err) - } - // Exit the executor h.executor.Exit() h.pluginClient.Kill() diff --git a/client/driver/raw_exec.go b/client/driver/raw_exec.go index 1500cf00f01..e0e86c20bf6 100644 --- a/client/driver/raw_exec.go +++ b/client/driver/raw_exec.go @@ -1,6 +1,7 @@ package driver import ( + "context" "encoding/json" "fmt" "log" @@ -164,9 +165,6 @@ func (d *RawExecDriver) Start(ctx *ExecContext, task *structs.Task) (DriverHandl doneCh: make(chan struct{}), waitCh: make(chan *dstructs.WaitResult, 1), } - if err := h.executor.SyncServices(consulContext(d.config, "")); err != nil { - h.logger.Printf("[ERR] driver.raw_exec: error registering services with consul for task: %q: %v", task.Name, err) - } go h.run() return h, nil } @@ -214,9 +212,6 @@ func (d *RawExecDriver) Open(ctx *ExecContext, handleID string) (DriverHandle, e doneCh: make(chan struct{}), waitCh: make(chan *dstructs.WaitResult, 1), } - if err := h.executor.SyncServices(consulContext(d.config, "")); err != nil { - h.logger.Printf("[ERR] driver.raw_exec: error registering services with consul: %v", err) - } go h.run() return h, nil } @@ -250,6 +245,10 @@ func (h *rawExecHandle) Update(task *structs.Task) error { return nil } +func (h *rawExecHandle) Exec(ctx context.Context, cmd string, args []string) ([]byte, int, error) { + return execChroot(ctx, "", cmd, args) +} + func (h *rawExecHandle) Signal(s os.Signal) error { return h.executor.Signal(s) } @@ -289,10 +288,6 @@ func (h *rawExecHandle) run() { h.logger.Printf("[ERR] driver.raw_exec: error killing user process: %v", e) } } - // Remove services - if err := h.executor.DeregisterServices(); err != nil { - h.logger.Printf("[ERR] driver.raw_exec: failed to deregister services: %v", err) - } // Exit the executor if err := h.executor.Exit(); err != nil { diff --git a/client/driver/raw_exec_test.go b/client/driver/raw_exec_test.go index 7b96c2218cd..cb098f76488 100644 --- a/client/driver/raw_exec_test.go +++ b/client/driver/raw_exec_test.go @@ -1,6 +1,8 @@ package driver import ( + "bytes" + "context" "fmt" "io/ioutil" "path/filepath" @@ -11,6 +13,7 @@ import ( "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/driver/env" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/helper/testtask" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" @@ -298,3 +301,62 @@ func TestRawExecDriverUser(t *testing.T) { t.Fatalf("Expecting '%v' in '%v'", msg, err) } } + +func TestRawExecDriver_HandlerExec(t *testing.T) { + task := &structs.Task{ + Name: "sleep", + Driver: "raw_exec", + Config: map[string]interface{}{ + "command": testtask.Path(), + "args": []string{"sleep", "9000"}, + }, + LogConfig: &structs.LogConfig{ + MaxFiles: 10, + MaxFileSizeMB: 10, + }, + Resources: basicResources, + } + testtask.SetTaskEnv(task) + ctx := testDriverContexts(t, task) + defer ctx.AllocDir.Destroy() + d := NewRawExecDriver(ctx.DriverCtx) + + if _, err := d.Prestart(ctx.ExecCtx, task); err != nil { + t.Fatalf("prestart err: %v", err) + } + handle, err := d.Start(ctx.ExecCtx, task) + if err != nil { + t.Fatalf("err: %v", err) + } + if handle == nil { + t.Fatalf("missing handle") + } + + // Exec a command that should work + out, code, err := handle.(consul.ScriptExecutor).Exec(context.TODO(), "/usr/bin/stat", []string{"/tmp"}) + if err != nil { + t.Fatalf("error exec'ing stat: %v", err) + } + if code != 0 { + t.Fatalf("expected `stat /alloc` to succeed but exit code was: %d", code) + } + if expected := 100; len(out) < expected { + t.Fatalf("expected at least %d bytes of output but found %d:\n%s", expected, len(out), out) + } + + // Exec a command that should fail + out, code, err = handle.(consul.ScriptExecutor).Exec(context.TODO(), "/usr/bin/stat", []string{"lkjhdsaflkjshowaisxmcvnlia"}) + if err != nil { + t.Fatalf("error exec'ing stat: %v", err) + } + if code == 0 { + t.Fatalf("expected `stat` to fail but exit code was: %d", code) + } + if expected := "No such file or directory"; !bytes.Contains(out, []byte(expected)) { + t.Fatalf("expected output to contain %q but found: %q", expected, out) + } + + if err := handle.Kill(); err != nil { + t.Fatalf("error killing exec handle: %v", err) + } +} diff --git a/client/driver/rkt.go b/client/driver/rkt.go index 55e2a026c9d..d1e039dd60c 100644 --- a/client/driver/rkt.go +++ b/client/driver/rkt.go @@ -2,8 +2,10 @@ package driver import ( "bytes" + "context" "encoding/json" "fmt" + "io/ioutil" "log" "net" "os" @@ -51,6 +53,9 @@ const ( // rktCmd is the command rkt is installed as. rktCmd = "rkt" + + // rktUuidDeadline is how long to wait for the uuid file to be written + rktUuidDeadline = 5 * time.Second ) // RktDriver is a driver for running images via Rkt @@ -81,6 +86,7 @@ type RktDriverConfig struct { // rktHandle is returned from Start/Open as a handle to the PID type rktHandle struct { + uuid string pluginClient *plugin.Client executorPid int executor executor.Executor @@ -94,6 +100,7 @@ type rktHandle struct { // rktPID is a struct to map the pid running the process to the vm image on // disk type rktPID struct { + UUID string PluginConfig *PluginReattachConfig ExecutorPid int KillTimeout time.Duration @@ -229,7 +236,7 @@ func (d *RktDriver) Start(ctx *ExecContext, task *structs.Task) (DriverHandle, e img := driverConfig.ImageName // Build the command. - var cmdArgs []string + cmdArgs := make([]string, 0, 32) // Add debug option to rkt command. debug := driverConfig.Debug @@ -253,6 +260,11 @@ func (d *RktDriver) Start(ctx *ExecContext, task *structs.Task) (DriverHandle, e } cmdArgs = append(cmdArgs, "run") + // Write the UUID out to a file in the state dir so we can read it back + // in and access the pod by UUID from other commands + uuidPath := filepath.Join(ctx.TaskDir.Dir, "rkt.uuid") + cmdArgs = append(cmdArgs, fmt.Sprintf("--uuid-file-save=%s", uuidPath)) + // Convert underscores to dashes in task names for use in volume names #2358 sanitizedName := strings.Replace(task.Name, "_", "-", -1) @@ -439,9 +451,28 @@ func (d *RktDriver) Start(ctx *ExecContext, task *structs.Task) (DriverHandle, e return nil, err } - d.logger.Printf("[DEBUG] driver.rkt: started ACI %q with: %v", img, cmdArgs) + // Wait for UUID file to get written + uuid := "" + deadline := time.Now().Add(rktUuidDeadline) + var lastErr error + for time.Now().Before(deadline) { + if uuidBytes, err := ioutil.ReadFile(uuidPath); err != nil { + lastErr = err + } else { + uuid = string(uuidBytes) + break + } + time.Sleep(400 * time.Millisecond) + } + if uuid == "" { + d.logger.Printf("[WARN] driver.rkt: reading uuid from %q failed; unable to run script checks for task %q. Last error: %v", + uuidPath, d.taskName, lastErr) + } + + d.logger.Printf("[DEBUG] driver.rkt: started ACI %q (UUID: %s) for task %q with: %v", img, uuid, d.taskName, cmdArgs) maxKill := d.DriverContext.config.MaxKillTimeout h := &rktHandle{ + uuid: uuid, pluginClient: pluginClient, executor: execIntf, executorPid: ps.Pid, @@ -451,9 +482,6 @@ func (d *RktDriver) Start(ctx *ExecContext, task *structs.Task) (DriverHandle, e doneCh: make(chan struct{}), waitCh: make(chan *dstructs.WaitResult, 1), } - if err := h.executor.SyncServices(consulContext(d.config, "")); err != nil { - h.logger.Printf("[ERR] driver.rkt: error registering services for task: %q: %v", task.Name, err) - } go h.run() return h, nil } @@ -484,6 +512,7 @@ func (d *RktDriver) Open(ctx *ExecContext, handleID string) (DriverHandle, error d.logger.Printf("[DEBUG] driver.rkt: version of executor: %v", ver.Version) // Return a driver handle h := &rktHandle{ + uuid: id.UUID, pluginClient: pluginClient, executorPid: id.ExecutorPid, executor: exec, @@ -493,9 +522,6 @@ func (d *RktDriver) Open(ctx *ExecContext, handleID string) (DriverHandle, error doneCh: make(chan struct{}), waitCh: make(chan *dstructs.WaitResult, 1), } - if err := h.executor.SyncServices(consulContext(d.config, "")); err != nil { - h.logger.Printf("[ERR] driver.rkt: error registering services: %v", err) - } go h.run() return h, nil } @@ -503,6 +529,7 @@ func (d *RktDriver) Open(ctx *ExecContext, handleID string) (DriverHandle, error func (h *rktHandle) ID() string { // Return a handle to the PID pid := &rktPID{ + UUID: h.uuid, PluginConfig: NewPluginReattachConfig(h.pluginClient.ReattachConfig()), KillTimeout: h.killTimeout, MaxKillTimeout: h.maxKillTimeout, @@ -528,6 +555,19 @@ func (h *rktHandle) Update(task *structs.Task) error { return nil } +func (h *rktHandle) Exec(ctx context.Context, cmd string, args []string) ([]byte, int, error) { + if h.uuid == "" { + return nil, 0, fmt.Errorf("unable to find rkt pod UUID") + } + // enter + UUID + cmd + args... + enterArgs := make([]string, 3+len(args)) + enterArgs[0] = "enter" + enterArgs[1] = h.uuid + enterArgs[2] = cmd + copy(enterArgs[3:], args) + return execChroot(ctx, "", rktCmd, enterArgs) +} + func (h *rktHandle) Signal(s os.Signal) error { return fmt.Errorf("Rkt does not support signals") } @@ -556,10 +596,6 @@ func (h *rktHandle) run() { h.logger.Printf("[ERROR] driver.rkt: error killing user process: %v", e) } } - // Remove services - if err := h.executor.DeregisterServices(); err != nil { - h.logger.Printf("[ERR] driver.rkt: failed to deregister services: %v", err) - } // Exit the executor if err := h.executor.Exit(); err != nil { diff --git a/client/driver/rkt_test.go b/client/driver/rkt_test.go index b54160becdc..34403cb1086 100644 --- a/client/driver/rkt_test.go +++ b/client/driver/rkt_test.go @@ -1,6 +1,8 @@ package driver import ( + "bytes" + "context" "fmt" "io/ioutil" "os" @@ -12,6 +14,7 @@ import ( "time" "github.com/hashicorp/nomad/client/config" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" @@ -489,3 +492,74 @@ func TestRktDriver_PortsMapping(t *testing.T) { t.Fatalf("timeout") } } + +func TestRktDriver_HandlerExec(t *testing.T) { + if os.Getenv("NOMAD_TEST_RKT") == "" { + t.Skip("skipping rkt tests") + } + + ctestutils.RktCompatible(t) + task := &structs.Task{ + Name: "etcd", + Driver: "rkt", + Config: map[string]interface{}{ + "trust_prefix": "coreos.com/etcd", + "image": "coreos.com/etcd:v2.0.4", + "command": "/etcd", + }, + LogConfig: &structs.LogConfig{ + MaxFiles: 10, + MaxFileSizeMB: 10, + }, + Resources: &structs.Resources{ + MemoryMB: 128, + CPU: 100, + }, + } + + ctx := testDriverContexts(t, task) + defer ctx.AllocDir.Destroy() + d := NewRktDriver(ctx.DriverCtx) + + if _, err := d.Prestart(ctx.ExecCtx, task); err != nil { + t.Fatalf("error in prestart: %v", err) + } + handle, err := d.Start(ctx.ExecCtx, task) + if err != nil { + t.Fatalf("err: %v", err) + } + if handle == nil { + t.Fatalf("missing handle") + } + + // Give the pod a second to start + time.Sleep(time.Second) + + // Exec a command that should work + out, code, err := handle.(consul.ScriptExecutor).Exec(context.TODO(), "/etcd", []string{"--version"}) + if err != nil { + t.Fatalf("error exec'ing etcd --version: %v", err) + } + if code != 0 { + t.Fatalf("expected `etcd --version` to succeed but exit code was: %d\n%s", code, string(out)) + } + if expected := []byte("etcd version "); !bytes.HasPrefix(out, expected) { + t.Fatalf("expected output to start with %q but found:\n%q", expected, out) + } + + // Exec a command that should fail + out, code, err = handle.(consul.ScriptExecutor).Exec(context.TODO(), "/etcd", []string{"--kaljdshf"}) + if err != nil { + t.Fatalf("error exec'ing bad command: %v", err) + } + if code == 0 { + t.Fatalf("expected `stat` to fail but exit code was: %d", code) + } + if expected := "flag provided but not defined"; !bytes.Contains(out, []byte(expected)) { + t.Fatalf("expected output to contain %q but found: %q", expected, out) + } + + if err := handle.Kill(); err != nil { + t.Fatalf("error killing handle: %v", err) + } +} diff --git a/client/driver/utils.go b/client/driver/utils.go index 7e1c79890bb..def668548ff 100644 --- a/client/driver/utils.go +++ b/client/driver/utils.go @@ -1,6 +1,7 @@ package driver import ( + "context" "encoding/json" "fmt" "io" @@ -8,8 +9,10 @@ import ( "os/exec" "path/filepath" "strings" + "syscall" "time" + "github.com/armon/circbuf" "github.com/hashicorp/go-multierror" "github.com/hashicorp/go-plugin" "github.com/hashicorp/nomad/client/config" @@ -85,19 +88,16 @@ func createExecutorWithConfig(config *plugin.ClientConfig, w io.Writer) (executo if err != nil { return nil, nil, fmt.Errorf("unable to dispense the executor plugin: %v", err) } - executorPlugin := raw.(executor.Executor) - return executorPlugin, executorClient, nil -} - -func consulContext(clientConfig *config.Config, containerID string) *executor.ConsulContext { - return &executor.ConsulContext{ - ConsulConfig: clientConfig.ConsulConfig, - ContainerID: containerID, - DockerEndpoint: clientConfig.Read("docker.endpoint"), - TLSCa: clientConfig.Read("docker.tls.ca"), - TLSCert: clientConfig.Read("docker.tls.cert"), - TLSKey: clientConfig.Read("docker.tls.key"), + executorPlugin, ok := raw.(*ExecutorRPC) + if !ok { + return nil, nil, fmt.Errorf("unexpected executor rpc type: %T", raw) } + // 0.6 Upgrade path: Deregister services from the executor as the Nomad + // client agent now handles all Consul interactions. + if err := executorPlugin.DeregisterServices(); err != nil { + return nil, nil, err + } + return executorPlugin, executorClient, nil } // killProcess kills a process with the given pid @@ -181,3 +181,36 @@ func getExecutorUser(task *structs.Task) string { } return task.User } + +// execChroot executes cmd with args inside chroot if set and returns the +// output, exit code, and error. If chroot is an empty string the command is +// executed on the host. +func execChroot(ctx context.Context, chroot, name string, args []string) ([]byte, int, error) { + buf, _ := circbuf.NewBuffer(int64(cstructs.CheckBufSize)) + cmd := exec.CommandContext(ctx, name, args...) + cmd.Dir = "/" + cmd.Stdout = buf + cmd.Stderr = buf + if chroot != "" { + setChroot(cmd, chroot) + } + if err := cmd.Run(); err != nil { + exitErr, ok := err.(*exec.ExitError) + if !ok { + // Non-exit error, return it and let the caller treat + // it as a critical failure + return nil, 0, err + } + + // Some kind of error happened; default to critical + exitCode := 2 + if status, ok := exitErr.Sys().(syscall.WaitStatus); ok { + exitCode = status.ExitStatus() + } + + // Don't return the exitError as the caller only needs the + // output and code. + return buf.Bytes(), exitCode, nil + } + return buf.Bytes(), 0, nil +} diff --git a/client/driver/utils_unix.go b/client/driver/utils_unix.go index 474cdcf17f1..397641e3e79 100644 --- a/client/driver/utils_unix.go +++ b/client/driver/utils_unix.go @@ -16,3 +16,11 @@ func isolateCommand(cmd *exec.Cmd) { } cmd.SysProcAttr.Setsid = true } + +// setChroot on a command +func setChroot(cmd *exec.Cmd, chroot string) { + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + cmd.SysProcAttr.Chroot = chroot +} diff --git a/client/task_runner.go b/client/task_runner.go index 55176c0e705..64eb4570f1f 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -21,6 +21,7 @@ import ( "github.com/hashicorp/nomad/client/driver" "github.com/hashicorp/nomad/client/getter" "github.com/hashicorp/nomad/client/vaultclient" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/client/driver/env" @@ -61,6 +62,7 @@ type TaskRunner struct { logger *log.Logger alloc *structs.Allocation restartTracker *RestartTracker + consul ConsulServiceAPI // running marks whether the task is running running bool @@ -173,7 +175,7 @@ type SignalEvent struct { func NewTaskRunner(logger *log.Logger, config *config.Config, updater TaskStateUpdater, taskDir *allocdir.TaskDir, alloc *structs.Allocation, task *structs.Task, - vaultClient vaultclient.VaultClient) *TaskRunner { + vaultClient vaultclient.VaultClient, consulClient ConsulServiceAPI) *TaskRunner { // Merge in the task resources task.Resources = alloc.TaskResources[task.Name] @@ -195,6 +197,7 @@ func NewTaskRunner(logger *log.Logger, config *config.Config, task: task, taskDir: taskDir, createdResources: driver.NewCreatedResources(), + consul: consulClient, vaultClient: vaultClient, vaultFuture: NewTokenFuture().Set(""), updateCh: make(chan *structs.Allocation, 64), @@ -289,6 +292,19 @@ func (r *TaskRunner) RestoreState() error { r.task.Name, r.alloc.ID, err) return nil } + + //FIXME is there a better place to do this? used to be in executor + // Prepare services + interpolateServices(r.getTaskEnv(), r.task) + + // Ensure the service is registered + scriptExec, _ := handle.(consul.ScriptExecutor) + if err := r.consul.RegisterTask(r.alloc.ID, r.task, scriptExec); err != nil { + //FIXME What to do if this fails? + r.logger.Printf("[WARN] client: failed to register services and checks for task %q alloc %q: %v", + r.task.Name, r.alloc.ID, err) + } + r.handleLock.Lock() r.handle = handle r.handleLock.Unlock() @@ -1220,9 +1236,43 @@ func (r *TaskRunner) startTask() error { r.handleLock.Lock() r.handle = handle r.handleLock.Unlock() + + //FIXME is there a better place to do this? used to be in executor + // Prepare services + interpolateServices(r.getTaskEnv(), r.task) + + // RegisterTask properly handles scriptExec being nil, so it just + // ignore the ok value. + scriptExec, _ := handle.(consul.ScriptExecutor) + if err := r.consul.RegisterTask(r.alloc.ID, r.task, scriptExec); err != nil { + //FIXME handle errors?! + //FIXME could break into prepare & submit steps as only preperation can error... + r.logger.Printf("[ERR] client: failed to register services and checks for task %q alloc %q: %v", r.task.Name, r.alloc.ID, err) + } + return nil } +// interpolateServices interpolates tags in a service and checks with values from the +// task's environment. +func interpolateServices(taskEnv *env.TaskEnvironment, task *structs.Task) { + for _, service := range task.Services { + for _, check := range service.Checks { + check.Name = taskEnv.ReplaceEnv(check.Name) + check.Type = taskEnv.ReplaceEnv(check.Type) + check.Command = taskEnv.ReplaceEnv(check.Command) + check.Args = taskEnv.ParseAndReplace(check.Args) + check.Path = taskEnv.ReplaceEnv(check.Path) + check.Protocol = taskEnv.ReplaceEnv(check.Protocol) + check.PortLabel = taskEnv.ReplaceEnv(check.PortLabel) + check.InitialStatus = taskEnv.ReplaceEnv(check.InitialStatus) + } + service.Name = taskEnv.ReplaceEnv(service.Name) + service.PortLabel = taskEnv.ReplaceEnv(service.PortLabel) + service.Tags = taskEnv.ParseAndReplace(service.Tags) + } +} + // buildTaskDir creates the task directory before driver.Prestart. It is safe // to call multiple times as its state is persisted. func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error { @@ -1335,13 +1385,16 @@ func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { // Merge in the task resources updatedTask.Resources = update.TaskResources[updatedTask.Name] - // Update will update resources and store the new kill timeout. var mErr multierror.Error + var scriptExec consul.ScriptExecutor r.handleLock.Lock() if r.handle != nil { + // Update will update resources and store the new kill timeout. if err := r.handle.Update(updatedTask); err != nil { mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err)) } + // Not all drivers support Exec (eg QEMU) + scriptExec, _ = r.handle.(consul.ScriptExecutor) } r.handleLock.Unlock() @@ -1350,9 +1403,21 @@ func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { r.restartTracker.SetPolicy(tg.RestartPolicy) } + // Deregister the old service+checks + r.consul.RemoveTask(r.alloc.ID, r.task) + // Store the updated alloc. r.alloc = update r.task = updatedTask + + //FIXME is there a better place to do this? used to be in executor + // Prepare services + interpolateServices(r.getTaskEnv(), r.task) + + // Register the new service+checks + if err := r.consul.RegisterTask(r.alloc.ID, r.task, scriptExec); err != nil { + mErr.Errors = append(mErr.Errors, fmt.Errorf("error registering updated task with consul: %v", err)) + } return mErr.ErrorOrNil() } @@ -1361,6 +1426,9 @@ func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { // given limit. It returns whether the task was destroyed and the error // associated with the last kill attempt. func (r *TaskRunner) handleDestroy() (destroyed bool, err error) { + // Remove from Consul + r.consul.RemoveTask(r.alloc.ID, r.task) + // Cap the number of times we attempt to kill the task. for i := 0; i < killFailureLimit; i++ { if err = r.handle.Kill(); err != nil { diff --git a/client/task_runner_test.go b/client/task_runner_test.go index 6baa14f9734..ec511e93dfb 100644 --- a/client/task_runner_test.go +++ b/client/task_runner_test.go @@ -104,7 +104,8 @@ func testTaskRunnerFromAlloc(t *testing.T, restarts bool, alloc *structs.Allocat } vclient := vaultclient.NewMockVaultClient() - tr := NewTaskRunner(logger, conf, upd.Update, taskDir, alloc, task, vclient) + cclient := newMockConsulServiceClient() + tr := NewTaskRunner(logger, conf, upd.Update, taskDir, alloc, task, vclient, cclient) if !restarts { tr.restartTracker = noRestartsTracker() } @@ -366,7 +367,7 @@ func TestTaskRunner_SaveRestoreState(t *testing.T) { // Create a new task runner task2 := &structs.Task{Name: ctx.tr.task.Name, Driver: ctx.tr.task.Driver} tr2 := NewTaskRunner(ctx.tr.logger, ctx.tr.config, ctx.upd.Update, - ctx.tr.taskDir, ctx.tr.alloc, task2, ctx.tr.vaultClient) + ctx.tr.taskDir, ctx.tr.alloc, task2, ctx.tr.vaultClient, ctx.tr.consul) tr2.restartTracker = noRestartsTracker() if err := tr2.RestoreState(); err != nil { t.Fatalf("err: %v", err) diff --git a/command/agent/agent.go b/command/agent/agent.go index 142dfe2e34e..8472a98f1da 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -8,17 +8,18 @@ import ( "os" "path/filepath" "runtime" - "strconv" "strings" "sync" "sync/atomic" "time" + "github.com/hashicorp/consul/api" "github.com/hashicorp/nomad/client" clientconfig "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/nomad/structs/config" ) const ( @@ -30,6 +31,10 @@ const ( serverRpcCheckTimeout = 3 * time.Second serverSerfCheckInterval = 10 * time.Second serverSerfCheckTimeout = 3 * time.Second + + // roles used in identifying Consul entries for Nomad agents + consulRoleServer = "server" + consulRoleClient = "client" ) // Agent is a long running daemon that is used to run both @@ -42,8 +47,12 @@ type Agent struct { logger *log.Logger logOutput io.Writer - // consulSyncer registers the Nomad agent with the Consul Agent - consulSyncer *consul.Syncer + // consulService is Nomad's custom Consul client for managing services + // and checks. + consulService *consul.ServiceClient + + // consulCatalog is the subset of Consul's Catalog API Nomad uses. + consulCatalog consul.CatalogAPI client *client.Client @@ -63,8 +72,8 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { shutdownCh: make(chan struct{}), } - if err := a.setupConsulSyncer(); err != nil { - return nil, fmt.Errorf("Failed to initialize Consul syncer task: %v", err) + if err := a.setupConsul(config.Consul); err != nil { + return nil, fmt.Errorf("Failed to initialize Consul client: %v", err) } if err := a.setupServer(); err != nil { return nil, err @@ -76,15 +85,6 @@ func NewAgent(config *Config, logOutput io.Writer) (*Agent, error) { return nil, fmt.Errorf("must have at least client or server mode enabled") } - // The Nomad Agent runs the consul.Syncer regardless of whether or not the - // Agent is running in Client or Server mode (or both), and regardless of - // the consul.auto_advertise parameter. The Client and Server both reuse the - // same consul.Syncer instance. This Syncer task periodically executes - // callbacks that update Consul. The reason the Syncer is always running is - // because one of the callbacks is attempts to self-bootstrap Nomad using - // information found in Consul. - go a.consulSyncer.Run() - return a, nil } @@ -339,7 +339,7 @@ func (a *Agent) setupServer() error { } // Create the server - server, err := nomad.NewServer(conf, a.consulSyncer, a.logger) + server, err := nomad.NewServer(conf, a.consulCatalog, a.logger) if err != nil { return fmt.Errorf("server setup failed: %v", err) } @@ -405,14 +405,16 @@ func (a *Agent) setupServer() error { // Add the http port check if TLS isn't enabled // TODO Add TLS check when Consul 0.7.1 comes out. - consulServices := map[consul.ServiceKey]*structs.Service{ - consul.GenerateServiceKey(rpcServ): rpcServ, - consul.GenerateServiceKey(serfServ): serfServ, + consulServices := []*structs.Service{ + rpcServ, + serfServ, } if !conf.TLSConfig.EnableHTTP { - consulServices[consul.GenerateServiceKey(httpServ)] = httpServ + consulServices = append(consulServices, httpServ) + } + if err := a.consulService.RegisterAgent(consulRoleServer, consulServices); err != nil { + return err } - a.consulSyncer.SetServices(consul.ServerDomain, consulServices) } return nil @@ -462,7 +464,7 @@ func (a *Agent) setupClient() error { } // Create the client - client, err := client.NewClient(conf, a.consulSyncer, a.logger) + client, err := client.NewClient(conf, a.consulCatalog, a.consulService, a.logger) if err != nil { return fmt.Errorf("client setup failed: %v", err) } @@ -495,9 +497,9 @@ func (a *Agent) setupClient() error { }, } if !conf.TLSConfig.EnableHTTP { - a.consulSyncer.SetServices(consul.ClientDomain, map[consul.ServiceKey]*structs.Service{ - consul.GenerateServiceKey(httpServ): httpServ, - }) + if err := a.consulService.RegisterAgent(consulRoleClient, []*structs.Service{httpServ}); err != nil { + return err + } } } @@ -612,8 +614,8 @@ func (a *Agent) Shutdown() error { } } - if err := a.consulSyncer.Shutdown(); err != nil { - a.logger.Printf("[ERR] agent: shutting down consul service failed: %v", err) + if err := a.consulService.Shutdown(); err != nil { + a.logger.Printf("[ERR] agent: shutting down Consul client failed: %v", err) } a.logger.Println("[INFO] agent: shutdown complete") @@ -659,46 +661,22 @@ func (a *Agent) Stats() map[string]map[string]string { return stats } -// setupConsulSyncer creates the Consul tasks used by this Nomad Agent -// (either Client or Server mode). -func (a *Agent) setupConsulSyncer() error { - var err error - a.consulSyncer, err = consul.NewSyncer(a.config.Consul, a.shutdownCh, a.logger) +// setupConsul creates the Consul client and starts its main Run loop. +func (a *Agent) setupConsul(consulConfig *config.ConsulConfig) error { + apiConf, err := consulConfig.ApiConfig() + if err != nil { + return err + } + client, err := api.NewClient(apiConf) if err != nil { return err } - a.consulSyncer.SetAddrFinder(func(portLabel string) (string, int) { - host, port, err := net.SplitHostPort(portLabel) - if err != nil { - p, err := strconv.Atoi(port) - if err != nil { - return "", 0 - } - return "", p - } - - // If the addr for the service is ":port", then we fall back - // to Nomad's default address resolution protocol. - // - // TODO(sean@): This should poll Consul to figure out what - // its advertise address is and use that in order to handle - // the case where there is something funky like NAT on this - // host. For now we just use the BindAddr if set, otherwise - // we fall back to a loopback addr. - if host == "" { - if a.config.BindAddr != "" { - host = a.config.BindAddr - } else { - host = "127.0.0.1" - } - } - p, err := strconv.Atoi(port) - if err != nil { - return host, 0 - } - return host, p - }) + // Create Consul Catalog client for service discovery. + a.consulCatalog = client.Catalog() + // Create Consul Service client for service advertisement and checks. + a.consulService = consul.NewServiceClient(client.Agent(), a.logger) + go a.consulService.Run() return nil } diff --git a/command/agent/consul/chaos_test.go b/command/agent/consul/chaos_test.go deleted file mode 100644 index 89b69ea2cb6..00000000000 --- a/command/agent/consul/chaos_test.go +++ /dev/null @@ -1,193 +0,0 @@ -// +build chaos - -package consul - -import ( - "fmt" - "io/ioutil" - "sort" - "strings" - "sync" - "testing" - "time" - - "github.com/hashicorp/consul/testutil" - "github.com/hashicorp/nomad/nomad/structs" - "github.com/hashicorp/nomad/nomad/structs/config" -) - -func TestSyncerChaos(t *testing.T) { - // Create an embedded Consul server - testconsul := testutil.NewTestServerConfig(t, func(c *testutil.TestServerConfig) { - // If -v wasn't specified squelch consul logging - if !testing.Verbose() { - c.Stdout = ioutil.Discard - c.Stderr = ioutil.Discard - } - }) - defer testconsul.Stop() - - // Configure Syncer to talk to the test server - cconf := config.DefaultConsulConfig() - cconf.Addr = testconsul.HTTPAddr - - clientSyncer, err := NewSyncer(cconf, nil, logger) - if err != nil { - t.Fatalf("Error creating Syncer: %v", err) - } - defer clientSyncer.Shutdown() - - execSyncer, err := NewSyncer(cconf, nil, logger) - if err != nil { - t.Fatalf("Error creating Syncer: %v", err) - } - defer execSyncer.Shutdown() - - clientService := &structs.Service{Name: "nomad-client"} - services := map[ServiceKey]*structs.Service{ - GenerateServiceKey(clientService): clientService, - } - if err := clientSyncer.SetServices("client", services); err != nil { - t.Fatalf("error setting client service: %v", err) - } - - const execn = 100 - const reapern = 2 - errors := make(chan error, 100) - wg := sync.WaitGroup{} - - // Start goroutines to concurrently SetServices - for i := 0; i < execn; i++ { - wg.Add(1) - go func(i int) { - defer wg.Done() - domain := ServiceDomain(fmt.Sprintf("exec-%d", i)) - services := map[ServiceKey]*structs.Service{} - for ii := 0; ii < 10; ii++ { - s := &structs.Service{Name: fmt.Sprintf("exec-%d-%d", i, ii)} - services[GenerateServiceKey(s)] = s - if err := execSyncer.SetServices(domain, services); err != nil { - select { - case errors <- err: - default: - } - return - } - time.Sleep(1) - } - }(i) - } - - // SyncServices runs a timer started by Syncer.Run which we don't use - // in this test, so run SyncServices concurrently - wg.Add(1) - go func() { - defer wg.Done() - for i := 0; i < execn; i++ { - if err := execSyncer.SyncServices(); err != nil { - select { - case errors <- err: - default: - } - return - } - time.Sleep(100) - } - }() - - wg.Add(1) - go func() { - defer wg.Done() - if err := clientSyncer.ReapUnmatched([]ServiceDomain{"nomad-client"}); err != nil { - select { - case errors <- err: - default: - } - return - } - }() - - // Reap all but exec-0-* - wg.Add(1) - go func() { - defer wg.Done() - for i := 0; i < execn; i++ { - if err := execSyncer.ReapUnmatched([]ServiceDomain{"exec-0", ServiceDomain(fmt.Sprintf("exec-%d", i))}); err != nil { - select { - case errors <- err: - default: - } - } - time.Sleep(100) - } - }() - - go func() { - wg.Wait() - close(errors) - }() - - for err := range errors { - if err != nil { - t.Errorf("error setting service from executor goroutine: %v", err) - } - } - - // Do a final ReapUnmatched to get consul back into a deterministic state - if err := execSyncer.ReapUnmatched([]ServiceDomain{"exec-0"}); err != nil { - t.Fatalf("error doing final reap: %v", err) - } - - // flattenedServices should be fully populated as ReapUnmatched doesn't - // touch Syncer's internal state - expected := map[string]struct{}{} - for i := 0; i < execn; i++ { - for ii := 0; ii < 10; ii++ { - expected[fmt.Sprintf("exec-%d-%d", i, ii)] = struct{}{} - } - } - - for _, s := range execSyncer.flattenedServices() { - _, ok := expected[s.Name] - if !ok { - t.Errorf("%s unexpected", s.Name) - } - delete(expected, s.Name) - } - if len(expected) > 0 { - left := []string{} - for s := range expected { - left = append(left, s) - } - sort.Strings(left) - t.Errorf("Couldn't find %d names in flattened services:\n%s", len(expected), strings.Join(left, "\n")) - } - - // All but exec-0 and possibly some of exec-99 should have been reaped - { - services, err := execSyncer.client.Agent().Services() - if err != nil { - t.Fatalf("Error getting services: %v", err) - } - expected := []int{} - for k, service := range services { - if service.Service == "consul" { - continue - } - i := -1 - ii := -1 - fmt.Sscanf(service.Service, "exec-%d-%d", &i, &ii) - switch { - case i == -1 || ii == -1: - t.Errorf("invalid service: %s -> %s", k, service.Service) - case i != 0 || ii > 9: - t.Errorf("unexpected service: %s -> %s", k, service.Service) - default: - expected = append(expected, ii) - } - } - if len(expected) != 10 { - t.Errorf("expected 0-9 but found: %#q", expected) - } - } -} diff --git a/command/agent/consul/check.go b/command/agent/consul/check.go deleted file mode 100644 index 551f94b6fc3..00000000000 --- a/command/agent/consul/check.go +++ /dev/null @@ -1,91 +0,0 @@ -package consul - -import ( - "log" - "sync" - "time" - - "github.com/hashicorp/consul/lib" - cstructs "github.com/hashicorp/nomad/client/driver/structs" -) - -// CheckRunner runs a given check in a specific interval and update a -// corresponding Consul TTL check -type CheckRunner struct { - check Check - runCheck func(Check) - logger *log.Logger - stop bool - stopCh chan struct{} - stopLock sync.Mutex - - started bool - startedLock sync.Mutex -} - -// NewCheckRunner configures and returns a CheckRunner -func NewCheckRunner(check Check, runCheck func(Check), logger *log.Logger) *CheckRunner { - cr := CheckRunner{ - check: check, - runCheck: runCheck, - logger: logger, - stopCh: make(chan struct{}), - } - return &cr -} - -// Start is used to start the check. The check runs until stop is called -func (r *CheckRunner) Start() { - r.startedLock.Lock() - defer r.startedLock.Unlock() - if r.started { - return - } - r.stopLock.Lock() - defer r.stopLock.Unlock() - go r.run() - r.started = true -} - -// Started returns if the check runner has started running -func (r *CheckRunner) Started() bool { - r.startedLock.Lock() - defer r.startedLock.Unlock() - return r.started -} - -// Stop is used to stop the check. -func (r *CheckRunner) Stop() { - r.stopLock.Lock() - defer r.stopLock.Unlock() - if !r.stop { - r.stop = true - close(r.stopCh) - } -} - -// run is invoked by a goroutine to run until Stop() is called -func (r *CheckRunner) run() { - // Get the randomized initial pause time - initialPauseTime := lib.RandomStagger(r.check.Interval()) - r.logger.Printf("[DEBUG] agent: pausing %v before first invocation of %s", initialPauseTime, r.check.ID()) - next := time.NewTimer(initialPauseTime) - for { - select { - case <-next.C: - r.runCheck(r.check) - next.Reset(r.check.Interval()) - case <-r.stopCh: - next.Stop() - return - } - } -} - -// Check is an interface which check providers can implement for Nomad to run -type Check interface { - Run() *cstructs.CheckResult - ID() string - Interval() time.Duration - Timeout() time.Duration -} diff --git a/command/agent/consul/client.go b/command/agent/consul/client.go new file mode 100644 index 00000000000..6365eb0b831 --- /dev/null +++ b/command/agent/consul/client.go @@ -0,0 +1,636 @@ +package consul + +import ( + "context" + "fmt" + "log" + "net" + "net/url" + "strconv" + "strings" + "sync" + "time" + + "github.com/hashicorp/consul/api" + "github.com/hashicorp/go-multierror" + "github.com/hashicorp/nomad/nomad/structs" +) + +var mark = struct{}{} + +const ( + // nomadServicePrefix is the first prefix that scopes all Nomad registered + // services + nomadServicePrefix = "_nomad" + + // The periodic time interval for syncing services and checks with Consul + defaultSyncInterval = 6 * time.Second + + // ttlCheckBuffer is the time interval that Nomad can take to report Consul + // the check result + ttlCheckBuffer = 31 * time.Second + + // defaultShutdownWait is how long Shutdown() should block waiting for + // enqueued operations to sync to Consul by default. + defaultShutdownWait = time.Minute + + // DefaultQueryWaitDuration is the max duration the Consul Agent will + // spend waiting for a response from a Consul Query. + DefaultQueryWaitDuration = 2 * time.Second + + // ServiceTagHTTP is the tag assigned to HTTP services + ServiceTagHTTP = "http" + + // ServiceTagRPC is the tag assigned to RPC services + ServiceTagRPC = "rpc" + + // ServiceTagSerf is the tag assigned to Serf services + ServiceTagSerf = "serf" +) + +// ScriptExecutor is the interface the ServiceClient uses to execute script +// checks inside a container. +type ScriptExecutor interface { + Exec(ctx context.Context, cmd string, args []string) ([]byte, int, error) +} + +// CatalogAPI is the consul/api.Catalog API used by Nomad. +type CatalogAPI interface { + Datacenters() ([]string, error) + Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error) +} + +// AgentAPI is the consul/api.Agent API used by Nomad. +type AgentAPI interface { + CheckRegister(check *api.AgentCheckRegistration) error + CheckDeregister(checkID string) error + ServiceRegister(service *api.AgentServiceRegistration) error + ServiceDeregister(serviceID string) error + UpdateTTL(id, output, status string) error +} + +// ServiceClient handles task and agent service registration with Consul. +type ServiceClient struct { + client AgentAPI + logger *log.Logger + retryInterval time.Duration + syncInterval time.Duration + + // runningCh is closed when the main Run loop exits + runningCh chan struct{} + + // shutdownCh is closed when the client should shutdown + shutdownCh chan struct{} + + // shutdownWait is how long Shutdown() blocks waiting for the final + // sync() to finish. Defaults to defaultShutdownWait + shutdownWait time.Duration + + // syncCh triggers a sync in the main Run loop + syncCh chan struct{} + + // services and checks to be registered + regServices map[string]*api.AgentServiceRegistration + regChecks map[string]*api.AgentCheckRegistration + + // services and checks to be unregisterd + deregServices map[string]struct{} + deregChecks map[string]struct{} + + // script checks to be run() after their corresponding check is + // registered + regScripts map[string]*scriptCheck + + // script check cancel funcs to be called before their corresponding + // check is removed. Only accessed in sync() so not covered by regLock + runningScripts map[string]*scriptHandle + + // regLock must be held while accessing reg and dereg maps + regLock sync.Mutex + + // Registered agent services and checks + agentServices map[string]struct{} + agentChecks map[string]struct{} + + // agentLock must be held while accessing agent maps + agentLock sync.Mutex +} + +// NewServiceClient creates a new Consul ServiceClient from an existing Consul API +// Client and logger. +func NewServiceClient(consulClient AgentAPI, logger *log.Logger) *ServiceClient { + return &ServiceClient{ + client: consulClient, + logger: logger, + retryInterval: defaultSyncInterval, //TODO what should this default to?! + syncInterval: defaultSyncInterval, + runningCh: make(chan struct{}), + shutdownCh: make(chan struct{}), + shutdownWait: defaultShutdownWait, + syncCh: make(chan struct{}, 1), + regServices: make(map[string]*api.AgentServiceRegistration), + regChecks: make(map[string]*api.AgentCheckRegistration), + deregServices: make(map[string]struct{}), + deregChecks: make(map[string]struct{}), + regScripts: make(map[string]*scriptCheck), + runningScripts: make(map[string]*scriptHandle), + agentServices: make(map[string]struct{}, 8), + agentChecks: make(map[string]struct{}, 8), + } +} + +// Run the Consul main loop which retries operations against Consul. It should +// be called exactly once. +func (c *ServiceClient) Run() { + defer close(c.runningCh) + timer := time.NewTimer(0) + defer timer.Stop() + + // Drain the initial tick so we don't sync until instructed + <-timer.C + + lastOk := true + for { + select { + case <-c.syncCh: + timer.Reset(0) + case <-timer.C: + if err := c.sync(); err != nil { + if lastOk { + lastOk = false + c.logger.Printf("[WARN] consul: failed to update services in Consul: %v", err) + } + //TODO Log? and jitter/backoff + timer.Reset(c.retryInterval) + } else { + if !lastOk { + c.logger.Printf("[INFO] consul: successfully updated services in Consul") + lastOk = true + } + } + case <-c.shutdownCh: + return + } + } +} + +// forceSync asynchronously causes a sync to happen. Any operations enqueued +// prior to calling forceSync will be synced. +func (c *ServiceClient) forceSync() { + select { + case c.syncCh <- mark: + default: + } +} + +// sync enqueued operations. +func (c *ServiceClient) sync() error { + // Shallow copy and reset the pending operations fields + c.regLock.Lock() + regServices := make(map[string]*api.AgentServiceRegistration, len(c.regServices)) + for k, v := range c.regServices { + regServices[k] = v + } + c.regServices = map[string]*api.AgentServiceRegistration{} + + regChecks := make(map[string]*api.AgentCheckRegistration, len(c.regChecks)) + for k, v := range c.regChecks { + regChecks[k] = v + } + c.regChecks = map[string]*api.AgentCheckRegistration{} + + regScripts := make(map[string]*scriptCheck, len(c.regScripts)) + for k, v := range c.regScripts { + regScripts[k] = v + } + c.regScripts = map[string]*scriptCheck{} + + deregServices := make(map[string]struct{}, len(c.deregServices)) + for k := range c.deregServices { + deregServices[k] = mark + } + c.deregServices = map[string]struct{}{} + + deregChecks := make(map[string]struct{}, len(c.deregChecks)) + for k := range c.deregChecks { + deregChecks[k] = mark + } + c.deregChecks = map[string]struct{}{} + c.regLock.Unlock() + + var err error + + regServiceN, regCheckN, deregServiceN, deregCheckN := len(regServices), len(regChecks), len(deregServices), len(deregChecks) + + // Register Services + for id, service := range regServices { + if err = c.client.ServiceRegister(service); err != nil { + goto ERROR + } + delete(regServices, id) + } + + // Register Checks + for id, check := range regChecks { + if err = c.client.CheckRegister(check); err != nil { + goto ERROR + } + delete(regChecks, id) + + // Run the script for this check if one exists + if script, ok := regScripts[id]; ok { + // This check is a script check; run it + c.runningScripts[id] = script.run() + } + } + + // Deregister Checks + for id := range deregChecks { + if h, ok := c.runningScripts[id]; ok { + // This check is a script check; stop it + h.cancel() + delete(c.runningScripts, id) + } + + if err = c.client.CheckDeregister(id); err != nil { + goto ERROR + } + delete(deregChecks, id) + } + + // Deregister Services + for id := range deregServices { + if err = c.client.ServiceDeregister(id); err != nil { + goto ERROR + } + delete(deregServices, id) + } + + c.logger.Printf("[DEBUG] consul: registered %d services / %d checks; deregisterd %d services / %d checks", regServiceN, regCheckN, deregServiceN, deregCheckN) + return nil + + //TODO Labels and gotos are nasty; move to a function? +ERROR: + // An error occurred, repopulate the operation maps omitting any keys + // that have been updated while sync() ran. + c.regLock.Lock() + for id, service := range regServices { + if _, ok := c.regServices[id]; ok { + continue + } + if _, ok := c.deregServices[id]; ok { + continue + } + c.regServices[id] = service + } + for id, check := range regChecks { + if _, ok := c.regChecks[id]; ok { + continue + } + if _, ok := c.deregChecks[id]; ok { + continue + } + c.regChecks[id] = check + } + for id, script := range regScripts { + if _, ok := c.regScripts[id]; ok { + // a new version of this script was added, drop this one + continue + } + c.regScripts[id] = script + } + for id, _ := range deregServices { + if _, ok := c.regServices[id]; ok { + continue + } + c.deregServices[id] = mark + } + for id, _ := range deregChecks { + if _, ok := c.regChecks[id]; ok { + continue + } + c.deregChecks[id] = mark + } + c.regLock.Unlock() + return err +} + +// RegisterAgent registers Nomad agents (client or server). Script checks are +// not supported and will return an error. Registration is asynchronous. +// +// Agents will be deregistered when Shutdown is called. +func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error { + regs := make([]*api.AgentServiceRegistration, len(services)) + checks := make([]*api.AgentCheckRegistration, 0, len(services)) + + for i, service := range services { + id := makeAgentServiceID(role, service) + host, rawport, err := net.SplitHostPort(service.PortLabel) + if err != nil { + return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err) + } + port, err := strconv.Atoi(rawport) + if err != nil { + return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err) + } + serviceReg := &api.AgentServiceRegistration{ + ID: id, + Name: service.Name, + Tags: service.Tags, + Address: host, + Port: port, + } + regs[i] = serviceReg + + for _, check := range service.Checks { + checkID := createCheckID(id, check) + if check.Type == structs.ServiceCheckScript { + return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name) + } + checkHost, checkPort := serviceReg.Address, serviceReg.Port + if check.PortLabel != "" { + host, rawport, err := net.SplitHostPort(check.PortLabel) + if err != nil { + return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err) + } + port, err := strconv.Atoi(rawport) + if err != nil { + return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err) + } + checkHost, checkPort = host, port + } + checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort) + if err != nil { + return fmt.Errorf("failed to add check %q: %v", check.Name, err) + } + checks = append(checks, checkReg) + } + } + + // Now add them to the registration queue + c.enqueueRegs(regs, checks, nil) + + // Record IDs for deregistering on shutdown + c.agentLock.Lock() + for _, s := range regs { + c.agentServices[s.ID] = mark + } + for _, ch := range checks { + c.agentChecks[ch.ID] = mark + } + c.agentLock.Unlock() + return nil +} + +// RegisterTask with Consul. Adds all sevice entries and checks to Consul. If +// exec is nil and a script check exists an error is returned. +// +// Actual communication with Consul is done asynchrously (see Run). +func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, exec ScriptExecutor) error { + regs := make([]*api.AgentServiceRegistration, len(task.Services)) + checks := make([]*api.AgentCheckRegistration, 0, len(task.Services)*2) // just guess at size + var scriptChecks []*scriptCheck + + for i, service := range task.Services { + id := makeTaskServiceID(allocID, task.Name, service) + host, port := task.FindHostAndPortFor(service.PortLabel) + serviceReg := &api.AgentServiceRegistration{ + ID: id, + Name: service.Name, + Tags: make([]string, len(service.Tags)), + Address: host, + Port: port, + } + // copy isn't strictly necessary but can avoid bugs especially + // with tests that may reuse Tasks + copy(serviceReg.Tags, service.Tags) + regs[i] = serviceReg + + for _, check := range service.Checks { + checkID := createCheckID(id, check) + if check.Type == structs.ServiceCheckScript { + if exec == nil { + return fmt.Errorf("driver %q doesn't support script checks", task.Driver) + } + scriptChecks = append(scriptChecks, newScriptCheck(checkID, check, exec, c.client, c.logger, c.shutdownCh)) + } + host, port := serviceReg.Address, serviceReg.Port + if check.PortLabel != "" { + host, port = task.FindHostAndPortFor(check.PortLabel) + } + checkReg, err := createCheckReg(id, checkID, check, host, port) + if err != nil { + return fmt.Errorf("failed to add check %q: %v", check.Name, err) + } + checks = append(checks, checkReg) + } + + } + + // Now add them to the registration queue + c.enqueueRegs(regs, checks, scriptChecks) + return nil +} + +// RemoveTask from Consul. Removes all service entries and checks. +// +// Actual communication with Consul is done asynchrously (see Run). +func (c *ServiceClient) RemoveTask(allocID string, task *structs.Task) { + deregs := make([]string, len(task.Services)) + checks := make([]string, 0, len(task.Services)*2) // just guess at size + + for i, service := range task.Services { + id := makeTaskServiceID(allocID, task.Name, service) + deregs[i] = id + + for _, check := range service.Checks { + checks = append(checks, createCheckID(id, check)) + } + } + + // Now add them to the deregistration fields; main Run loop will update + c.enqueueDeregs(deregs, checks) +} + +// enqueueRegs enqueues service and check registrations for the next time +// operations are sync'd to Consul. +func (c *ServiceClient) enqueueRegs(regs []*api.AgentServiceRegistration, checks []*api.AgentCheckRegistration, scriptChecks []*scriptCheck) { + c.regLock.Lock() + for _, reg := range regs { + // Add reg + c.regServices[reg.ID] = reg + // Make sure it's not being removed + delete(c.deregServices, reg.ID) + } + for _, check := range checks { + // Add check + c.regChecks[check.ID] = check + // Make sure it's not being removed + delete(c.deregChecks, check.ID) + } + for _, script := range scriptChecks { + c.regScripts[script.id] = script + } + c.regLock.Unlock() + + c.forceSync() +} + +// enqueueDeregs enqueues service and check removals for the next time +// operations are sync'd to Consul. +func (c *ServiceClient) enqueueDeregs(deregs []string, checks []string) { + c.regLock.Lock() + for _, dereg := range deregs { + // Add dereg + c.deregServices[dereg] = mark + // Make sure it's not being added + delete(c.regServices, dereg) + } + for _, check := range checks { + // Add check for removal + c.deregChecks[check] = mark + // Make sure it's not being added + delete(c.regChecks, check) + } + c.regLock.Unlock() + + c.forceSync() +} + +// Shutdown the Consul client. Update running task registations and deregister +// agent from Consul. Blocks up to shutdownWait before giving up on syncing +// operations. +func (c *ServiceClient) Shutdown() error { + select { + case <-c.shutdownCh: + return nil + default: + close(c.shutdownCh) + } + + var mErr multierror.Error + + // Don't let Shutdown block indefinitely + deadline := time.After(c.shutdownWait) + + // Deregister agent services and checks + c.agentLock.Lock() + for id := range c.agentServices { + if err := c.client.ServiceDeregister(id); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + } + + // Deregister Checks + for id := range c.agentChecks { + if err := c.client.CheckDeregister(id); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + } + c.agentLock.Unlock() + + // Wait for Run to finish any outstanding sync() calls and exit + select { + case <-c.runningCh: + // sync one last time to ensure all enqueued operations are applied + if err := c.sync(); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + case <-deadline: + // Don't wait forever though + mErr.Errors = append(mErr.Errors, fmt.Errorf("timed out waiting for Consul operations to complete")) + return mErr.ErrorOrNil() + } + + // Give script checks time to exit (no need to lock as Run() has exited) + for _, h := range c.runningScripts { + select { + case <-h.wait(): + case <-deadline: + mErr.Errors = append(mErr.Errors, fmt.Errorf("timed out waiting for script checks to run")) + return mErr.ErrorOrNil() + } + } + return mErr.ErrorOrNil() +} + +// makeAgentServiceID creates a unique ID for identifying an agent service in +// Consul. +// +// Agent service IDs are of the form: +// +// {nomadServicePrefix}-{ROLE}-{Service.Name}-{Service.Tags...} +// Example Server ID: _nomad-server-nomad-serf +// Example Client ID: _nomad-client-nomad-client-http +// +func makeAgentServiceID(role string, service *structs.Service) string { + parts := make([]string, len(service.Tags)+3) + parts[0] = nomadServicePrefix + parts[1] = role + parts[2] = service.Name + copy(parts[3:], service.Tags) + return strings.Join(parts, "-") +} + +// makeTaskServiceID creates a unique ID for identifying a task service in +// Consul. +// +// Task service IDs are of the form: +// +// {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} +// Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 +// +func makeTaskServiceID(allocID, taskName string, service *structs.Service) string { + parts := make([]string, len(service.Tags)+5) + parts[0] = nomadServicePrefix + parts[1] = "executor" + parts[2] = allocID + parts[3] = taskName + parts[4] = service.Name + copy(parts[5:], service.Tags) + return strings.Join(parts, "-") +} + +// createCheckID creates a unique ID for a check. +func createCheckID(serviceID string, check *structs.ServiceCheck) string { + return check.Hash(serviceID) +} + +// createCheckReg creates a Check that can be registered with Consul. +// +// Only supports HTTP(S) and TCP checks. Script checks must be handled +// externally. +func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) { + chkReg := api.AgentCheckRegistration{ + ID: checkID, + Name: check.Name, + ServiceID: serviceID, + } + chkReg.Status = check.InitialStatus + chkReg.Timeout = check.Timeout.String() + chkReg.Interval = check.Interval.String() + + switch check.Type { + case structs.ServiceCheckHTTP: + if check.Protocol == "" { + check.Protocol = "http" + } + base := url.URL{ + Scheme: check.Protocol, + Host: net.JoinHostPort(host, strconv.Itoa(port)), + } + relative, err := url.Parse(check.Path) + if err != nil { + return nil, err + } + url := base.ResolveReference(relative) + chkReg.HTTP = url.String() + case structs.ServiceCheckTCP: + chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port)) + case structs.ServiceCheckScript: + chkReg.TTL = (check.Interval + ttlCheckBuffer).String() + default: + return nil, fmt.Errorf("check type %+q not valid", check.Type) + } + return &chkReg, nil +} diff --git a/command/agent/consul/int_test.go b/command/agent/consul/int_test.go new file mode 100644 index 00000000000..2f0239d5f78 --- /dev/null +++ b/command/agent/consul/int_test.go @@ -0,0 +1,228 @@ +package consul_test + +import ( + "io/ioutil" + "log" + "os" + "path/filepath" + "testing" + "time" + + "golang.org/x/sys/unix" + + consulapi "github.com/hashicorp/consul/api" + "github.com/hashicorp/consul/testutil" + "github.com/hashicorp/nomad/client" + "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/config" + "github.com/hashicorp/nomad/client/driver" + "github.com/hashicorp/nomad/client/vaultclient" + "github.com/hashicorp/nomad/command/agent/consul" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/structs" +) + +func testLogger() *log.Logger { + if testing.Verbose() { + return log.New(os.Stderr, "", log.LstdFlags) + } + return log.New(ioutil.Discard, "", 0) +} + +// TestConsul_Integration asserts TaskRunner properly registers and deregisters +// services and checks with Consul using an embedded Consul agent. +func TestConsul_Integration(t *testing.T) { + if _, ok := driver.BuiltinDrivers["mock_driver"]; !ok { + t.Skip(`test requires mock_driver; run with "-tags nomad_test"`) + } + if testing.Short() { + t.Skip("-short set; skipping") + } + if unix.Geteuid() != 0 { + t.Skip("Must be run as root") + } + // Create an embedded Consul server + testconsul := testutil.NewTestServerConfig(t, func(c *testutil.TestServerConfig) { + // If -v wasn't specified squelch consul logging + if !testing.Verbose() { + c.Stdout = ioutil.Discard + c.Stderr = ioutil.Discard + } + }) + defer testconsul.Stop() + + conf := config.DefaultConfig() + conf.ConsulConfig.Addr = testconsul.HTTPAddr + consulConfig, err := conf.ConsulConfig.ApiConfig() + if err != nil { + t.Fatalf("error generating consul config: %v", err) + } + + conf.StateDir, err = ioutil.TempDir("", "nomadtest-consulstate") + if err != nil { + t.Fatalf("error creating temp dir: %v", err) + } + defer os.RemoveAll(conf.StateDir) + conf.AllocDir, err = ioutil.TempDir("", "nomdtest-consulalloc") + if err != nil { + t.Fatalf("error creating temp dir: %v", err) + } + defer os.RemoveAll(conf.AllocDir) + + alloc := mock.Alloc() + task := alloc.Job.TaskGroups[0].Tasks[0] + task.Driver = "mock_driver" + task.Config = map[string]interface{}{ + "run_for": "1h", + } + // Choose a port that shouldn't be in use + task.Resources.Networks[0].ReservedPorts = []structs.Port{{Label: "http", Value: 3}} + task.Services = []*structs.Service{ + { + Name: "httpd", + PortLabel: "http", + Tags: []string{"nomad", "test", "http"}, + Checks: []*structs.ServiceCheck{ + { + Name: "httpd-http-check", + Type: "http", + Path: "/", + Protocol: "http", + PortLabel: "http", + Interval: 9000 * time.Hour, + Timeout: 1, // fail as fast as possible + }, + { + Name: "httpd-script-check", + Type: "script", + Command: "/bin/true", + Interval: 10 * time.Second, + Timeout: 10 * time.Second, + }, + }, + }, + { + Name: "httpd2", + PortLabel: "http", + Tags: []string{"test", "http2"}, + }, + } + + logger := testLogger() + logUpdate := func(name, state string, event *structs.TaskEvent) { + logger.Printf("[TEST] test.updater: name=%q state=%q event=%v", name, state, event) + } + allocDir := allocdir.NewAllocDir(logger, filepath.Join(conf.AllocDir, alloc.ID)) + if err := allocDir.Build(); err != nil { + t.Fatalf("error building alloc dir: %v", err) + } + taskDir := allocDir.NewTaskDir(task.Name) + vclient := vaultclient.NewMockVaultClient() + consulClient, err := consulapi.NewClient(consulConfig) + if err != nil { + t.Fatalf("error creating consul client: %v", err) + } + serviceClient := consul.NewServiceClient(consulClient.Agent(), logger) + defer serviceClient.Shutdown() // just-in-case cleanup + consulRan := make(chan struct{}) + go func() { + serviceClient.Run() + close(consulRan) + }() + tr := client.NewTaskRunner(logger, conf, logUpdate, taskDir, alloc, task, vclient, serviceClient) + tr.MarkReceived() + go tr.Run() + defer func() { + // Just in case cleanup + select { + case <-tr.WaitCh(): + // Exited cleanly, no need to kill + default: + tr.Kill("", "", true) // just in case + } + }() + + // Block waiting for the service to appear + catalog := consulClient.Catalog() + res, meta, err := catalog.Service("httpd2", "test", nil) + for len(res) == 0 { + //Expected initial request to fail, do a blocking query + res, meta, err = catalog.Service("httpd2", "test", &consulapi.QueryOptions{WaitIndex: meta.LastIndex + 1, WaitTime: 3 * time.Second}) + if err != nil { + t.Fatalf("error querying for service: %v", err) + } + } + if len(res) != 1 { + t.Fatalf("expected 1 service but found %d:\n%#v", len(res), res) + } + res = res[:] + + // Assert the service with the checks exists + for len(res) == 0 { + res, meta, err = catalog.Service("httpd", "http", &consulapi.QueryOptions{WaitIndex: meta.LastIndex + 1, WaitTime: 3 * time.Second}) + if err != nil { + t.Fatalf("error querying for service: %v", err) + } + } + if len(res) != 1 { + t.Fatalf("exepcted 1 service but found %d:\n%#v", len(res), res) + } + + // Assert the script check passes (mock_driver script checks always + // pass) after having time to run once + time.Sleep(2 * time.Second) + checks, _, err := consulClient.Health().Checks("httpd", nil) + if err != nil { + t.Fatalf("error querying checks: %v", err) + } + if expected := 2; len(checks) != expected { + t.Fatalf("expected %d checks but found %d:\n%#v", expected, len(checks), checks) + } + for _, check := range checks { + if expected := "httpd"; check.ServiceName != expected { + t.Fatalf("expected checks to be for %q but found service name = %q", expected, check.ServiceName) + } + switch check.Name { + case "httpd-http-check": + // Port check should fail + if expected := consulapi.HealthCritical; check.Status != expected { + t.Errorf("expected %q status to be %q but found %q", check.Name, expected, check.Status) + } + case "httpd-script-check": + // mock_driver script checks always succeed + if expected := consulapi.HealthPassing; check.Status != expected { + t.Errorf("expected %q status to be %q but found %q", check.Name, expected, check.Status) + } + default: + t.Errorf("unexpected check %q with status %q", check.Name, check.Status) + } + } + + logger.Printf("[TEST] consul.test: killing task") + + // Kill the task + tr.Kill("", "", false) + + select { + case <-tr.WaitCh(): + case <-time.After(10 * time.Second): + t.Fatalf("timed out waiting for Run() to exit") + } + + // Shutdown Consul ServiceClient to ensure all pending operations complete + if err := serviceClient.Shutdown(); err != nil { + t.Errorf("error shutting down Consul ServiceClient: %v", err) + } + + // Ensure Consul is clean + services, _, err := catalog.Services(nil) + if err != nil { + t.Fatalf("error query services: %v", err) + } + if len(services) != 1 { + t.Fatalf("expected only 1 service in Consul but found %d:\n%#v", len(services), services) + } + if _, ok := services["consul"]; !ok { + t.Fatalf(`expected only the "consul" key in Consul but found: %#v`, services) + } +} diff --git a/command/agent/consul/mock.go b/command/agent/consul/mock.go new file mode 100644 index 00000000000..f0dd0326ce0 --- /dev/null +++ b/command/agent/consul/mock.go @@ -0,0 +1,27 @@ +package consul + +import ( + "log" + + "github.com/hashicorp/consul/api" +) + +// MockCatalog can be used for testing where the CatalogAPI is needed. +type MockCatalog struct { + logger *log.Logger +} + +func NewMockCatalog(l *log.Logger) *MockCatalog { + return &MockCatalog{logger: l} +} + +func (m *MockCatalog) Datacenters() ([]string, error) { + dcs := []string{"dc1"} + m.logger.Printf("[DEBUG] mock_consul: Datacenters() -> (%q, nil)", dcs) + return dcs, nil +} + +func (m *MockCatalog) Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error) { + m.logger.Printf("[DEBUG] mock_consul: Service(%q, %q, %#v) -> (nil, nil, nil)", service, tag, q) + return nil, nil, nil +} diff --git a/command/agent/consul/script.go b/command/agent/consul/script.go new file mode 100644 index 00000000000..1c63877a3a7 --- /dev/null +++ b/command/agent/consul/script.go @@ -0,0 +1,136 @@ +package consul + +import ( + "context" + "log" + "time" + + "github.com/hashicorp/consul/api" + "github.com/hashicorp/nomad/nomad/structs" +) + +// heartbeater is the subset of consul agent functionality needed by script +// checks to heartbeat +type heartbeater interface { + UpdateTTL(id, output, status string) error +} + +type scriptHandle struct { + // cancel the script + cancel func() + done chan struct{} +} + +// wait returns a chan that's closed when the script exits +func (s *scriptHandle) wait() <-chan struct{} { + return s.done +} + +type scriptCheck struct { + id string + check *structs.ServiceCheck + exec ScriptExecutor + agent heartbeater + + // lastCheckOk is true if the last check was ok; otherwise false + lastCheckOk bool + + logger *log.Logger + shutdownCh <-chan struct{} +} + +func newScriptCheck(id string, check *structs.ServiceCheck, exec ScriptExecutor, agent heartbeater, logger *log.Logger, shutdownCh <-chan struct{}) *scriptCheck { + return &scriptCheck{ + id: id, + check: check, + exec: exec, + agent: agent, + lastCheckOk: true, // start logging on first failure + logger: logger, + shutdownCh: shutdownCh, + } +} + +// run this script check and return its cancel func. If the shutdownCh is +// closed the check will be run once more before exiting. +func (s *scriptCheck) run() *scriptHandle { + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + defer close(done) + timer := time.NewTimer(0) + defer timer.Stop() + for { + // Block until check is removed, Nomad is shutting + // down, or the check interval is up + select { + case <-ctx.Done(): + // check has been removed + return + case <-s.shutdownCh: + // unblock but don't exit until after we heartbeat once more + case <-timer.C: + timer.Reset(s.check.Interval) + } + + // Execute check script with timeout + execctx, cancel := context.WithTimeout(ctx, s.check.Timeout) + output, code, err := s.exec.Exec(execctx, s.check.Command, s.check.Args) + switch execctx.Err() { + case context.Canceled: + // check removed during execution; exit + return + case context.DeadlineExceeded: + // Log deadline exceeded every time, but flip last check to false + s.lastCheckOk = false + s.logger.Printf("[WARN] consul.checks: check %q timed out (%s)", s.check.Name, s.check.Timeout) + } + // cleanup context + cancel() + + state := api.HealthCritical + switch code { + case 0: + state = api.HealthPassing + case 1: + state = api.HealthWarning + } + if err != nil { + state = api.HealthCritical + output = []byte(err.Error()) + } + + // Actually heartbeat the check + err = s.agent.UpdateTTL(s.id, string(output), state) + select { + case <-ctx.Done(): + // check has been removed; don't report errors + return + default: + } + + if err != nil { + //FIXME Backoff? Retry faster? + if s.lastCheckOk { + s.lastCheckOk = false + s.logger.Printf("[WARN] consul.checks: update for check %q failed: %v", s.check.Name, err) + } else { + s.logger.Printf("[DEBUG] consul.checks: update for check %q still failing: %v", s.check.Name, err) + } + + } else if !s.lastCheckOk { + // Succeeded for the first time or after failing; log + s.lastCheckOk = true + s.logger.Printf("[INFO] consul.checks: update for check %q succeeded", s.check.Name) + } + + select { + case <-s.shutdownCh: + // We've been told to exit + return + default: + } + } + }() + return &scriptHandle{cancel: cancel, done: done} +} diff --git a/command/agent/consul/script_test.go b/command/agent/consul/script_test.go new file mode 100644 index 00000000000..a9bda3e4449 --- /dev/null +++ b/command/agent/consul/script_test.go @@ -0,0 +1,165 @@ +package consul + +import ( + "context" + "os/exec" + "testing" + "time" + + "github.com/hashicorp/consul/api" + "github.com/hashicorp/nomad/nomad/structs" +) + +// blockingScriptExec implements ScriptExec by running a subcommand that never +// exits. +type blockingScriptExec struct { + // running is ticked before blocking to allow synchronizing operations + running chan struct{} + + // set to true if Exec is called and has exited + exited bool +} + +func newBlockingScriptExec() *blockingScriptExec { + return &blockingScriptExec{running: make(chan struct{})} +} + +func (b *blockingScriptExec) Exec(ctx context.Context, _ string, _ []string) ([]byte, int, error) { + b.running <- mark + cmd := exec.CommandContext(ctx, "/bin/sleep", "9000") + err := cmd.Run() + code := 0 + if exitErr, ok := err.(*exec.ExitError); ok { + if !exitErr.Success() { + code = 1 + } + } + b.exited = true + return []byte{}, code, err +} + +// TestConsulScript_Exec_Cancel asserts cancelling a script check shortcircuits +// any running scripts. +func TestConsulScript_Exec_Cancel(t *testing.T) { + serviceCheck := structs.ServiceCheck{ + Name: "sleeper", + Interval: time.Hour, + Timeout: time.Hour, + } + exec := newBlockingScriptExec() + + // pass nil for heartbeater as it shouldn't be called + check := newScriptCheck("checkid", &serviceCheck, exec, nil, testLogger(), nil) + handle := check.run() + + // wait until Exec is called + <-exec.running + + // cancel now that we're blocked in exec + handle.cancel() + + select { + case <-handle.wait(): + case <-time.After(3 * time.Second): + t.Fatalf("timed out waiting for script check to exit") + } + if !exec.exited { + t.Errorf("expected script executor to run and exit but it has not") + } +} + +type fakeHeartbeater struct { + updates chan string +} + +func (f *fakeHeartbeater) UpdateTTL(checkID, output, status string) error { + f.updates <- status + return nil +} + +func newFakeHeartbeater() *fakeHeartbeater { + return &fakeHeartbeater{updates: make(chan string)} +} + +// TestConsulScript_Exec_Timeout asserts a script will be killed when the +// timeout is reached. +func TestConsulScript_Exec_Timeout(t *testing.T) { + t.Parallel() // run the slow tests in parallel + serviceCheck := structs.ServiceCheck{ + Name: "sleeper", + Interval: time.Hour, + Timeout: time.Second, + } + exec := newBlockingScriptExec() + + hb := newFakeHeartbeater() + check := newScriptCheck("checkid", &serviceCheck, exec, hb, testLogger(), nil) + handle := check.run() + defer handle.cancel() // just-in-case cleanup + <-exec.running + + // Check for UpdateTTL call + select { + case update := <-hb.updates: + if update != api.HealthCritical { + t.Error("expected %q due to timeout but received %q", api.HealthCritical, update) + } + case <-time.After(3 * time.Second): + t.Fatalf("timed out waiting for script check to exit") + } + if !exec.exited { + t.Errorf("expected script executor to run and exit but it has not") + } + + // Cancel and watch for exit + handle.cancel() + select { + case <-handle.wait(): + // ok! + case update := <-hb.updates: + t.Errorf("unexpected UpdateTTL call on exit with status=%q", update) + case <-time.After(3 * time.Second): + t.Fatalf("timed out waiting for script check to exit") + } +} + +type noopExec struct{} + +func (noopExec) Exec(context.Context, string, []string) ([]byte, int, error) { + return []byte{}, 0, nil +} + +// TestConsulScript_Exec_Shutdown asserts a script will be executed once more +// when told to shutdown. +func TestConsulScript_Exec_Shutdown(t *testing.T) { + serviceCheck := structs.ServiceCheck{ + Name: "sleeper", + Interval: time.Hour, + Timeout: 3 * time.Second, + } + + hb := newFakeHeartbeater() + shutdown := make(chan struct{}) + check := newScriptCheck("checkid", &serviceCheck, noopExec{}, hb, testLogger(), shutdown) + handle := check.run() + defer handle.cancel() // just-in-case cleanup + + // Tell scriptCheck to exit + close(shutdown) + + select { + case update := <-hb.updates: + if update != api.HealthPassing { + t.Error("expected %q due to timeout but received %q", api.HealthPassing, update) + } + case <-time.After(3 * time.Second): + t.Fatalf("timed out waiting for script check to exit") + } + + select { + case <-handle.wait(): + // ok! + case <-time.After(3 * time.Second): + t.Fatalf("timed out waiting for script check to exit") + } +} diff --git a/command/agent/consul/syncer.go b/command/agent/consul/syncer.go deleted file mode 100644 index c111f3aafcb..00000000000 --- a/command/agent/consul/syncer.go +++ /dev/null @@ -1,1016 +0,0 @@ -// Package consul is used by Nomad to register all services both static services -// and dynamic via allocations. -// -// Consul Service IDs have the following format: ${nomadServicePrefix}-${groupName}-${serviceKey} -// groupName takes on one of the following values: -// - server -// - client -// - executor-${alloc-id}-${task-name} -// -// serviceKey should be generated by service registrators. -// If the serviceKey is being generated by the executor for a Nomad Task.Services -// the following helper should be used: -// NOTE: Executor should interpolate the service prior to calling -// func GenerateTaskServiceKey(service *structs.Service) string -// -// The Nomad Client reaps services registered from dead allocations that were -// not properly cleaned up by the executor (this is not the expected case). -// -// TODO fix this comment -// The Consul ServiceIDs generated by the executor will contain the allocation -// ID. Thus the client can generate the list of Consul ServiceIDs to keep by -// calling the following method on all running allocations the client is aware -// of: -// func GenerateExecutorServiceKeyPrefixFromAlloc(allocID string) string -package consul - -import ( - "fmt" - "log" - "net" - "net/url" - "strconv" - "strings" - "sync" - "time" - - consul "github.com/hashicorp/consul/api" - "github.com/hashicorp/go-multierror" - - "github.com/hashicorp/nomad/nomad/structs" - "github.com/hashicorp/nomad/nomad/structs/config" - "github.com/hashicorp/nomad/nomad/types" -) - -const ( - // initialSyncBuffer is the max time an initial sync will sleep - // before syncing. - initialSyncBuffer = 30 * time.Second - - // initialSyncDelay is the delay before an initial sync. - initialSyncDelay = 5 * time.Second - - // nomadServicePrefix is the first prefix that scopes all Nomad registered - // services - nomadServicePrefix = "_nomad" - - // The periodic time interval for syncing services and checks with Consul - defaultSyncInterval = 6 * time.Second - - // defaultSyncJitter provides a little variance in the frequency at which - // Syncer polls Consul. - defaultSyncJitter = time.Second - - // ttlCheckBuffer is the time interval that Nomad can take to report Consul - // the check result - ttlCheckBuffer = 31 * time.Second - - // DefaultQueryWaitDuration is the max duration the Consul Agent will - // spend waiting for a response from a Consul Query. - DefaultQueryWaitDuration = 2 * time.Second - - // ServiceTagHTTP is the tag assigned to HTTP services - ServiceTagHTTP = "http" - - // ServiceTagRPC is the tag assigned to RPC services - ServiceTagRPC = "rpc" - - // ServiceTagSerf is the tag assigned to Serf services - ServiceTagSerf = "serf" -) - -// consulServiceID and consulCheckID are the IDs registered with Consul -type consulServiceID string -type consulCheckID string - -// ServiceKey is the generated service key that is used to build the Consul -// ServiceID -type ServiceKey string - -// ServiceDomain is the domain of services registered by Nomad -type ServiceDomain string - -const ( - ClientDomain ServiceDomain = "client" - ServerDomain ServiceDomain = "server" -) - -// NewExecutorDomain returns a domain specific to the alloc ID and task -func NewExecutorDomain(allocID, task string) ServiceDomain { - return ServiceDomain(fmt.Sprintf("executor-%s-%s", allocID, task)) -} - -// Syncer allows syncing of services and checks with Consul -type Syncer struct { - client *consul.Client - consulAvailable bool - - // servicesGroups and checkGroups are named groups of services and checks - // respectively that will be flattened and reconciled with Consul when - // SyncServices() is called. The key to the servicesGroups map is unique - // per handler and is used to allow the Agent's services to be maintained - // independently of the Client or Server's services. - servicesGroups map[ServiceDomain]map[ServiceKey]*consul.AgentServiceRegistration - checkGroups map[ServiceDomain]map[ServiceKey][]*consul.AgentCheckRegistration - groupsLock sync.RWMutex - - // The "Consul Registry" is a collection of Consul Services and - // Checks all guarded by the registryLock. - registryLock sync.RWMutex - - // checkRunners are delegated Consul checks being ran by the Syncer - checkRunners map[consulCheckID]*CheckRunner - - addrFinder func(portLabel string) (string, int) - createDelegatedCheck func(*structs.ServiceCheck, string) (Check, error) - delegateChecks map[string]struct{} // delegateChecks are the checks that the Nomad client runs and reports to Consul - // End registryLock guarded attributes. - - logger *log.Logger - - shutdownCh chan struct{} - shutdown bool - shutdownLock sync.Mutex - - // notifyShutdownCh is used to notify a Syncer it needs to shutdown. - // This can happen because there was an explicit call to the Syncer's - // Shutdown() method, or because the calling task signaled the - // program is going to exit by closing its shutdownCh. - notifyShutdownCh chan struct{} - - // periodicCallbacks is walked sequentially when the timer in Run - // fires. - periodicCallbacks map[string]types.PeriodicCallback - notifySyncCh chan struct{} - periodicLock sync.RWMutex - - // The periodic time interval for syncing services and checks with Consul - syncInterval time.Duration - - // syncJitter provides a little variance in the frequency at which - // Syncer polls Consul. - syncJitter time.Duration -} - -// NewSyncer returns a new consul.Syncer -func NewSyncer(consulConfig *config.ConsulConfig, shutdownCh chan struct{}, logger *log.Logger) (*Syncer, error) { - var consulClientConfig *consul.Config - var err error - consulClientConfig, err = consulConfig.ApiConfig() - if err != nil { - return nil, err - } - - var consulClient *consul.Client - if consulClient, err = consul.NewClient(consulClientConfig); err != nil { - return nil, err - } - consulSyncer := Syncer{ - client: consulClient, - logger: logger, - consulAvailable: true, - shutdownCh: shutdownCh, - servicesGroups: make(map[ServiceDomain]map[ServiceKey]*consul.AgentServiceRegistration), - checkGroups: make(map[ServiceDomain]map[ServiceKey][]*consul.AgentCheckRegistration), - checkRunners: make(map[consulCheckID]*CheckRunner), - periodicCallbacks: make(map[string]types.PeriodicCallback), - notifySyncCh: make(chan struct{}, 1), - // default noop implementation of addrFinder - addrFinder: func(string) (string, int) { return "", 0 }, - syncInterval: defaultSyncInterval, - syncJitter: defaultSyncJitter, - } - - return &consulSyncer, nil -} - -// SetDelegatedChecks sets the checks that nomad is going to run and report the -// result back to consul -func (c *Syncer) SetDelegatedChecks(delegateChecks map[string]struct{}, createDelegatedCheckFn func(*structs.ServiceCheck, string) (Check, error)) *Syncer { - c.delegateChecks = delegateChecks - c.createDelegatedCheck = createDelegatedCheckFn - return c -} - -// SetAddrFinder sets a function to find the host and port for a Service given its port label -func (c *Syncer) SetAddrFinder(addrFinder func(string) (string, int)) *Syncer { - c.addrFinder = addrFinder - return c -} - -// GenerateServiceKey should be called to generate a serviceKey based on the -// Service. -func GenerateServiceKey(service *structs.Service) ServiceKey { - var key string - numTags := len(service.Tags) - switch numTags { - case 0: - key = fmt.Sprintf("%s", service.Name) - default: - tags := strings.Join(service.Tags, "-") - key = fmt.Sprintf("%s-%s", service.Name, tags) - } - return ServiceKey(key) -} - -// SetServices stores the map of Nomad Services to the provided service -// domain name. -func (c *Syncer) SetServices(domain ServiceDomain, services map[ServiceKey]*structs.Service) error { - var mErr multierror.Error - numServ := len(services) - registeredServices := make(map[ServiceKey]*consul.AgentServiceRegistration, numServ) - registeredChecks := make(map[ServiceKey][]*consul.AgentCheckRegistration, numServ) - for serviceKey, service := range services { - serviceReg, err := c.createService(service, domain, serviceKey) - if err != nil { - mErr.Errors = append(mErr.Errors, err) - continue - } - registeredServices[serviceKey] = serviceReg - - // Register the check(s) for this service - for _, chk := range service.Checks { - // Create a Consul check registration - chkReg, err := c.createCheckReg(chk, serviceReg) - if err != nil { - mErr.Errors = append(mErr.Errors, err) - continue - } - - // creating a nomad check if we have to handle this particular check type - c.registryLock.RLock() - if _, ok := c.delegateChecks[chk.Type]; ok { - _, ok := c.checkRunners[consulCheckID(chkReg.ID)] - c.registryLock.RUnlock() - if ok { - continue - } - - nc, err := c.createDelegatedCheck(chk, chkReg.ID) - if err != nil { - mErr.Errors = append(mErr.Errors, err) - continue - } - - cr := NewCheckRunner(nc, c.runCheck, c.logger) - c.registryLock.Lock() - // TODO type the CheckRunner - c.checkRunners[consulCheckID(nc.ID())] = cr - c.registryLock.Unlock() - } else { - c.registryLock.RUnlock() - } - - registeredChecks[serviceKey] = append(registeredChecks[serviceKey], chkReg) - } - } - - if len(mErr.Errors) > 0 { - return mErr.ErrorOrNil() - } - - // Update the services and checks groups for this domain - c.groupsLock.Lock() - - // Create map for service group if it doesn't exist - serviceKeys, ok := c.servicesGroups[domain] - if !ok { - serviceKeys = make(map[ServiceKey]*consul.AgentServiceRegistration, len(registeredServices)) - c.servicesGroups[domain] = serviceKeys - } - - // Remove stale services - for existingServiceKey := range serviceKeys { - if _, ok := registeredServices[existingServiceKey]; !ok { - // Exisitng service needs to be removed - delete(serviceKeys, existingServiceKey) - } - } - - // Add registered services - for serviceKey, service := range registeredServices { - serviceKeys[serviceKey] = service - } - - // Create map for check group if it doesn't exist - checkKeys, ok := c.checkGroups[domain] - if !ok { - checkKeys = make(map[ServiceKey][]*consul.AgentCheckRegistration, len(registeredChecks)) - c.checkGroups[domain] = checkKeys - } - - // Remove stale checks - for existingCheckKey := range checkKeys { - if _, ok := registeredChecks[existingCheckKey]; !ok { - // Exisitng check needs to be removed - delete(checkKeys, existingCheckKey) - } - } - - // Add registered checks - for checkKey, checks := range registeredChecks { - checkKeys[checkKey] = checks - } - c.groupsLock.Unlock() - - // Sync immediately - c.SyncNow() - - return nil -} - -// SyncNow expires the current timer forcing the list of periodic callbacks -// to be synced immediately. -func (c *Syncer) SyncNow() { - select { - case c.notifySyncCh <- struct{}{}: - default: - } -} - -// flattenedServices returns a flattened list of services that are registered -// locally -func (c *Syncer) flattenedServices() []*consul.AgentServiceRegistration { - const initialNumServices = 8 - services := make([]*consul.AgentServiceRegistration, 0, initialNumServices) - c.groupsLock.RLock() - defer c.groupsLock.RUnlock() - for _, servicesGroup := range c.servicesGroups { - for _, service := range servicesGroup { - services = append(services, service) - } - } - return services -} - -// flattenedChecks returns a flattened list of checks that are registered -// locally -func (c *Syncer) flattenedChecks() []*consul.AgentCheckRegistration { - const initialNumChecks = 8 - checks := make([]*consul.AgentCheckRegistration, 0, initialNumChecks) - c.groupsLock.RLock() - for _, checkGroup := range c.checkGroups { - for _, check := range checkGroup { - checks = append(checks, check...) - } - } - c.groupsLock.RUnlock() - return checks -} - -func (c *Syncer) signalShutdown() { - select { - case c.notifyShutdownCh <- struct{}{}: - default: - } -} - -// Shutdown de-registers the services and checks and shuts down periodic syncing -func (c *Syncer) Shutdown() error { - var mErr multierror.Error - - c.shutdownLock.Lock() - if !c.shutdown { - c.shutdown = true - } - c.shutdownLock.Unlock() - - c.signalShutdown() - - // Stop all the checks that nomad is running - c.registryLock.RLock() - defer c.registryLock.RUnlock() - for _, cr := range c.checkRunners { - cr.Stop() - } - - // De-register all the services registered by this syncer from Consul - services, err := c.queryAgentServices() - if err != nil { - mErr.Errors = append(mErr.Errors, err) - } - for serviceID := range services { - convertedID := string(serviceID) - if err := c.client.Agent().ServiceDeregister(convertedID); err != nil { - c.logger.Printf("[WARN] consul.syncer: failed to deregister service ID %+q: %v", convertedID, err) - mErr.Errors = append(mErr.Errors, err) - } - } - return mErr.ErrorOrNil() -} - -// queryChecks queries the Consul Agent for a list of Consul checks that -// have been registered with this Consul Syncer. -func (c *Syncer) queryChecks() (map[consulCheckID]*consul.AgentCheck, error) { - checks, err := c.client.Agent().Checks() - if err != nil { - return nil, err - } - return c.filterConsulChecks(checks), nil -} - -// queryAgentServices queries the Consul Agent for a list of Consul services that -// have been registered with this Consul Syncer. -func (c *Syncer) queryAgentServices() (map[consulServiceID]*consul.AgentService, error) { - services, err := c.client.Agent().Services() - if err != nil { - return nil, err - } - return c.filterConsulServices(services), nil -} - -// syncChecks synchronizes this Syncer's Consul Checks with the Consul Agent. -func (c *Syncer) syncChecks() error { - var mErr multierror.Error - consulChecks, err := c.queryChecks() - if err != nil { - return err - } - - // Synchronize checks with Consul - missingChecks, existingChecks, changedChecks, staleChecks := c.calcChecksDiff(consulChecks) - for _, check := range missingChecks { - if err := c.registerCheck(check); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - for _, check := range existingChecks { - c.ensureCheckRunning(check) - } - for _, check := range changedChecks { - // NOTE(sean@): Do we need to deregister the check before - // re-registering it? Not deregistering to avoid missing the - // TTL but doesn't correct reconcile any possible drift with - // the check. - // - // if err := c.deregisterCheck(check.ID); err != nil { - // mErr.Errors = append(mErr.Errors, err) - // } - if err := c.registerCheck(check); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - for _, check := range staleChecks { - if err := c.deregisterCheck(consulCheckID(check.ID)); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - return mErr.ErrorOrNil() -} - -// compareConsulCheck takes a consul.AgentCheckRegistration instance and -// compares it with a consul.AgentCheck. Returns true if they are equal -// according to consul.AgentCheck, otherwise false. -func compareConsulCheck(localCheck *consul.AgentCheckRegistration, consulCheck *consul.AgentCheck) bool { - if consulCheck.CheckID != localCheck.ID || - consulCheck.Name != localCheck.Name || - consulCheck.Notes != localCheck.Notes || - consulCheck.ServiceID != localCheck.ServiceID { - return false - } - return true -} - -// calcChecksDiff takes the argument (consulChecks) and calculates the delta -// between the consul.Syncer's list of known checks (c.flattenedChecks()). -// Four arrays are returned: -// -// 1) a slice of checks that exist only locally in the Syncer and are missing -// from the Consul Agent (consulChecks) and therefore need to be registered. -// -// 2) a slice of checks that exist in both the local consul.Syncer's -// tracked list and Consul Agent (consulChecks). -// -// 3) a slice of checks that exist in both the local consul.Syncer's -// tracked list and Consul Agent (consulServices) but have diverged state. -// -// 4) a slice of checks that exist only in the Consul Agent (consulChecks) -// and should be removed because the Consul Agent has drifted from the -// Syncer. -func (c *Syncer) calcChecksDiff(consulChecks map[consulCheckID]*consul.AgentCheck) ( - missingChecks []*consul.AgentCheckRegistration, - equalChecks []*consul.AgentCheckRegistration, - changedChecks []*consul.AgentCheckRegistration, - staleChecks []*consul.AgentCheckRegistration) { - - type mergedCheck struct { - check *consul.AgentCheckRegistration - // 'l' == Nomad local only - // 'e' == equal - // 'c' == changed - // 'a' == Consul agent only - state byte - } - var ( - localChecksCount = 0 - equalChecksCount = 0 - changedChecksCount = 0 - agentChecks = 0 - ) - flattenedChecks := c.flattenedChecks() - localChecks := make(map[string]*mergedCheck, len(flattenedChecks)+len(consulChecks)) - for _, localCheck := range flattenedChecks { - localChecksCount++ - localChecks[localCheck.ID] = &mergedCheck{localCheck, 'l'} - } - for _, consulCheck := range consulChecks { - if localCheck, found := localChecks[consulCheck.CheckID]; found { - localChecksCount-- - if compareConsulCheck(localCheck.check, consulCheck) { - equalChecksCount++ - localChecks[consulCheck.CheckID].state = 'e' - } else { - changedChecksCount++ - localChecks[consulCheck.CheckID].state = 'c' - } - } else { - agentChecks++ - agentCheckReg := &consul.AgentCheckRegistration{ - ID: consulCheck.CheckID, - Name: consulCheck.Name, - Notes: consulCheck.Notes, - ServiceID: consulCheck.ServiceID, - } - localChecks[consulCheck.CheckID] = &mergedCheck{agentCheckReg, 'a'} - } - } - - missingChecks = make([]*consul.AgentCheckRegistration, 0, localChecksCount) - equalChecks = make([]*consul.AgentCheckRegistration, 0, equalChecksCount) - changedChecks = make([]*consul.AgentCheckRegistration, 0, changedChecksCount) - staleChecks = make([]*consul.AgentCheckRegistration, 0, agentChecks) - for _, check := range localChecks { - switch check.state { - case 'l': - missingChecks = append(missingChecks, check.check) - case 'e': - equalChecks = append(equalChecks, check.check) - case 'c': - changedChecks = append(changedChecks, check.check) - case 'a': - staleChecks = append(staleChecks, check.check) - } - } - - return missingChecks, equalChecks, changedChecks, staleChecks -} - -// compareConsulService takes a consul.AgentServiceRegistration instance and -// compares it with a consul.AgentService. Returns true if they are equal -// according to consul.AgentService, otherwise false. -func compareConsulService(localService *consul.AgentServiceRegistration, consulService *consul.AgentService) bool { - if consulService.ID != localService.ID || - consulService.Service != localService.Name || - consulService.Port != localService.Port || - consulService.Address != localService.Address || - consulService.EnableTagOverride != localService.EnableTagOverride { - return false - } - - serviceTags := make(map[string]byte, len(localService.Tags)) - for _, tag := range localService.Tags { - serviceTags[tag] = 'l' - } - for _, tag := range consulService.Tags { - if _, found := serviceTags[tag]; !found { - return false - } - serviceTags[tag] = 'b' - } - for _, state := range serviceTags { - if state == 'l' { - return false - } - } - - return true -} - -// calcServicesDiff takes the argument (consulServices) and calculates the -// delta between the consul.Syncer's list of known services -// (c.flattenedServices()). Four arrays are returned: -// -// 1) a slice of services that exist only locally in the Syncer and are -// missing from the Consul Agent (consulServices) and therefore need to be -// registered. -// -// 2) a slice of services that exist in both the local consul.Syncer's -// tracked list and Consul Agent (consulServices) *AND* are identical. -// -// 3) a slice of services that exist in both the local consul.Syncer's -// tracked list and Consul Agent (consulServices) but have diverged state. -// -// 4) a slice of services that exist only in the Consul Agent -// (consulServices) and should be removed because the Consul Agent has -// drifted from the Syncer. -func (c *Syncer) calcServicesDiff(consulServices map[consulServiceID]*consul.AgentService) (missingServices []*consul.AgentServiceRegistration, equalServices []*consul.AgentServiceRegistration, changedServices []*consul.AgentServiceRegistration, staleServices []*consul.AgentServiceRegistration) { - type mergedService struct { - service *consul.AgentServiceRegistration - // 'l' == Nomad local only - // 'e' == equal - // 'c' == changed - // 'a' == Consul agent only - state byte - } - var ( - localServicesCount = 0 - equalServicesCount = 0 - changedServicesCount = 0 - agentServices = 0 - ) - flattenedServices := c.flattenedServices() - localServices := make(map[string]*mergedService, len(flattenedServices)+len(consulServices)) - for _, localService := range flattenedServices { - localServicesCount++ - localServices[localService.ID] = &mergedService{localService, 'l'} - } - for _, consulService := range consulServices { - if localService, found := localServices[consulService.ID]; found { - localServicesCount-- - if compareConsulService(localService.service, consulService) { - equalServicesCount++ - localServices[consulService.ID].state = 'e' - } else { - changedServicesCount++ - localServices[consulService.ID].state = 'c' - } - } else { - agentServices++ - agentServiceReg := &consul.AgentServiceRegistration{ - ID: consulService.ID, - Name: consulService.Service, - Tags: consulService.Tags, - Port: consulService.Port, - Address: consulService.Address, - } - localServices[consulService.ID] = &mergedService{agentServiceReg, 'a'} - } - } - - missingServices = make([]*consul.AgentServiceRegistration, 0, localServicesCount) - equalServices = make([]*consul.AgentServiceRegistration, 0, equalServicesCount) - changedServices = make([]*consul.AgentServiceRegistration, 0, changedServicesCount) - staleServices = make([]*consul.AgentServiceRegistration, 0, agentServices) - for _, service := range localServices { - switch service.state { - case 'l': - missingServices = append(missingServices, service.service) - case 'e': - equalServices = append(equalServices, service.service) - case 'c': - changedServices = append(changedServices, service.service) - case 'a': - staleServices = append(staleServices, service.service) - } - } - - return missingServices, equalServices, changedServices, staleServices -} - -// syncServices synchronizes this Syncer's Consul Services with the Consul -// Agent. -func (c *Syncer) syncServices() error { - consulServices, err := c.queryAgentServices() - if err != nil { - return err - } - - // Synchronize services with Consul - var mErr multierror.Error - missingServices, _, changedServices, removedServices := c.calcServicesDiff(consulServices) - for _, service := range missingServices { - if err := c.client.Agent().ServiceRegister(service); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - for _, service := range changedServices { - // Re-register the local service - if err := c.client.Agent().ServiceRegister(service); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - for _, service := range removedServices { - if err := c.deregisterService(service.ID); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - return mErr.ErrorOrNil() -} - -// registerCheck registers a check definition with Consul -func (c *Syncer) registerCheck(chkReg *consul.AgentCheckRegistration) error { - c.registryLock.RLock() - if cr, ok := c.checkRunners[consulCheckID(chkReg.ID)]; ok { - cr.Start() - } - c.registryLock.RUnlock() - return c.client.Agent().CheckRegister(chkReg) -} - -// ensureCheckRunning starts the check runner for a check if it's not already running -func (c *Syncer) ensureCheckRunning(chk *consul.AgentCheckRegistration) { - c.registryLock.RLock() - defer c.registryLock.RUnlock() - if cr, ok := c.checkRunners[consulCheckID(chk.ID)]; ok && !cr.Started() { - c.logger.Printf("[DEBUG] consul.syncer: starting runner for existing check. %v", chk.ID) - cr.Start() - } -} - -// createCheckReg creates a Check that can be registered with Nomad. It also -// creates a Nomad check for the check types that it can handle. -func (c *Syncer) createCheckReg(check *structs.ServiceCheck, serviceReg *consul.AgentServiceRegistration) (*consul.AgentCheckRegistration, error) { - chkReg := consul.AgentCheckRegistration{ - ID: check.Hash(serviceReg.ID), - Name: check.Name, - ServiceID: serviceReg.ID, - } - chkReg.Timeout = check.Timeout.String() - chkReg.Interval = check.Interval.String() - host, port := serviceReg.Address, serviceReg.Port - if check.PortLabel != "" { - host, port = c.addrFinder(check.PortLabel) - } - switch check.Type { - case structs.ServiceCheckHTTP: - if check.Protocol == "" { - check.Protocol = "http" - } - base := url.URL{ - Scheme: check.Protocol, - Host: net.JoinHostPort(host, strconv.Itoa(port)), - } - relative, err := url.Parse(check.Path) - if err != nil { - return nil, err - } - url := base.ResolveReference(relative) - chkReg.HTTP = url.String() - case structs.ServiceCheckTCP: - chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port)) - case structs.ServiceCheckScript: - chkReg.TTL = (check.Interval + ttlCheckBuffer).String() - default: - return nil, fmt.Errorf("check type %+q not valid", check.Type) - } - chkReg.Status = check.InitialStatus - return &chkReg, nil -} - -// generateConsulServiceID takes the domain and service key and returns a Consul -// ServiceID -func generateConsulServiceID(domain ServiceDomain, key ServiceKey) consulServiceID { - return consulServiceID(fmt.Sprintf("%s-%s-%s", nomadServicePrefix, domain, key)) -} - -// createService creates a Consul AgentService from a Nomad ConsulService. -func (c *Syncer) createService(service *structs.Service, domain ServiceDomain, key ServiceKey) (*consul.AgentServiceRegistration, error) { - c.registryLock.RLock() - defer c.registryLock.RUnlock() - - srv := consul.AgentServiceRegistration{ - ID: string(generateConsulServiceID(domain, key)), - Name: service.Name, - Tags: service.Tags, - } - host, port := c.addrFinder(service.PortLabel) - if host != "" { - srv.Address = host - } - - if port != 0 { - srv.Port = port - } - - return &srv, nil -} - -// deregisterService de-registers a service with the given ID from consul -func (c *Syncer) deregisterService(serviceID string) error { - return c.client.Agent().ServiceDeregister(serviceID) -} - -// deregisterCheck de-registers a check from Consul -func (c *Syncer) deregisterCheck(id consulCheckID) error { - c.registryLock.Lock() - defer c.registryLock.Unlock() - - // Deleting from Consul Agent - if err := c.client.Agent().CheckDeregister(string(id)); err != nil { - // CheckDeregister() will be reattempted again in a future - // sync. - return err - } - - // Remove the check from the local registry - if cr, ok := c.checkRunners[id]; ok { - cr.Stop() - delete(c.checkRunners, id) - } - - return nil -} - -// Run triggers periodic syncing of services and checks with Consul. This is -// a long lived go-routine which is stopped during shutdown. -func (c *Syncer) Run() { - sync := time.NewTimer(0) - for { - select { - case <-sync.C: - d := c.syncInterval - c.syncJitter - sync.Reset(d) - - if err := c.SyncServices(); err != nil { - if c.consulAvailable { - c.logger.Printf("[DEBUG] consul.syncer: error in syncing: %v", err) - } - c.consulAvailable = false - } else { - if !c.consulAvailable { - c.logger.Printf("[DEBUG] consul.syncer: syncs succesful") - } - c.consulAvailable = true - } - case <-c.notifySyncCh: - sync.Reset(0) - case <-c.shutdownCh: - c.Shutdown() - case <-c.notifyShutdownCh: - sync.Stop() - c.logger.Printf("[INFO] consul.syncer: shutting down syncer ") - return - } - } -} - -// RunHandlers executes each handler (randomly) -func (c *Syncer) RunHandlers() error { - c.periodicLock.RLock() - handlers := make(map[string]types.PeriodicCallback, len(c.periodicCallbacks)) - for name, fn := range c.periodicCallbacks { - handlers[name] = fn - } - c.periodicLock.RUnlock() - - var mErr multierror.Error - for _, fn := range handlers { - if err := fn(); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - return mErr.ErrorOrNil() -} - -// SyncServices sync the services with the Consul Agent -func (c *Syncer) SyncServices() error { - var mErr multierror.Error - if err := c.syncServices(); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - if err := c.syncChecks(); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - if err := c.RunHandlers(); err != nil { - return err - } - return mErr.ErrorOrNil() -} - -// filterConsulServices prunes out all the service who were not registered with -// the syncer -func (c *Syncer) filterConsulServices(consulServices map[string]*consul.AgentService) map[consulServiceID]*consul.AgentService { - localServices := make(map[consulServiceID]*consul.AgentService, len(consulServices)) - c.groupsLock.RLock() - defer c.groupsLock.RUnlock() - for serviceID, service := range consulServices { - for domain := range c.servicesGroups { - if strings.HasPrefix(service.ID, fmt.Sprintf("%s-%s", nomadServicePrefix, domain)) { - localServices[consulServiceID(serviceID)] = service - break - } - } - } - return localServices -} - -// filterConsulChecks prunes out all the consul checks which do not have -// services with Syncer's idPrefix. -func (c *Syncer) filterConsulChecks(consulChecks map[string]*consul.AgentCheck) map[consulCheckID]*consul.AgentCheck { - localChecks := make(map[consulCheckID]*consul.AgentCheck, len(consulChecks)) - c.groupsLock.RLock() - defer c.groupsLock.RUnlock() - for checkID, check := range consulChecks { - for domain := range c.checkGroups { - if strings.HasPrefix(check.ServiceID, fmt.Sprintf("%s-%s", nomadServicePrefix, domain)) { - localChecks[consulCheckID(checkID)] = check - break - } - } - } - return localChecks -} - -// consulPresent indicates whether the Consul Agent is responding -func (c *Syncer) consulPresent() bool { - _, err := c.client.Agent().Self() - return err == nil -} - -// runCheck runs a check and updates the corresponding ttl check in consul -func (c *Syncer) runCheck(check Check) { - res := check.Run() - if res.Duration >= check.Timeout() { - c.logger.Printf("[DEBUG] consul.syncer: check took time: %v, timeout: %v", res.Duration, check.Timeout()) - } - state := consul.HealthCritical - output := res.Output - switch res.ExitCode { - case 0: - state = consul.HealthPassing - case 1: - state = consul.HealthWarning - default: - state = consul.HealthCritical - } - if res.Err != nil { - state = consul.HealthCritical - output = res.Err.Error() - } - if err := c.client.Agent().UpdateTTL(check.ID(), output, state); err != nil { - if c.consulAvailable { - c.logger.Printf("[DEBUG] consul.syncer: check %+q failed, disabling Consul checks until until next successful sync: %v", check.ID(), err) - c.consulAvailable = false - } else { - c.consulAvailable = true - } - } -} - -// ReapUnmatched prunes all services that do not exist in the passed domains -func (c *Syncer) ReapUnmatched(domains []ServiceDomain) error { - servicesInConsul, err := c.ConsulClient().Agent().Services() - if err != nil { - return err - } - - var mErr multierror.Error - for serviceID := range servicesInConsul { - // Skip any service that was not registered by Nomad - if !strings.HasPrefix(serviceID, nomadServicePrefix) { - continue - } - - // Filter services that do not exist in the desired domains - match := false - for _, domain := range domains { - // Include the hyphen so it is explicit to that domain otherwise it - // maybe a subset match - desired := fmt.Sprintf("%s-%s-", nomadServicePrefix, domain) - if strings.HasPrefix(serviceID, desired) { - match = true - break - } - } - - if !match { - if err := c.deregisterService(serviceID); err != nil { - mErr.Errors = append(mErr.Errors, err) - } - } - } - - return mErr.ErrorOrNil() -} - -// AddPeriodicHandler adds a uniquely named callback. Returns true if -// successful, false if a handler with the same name already exists. -func (c *Syncer) AddPeriodicHandler(name string, fn types.PeriodicCallback) bool { - c.periodicLock.Lock() - defer c.periodicLock.Unlock() - if _, found := c.periodicCallbacks[name]; found { - c.logger.Printf("[ERROR] consul.syncer: failed adding handler %+q", name) - return false - } - c.periodicCallbacks[name] = fn - return true -} - -// NumHandlers returns the number of callbacks registered with the syncer -func (c *Syncer) NumHandlers() int { - c.periodicLock.RLock() - defer c.periodicLock.RUnlock() - return len(c.periodicCallbacks) -} - -// RemovePeriodicHandler removes a handler with a given name. -func (c *Syncer) RemovePeriodicHandler(name string) { - c.periodicLock.Lock() - defer c.periodicLock.Unlock() - delete(c.periodicCallbacks, name) -} - -// ConsulClient returns the Consul client used by the Syncer. -func (c *Syncer) ConsulClient() *consul.Client { - return c.client -} diff --git a/command/agent/consul/syncer_test.go b/command/agent/consul/syncer_test.go deleted file mode 100644 index 42879ca7783..00000000000 --- a/command/agent/consul/syncer_test.go +++ /dev/null @@ -1,358 +0,0 @@ -package consul - -import ( - "io/ioutil" - "log" - "net" - "os" - "reflect" - "testing" - "time" - - "github.com/hashicorp/consul/api" - "github.com/hashicorp/consul/testutil" - "github.com/hashicorp/nomad/nomad/structs" - "github.com/hashicorp/nomad/nomad/structs/config" -) - -const ( - allocID = "12" - serviceRegPrefix = "test" - serviceGroupName = "executor" -) - -var logger = log.New(os.Stdout, "", log.LstdFlags) - -func TestSyncNow(t *testing.T) { - cs, testconsul := testConsul(t) - defer cs.Shutdown() - defer testconsul.Stop() - - cs.SetAddrFinder(func(h string) (string, int) { - a, pstr, _ := net.SplitHostPort(h) - p, _ := net.LookupPort("tcp", pstr) - return a, p - }) - cs.syncInterval = 9000 * time.Hour - - service := &structs.Service{Name: "foo1", Tags: []string{"a", "b"}} - services := map[ServiceKey]*structs.Service{ - GenerateServiceKey(service): service, - } - - // Run syncs once on startup and then blocks forever - go cs.Run() - - if err := cs.SetServices(serviceGroupName, services); err != nil { - t.Fatalf("error setting services: %v", err) - } - - synced := false - for i := 0; !synced && i < 10; i++ { - time.Sleep(250 * time.Millisecond) - agentServices, err := cs.queryAgentServices() - if err != nil { - t.Fatalf("error querying consul services: %v", err) - } - synced = len(agentServices) == 1 - } - if !synced { - t.Fatalf("initial sync never occurred") - } - - // SetServices again should cause another sync - service1 := &structs.Service{Name: "foo1", Tags: []string{"Y", "Z"}} - service2 := &structs.Service{Name: "bar"} - services = map[ServiceKey]*structs.Service{ - GenerateServiceKey(service1): service1, - GenerateServiceKey(service2): service2, - } - - if err := cs.SetServices(serviceGroupName, services); err != nil { - t.Fatalf("error setting services: %v", err) - } - - synced = false - for i := 0; !synced && i < 10; i++ { - time.Sleep(250 * time.Millisecond) - agentServices, err := cs.queryAgentServices() - if err != nil { - t.Fatalf("error querying consul services: %v", err) - } - synced = len(agentServices) == 2 - } - if !synced { - t.Fatalf("SetServices didn't sync immediately") - } -} - -func TestCheckRegistration(t *testing.T) { - cs, err := NewSyncer(config.DefaultConsulConfig(), make(chan struct{}), logger) - if err != nil { - t.Fatalf("Err: %v", err) - } - - check1 := structs.ServiceCheck{ - Name: "check-foo-1", - Type: structs.ServiceCheckTCP, - Interval: 30 * time.Second, - Timeout: 5 * time.Second, - InitialStatus: api.HealthPassing, - } - check2 := structs.ServiceCheck{ - Name: "check1", - Type: "tcp", - PortLabel: "port2", - Interval: 3 * time.Second, - Timeout: 1 * time.Second, - } - check3 := structs.ServiceCheck{ - Name: "check3", - Type: "http", - PortLabel: "port3", - Path: "/health?p1=1&p2=2", - Interval: 3 * time.Second, - Timeout: 1 * time.Second, - } - service1 := structs.Service{ - Name: "foo-1", - Tags: []string{"tag1", "tag2"}, - PortLabel: "port1", - Checks: []*structs.ServiceCheck{ - &check1, &check2, - }, - } - task := structs.Task{ - Name: "foo", - Services: []*structs.Service{&service1}, - Resources: &structs.Resources{ - Networks: []*structs.NetworkResource{ - &structs.NetworkResource{ - IP: "10.10.11.5", - DynamicPorts: []structs.Port{ - structs.Port{ - Label: "port1", - Value: 20002, - }, - structs.Port{ - Label: "port2", - Value: 20003, - }, - structs.Port{ - Label: "port3", - Value: 20004, - }, - }, - }, - }, - }, - } - cs.SetAddrFinder(task.FindHostAndPortFor) - srvReg, _ := cs.createService(&service1, "domain", "key") - check1Reg, _ := cs.createCheckReg(&check1, srvReg) - check2Reg, _ := cs.createCheckReg(&check2, srvReg) - check3Reg, _ := cs.createCheckReg(&check3, srvReg) - - expected := "10.10.11.5:20002" - if check1Reg.TCP != expected { - t.Fatalf("expected: %v, actual: %v", expected, check1Reg.TCP) - } - - expected = "10.10.11.5:20003" - if check2Reg.TCP != expected { - t.Fatalf("expected: %v, actual: %v", expected, check2Reg.TCP) - } - - expected = "http://10.10.11.5:20004/health?p1=1&p2=2" - if check3Reg.HTTP != expected { - t.Fatalf("expected: %v, actual: %v", expected, check3Reg.HTTP) - } - - expected = api.HealthPassing - if check1Reg.Status != expected { - t.Fatalf("expected: %v, actual: %v", expected, check1Reg.Status) - } -} - -// testConsul returns a Syncer configured with an embedded Consul server. -// -// Callers must defer Syncer.Shutdown() and TestServer.Stop() -// -func testConsul(t *testing.T) (*Syncer, *testutil.TestServer) { - // Create an embedded Consul server - testconsul := testutil.NewTestServerConfig(t, func(c *testutil.TestServerConfig) { - // If -v wasn't specified squelch consul logging - if !testing.Verbose() { - c.Stdout = ioutil.Discard - c.Stderr = ioutil.Discard - } - }) - - // Configure Syncer to talk to the test server - cconf := config.DefaultConsulConfig() - cconf.Addr = testconsul.HTTPAddr - - cs, err := NewSyncer(cconf, nil, logger) - if err != nil { - t.Fatalf("Error creating Syncer: %v", err) - } - return cs, testconsul -} - -func TestConsulServiceRegisterServices(t *testing.T) { - cs, testconsul := testConsul(t) - defer cs.Shutdown() - defer testconsul.Stop() - - service1 := &structs.Service{Name: "foo", Tags: []string{"a", "b"}} - service2 := &structs.Service{Name: "foo"} - services := map[ServiceKey]*structs.Service{ - GenerateServiceKey(service1): service1, - GenerateServiceKey(service2): service2, - } - - // Call SetServices to update services in consul - if err := cs.SetServices(serviceGroupName, services); err != nil { - t.Fatalf("error setting services: %v", err) - } - - // Manually call SyncServers to cause a synchronous consul update - if err := cs.SyncServices(); err != nil { - t.Fatalf("error syncing services: %v", err) - } - - numservices := len(cs.flattenedServices()) - if numservices != 2 { - t.Fatalf("expected 2 services but found %d", numservices) - } - - numchecks := len(cs.flattenedChecks()) - if numchecks != 0 { - t.Fatalf("expected 0 checks but found %d", numchecks) - } - - // Assert services are in consul - agentServices, err := cs.client.Agent().Services() - if err != nil { - t.Fatalf("error querying consul services: %v", err) - } - found := 0 - for id, as := range agentServices { - if id == "consul" { - found++ - continue - } - if _, ok := services[ServiceKey(as.Service)]; ok { - found++ - continue - } - t.Errorf("unexpected service in consul: %s", id) - } - if found != 3 { - t.Fatalf("expected 3 services in consul but found %d:\nconsul: %#v", len(agentServices), agentServices) - } - - agentChecks, err := cs.queryChecks() - if err != nil { - t.Fatalf("error querying consul checks: %v", err) - } - if len(agentChecks) != numchecks { - t.Fatalf("expected %d checks in consul but found %d:\n%#v", numservices, len(agentChecks), agentChecks) - } -} - -func TestConsulServiceUpdateService(t *testing.T) { - cs, testconsul := testConsul(t) - defer cs.Shutdown() - defer testconsul.Stop() - - cs.SetAddrFinder(func(h string) (string, int) { - a, pstr, _ := net.SplitHostPort(h) - p, _ := net.LookupPort("tcp", pstr) - return a, p - }) - - service1 := &structs.Service{Name: "foo1", Tags: []string{"a", "b"}} - service2 := &structs.Service{Name: "foo2"} - services := map[ServiceKey]*structs.Service{ - GenerateServiceKey(service1): service1, - GenerateServiceKey(service2): service2, - } - if err := cs.SetServices(serviceGroupName, services); err != nil { - t.Fatalf("error setting services: %v", err) - } - if err := cs.SyncServices(); err != nil { - t.Fatalf("error syncing services: %v", err) - } - - // Now update both services - service1 = &structs.Service{Name: "foo1", Tags: []string{"a", "z"}} - service2 = &structs.Service{Name: "foo2", PortLabel: ":8899"} - service3 := &structs.Service{Name: "foo3"} - services = map[ServiceKey]*structs.Service{ - GenerateServiceKey(service1): service1, - GenerateServiceKey(service2): service2, - GenerateServiceKey(service3): service3, - } - if err := cs.SetServices(serviceGroupName, services); err != nil { - t.Fatalf("error setting services: %v", err) - } - if err := cs.SyncServices(); err != nil { - t.Fatalf("error syncing services: %v", err) - } - - agentServices, err := cs.queryAgentServices() - if err != nil { - t.Fatalf("error querying consul services: %v", err) - } - if len(agentServices) != 3 { - t.Fatalf("expected 3 services in consul but found %d:\n%#v", len(agentServices), agentServices) - } - consulServices := make(map[string]*api.AgentService, 3) - for _, as := range agentServices { - consulServices[as.ID] = as - } - - found := 0 - for _, s := range cs.flattenedServices() { - // Assert sure changes were applied to internal state - switch s.Name { - case "foo1": - found++ - if !reflect.DeepEqual(service1.Tags, s.Tags) { - t.Errorf("incorrect tags on foo1:\n expected: %v\n found: %v", service1.Tags, s.Tags) - } - case "foo2": - found++ - if s.Address != "" { - t.Errorf("expected empty host on foo2 but found %q", s.Address) - } - if s.Port != 8899 { - t.Errorf("expected port 8899 on foo2 but found %d", s.Port) - } - case "foo3": - found++ - default: - t.Errorf("unexpected service: %s", s.Name) - } - - // Assert internal state equals consul's state - cs, ok := consulServices[s.ID] - if !ok { - t.Errorf("service not in consul: %s id: %s", s.Name, s.ID) - continue - } - if !reflect.DeepEqual(s.Tags, cs.Tags) { - t.Errorf("mismatched tags in syncer state and consul for %s:\nsyncer: %v\nconsul: %v", s.Name, s.Tags, cs.Tags) - } - if cs.Port != s.Port { - t.Errorf("mismatched port in syncer state and consul for %s\nsyncer: %v\nconsul: %v", s.Name, s.Port, cs.Port) - } - if cs.Address != s.Address { - t.Errorf("mismatched address in syncer state and consul for %s\nsyncer: %v\nconsul: %v", s.Name, s.Address, cs.Address) - } - } - if found != 3 { - t.Fatalf("expected 3 services locally but found %d", found) - } -} diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go new file mode 100644 index 00000000000..59ea83d7b12 --- /dev/null +++ b/command/agent/consul/unit_test.go @@ -0,0 +1,497 @@ +package consul + +import ( + "context" + "fmt" + "io/ioutil" + "log" + "os" + "reflect" + "sync" + "testing" + "time" + + "github.com/hashicorp/consul/api" + "github.com/hashicorp/nomad/nomad/structs" +) + +func testLogger() *log.Logger { + if testing.Verbose() { + return log.New(os.Stderr, "", log.LstdFlags) + } + return log.New(ioutil.Discard, "", 0) +} + +func testTask() *structs.Task { + return &structs.Task{ + Name: "taskname", + Resources: &structs.Resources{ + Networks: []*structs.NetworkResource{ + { + DynamicPorts: []structs.Port{{Label: "x", Value: 1234}}, + }, + }, + }, + Services: []*structs.Service{ + { + Name: "taskname-service", + PortLabel: "x", + Tags: []string{"tag1", "tag2"}, + }, + }, + } +} + +// testFakeCtx contains a fake Consul AgentAPI and implements the Exec +// interface to allow testing without running Consul. +type testFakeCtx struct { + ServiceClient *ServiceClient + FakeConsul *fakeConsul + Task *structs.Task + + ExecFunc func(ctx context.Context, cmd string, args []string) ([]byte, int, error) +} + +// Exec implements the ScriptExecutor interface and will use an alternate +// implementation t.ExecFunc if non-nil. +func (t *testFakeCtx) Exec(ctx context.Context, cmd string, args []string) ([]byte, int, error) { + if t.ExecFunc == nil { + // Default impl is just "ok" + return []byte("ok"), 0, nil + } + return t.ExecFunc(ctx, cmd, args) +} + +// setupFake creates a testFakeCtx with a ServiceClient backed by a fakeConsul. +// A test Task is also provided. +func setupFake() *testFakeCtx { + fc := &fakeConsul{ + services: make(map[string]*api.AgentServiceRegistration), + checks: make(map[string]*api.AgentCheckRegistration), + checkTTLs: make(map[string]int), + } + return &testFakeCtx{ + ServiceClient: NewServiceClient(fc, testLogger()), + FakeConsul: fc, + Task: testTask(), + } +} + +// fakeConsul is a fake in-memory Consul backend for ServiceClient. +type fakeConsul struct { + // maps of what services and checks have been registered + services map[string]*api.AgentServiceRegistration + checks map[string]*api.AgentCheckRegistration + mu sync.Mutex + + // when UpdateTTL is called the check ID will have its counter inc'd + checkTTLs map[string]int +} + +func (c *fakeConsul) CheckRegister(check *api.AgentCheckRegistration) error { + c.mu.Lock() + defer c.mu.Unlock() + c.checks[check.ID] = check + return nil +} + +func (c *fakeConsul) CheckDeregister(checkID string) error { + c.mu.Lock() + defer c.mu.Unlock() + delete(c.checks, checkID) + delete(c.checkTTLs, checkID) + return nil +} + +func (c *fakeConsul) ServiceRegister(service *api.AgentServiceRegistration) error { + c.mu.Lock() + defer c.mu.Unlock() + c.services[service.ID] = service + return nil +} + +func (c *fakeConsul) ServiceDeregister(serviceID string) error { + c.mu.Lock() + defer c.mu.Unlock() + delete(c.services, serviceID) + return nil +} + +func (c *fakeConsul) UpdateTTL(id string, output string, status string) error { + c.mu.Lock() + defer c.mu.Unlock() + check, ok := c.checks[id] + if !ok { + return fmt.Errorf("unknown check id: %q", id) + } + // Flip initial status to passing + check.Status = "passing" + c.checkTTLs[id]++ + return nil +} + +func TestConsul_ChangeTags(t *testing.T) { + ctx := setupFake() + + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, nil); err != nil { + t.Fatalf("unexpected error registering task: %v", err) + } + + // Manually call sync() since Run() isn't running + if err := ctx.ServiceClient.sync(); err != nil { + t.Fatalf("unexpected error syncing task: %v", err) + } + + if n := len(ctx.FakeConsul.services); n != 1 { + t.Fatalf("expected 1 service but found %d:\n%#v", n, ctx.FakeConsul.services) + } + + origKey := "" + for k, v := range ctx.FakeConsul.services { + origKey = k + if v.Name != ctx.Task.Services[0].Name { + t.Errorf("expected Name=%q != %q", ctx.Task.Services[0].Name, v.Name) + } + if !reflect.DeepEqual(v.Tags, ctx.Task.Services[0].Tags) { + t.Errorf("expected Tags=%v != %v", ctx.Task.Services[0].Tags, v.Tags) + } + } + + // Changing a tag removes old entry before adding new one + ctx.ServiceClient.RemoveTask("allocid", ctx.Task) + ctx.Task.Services[0].Tags[0] = "newtag" + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, nil); err != nil { + t.Fatalf("unexpected error registering task: %v", err) + } + if err := ctx.ServiceClient.sync(); err != nil { + t.Fatalf("unexpected error syncing task: %v", err) + } + + if n := len(ctx.FakeConsul.services); n != 1 { + t.Fatalf("expected 1 service but found %d:\n%#v", n, ctx.FakeConsul.services) + } + + for k, v := range ctx.FakeConsul.services { + if k == origKey { + t.Errorf("expected key to change but found %q", k) + } + if v.Name != ctx.Task.Services[0].Name { + t.Errorf("expected Name=%q != %q", ctx.Task.Services[0].Name, v.Name) + } + if !reflect.DeepEqual(v.Tags, ctx.Task.Services[0].Tags) { + t.Errorf("expected Tags=%v != %v", ctx.Task.Services[0].Tags, v.Tags) + } + } +} + +// TestConsul_RegServices tests basic service registration. +func TestConsul_RegServices(t *testing.T) { + ctx := setupFake() + port := ctx.Task.Resources.Networks[0].DynamicPorts[0].Value + + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, nil); err != nil { + t.Fatalf("unexpected error registering task: %v", err) + } + + // Manually call sync() since Run() isn't running + if err := ctx.ServiceClient.sync(); err != nil { + t.Fatalf("unexpected error syncing task: %v", err) + } + + if n := len(ctx.FakeConsul.services); n != 1 { + t.Fatalf("expected 1 service but found %d:\n%#v", n, ctx.FakeConsul.services) + } + for _, v := range ctx.FakeConsul.services { + if v.Name != ctx.Task.Services[0].Name { + t.Errorf("expected Name=%q != %q", ctx.Task.Services[0].Name, v.Name) + } + if !reflect.DeepEqual(v.Tags, ctx.Task.Services[0].Tags) { + t.Errorf("expected Tags=%v != %v", ctx.Task.Services[0].Tags, v.Tags) + } + if v.Port != port { + t.Errorf("expected Port=%d != %d", port, v.Port) + } + } + + // Make a change which will register a new service + ctx.Task.Services[0].Name = "taskname-service2" + ctx.Task.Services[0].Tags[0] = "tag3" + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, nil); err != nil { + t.Fatalf("unpexpected error registering task: %v", err) + } + + // Make sure changes don't take affect until sync() is called (since + // Run() isn't running) + if n := len(ctx.FakeConsul.services); n != 1 { + t.Fatalf("expected 1 service but found %d:\n%#v", n, ctx.FakeConsul.services) + } + for _, v := range ctx.FakeConsul.services { + if reflect.DeepEqual(v.Tags, ctx.Task.Services[0].Tags) { + t.Errorf("expected Tags to differ, changes applied before sync()") + } + } + + // Now sync() and re-check for the applied updates + if err := ctx.ServiceClient.sync(); err != nil { + t.Fatalf("unexpected error syncing task: %v", err) + } + if n := len(ctx.FakeConsul.services); n != 2 { + t.Fatalf("expected 2 services but found %d:\n%#v", n, ctx.FakeConsul.services) + } + found := false + for _, v := range ctx.FakeConsul.services { + if v.Name == ctx.Task.Services[0].Name { + if found { + t.Fatalf("found new service name %q twice", v.Name) + } + found = true + if !reflect.DeepEqual(v.Tags, ctx.Task.Services[0].Tags) { + t.Errorf("expected Tags=%v != %v", ctx.Task.Services[0].Tags, v.Tags) + } + } + } + if !found { + t.Fatalf("did not find new service %q", ctx.Task.Services[0].Name) + } + + // Remove the new task + ctx.ServiceClient.RemoveTask("allocid", ctx.Task) + if err := ctx.ServiceClient.sync(); err != nil { + t.Fatalf("unexpected error syncing task: %v", err) + } + if n := len(ctx.FakeConsul.services); n != 1 { + t.Fatalf("expected 1 service but found %d:\n%#v", n, ctx.FakeConsul.services) + } + for _, v := range ctx.FakeConsul.services { + if v.Name != "taskname-service" { + t.Errorf("expected original task to survive not %q", v.Name) + } + } +} + +// TestConsul_ShutdownOK tests the ok path for the shutdown logic in +// ServiceClient. +func TestConsul_ShutdownOK(t *testing.T) { + ctx := setupFake() + + // Add a script check to make sure its TTL gets updated + ctx.Task.Services[0].Checks = []*structs.ServiceCheck{ + { + Name: "scriptcheck", + Type: "script", + Command: "true", + // Make check block until shutdown + Interval: 9000 * time.Hour, + Timeout: 10 * time.Second, + InitialStatus: "warning", + }, + } + + hasShutdown := make(chan struct{}) + go func() { + ctx.ServiceClient.Run() + close(hasShutdown) + }() + + // Register a task and agent + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx); err != nil { + t.Fatalf("unexpected error registering task: %v", err) + } + + agentServices := []*structs.Service{ + { + Name: "http", + Tags: []string{"nomad"}, + PortLabel: "localhost:2345", + }, + } + if err := ctx.ServiceClient.RegisterAgent("client", agentServices); err != nil { + t.Fatalf("unexpected error registering agent: %v", err) + } + + // Shutdown should block until all enqueued operations finish. + if err := ctx.ServiceClient.Shutdown(); err != nil { + t.Errorf("unexpected error shutting down client: %v", err) + } + + // assert Run() exits in a timely fashion after Shutdown() exits + select { + case <-hasShutdown: + // ok! Run() exited as expected + case <-time.After(10 * time.Second): + t.Fatalf("expected Run() to exit, but it did not") + } + + // Nothing should be enqueued anymore + enqueued := (len(ctx.ServiceClient.regServices) + len(ctx.ServiceClient.deregServices) + + len(ctx.ServiceClient.regChecks) + len(ctx.ServiceClient.deregChecks)) + if enqueued > 0 { + t.Errorf("%d operations still enqueued", enqueued) + } + + // UpdateTTL should have been called once for the script check + if n := len(ctx.FakeConsul.checkTTLs); n != 1 { + t.Fatalf("expected 1 checkTTL entry but found: %d", n) + } + for _, v := range ctx.FakeConsul.checkTTLs { + if v != 1 { + t.Fatalf("expected script check to be updated once but found %d", v) + } + } + for _, v := range ctx.FakeConsul.checks { + if v.Status != "passing" { + t.Fatalf("expected check to be passing but found %q", v.Status) + } + } +} + +// TestConsul_ShutdownSlow tests the slow but ok path for the shutdown logic in +// ServiceClient. +func TestConsul_ShutdownSlow(t *testing.T) { + t.Parallel() // run the slow tests in parallel + ctx := setupFake() + + // Add a script check to make sure its TTL gets updated + ctx.Task.Services[0].Checks = []*structs.ServiceCheck{ + { + Name: "scriptcheck", + Type: "script", + Command: "true", + // Make check block until shutdown + Interval: 9000 * time.Hour, + Timeout: 5 * time.Second, + InitialStatus: "warning", + }, + } + + // Make Exec slow, but not too slow + ctx.ExecFunc = func(ctx context.Context, cmd string, args []string) ([]byte, int, error) { + time.Sleep(time.Second) + return []byte{}, 0, nil + } + + // Make shutdown wait time just a bit longer than ctx.Exec takes + ctx.ServiceClient.shutdownWait = 3 * time.Second + + hasShutdown := make(chan struct{}) + go func() { + ctx.ServiceClient.Run() + close(hasShutdown) + }() + + // Register a task and agent + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx); err != nil { + t.Fatalf("unexpected error registering task: %v", err) + } + + // Shutdown should block until all enqueued operations finish. + preShutdown := time.Now() + if err := ctx.ServiceClient.Shutdown(); err != nil { + t.Errorf("unexpected error shutting down client: %v", err) + } + + // assert Run() exits in a timely fashion after Shutdown() exits + select { + case <-hasShutdown: + // ok! Run() exited as expected + case <-time.After(10 * time.Second): + t.Fatalf("expected Run() to exit, but it did not") + } + + // Shutdown time should have taken: 1s <= shutdown <= 3s + shutdownTime := time.Now().Sub(preShutdown) + if shutdownTime < time.Second || shutdownTime > ctx.ServiceClient.shutdownWait { + t.Errorf("expected shutdown to take >1s and <%s but took: %s", ctx.ServiceClient.shutdownWait, shutdownTime) + } + + // UpdateTTL should have been called once for the script check + if n := len(ctx.FakeConsul.checkTTLs); n != 1 { + t.Fatalf("expected 1 checkTTL entry but found: %d", n) + } + for _, v := range ctx.FakeConsul.checkTTLs { + if v != 1 { + t.Fatalf("expected script check to be updated once but found %d", v) + } + } + for _, v := range ctx.FakeConsul.checks { + if v.Status != "passing" { + t.Fatalf("expected check to be passing but found %q", v.Status) + } + } +} + +// TestConsul_ShutdownBlocked tests the blocked past deadline path for the +// shutdown logic in ServiceClient. +func TestConsul_ShutdownBlocked(t *testing.T) { + t.Parallel() // run the slow tests in parallel + ctx := setupFake() + + // Add a script check to make sure its TTL gets updated + ctx.Task.Services[0].Checks = []*structs.ServiceCheck{ + { + Name: "scriptcheck", + Type: "script", + Command: "true", + // Make check block until shutdown + Interval: 9000 * time.Hour, + Timeout: 9000 * time.Hour, + InitialStatus: "warning", + }, + } + + block := make(chan struct{}) + defer close(block) // cleanup after test + + // Make Exec slow, but not too slow + ctx.ExecFunc = func(ctx context.Context, cmd string, args []string) ([]byte, int, error) { + <-block + return []byte{}, 0, nil + } + + // Use a short shutdown deadline since we're intentionally blocking forever + ctx.ServiceClient.shutdownWait = time.Second + + hasShutdown := make(chan struct{}) + go func() { + ctx.ServiceClient.Run() + close(hasShutdown) + }() + + // Register a task and agent + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx); err != nil { + t.Fatalf("unexpected error registering task: %v", err) + } + + // Shutdown should block until all enqueued operations finish. + preShutdown := time.Now() + err := ctx.ServiceClient.Shutdown() + if err == nil { + t.Errorf("expected a timed out error from shutdown") + } + + // assert Run() exits in a timely fashion after Shutdown() exits + maxWait := 10 * time.Second + select { + case <-hasShutdown: + // ok! Run() exited as expected + case <-time.After(maxWait): + t.Fatalf("expected Run() to exit, but it did not") + } + + // Shutdown time should have taken 1s; to avoid timing related errors + // simply test for 1s <= shutdown <= 10s + shutdownTime := time.Now().Sub(preShutdown) + if shutdownTime < ctx.ServiceClient.shutdownWait || shutdownTime > maxWait { + t.Errorf("expected shutdown to take >%s and <%s but took: %s", ctx.ServiceClient.shutdownWait, maxWait, shutdownTime) + } + + // UpdateTTL should not have been called for the script check + if n := len(ctx.FakeConsul.checkTTLs); n != 0 { + t.Fatalf("expected 0 checkTTL entry but found: %d", n) + } + for _, v := range ctx.FakeConsul.checks { + if expected := "warning"; v.Status != expected { + t.Fatalf("expected check to be %q but found %q", expected, v.Status) + } + } +} diff --git a/nomad/server.go b/nomad/server.go index efed68cebdf..dade6cc8635 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -63,6 +63,14 @@ const ( // raftRemoveGracePeriod is how long we wait to allow a RemovePeer // to replicate to gracefully leave the cluster. raftRemoveGracePeriod = 5 * time.Second + + // defaultConsulDiscoveryInterval is how often to poll Consul for new + // servers if there is no leader. + defaultConsulDiscoveryInterval time.Duration = 9 * time.Second + + // defaultConsulDiscoveryIntervalRetry is how often to poll Consul for + // new servers if there is no leader and the last Consul query failed. + defaultConsulDiscoveryIntervalRetry time.Duration = 3 * time.Second ) // Server is Nomad server which manages the job queues, @@ -136,8 +144,8 @@ type Server struct { heartbeatTimers map[string]*time.Timer heartbeatTimersLock sync.Mutex - // consulSyncer advertises this Nomad Agent with Consul - consulSyncer *consul.Syncer + // consulCatalog is used for discovering other Nomad Servers via Consul + consulCatalog consul.CatalogAPI // vault is the client for communicating with Vault. vault VaultClient @@ -167,7 +175,7 @@ type endpoints struct { // NewServer is used to construct a new Nomad server from the // configuration, potentially returning an error -func NewServer(config *Config, consulSyncer *consul.Syncer, logger *log.Logger) (*Server, error) { +func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logger) (*Server, error) { // Check the protocol version if err := config.CheckVersion(); err != nil { return nil, err @@ -212,20 +220,20 @@ func NewServer(config *Config, consulSyncer *consul.Syncer, logger *log.Logger) // Create the server s := &Server{ - config: config, - consulSyncer: consulSyncer, - connPool: NewPool(config.LogOutput, serverRPCCache, serverMaxStreams, tlsWrap), - logger: logger, - rpcServer: rpc.NewServer(), - peers: make(map[string][]*serverParts), - localPeers: make(map[raft.ServerAddress]*serverParts), - reconcileCh: make(chan serf.Member, 32), - eventCh: make(chan serf.Event, 256), - evalBroker: evalBroker, - blockedEvals: blockedEvals, - planQueue: planQueue, - rpcTLS: incomingTLS, - shutdownCh: make(chan struct{}), + config: config, + consulCatalog: consulCatalog, + connPool: NewPool(config.LogOutput, serverRPCCache, serverMaxStreams, tlsWrap), + logger: logger, + rpcServer: rpc.NewServer(), + peers: make(map[string][]*serverParts), + localPeers: make(map[raft.ServerAddress]*serverParts), + reconcileCh: make(chan serf.Member, 32), + eventCh: make(chan serf.Event, 256), + evalBroker: evalBroker, + blockedEvals: blockedEvals, + planQueue: planQueue, + rpcTLS: incomingTLS, + shutdownCh: make(chan struct{}), } // Create the periodic dispatcher for launching periodic jobs. @@ -542,8 +550,7 @@ func (s *Server) setupBootstrapHandler() error { s.logger.Printf("[DEBUG] server.nomad: lost contact with Nomad quorum, falling back to Consul for server list") - consulCatalog := s.consulSyncer.ConsulClient().Catalog() - dcs, err := consulCatalog.Datacenters() + dcs, err := s.consulCatalog.Datacenters() if err != nil { peersTimeout.Reset(peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor)) return fmt.Errorf("server.nomad: unable to query Consul datacenters: %v", err) @@ -570,7 +577,7 @@ func (s *Server) setupBootstrapHandler() error { Near: "_agent", WaitTime: consul.DefaultQueryWaitDuration, } - consulServices, _, err := consulCatalog.Service(nomadServerServiceName, consul.ServiceTagSerf, consulOpts) + consulServices, _, err := s.consulCatalog.Service(nomadServerServiceName, consul.ServiceTagSerf, consulOpts) if err != nil { err := fmt.Errorf("failed to query service %q in Consul datacenter %q: %v", nomadServerServiceName, dc, err) s.logger.Printf("[WARN] server.nomad: %v", err) @@ -618,7 +625,28 @@ func (s *Server) setupBootstrapHandler() error { return nil } - s.consulSyncer.AddPeriodicHandler("Nomad Server Fallback Server Handler", bootstrapFn) + // Hacky replacement for old ConsulSyncer Periodic Handler. + go func() { + lastOk := true + sync := time.NewTimer(0) + for { + select { + case <-sync.C: + d := defaultConsulDiscoveryInterval + if err := bootstrapFn(); err != nil { + // Only log if it worked last time + if lastOk { + lastOk = false + s.logger.Printf("[ERR] consul: error looking up Nomad servers: %v", err) + } + d = defaultConsulDiscoveryIntervalRetry + } + sync.Reset(d) + case <-s.shutdownCh: + return + } + } + }() return nil } diff --git a/nomad/server_test.go b/nomad/server_test.go index 902498a1d4a..719bfbf6259 100644 --- a/nomad/server_test.go +++ b/nomad/server_test.go @@ -76,15 +76,11 @@ func testServer(t *testing.T, cb func(*Config)) *Server { // Enable raft as leader if we have bootstrap on config.RaftConfig.StartAsLeader = !config.DevDisableBootstrap - shutdownCh := make(chan struct{}) logger := log.New(config.LogOutput, fmt.Sprintf("[%s] ", config.NodeName), log.LstdFlags) - consulSyncer, err := consul.NewSyncer(config.ConsulConfig, shutdownCh, logger) - if err != nil { - t.Fatalf("err: %v", err) - } + catalog := consul.NewMockCatalog(logger) // Create server - server, err := NewServer(config, consulSyncer, logger) + server, err := NewServer(config, catalog, logger) if err != nil { t.Fatalf("err: %v", err) } diff --git a/website/source/docs/job-specification/service.html.md b/website/source/docs/job-specification/service.html.md index ee6a1b6b449..2973142da84 100644 --- a/website/source/docs/job-specification/service.html.md +++ b/website/source/docs/job-specification/service.html.md @@ -106,7 +106,7 @@ does not automatically enable service discovery. ~> **Caveat:** The command must be the path to the command on disk, and no shell exists by default. That means operators like `||` or `&&` are not available. Additionally, all arguments must be supplied via the `args` - parameter. The achieve the behavior of shell operators, specify the command + parameter. To achieve the behavior of shell operators, specify the command as a shell, like `/bin/bash` and then use `args` to run the check. - `initial_status` `(string: )` - Specifies the originating status of the