From a1bfc0320ee50b8ea3814dc95b734241f4af61ca Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 2 Dec 2020 16:22:38 -0500 Subject: [PATCH] docker: kill signal API should include timeout context When the Docker driver kills as task, we send a request via the Docker API for dockerd to fire the signal. We send that signal and then block for the `kill_timeout` waiting for the container to exit. But if the Docker API blocks, we will block indefinitely because we haven't configured the API call with the same timeout. This changeset is a minimal intervention to add the timeout to the Docker API call _only_ when we have the `kill_timeout` set. Future work should examine whether we should be threading contexts through other `go-dockerclient` API calls. --- CHANGELOG.md | 1 + drivers/docker/driver.go | 5 ++++- drivers/docker/handle.go | 14 +++++++++----- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fcebeb16dac..91b2677fb86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -72,6 +72,7 @@ BUG FIXES: * csi: Fixed a bug where `nomad volume detach` would not accept prefixes for the node ID parameter. [[GH-9041](https://github.com/hashicorp/nomad/issues/9041)] * csi: Fixed a bug where `nomad alloc status -verbose` would display an error when querying volumes. [[GH-9354](https://github.com/hashicorp/nomad/issues/9354)] * csi: Fixed a bug where queries for CSI plugins could be interleaved, resulting in inconsistent counts of plugins. [[GH-9438](https://github.com/hashicorp/nomad/issues/9438)] + * driver/docker: Fixed a bug where the Docker daemon could block longer than the `kill_timeout`. [[GH-9502](https://github.com/hashicorp/nomad/issues/9502) * driver/docker: Fixed a bug where the default `image_delay` configuration was ignored if the `gc` configuration was not set. [[GH-9101](https://github.com/hashicorp/nomad/issues/9101)] * driver/raw_exec: Fixed a bug where raw_exec attempts to create a freezer cgroups for the tasks even when `no_cgroups` is set. [[GH-9328](https://github.com/hashicorp/nomad/issues/9328)] * ui: Fixed a bug in the volume status page where read allocations and write allocations were not displayed. [[GH-9377](https://github.com/hashicorp/nomad/issues/9377)] diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index b5677911d9d..d8222a087a3 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -1461,7 +1461,10 @@ func (d *Driver) SignalTask(taskID string, signal string) error { return fmt.Errorf("failed to parse signal: %v", err) } - return h.Signal(sig) + // TODO: review whether we can timeout in this and other Docker API + // calls without breaking the expected client behavior. + // see https://github.com/hashicorp/nomad/issues/9503 + return h.Signal(context.Background(), sig) } func (d *Driver) ExecTask(taskID string, cmd []string, timeout time.Duration) (*drivers.ExecTaskResult, error) { diff --git a/drivers/docker/handle.go b/drivers/docker/handle.go index 3e1854a7657..6a75603f79f 100644 --- a/drivers/docker/handle.go +++ b/drivers/docker/handle.go @@ -103,7 +103,7 @@ func (h *taskHandle) Exec(ctx context.Context, cmd string, args []string) (*driv return execResult, nil } -func (h *taskHandle) Signal(s os.Signal) error { +func (h *taskHandle) Signal(ctx context.Context, s os.Signal) error { // Convert types sysSig, ok := s.(syscall.Signal) if !ok { @@ -116,8 +116,9 @@ func (h *taskHandle) Signal(s os.Signal) error { dockerSignal := docker.Signal(sysSig) opts := docker.KillContainerOptions{ - ID: h.containerID, - Signal: dockerSignal, + ID: h.containerID, + Signal: dockerSignal, + Context: ctx, } return h.client.KillContainer(opts) @@ -127,7 +128,10 @@ func (h *taskHandle) Signal(s os.Signal) error { func (h *taskHandle) Kill(killTimeout time.Duration, signal os.Signal) error { // Only send signal if killTimeout is set, otherwise stop container if killTimeout > 0 { - if err := h.Signal(signal); err != nil { + ctx, cancel := context.WithTimeout(context.Background(), killTimeout) + defer cancel() + + if err := h.Signal(ctx, signal); err != nil { // Container has already been removed. if strings.Contains(err.Error(), NoSuchContainerError) { h.logger.Debug("attempted to signal nonexistent container") @@ -146,7 +150,7 @@ func (h *taskHandle) Kill(killTimeout time.Duration, signal os.Signal) error { select { case <-h.waitCh: return nil - case <-time.After(killTimeout): + case <-ctx.Done(): } }