From 9db46fde842464ca57461c183df845275d761e75 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Sun, 19 Apr 2020 15:34:45 -0400 Subject: [PATCH 1/2] driver/docker: protect against nil container Protect against a panic when we attempt to start a container with a name that conflicts with an existing one. If the existing one is being deleted while nomad first attempts to create the container, the createContainer will fail with `container already exists`, but we get nil container reference from the `containerByName` lookup, and cause a crash. I'm not certain how we get into the state, except for being very unlucky. I suspect that this case may be the result of a concurrent restart or the docker engine API not being fully consistent (e.g. an earlier call purged the container, but docker didn't free up resources yet to create a new container with the same name immediately yet). If that's the case, then re-attempting creation will hopefully succeed, or we'd at least fail enough times for the alloc to be rescheduled to another node. --- drivers/docker/driver.go | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index 5880d976d2e..1d8cc743690 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -439,16 +439,21 @@ CREATE: return container, nil } - // Delete matching containers - err = client.RemoveContainer(docker.RemoveContainerOptions{ - ID: container.ID, - Force: true, - }) - if err != nil { - d.logger.Error("failed to purge container", "container_id", container.ID) - return nil, recoverableErrTimeouts(fmt.Errorf("Failed to purge container %s: %s", container.ID, err)) - } else { - d.logger.Info("purged container", "container_id", container.ID) + // Purge conflicting container if found. + // If container is nil here, the conflicting container was + // deleted in our check here, so retry again. + if container != nil { + // Delete matching containers + err = client.RemoveContainer(docker.RemoveContainerOptions{ + ID: container.ID, + Force: true, + }) + if err != nil { + d.logger.Error("failed to purge container", "container_id", container.ID) + return nil, recoverableErrTimeouts(fmt.Errorf("Failed to purge container %s: %s", container.ID, err)) + } else { + d.logger.Info("purged container", "container_id", container.ID) + } } if attempted < 5 { From 7f29912a02cb7c4ab1f0ad6208dbef1345f87640 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Mon, 20 Apr 2020 10:31:09 -0400 Subject: [PATCH 2/2] add changelog [ci skip] --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 63eb9ee1e1c..52677ffed7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ BUG FIXES: * core: Fixed a bug that only ran a task `shutdown_delay` if the task had a registered service [[GH-7663](https://github.com/hashicorp/nomad/issues/7663)] + * driver/docker: Fixed a bug where retrying failed docker creation may in rare cases trigger a panic [[GH-7749](https://github.com/hashicorp/nomad/issues/7749)] * vault: Upgrade http2 library to fix Vault API calls that fail with `http2: no cached connection was available` [[GH-7673](https://github.com/hashicorp/nomad/issues/7673)] ## 0.11.0 (April 8, 2020)