Skip to content

Commit

Permalink
Merge pull request #6326 from hashicorp/b-docker-start-failure-handling
Browse files Browse the repository at this point in the history
Fix bugs around docker start retries
  • Loading branch information
Mahmood Ali authored Sep 18, 2019
2 parents 00c24fc + b5b445c commit 922663a
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 5 deletions.
41 changes: 36 additions & 5 deletions drivers/docker/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ var (
// running operations such as waiting on containers and collect stats
waitClient *docker.Client

dockerTransientErrs = []string{
"Client.Timeout exceeded while awaiting headers",
"EOF",
"API error (500)",
}

// recoverableErrTimeouts returns a recoverable error if the error was due
// to timeouts
recoverableErrTimeouts = func(err error) error {
Expand Down Expand Up @@ -449,13 +455,17 @@ CREATE:

if attempted < 5 {
attempted++
time.Sleep(1 * time.Second)
time.Sleep(nextBackoff(attempted))
goto CREATE
}
} else if strings.Contains(strings.ToLower(createErr.Error()), "no such image") {
// There is still a very small chance this is possible even with the
// coordinator so retry.
return nil, nstructs.NewRecoverableError(createErr, true)
} else if isDockerTransientError(createErr) && attempted < 5 {
attempted++
time.Sleep(nextBackoff(attempted))
goto CREATE
}

return nil, recoverableErrTimeouts(createErr)
Expand All @@ -468,17 +478,16 @@ func (d *Driver) startContainer(c *docker.Container) error {
attempted := 0
START:
startErr := client.StartContainer(c.ID, c.HostConfig)
if startErr == nil {
if startErr == nil || strings.Contains(startErr.Error(), "Container already running") {
return nil
}

d.logger.Debug("failed to start container", "container_id", c.ID, "attempt", attempted+1, "error", startErr)

// If it is a 500 error it is likely we can retry and be successful
if strings.Contains(startErr.Error(), "API error (500)") {
if isDockerTransientError(startErr) {
if attempted < 5 {
attempted++
time.Sleep(1 * time.Second)
time.Sleep(nextBackoff(attempted))
goto START
}
return nstructs.NewRecoverableError(startErr, true)
Expand All @@ -487,6 +496,13 @@ START:
return recoverableErrTimeouts(startErr)
}

// nextBackoff returns appropriate docker backoff durations after attempted attempts.
func nextBackoff(attempted int) time.Duration {
// attempts in 200ms, 800ms, 3.2s, 12.8s, 51.2s
// TODO: add randomization factor and extract to a helper
return 1 << (2 * uint64(attempted)) * 50 * time.Millisecond
}

// createImage creates a docker image either by pulling it from a registry or by
// loading it from the file system
func (d *Driver) createImage(task *drivers.TaskConfig, driverConfig *TaskConfig, client *docker.Client) (string, error) {
Expand Down Expand Up @@ -1458,3 +1474,18 @@ func sliceMergeUlimit(ulimitsRaw map[string]string) ([]docker.ULimit, error) {
func (d *Driver) Shutdown() {
d.signalShutdown()
}

func isDockerTransientError(err error) bool {
if err == nil {
return false
}

errMsg := err.Error()
for _, te := range dockerTransientErrs {
if strings.Contains(errMsg, te) {
return true
}
}

return false
}
71 changes: 71 additions & 0 deletions drivers/docker/driver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2376,3 +2376,74 @@ func waitForExist(t *testing.T, client *docker.Client, containerID string) {
require.NoError(t, err)
})
}

// TestDockerDriver_CreationIdempotent asserts that createContainer and
// and startContainers functions are idempotent, as we have some retry
// logic there without ensureing we delete/destroy containers
func TestDockerDriver_CreationIdempotent(t *testing.T) {
if !tu.IsCI() {
t.Parallel()
}
testutil.DockerCompatible(t)

task, cfg, _ := dockerTask(t)
require.NoError(t, task.EncodeConcreteDriverConfig(cfg))

client := newTestDockerClient(t)
driver := dockerDriverHarness(t, nil)
cleanup := driver.MkAllocDir(task, true)
defer cleanup()

copyImage(t, task.TaskDir(), "busybox.tar")

d, ok := driver.Impl().(*Driver)
require.True(t, ok)

_, err := d.createImage(task, cfg, client)
require.NoError(t, err)

containerCfg, err := d.createContainerConfig(task, cfg, cfg.Image)
require.NoError(t, err)

c, err := d.createContainer(client, containerCfg, cfg.Image)
require.NoError(t, err)
defer client.RemoveContainer(docker.RemoveContainerOptions{
ID: c.ID,
Force: true,
})

// calling createContainer again creates a new one and remove old one
c2, err := d.createContainer(client, containerCfg, cfg.Image)
require.NoError(t, err)
defer client.RemoveContainer(docker.RemoveContainerOptions{
ID: c2.ID,
Force: true,
})

require.NotEqual(t, c.ID, c2.ID)
// old container was destroyed
{
_, err := client.InspectContainer(c.ID)
require.Error(t, err)
require.Contains(t, err.Error(), NoSuchContainerError)
}

// now start container twice
require.NoError(t, d.startContainer(c2))
require.NoError(t, d.startContainer(c2))

tu.WaitForResult(func() (bool, error) {
c, err := client.InspectContainer(c2.ID)
if err != nil {
return false, fmt.Errorf("failed to get container status: %v", err)
}

if !c.State.Running {
return false, fmt.Errorf("container is not running but %v", c.State)
}

return true, nil
}, func(err error) {
require.NoError(t, err)
})
}

0 comments on commit 922663a

Please sign in to comment.