diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index ac83f376d86..3367caeba8d 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -41,6 +41,12 @@ var ( // running operations such as waiting on containers and collect stats waitClient *docker.Client + dockerTransientErrs = []string{ + "Client.Timeout exceeded while awaiting headers", + "EOF", + "API error (500)", + } + // recoverableErrTimeouts returns a recoverable error if the error was due // to timeouts recoverableErrTimeouts = func(err error) error { @@ -449,13 +455,17 @@ CREATE: if attempted < 5 { attempted++ - time.Sleep(1 * time.Second) + time.Sleep(nextBackoff(attempted)) goto CREATE } } else if strings.Contains(strings.ToLower(createErr.Error()), "no such image") { // There is still a very small chance this is possible even with the // coordinator so retry. return nil, nstructs.NewRecoverableError(createErr, true) + } else if isDockerTransientError(createErr) && attempted < 5 { + attempted++ + time.Sleep(nextBackoff(attempted)) + goto CREATE } return nil, recoverableErrTimeouts(createErr) @@ -468,17 +478,16 @@ func (d *Driver) startContainer(c *docker.Container) error { attempted := 0 START: startErr := client.StartContainer(c.ID, c.HostConfig) - if startErr == nil { + if startErr == nil || strings.Contains(startErr.Error(), "Container already running") { return nil } d.logger.Debug("failed to start container", "container_id", c.ID, "attempt", attempted+1, "error", startErr) - // If it is a 500 error it is likely we can retry and be successful - if strings.Contains(startErr.Error(), "API error (500)") { + if isDockerTransientError(startErr) { if attempted < 5 { attempted++ - time.Sleep(1 * time.Second) + time.Sleep(nextBackoff(attempted)) goto START } return nstructs.NewRecoverableError(startErr, true) @@ -487,6 +496,13 @@ START: return recoverableErrTimeouts(startErr) } +// nextBackoff returns appropriate docker backoff durations after attempted attempts. +func nextBackoff(attempted int) time.Duration { + // attempts in 200ms, 800ms, 3.2s, 12.8s, 51.2s + // TODO: add randomization factor and extract to a helper + return 1 << (2 * uint64(attempted)) * 50 * time.Millisecond +} + // createImage creates a docker image either by pulling it from a registry or by // loading it from the file system func (d *Driver) createImage(task *drivers.TaskConfig, driverConfig *TaskConfig, client *docker.Client) (string, error) { @@ -1458,3 +1474,18 @@ func sliceMergeUlimit(ulimitsRaw map[string]string) ([]docker.ULimit, error) { func (d *Driver) Shutdown() { d.signalShutdown() } + +func isDockerTransientError(err error) bool { + if err == nil { + return false + } + + errMsg := err.Error() + for _, te := range dockerTransientErrs { + if strings.Contains(errMsg, te) { + return true + } + } + + return false +} diff --git a/drivers/docker/driver_test.go b/drivers/docker/driver_test.go index 997ce1d9c1b..a100aa7431e 100644 --- a/drivers/docker/driver_test.go +++ b/drivers/docker/driver_test.go @@ -2376,3 +2376,74 @@ func waitForExist(t *testing.T, client *docker.Client, containerID string) { require.NoError(t, err) }) } + +// TestDockerDriver_CreationIdempotent asserts that createContainer and +// and startContainers functions are idempotent, as we have some retry +// logic there without ensureing we delete/destroy containers +func TestDockerDriver_CreationIdempotent(t *testing.T) { + if !tu.IsCI() { + t.Parallel() + } + testutil.DockerCompatible(t) + + task, cfg, _ := dockerTask(t) + require.NoError(t, task.EncodeConcreteDriverConfig(cfg)) + + client := newTestDockerClient(t) + driver := dockerDriverHarness(t, nil) + cleanup := driver.MkAllocDir(task, true) + defer cleanup() + + copyImage(t, task.TaskDir(), "busybox.tar") + + d, ok := driver.Impl().(*Driver) + require.True(t, ok) + + _, err := d.createImage(task, cfg, client) + require.NoError(t, err) + + containerCfg, err := d.createContainerConfig(task, cfg, cfg.Image) + require.NoError(t, err) + + c, err := d.createContainer(client, containerCfg, cfg.Image) + require.NoError(t, err) + defer client.RemoveContainer(docker.RemoveContainerOptions{ + ID: c.ID, + Force: true, + }) + + // calling createContainer again creates a new one and remove old one + c2, err := d.createContainer(client, containerCfg, cfg.Image) + require.NoError(t, err) + defer client.RemoveContainer(docker.RemoveContainerOptions{ + ID: c2.ID, + Force: true, + }) + + require.NotEqual(t, c.ID, c2.ID) + // old container was destroyed + { + _, err := client.InspectContainer(c.ID) + require.Error(t, err) + require.Contains(t, err.Error(), NoSuchContainerError) + } + + // now start container twice + require.NoError(t, d.startContainer(c2)) + require.NoError(t, d.startContainer(c2)) + + tu.WaitForResult(func() (bool, error) { + c, err := client.InspectContainer(c2.ID) + if err != nil { + return false, fmt.Errorf("failed to get container status: %v", err) + } + + if !c.State.Running { + return false, fmt.Errorf("container is not running but %v", c.State) + } + + return true, nil + }, func(err error) { + require.NoError(t, err) + }) +}