diff --git a/CHANGELOG.md b/CHANGELOG.md index 16d618ad212..ef34d1aa533 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -64,6 +64,7 @@ BUG FIXES: * client: Job validation now checks that the datacenter field does not contain empty strings [[GH-5665](https://github.com/hashicorp/nomad/pull/5665)] * client: Fix network port mapping related environment variables when running with Nomad 0.8 servers [[GH-5587](https://github.com/hashicorp/nomad/issues/5587)] * client: Fix issue with terminal state deployments being modified when allocation subsequently fails [[GH-5645](https://github.com/hashicorp/nomad/issues/5645)] + * driver/docker: Fix regression around image GC [[GH-5768](https://github.com/hashicorp/nomad/issues/5768)] * metrics: Fixed stale metrics [[GH-5540](https://github.com/hashicorp/nomad/issues/5540)] * vault: Fix renewal time to be 1/2 lease duration with jitter [[GH-5479](https://github.com/hashicorp/nomad/issues/5479)] diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index 3754d032b7e..88c4031fc9e 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -1107,14 +1107,30 @@ func (d *Driver) DestroyTask(taskID string, force bool) error { c, err := h.client.InspectContainer(h.containerID) if err != nil { - return fmt.Errorf("failed to inspect container state: %v", err) - } - if c.State.Running && !force { - return fmt.Errorf("must call StopTask for the given task before Destroy or set force to true") - } + switch err.(type) { + case *docker.NoSuchContainer: + h.logger.Info("container was removed out of band, will proceed with DestroyTask", + "error", err) + default: + return fmt.Errorf("failed to inspect container state: %v", err) + } + } else { + if c.State.Running { + if !force { + return fmt.Errorf("must call StopTask for the given task before Destroy or set force to true") + } + if err := h.client.StopContainer(h.containerID, 0); err != nil { + h.logger.Warn("failed to stop container during destroy", "error", err) + } + } - if err := h.client.StopContainer(h.containerID, 0); err != nil { - h.logger.Warn("failed to stop container during destroy", "error", err) + if h.removeContainerOnExit { + if err := h.client.RemoveContainer(docker.RemoveContainerOptions{ID: h.containerID, RemoveVolumes: true, Force: true}); err != nil { + h.logger.Error("error removing container", "error", err) + } + } else { + h.logger.Debug("not removing container due to config") + } } if err := d.cleanupImage(h); err != nil { diff --git a/drivers/docker/driver_test.go b/drivers/docker/driver_test.go index 70cd9a5e4d6..445950d670a 100644 --- a/drivers/docker/driver_test.go +++ b/drivers/docker/driver_test.go @@ -77,9 +77,12 @@ func dockerTask(t *testing.T) (*drivers.TaskConfig, *TaskConfig, []int) { cfg := newTaskConfig("", busyboxLongRunningCmd) task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: "redis-demo", - AllocID: uuid.Generate(), + ID: uuid.Generate(), + Name: "redis-demo", + AllocID: uuid.Generate(), + Env: map[string]string{ + "test": t.Name(), + }, DeviceEnv: make(map[string]string), Resources: &drivers.Resources{ NomadResources: &structs.AllocatedTaskResources{ @@ -140,6 +143,31 @@ func dockerSetup(t *testing.T, task *drivers.TaskConfig) (*docker.Client, *dtest } } +// cleanSlate removes the specified docker image, including potentially stopping/removing any +// containers based on that image. This is used to decouple tests that would be coupled +// by using the same container image. +func cleanSlate(client *docker.Client, imageID string) { + if img, _ := client.InspectImage(imageID); img == nil { + return + } + containers, _ := client.ListContainers(docker.ListContainersOptions{ + All: true, + Filters: map[string][]string{ + "ancestor": {imageID}, + }, + }) + for _, c := range containers { + client.RemoveContainer(docker.RemoveContainerOptions{ + Force: true, + ID: c.ID, + }) + } + client.RemoveImageExtended(imageID, docker.RemoveImageOptions{ + Force: true, + }) + return +} + // dockerDriverHarness wires up everything needed to launch a task with a docker driver. // A driver plugin interface and cleanup function is returned func dockerDriverHarness(t *testing.T, cfg map[string]interface{}) *dtestutil.DriverHarness { @@ -441,6 +469,7 @@ func TestDockerDriver_Start_StoppedContainer(t *testing.T) { Config: &docker.Config{ Image: taskCfg.Image, Cmd: []string{"sleep", "9000"}, + Env: []string{fmt.Sprintf("test=%s", t.Name())}, }, } @@ -449,11 +478,11 @@ func TestDockerDriver_Start_StoppedContainer(t *testing.T) { } _, _, err = d.StartTask(task) - require.NoError(t, err) - defer d.DestroyTask(task.ID, true) + require.NoError(t, err) require.NoError(t, d.WaitUntilStarted(task.ID, 5*time.Second)) + require.NoError(t, d.DestroyTask(task.ID, true)) } func TestDockerDriver_Start_LoadImage(t *testing.T) { @@ -762,9 +791,12 @@ func TestDockerDriver_StartN(t *testing.T) { _, _, err := d.StartTask(task) require.NoError(err) - defer d.DestroyTask(task.ID, true) } + defer d.DestroyTask(task3.ID, true) + defer d.DestroyTask(task2.ID, true) + defer d.DestroyTask(task1.ID, true) + t.Log("All tasks are started. Terminating...") for _, task := range taskList { require.NoError(d.StopTask(task.ID, time.Second, "SIGINT")) @@ -826,11 +858,13 @@ func TestDockerDriver_StartNVersions(t *testing.T) { _, _, err := d.StartTask(task) require.NoError(err) - defer d.DestroyTask(task.ID, true) - require.NoError(d.WaitUntilStarted(task.ID, 5*time.Second)) } + defer d.DestroyTask(task3.ID, true) + defer d.DestroyTask(task2.ID, true) + defer d.DestroyTask(task1.ID, true) + t.Log("All tasks are started. Terminating...") for _, task := range taskList { require.NoError(d.StopTask(task.ID, time.Second, "SIGINT")) @@ -845,6 +879,7 @@ func TestDockerDriver_StartNVersions(t *testing.T) { require.Fail("timeout waiting on task") } } + t.Log("Test complete!") } @@ -1178,6 +1213,7 @@ func TestDockerDriver_Capabilities(t *testing.T) { copyImage(t, task.TaskDir(), "busybox.tar") _, _, err := d.StartTask(task) + defer d.DestroyTask(task.ID, true) if err == nil && tc.StartError != "" { t.Fatalf("Expected error in start: %v", tc.StartError) } else if err != nil { @@ -1189,7 +1225,6 @@ func TestDockerDriver_Capabilities(t *testing.T) { return } - defer d.DestroyTask(task.ID, true) handle, ok := dockerDriver.tasks.Get(task.ID) require.True(t, ok) @@ -1386,12 +1421,16 @@ func TestDockerDriver_CleanupContainer(t *testing.T) { waitCh, err := d.WaitTask(context.Background(), task.ID) require.NoError(t, err) + select { case res := <-waitCh: if !res.Successful() { t.Fatalf("err: %v", res) } + err = d.DestroyTask(task.ID, false) + require.NoError(t, err) + time.Sleep(3 * time.Second) // Ensure that the container isn't present @@ -1405,6 +1444,197 @@ func TestDockerDriver_CleanupContainer(t *testing.T) { } } +func TestDockerDriver_EnableImageGC(t *testing.T) { + testutil.DockerCompatible(t) + + task, cfg, _ := dockerTask(t) + cfg.Command = "echo" + cfg.Args = []string{"hello"} + require.NoError(t, task.EncodeConcreteDriverConfig(cfg)) + + client := newTestDockerClient(t) + driver := dockerDriverHarness(t, map[string]interface{}{ + "gc": map[string]interface{}{ + "container": true, + "image": true, + "image_delay": "2s", + }, + }) + cleanup := driver.MkAllocDir(task, true) + defer cleanup() + + cleanSlate(client, cfg.Image) + + copyImage(t, task.TaskDir(), "busybox.tar") + _, _, err := driver.StartTask(task) + require.NoError(t, err) + + dockerDriver, ok := driver.Impl().(*Driver) + require.True(t, ok) + _, ok = dockerDriver.tasks.Get(task.ID) + require.True(t, ok) + + waitCh, err := dockerDriver.WaitTask(context.Background(), task.ID) + require.NoError(t, err) + select { + case res := <-waitCh: + if !res.Successful() { + t.Fatalf("err: %v", res) + } + + case <-time.After(time.Duration(tu.TestMultiplier()*5) * time.Second): + t.Fatalf("timeout") + } + + // we haven't called DestroyTask, image should be present + _, err = client.InspectImage(cfg.Image) + require.NoError(t, err) + + err = dockerDriver.DestroyTask(task.ID, false) + require.NoError(t, err) + + // image_delay is 3s, so image should still be around for a bit + _, err = client.InspectImage(cfg.Image) + require.NoError(t, err) + + // Ensure image was removed + tu.WaitForResult(func() (bool, error) { + if _, err := client.InspectImage(cfg.Image); err == nil { + return false, fmt.Errorf("image exists but should have been removed. Does another %v container exist?", cfg.Image) + } + + return true, nil + }, func(err error) { + require.NoError(t, err) + }) +} + +func TestDockerDriver_DisableImageGC(t *testing.T) { + testutil.DockerCompatible(t) + + task, cfg, _ := dockerTask(t) + cfg.Command = "echo" + cfg.Args = []string{"hello"} + require.NoError(t, task.EncodeConcreteDriverConfig(cfg)) + + client := newTestDockerClient(t) + driver := dockerDriverHarness(t, map[string]interface{}{ + "gc": map[string]interface{}{ + "container": true, + "image": false, + "image_delay": "1s", + }, + }) + cleanup := driver.MkAllocDir(task, true) + defer cleanup() + + cleanSlate(client, cfg.Image) + + copyImage(t, task.TaskDir(), "busybox.tar") + _, _, err := driver.StartTask(task) + require.NoError(t, err) + + dockerDriver, ok := driver.Impl().(*Driver) + require.True(t, ok) + handle, ok := dockerDriver.tasks.Get(task.ID) + require.True(t, ok) + + waitCh, err := dockerDriver.WaitTask(context.Background(), task.ID) + require.NoError(t, err) + select { + case res := <-waitCh: + if !res.Successful() { + t.Fatalf("err: %v", res) + } + + case <-time.After(time.Duration(tu.TestMultiplier()*5) * time.Second): + t.Fatalf("timeout") + } + + // we haven't called DestroyTask, image should be present + _, err = client.InspectImage(handle.containerImage) + require.NoError(t, err) + + err = dockerDriver.DestroyTask(task.ID, false) + require.NoError(t, err) + + // image_delay is 1s, wait a little longer + time.Sleep(3 * time.Second) + + // image should not have been removed or scheduled to be removed + _, err = client.InspectImage(cfg.Image) + require.NoError(t, err) + dockerDriver.coordinator.imageLock.Lock() + _, ok = dockerDriver.coordinator.deleteFuture[handle.containerImage] + require.False(t, ok, "image should not be registered for deletion") + dockerDriver.coordinator.imageLock.Unlock() +} + +func TestDockerDriver_MissingContainer_Cleanup(t *testing.T) { + testutil.DockerCompatible(t) + + task, cfg, _ := dockerTask(t) + cfg.Command = "echo" + cfg.Args = []string{"hello"} + require.NoError(t, task.EncodeConcreteDriverConfig(cfg)) + + client := newTestDockerClient(t) + driver := dockerDriverHarness(t, map[string]interface{}{ + "gc": map[string]interface{}{ + "container": true, + "image": true, + "image_delay": "0s", + }, + }) + cleanup := driver.MkAllocDir(task, true) + defer cleanup() + + cleanSlate(client, cfg.Image) + + copyImage(t, task.TaskDir(), "busybox.tar") + _, _, err := driver.StartTask(task) + require.NoError(t, err) + + dockerDriver, ok := driver.Impl().(*Driver) + require.True(t, ok) + h, ok := dockerDriver.tasks.Get(task.ID) + require.True(t, ok) + + waitCh, err := dockerDriver.WaitTask(context.Background(), task.ID) + require.NoError(t, err) + select { + case res := <-waitCh: + if !res.Successful() { + t.Fatalf("err: %v", res) + } + + case <-time.After(time.Duration(tu.TestMultiplier()*5) * time.Second): + t.Fatalf("timeout") + } + + // remove the container out-of-band + require.NoError(t, client.RemoveContainer(docker.RemoveContainerOptions{ + ID: h.containerID, + })) + + require.NoError(t, dockerDriver.DestroyTask(task.ID, false)) + + // Ensure image was removed + tu.WaitForResult(func() (bool, error) { + if _, err := client.InspectImage(cfg.Image); err == nil { + return false, fmt.Errorf("image exists but should have been removed. Does another %v container exist?", cfg.Image) + } + + return true, nil + }, func(err error) { + require.NoError(t, err) + }) + + // Ensure that task handle was removed + _, ok = dockerDriver.tasks.Get(task.ID) + require.False(t, ok) +} + func TestDockerDriver_Stats(t *testing.T) { if !tu.IsCI() { t.Parallel() @@ -1503,7 +1733,9 @@ func TestDockerDriver_VolumesDisabled(t *testing.T) { task, driver, _, _, cleanup := setupDockerVolumes(t, cfg, tmpvol) defer cleanup() - if _, _, err := driver.StartTask(task); err == nil { + _, _, err = driver.StartTask(task) + defer driver.DestroyTask(task.ID, true) + if err == nil { require.Fail(t, "Started driver successfully when volumes should have been disabled.") } } @@ -1541,7 +1773,9 @@ func TestDockerDriver_VolumesDisabled(t *testing.T) { taskCfg.VolumeDriver = "flocker" require.NoError(t, task.EncodeConcreteDriverConfig(taskCfg)) - if _, _, err := driver.StartTask(task); err == nil { + _, _, err := driver.StartTask(task) + defer driver.DestroyTask(task.ID, true) + if err == nil { require.Fail(t, "Started driver successfully when volume drivers should have been disabled.") } } diff --git a/drivers/docker/handle.go b/drivers/docker/handle.go index 1c52591334a..398b53d74b8 100644 --- a/drivers/docker/handle.go +++ b/drivers/docker/handle.go @@ -212,15 +212,6 @@ func (h *taskHandle) run() { } } - // Remove the container - if h.removeContainerOnExit == true { - if err := h.client.RemoveContainer(docker.RemoveContainerOptions{ID: h.containerID, RemoveVolumes: true, Force: true}); err != nil { - h.logger.Error("error removing container", "error", err) - } - } else { - h.logger.Debug("not removing container due to config") - } - // Set the result h.exitResultLock.Lock() h.exitResult = &drivers.ExitResult{ diff --git a/plugins/drivers/testutils/testing.go b/plugins/drivers/testutils/testing.go index f28c3d4fa9e..5a941f97be4 100644 --- a/plugins/drivers/testutils/testing.go +++ b/plugins/drivers/testutils/testing.go @@ -75,7 +75,7 @@ func (h *DriverHarness) Kill() { h.server.Stop() } -// MkAllocDir creates a tempory directory and allocdir structure. +// MkAllocDir creates a temporary directory and allocdir structure. // If enableLogs is set to true a logmon instance will be started to write logs // to the LogDir of the task // A cleanup func is returned and should be defered so as to not leak dirs