From edf2b30309d308ec3a45edcefdeab22908704008 Mon Sep 17 00:00:00 2001 From: Victor Vieux Date: Fri, 11 Dec 2015 00:16:14 -0800 Subject: [PATCH] don't error on 1st rescheduling error Signed-off-by: Victor Vieux --- cluster/watchdog.go | 61 +++++++++------ test/integration/rescheduling.bats | 117 ++++++++++++++++++++++++++++- 2 files changed, 153 insertions(+), 25 deletions(-) diff --git a/cluster/watchdog.go b/cluster/watchdog.go index e5ff6b6f83..6900e647fc 100644 --- a/cluster/watchdog.go +++ b/cluster/watchdog.go @@ -7,52 +7,65 @@ import ( ) type Watchdog struct { - l sync.Mutex + sync.Mutex cluster Cluster } // Handle cluster callbacks func (w *Watchdog) Handle(e *Event) error { - // Skip non-swarm events. - if e.From != "swarm" { - return nil - } - switch e.Status { case "engine_disconnect": go w.rescheduleContainers(e.Engine) + default: + go w.reschedulePendingContainers() } return nil } +func (w *Watchdog) reschedulePendingContainers() { + w.Lock() + defer w.Unlock() + + log.Infof("rescheduling pending containers") + + for _, c := range w.cluster.Containers() { + if !c.Engine.IsHealthy() { + w.rescheduleContainer(c) + } + } +} + func (w *Watchdog) rescheduleContainers(e *Engine) { - w.l.Lock() - defer w.l.Unlock() + w.Lock() + defer w.Unlock() log.Infof("Node %s failed - rescheduling containers", e.ID) for _, c := range e.Containers() { - // Skip containers which don't have an "on-node-failure" reschedule policy. - if c.Config.ReschedulePolicy() != "on-node-failure" { - log.Debugf("Skipping rescheduling of %s based on rescheduling policy", c.Id) - continue - } + w.rescheduleContainer(c) + } +} - // Remove the container from the dead engine. If we don't, then both - // the old and new one will show up in docker ps. - // We have to do this before calling `CreateContainer`, otherwise it - // will abort because the name is already taken. - c.Engine.removeContainer(c) +func (w *Watchdog) rescheduleContainer(c *Container) { + // Skip containers which don't have an "on-node-failure" reschedule policy. + if c.Config.ReschedulePolicy() != "on-node-failure" { + log.Debugf("Skipping rescheduling of %s based on rescheduling policy", c.Id) + return + } - newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name) + // Remove the container from the dead engine. If we don't, then both + // the old and new one will show up in docker ps. + // We have to do this before calling `CreateContainer`, otherwise it + // will abort because the name is already taken. + c.Engine.removeContainer(c) - if err != nil { - log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err) - continue - } + newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name, nil) + if err != nil { + log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err) + c.Engine.AddContainer(c) + } else { log.Infof("Rescheduled container %s from %s to %s as %s (Swarm ID: %s)", c.Id, c.Engine.ID, newContainer.Engine.ID, newContainer.Id, c.Config.SwarmID()) - if c.Info.State.Running { if err := newContainer.Start(); err != nil { log.Errorf("Failed to start rescheduled container %s", newContainer.Id) diff --git a/test/integration/rescheduling.bats b/test/integration/rescheduling.bats index 20c8de085e..cf893a7878 100644 --- a/test/integration/rescheduling.bats +++ b/test/integration/rescheduling.bats @@ -12,7 +12,7 @@ function teardown() { swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-refresh-retry=1 ${HOSTS[0]},${HOSTS[1]} # c1 on node-0 with reschedule=on-node-failure - run docker_swarm run -dit --name c1 -e constraint:node==~node-0 -e reschedule:on-node-failure busybox sh + run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh [ "$status" -eq 0 ] # c2 on node-0 with reschedule=off run docker_swarm run -dit --name c2 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=off busybox sh @@ -55,3 +55,118 @@ function teardown() { [ "$status" -eq 0 ] [[ "${output}" == *'"Name": "node-1"'* ]] } + +@test "rescheduling with constraints" { + start_docker_with_busybox 2 + swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-refresh-retry=1 ${HOSTS[0]},${HOSTS[1]} + + # c1 on node-0 with reschedule=on-node-failure + run docker_swarm run -dit --name c1 -e constraint:node==~node-0 -e reschedule:on-node-failure busybox sh + [ "$status" -eq 0 ] + # c2 on node-0 with reschedule=off + run docker_swarm run -dit --name c2 -e constraint:node==node-0 -e reschedule:on-node-failure busybox sh + [ "$status" -eq 0 ] + # c3 on node-1 + run docker_swarm run -dit --name c3 -e constraint:node==node-1 -e reschedule:on-node-failure busybox sh + [ "$status" -eq 0 ] + + run docker_swarm ps -q + [ "${#lines[@]}" -eq 3 ] + + # Make sure containers are running where they should. + run docker_swarm inspect c1 + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-0"'* ]] + run docker_swarm inspect c2 + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-0"'* ]] + run docker_swarm inspect c3 + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-1"'* ]] + + # Stop node-0 + docker_host stop ${DOCKER_CONTAINERS[0]} + + # Wait for Swarm to detect the node failure. + retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'" + + # c1 should have been rescheduled from node-0 to node-1 + run docker_swarm inspect c1 + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-1"'* ]] + + # c2 should still be on node-0 since a node constraint was applied. + run docker_swarm inspect c2 + [ "$status" -eq 1 ] + + # c3 should still be on node-1 since it wasn't affected + run docker_swarm inspect c3 + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-1"'* ]] +} + +@test "reschedule conflict" { + start_docker_with_busybox 2 + swarm_manage + + run docker_swarm run -dit -e reschedule:on-node-failure --label com.docker.swarm.reschedule-policy=off busybox sh + [ "$status" -eq 1 ] + echo ${output} + [[ "${output}" == *"conflicting reschedule policies"* ]] +} + +@test "rescheduling pending" { + start_docker_with_busybox 2 + swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-refresh-retry=1 ${HOSTS[0]},${HOSTS[1]} + + # c1 on node-0 with reschedule=on-node-failure + run docker_swarm run -dit --name c1 -e affinity:container!=c* -e constraint:node==~node-0 -e reschedule:on-node-failure busybox sh + [ "$status" -eq 0 ] + # c2 on node-0 with reschedule=off + run docker_swarm run -dit --name c2 -e affinity:container!=c* -e constraint:node==node-1 -e reschedule:on-node-failure busybox sh + [ "$status" -eq 0 ] + + run docker_swarm ps -q + [ "${#lines[@]}" -eq 2 ] + + # Make sure containers are running where they should. + run docker_swarm inspect c1 + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-0"'* ]] + run docker_swarm inspect c2 + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-1"'* ]] + + # Stop node-0 + docker_host stop ${DOCKER_CONTAINERS[0]} + + # Wait for Swarm to detect the node failure. + retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'" + + # c1 cannot be reschedule right now because of the anti affinity + run docker_swarm inspect c1 + [ "$status" -eq 1 ] + + # c2 should still be on node-1 since it wasn't affected + run docker_swarm inspect c2 + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-1"'* ]] + + run docker_swarm ps --filter "name=c1" + [[ "${lines[1]}" == *"Host Down"* ]] + + docker_swarm rm -f c2 + + # c2 was removed + run docker_swarm inspect c2 + [ "$status" -eq 1 ] + + # c1 should have been rescheduled from node-0 to node-1 + run docker_swarm inspect c1 + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-1"'* ]] + + run docker_swarm ps --filter "name=c1" + [[ "${lines[1]}" != *"Host Down"* ]] + +}