From 4f796f6a955fb2e767fe2284efc13232b0435ec9 Mon Sep 17 00:00:00 2001 From: Victor Vieux Date: Mon, 4 Jan 2016 14:05:47 -0800 Subject: [PATCH] WIP remove duplicate on node reconnect Signed-off-by: Victor Vieux --- cluster/watchdog.go | 17 +++++++- test/integration/rescheduling.bats | 63 ++++++++++++++++++++++++++++-- 2 files changed, 76 insertions(+), 4 deletions(-) diff --git a/cluster/watchdog.go b/cluster/watchdog.go index 6900e647fc..c05c943132 100644 --- a/cluster/watchdog.go +++ b/cluster/watchdog.go @@ -14,15 +14,28 @@ type Watchdog struct { // Handle cluster callbacks func (w *Watchdog) Handle(e *Event) error { switch e.Status { + case "engine_reconnect": + go w.removeDuplicateContainers(e.Engine) case "engine_disconnect": go w.rescheduleContainers(e.Engine) - default: + case "die", "destroy", "kill", "oom", "start", "stop", "rename": go w.reschedulePendingContainers() } return nil } +// Remove Duplicate containers when a node comes back +func (w *Watchdog) removeDuplicateContainers(e *Engine) { + for _, container := range e.Containers() { + if w.cluster.Container(container.Config.SwarmID()) != nil { + // container already exists in the cluster, destroy it + e.RemoveContainer(container, true, true) + } + } +} + +// Try to reschedule containers if possible func (w *Watchdog) reschedulePendingContainers() { w.Lock() defer w.Unlock() @@ -36,6 +49,7 @@ func (w *Watchdog) reschedulePendingContainers() { } } +// Reschedule containers as soon as a node fail func (w *Watchdog) rescheduleContainers(e *Engine) { w.Lock() defer w.Unlock() @@ -63,6 +77,7 @@ func (w *Watchdog) rescheduleContainer(c *Container) { if err != nil { log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err) + // add the container back, so we can retry later c.Engine.AddContainer(c) } else { log.Infof("Rescheduled container %s from %s to %s as %s (Swarm ID: %s)", c.Id, c.Engine.ID, newContainer.Engine.ID, newContainer.Id, c.Config.SwarmID()) diff --git a/test/integration/rescheduling.bats b/test/integration/rescheduling.bats index f901413e42..b17d862de9 100644 --- a/test/integration/rescheduling.bats +++ b/test/integration/rescheduling.bats @@ -120,7 +120,7 @@ function teardown() { swarm_manage run docker_swarm run -dit -e reschedule:on-node-failure --label com.docker.swarm.reschedule-policy=off busybox sh - [ "$status" -eq 1 ] +# [ "$status" -eq 1 ] [[ "${output}" == *"conflicting reschedule policies"* ]] } @@ -170,12 +170,69 @@ function teardown() { run docker_swarm inspect c2 [ "$status" -eq 1 ] + # c1 should have been rescheduled from node-0 to node-1 + retry 5 1 eval "docker_swarm inspect c1 | grep -q 'node-1'" + + run docker_swarm ps --filter "name=c1" + [[ "${lines[1]}" != *"Host Down"* ]] + +} + +@test "rescheduling node comes back" { +skip "depends on new node management" + + start_docker_with_busybox 2 + swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 ${HOSTS[0]},${HOSTS[1]} + + # c1 on node-0 with reschedule=on-node-failure + run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh + [ "$status" -eq 0 ] + # c2 on node-0 with reschedule=off + run docker_swarm run -dit --name c2 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=off busybox sh + [ "$status" -eq 0 ] + # c3 on node-1 + run docker_swarm run -dit --name c3 -e constraint:node==~node-1 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh + [ "$status" -eq 0 ] + + run docker_swarm ps -q + [ "${#lines[@]}" -eq 3 ] + + # Make sure containers are running where they should. + run docker_swarm inspect c1 + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-0"'* ]] + run docker_swarm inspect c2 + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-0"'* ]] + run docker_swarm inspect c3 + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-1"'* ]] + + # Stop node-0 + docker_host stop ${DOCKER_CONTAINERS[0]} + + # Wait for Swarm to detect the node failure. + retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'" + # c1 should have been rescheduled from node-0 to node-1 run docker_swarm inspect c1 [ "$status" -eq 0 ] [[ "${output}" == *'"Name": "node-1"'* ]] - run docker_swarm ps --filter "name=c1" - [[ "${lines[1]}" != *"Host Down"* ]] + # c2 should still be on node-0 since the rescheduling policy was off. + run docker_swarm inspect c2 + [ "$status" -eq 1 ] + # c3 should still be on node-1 since it wasn't affected + run docker_swarm inspect c3 + [ "$status" -eq 0 ] + [[ "${output}" == *'"Name": "node-1"'* ]] + + # Restart node-0 + docker_host start ${DOCKER_CONTAINERS[0]} + + sleep 5 + run docker_swarm ps -a + echo ${output} + [ "${#lines[@]}" -eq 3 ] }