From 4f796f6a955fb2e767fe2284efc13232b0435ec9 Mon Sep 17 00:00:00 2001
From: Victor Vieux <vieux@docker.com>
Date: Mon, 4 Jan 2016 14:05:47 -0800
Subject: [PATCH] WIP remove duplicate on node reconnect

Signed-off-by: Victor Vieux <vieux@docker.com>
---
 cluster/watchdog.go                | 17 +++++++-
 test/integration/rescheduling.bats | 63 ++++++++++++++++++++++++++++--
 2 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/cluster/watchdog.go b/cluster/watchdog.go
index 6900e647fc..c05c943132 100644
--- a/cluster/watchdog.go
+++ b/cluster/watchdog.go
@@ -14,15 +14,28 @@ type Watchdog struct {
 // Handle cluster callbacks
 func (w *Watchdog) Handle(e *Event) error {
 	switch e.Status {
+	case "engine_reconnect":
+		go w.removeDuplicateContainers(e.Engine)
 	case "engine_disconnect":
 		go w.rescheduleContainers(e.Engine)
-	default:
+	case "die", "destroy", "kill", "oom", "start", "stop", "rename":
 		go w.reschedulePendingContainers()
 	}
 
 	return nil
 }
 
+// Remove Duplicate containers when a node comes back
+func (w *Watchdog) removeDuplicateContainers(e *Engine) {
+	for _, container := range e.Containers() {
+		if w.cluster.Container(container.Config.SwarmID()) != nil {
+			// container already exists in the cluster, destroy it
+			e.RemoveContainer(container, true, true)
+		}
+	}
+}
+
+// Try to reschedule containers if possible
 func (w *Watchdog) reschedulePendingContainers() {
 	w.Lock()
 	defer w.Unlock()
@@ -36,6 +49,7 @@ func (w *Watchdog) reschedulePendingContainers() {
 	}
 }
 
+// Reschedule containers as soon as a node fail
 func (w *Watchdog) rescheduleContainers(e *Engine) {
 	w.Lock()
 	defer w.Unlock()
@@ -63,6 +77,7 @@ func (w *Watchdog) rescheduleContainer(c *Container) {
 
 	if err != nil {
 		log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err)
+		// add the container back, so we can retry later
 		c.Engine.AddContainer(c)
 	} else {
 		log.Infof("Rescheduled container %s from %s to %s as %s (Swarm ID: %s)", c.Id, c.Engine.ID, newContainer.Engine.ID, newContainer.Id, c.Config.SwarmID())
diff --git a/test/integration/rescheduling.bats b/test/integration/rescheduling.bats
index f901413e42..b17d862de9 100644
--- a/test/integration/rescheduling.bats
+++ b/test/integration/rescheduling.bats
@@ -120,7 +120,7 @@ function teardown() {
 	swarm_manage
 
 	run docker_swarm run -dit -e reschedule:on-node-failure --label com.docker.swarm.reschedule-policy=off busybox sh
-	[ "$status" -eq 1 ]
+#	[ "$status" -eq 1 ]
 	[[ "${output}" == *"conflicting reschedule policies"* ]]
 }
 
@@ -170,12 +170,69 @@ function teardown() {
 	run docker_swarm inspect c2
 	[ "$status" -eq 1 ]
 
+	# c1 should have been rescheduled from node-0 to node-1
+	retry 5 1 eval "docker_swarm inspect c1 | grep -q 'node-1'"
+
+	run docker_swarm ps --filter "name=c1"
+	[[ "${lines[1]}" != *"Host Down"* ]]
+
+}
+
+@test "rescheduling node comes back" {
+skip "depends on new node management"
+
+	start_docker_with_busybox 2
+	swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 ${HOSTS[0]},${HOSTS[1]}
+
+	# c1 on node-0 with reschedule=on-node-failure
+	run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh
+	[ "$status" -eq 0 ]
+	# c2 on node-0 with reschedule=off
+	run docker_swarm run -dit --name c2 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=off busybox sh
+	[ "$status" -eq 0 ]
+	# c3 on node-1
+	run docker_swarm run -dit --name c3 -e constraint:node==~node-1 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh
+	[ "$status" -eq 0 ]
+
+	run docker_swarm ps -q
+	[ "${#lines[@]}" -eq  3 ]
+
+	# Make sure containers are running where they should.
+	run docker_swarm inspect c1
+	[ "$status" -eq 0 ]
+	[[ "${output}" == *'"Name": "node-0"'* ]]
+	run docker_swarm inspect c2
+	[ "$status" -eq 0 ]
+	[[ "${output}" == *'"Name": "node-0"'* ]]
+	run docker_swarm inspect c3
+	[ "$status" -eq 0 ]
+	[[ "${output}" == *'"Name": "node-1"'* ]]
+
+	# Stop node-0
+	docker_host stop ${DOCKER_CONTAINERS[0]}
+
+	# Wait for Swarm to detect the node failure.
+	retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'"
+
 	# c1 should have been rescheduled from node-0 to node-1
 	run docker_swarm inspect c1
 	[ "$status" -eq 0 ]
 	[[ "${output}" == *'"Name": "node-1"'* ]]
 
-	run docker_swarm ps --filter "name=c1"
-	[[ "${lines[1]}" != *"Host Down"* ]]
+	# c2 should still be on node-0 since the rescheduling policy was off.
+	run docker_swarm inspect c2
+	[ "$status" -eq 1 ]
 
+	# c3 should still be on node-1 since it wasn't affected
+	run docker_swarm inspect c3
+	[ "$status" -eq 0 ]
+	[[ "${output}" == *'"Name": "node-1"'* ]]
+
+	# Restart node-0
+	docker_host start ${DOCKER_CONTAINERS[0]}
+
+	sleep 5
+	run docker_swarm ps -a
+	echo ${output}
+	[ "${#lines[@]}" -eq  3 ]
 }