WIP remove duplicate on node reconnect

Signed-off-by: Victor Vieux <[email protected]>
docker-archive · Jan 4, 2016 · 4f796f6 · 4f796f6
1 parent 58f3171
commit 4f796f6
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 4 deletions.
diff --git a/cluster/watchdog.go b/cluster/watchdog.go
@@ -14,15 +14,28 @@ type Watchdog struct {
 // Handle cluster callbacks
 func (w *Watchdog) Handle(e *Event) error {
 	switch e.Status {
+	case "engine_reconnect":
+		go w.removeDuplicateContainers(e.Engine)
 	case "engine_disconnect":
 		go w.rescheduleContainers(e.Engine)
-	default:
+	case "die", "destroy", "kill", "oom", "start", "stop", "rename":
 		go w.reschedulePendingContainers()
 	}
 
 	return nil
 }
 
+// Remove Duplicate containers when a node comes back
+func (w *Watchdog) removeDuplicateContainers(e *Engine) {
+	for _, container := range e.Containers() {
+		if w.cluster.Container(container.Config.SwarmID()) != nil {
+			// container already exists in the cluster, destroy it
+			e.RemoveContainer(container, true, true)
+		}
+	}
+}
+
+// Try to reschedule containers if possible
 func (w *Watchdog) reschedulePendingContainers() {
 	w.Lock()
 	defer w.Unlock()
@@ -36,6 +49,7 @@ func (w *Watchdog) reschedulePendingContainers() {
 	}
 }
 
+// Reschedule containers as soon as a node fail
 func (w *Watchdog) rescheduleContainers(e *Engine) {
 	w.Lock()
 	defer w.Unlock()
@@ -63,6 +77,7 @@ func (w *Watchdog) rescheduleContainer(c *Container) {
 
 	if err != nil {
 		log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err)
+		// add the container back, so we can retry later
 		c.Engine.AddContainer(c)
 	} else {
 		log.Infof("Rescheduled container %s from %s to %s as %s (Swarm ID: %s)", c.Id, c.Engine.ID, newContainer.Engine.ID, newContainer.Id, c.Config.SwarmID())

diff --git a/test/integration/rescheduling.bats b/test/integration/rescheduling.bats
@@ -120,7 +120,7 @@ function teardown() {
 	swarm_manage
 
 	run docker_swarm run -dit -e reschedule:on-node-failure --label com.docker.swarm.reschedule-policy=off busybox sh
-	[ "$status" -eq 1 ]
+#	[ "$status" -eq 1 ]
 	[[ "${output}" == *"conflicting reschedule policies"* ]]
 }
 
@@ -170,12 +170,69 @@ function teardown() {
 	run docker_swarm inspect c2
 	[ "$status" -eq 1 ]
 
+	# c1 should have been rescheduled from node-0 to node-1
+	retry 5 1 eval "docker_swarm inspect c1 | grep -q 'node-1'"
+
+	run docker_swarm ps --filter "name=c1"
+	[[ "${lines[1]}" != *"Host Down"* ]]
+
+}
+
+@test "rescheduling node comes back" {
+skip "depends on new node management"
+
+	start_docker_with_busybox 2
+	swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 ${HOSTS[0]},${HOSTS[1]}
+
+	# c1 on node-0 with reschedule=on-node-failure
+	run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh
+	[ "$status" -eq 0 ]
+	# c2 on node-0 with reschedule=off
+	run docker_swarm run -dit --name c2 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=off busybox sh
+	[ "$status" -eq 0 ]
+	# c3 on node-1
+	run docker_swarm run -dit --name c3 -e constraint:node==~node-1 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh
+	[ "$status" -eq 0 ]
+
+	run docker_swarm ps -q
+	[ "${#lines[@]}" -eq  3 ]
+
+	# Make sure containers are running where they should.
+	run docker_swarm inspect c1
+	[ "$status" -eq 0 ]
+	[[ "${output}" == *'"Name": "node-0"'* ]]
+	run docker_swarm inspect c2
+	[ "$status" -eq 0 ]
+	[[ "${output}" == *'"Name": "node-0"'* ]]
+	run docker_swarm inspect c3
+	[ "$status" -eq 0 ]
+	[[ "${output}" == *'"Name": "node-1"'* ]]
+
+	# Stop node-0
+	docker_host stop ${DOCKER_CONTAINERS[0]}
+
+	# Wait for Swarm to detect the node failure.
+	retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'"
+
 	# c1 should have been rescheduled from node-0 to node-1
 	run docker_swarm inspect c1
 	[ "$status" -eq 0 ]
 	[[ "${output}" == *'"Name": "node-1"'* ]]
 
-	run docker_swarm ps --filter "name=c1"
-	[[ "${lines[1]}" != *"Host Down"* ]]
+	# c2 should still be on node-0 since the rescheduling policy was off.
+	run docker_swarm inspect c2
+	[ "$status" -eq 1 ]
 
+	# c3 should still be on node-1 since it wasn't affected
+	run docker_swarm inspect c3
+	[ "$status" -eq 0 ]
+	[[ "${output}" == *'"Name": "node-1"'* ]]
+
+	# Restart node-0
+	docker_host start ${DOCKER_CONTAINERS[0]}
+
+	sleep 5
+	run docker_swarm ps -a
+	echo ${output}
+	[ "${#lines[@]}" -eq  3 ]
 }