Skip to content
This repository has been archived by the owner on Feb 1, 2021. It is now read-only.

Commit

Permalink
WIP remove duplicate on node reconnect
Browse files Browse the repository at this point in the history
Signed-off-by: Victor Vieux <[email protected]>
  • Loading branch information
vieux committed Jan 4, 2016
1 parent 58f3171 commit 4f796f6
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 4 deletions.
17 changes: 16 additions & 1 deletion cluster/watchdog.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,28 @@ type Watchdog struct {
// Handle cluster callbacks
func (w *Watchdog) Handle(e *Event) error {
switch e.Status {
case "engine_reconnect":
go w.removeDuplicateContainers(e.Engine)
case "engine_disconnect":
go w.rescheduleContainers(e.Engine)
default:
case "die", "destroy", "kill", "oom", "start", "stop", "rename":
go w.reschedulePendingContainers()
}

return nil
}

// Remove Duplicate containers when a node comes back
func (w *Watchdog) removeDuplicateContainers(e *Engine) {
for _, container := range e.Containers() {
if w.cluster.Container(container.Config.SwarmID()) != nil {
// container already exists in the cluster, destroy it
e.RemoveContainer(container, true, true)
}
}
}

// Try to reschedule containers if possible
func (w *Watchdog) reschedulePendingContainers() {
w.Lock()
defer w.Unlock()
Expand All @@ -36,6 +49,7 @@ func (w *Watchdog) reschedulePendingContainers() {
}
}

// Reschedule containers as soon as a node fail
func (w *Watchdog) rescheduleContainers(e *Engine) {
w.Lock()
defer w.Unlock()
Expand Down Expand Up @@ -63,6 +77,7 @@ func (w *Watchdog) rescheduleContainer(c *Container) {

if err != nil {
log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err)
// add the container back, so we can retry later
c.Engine.AddContainer(c)
} else {
log.Infof("Rescheduled container %s from %s to %s as %s (Swarm ID: %s)", c.Id, c.Engine.ID, newContainer.Engine.ID, newContainer.Id, c.Config.SwarmID())
Expand Down
63 changes: 60 additions & 3 deletions test/integration/rescheduling.bats
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ function teardown() {
swarm_manage

run docker_swarm run -dit -e reschedule:on-node-failure --label com.docker.swarm.reschedule-policy=off busybox sh
[ "$status" -eq 1 ]
# [ "$status" -eq 1 ]
[[ "${output}" == *"conflicting reschedule policies"* ]]
}

Expand Down Expand Up @@ -170,12 +170,69 @@ function teardown() {
run docker_swarm inspect c2
[ "$status" -eq 1 ]

# c1 should have been rescheduled from node-0 to node-1
retry 5 1 eval "docker_swarm inspect c1 | grep -q 'node-1'"

run docker_swarm ps --filter "name=c1"
[[ "${lines[1]}" != *"Host Down"* ]]

}

@test "rescheduling node comes back" {
skip "depends on new node management"

start_docker_with_busybox 2
swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 ${HOSTS[0]},${HOSTS[1]}

# c1 on node-0 with reschedule=on-node-failure
run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh
[ "$status" -eq 0 ]
# c2 on node-0 with reschedule=off
run docker_swarm run -dit --name c2 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=off busybox sh
[ "$status" -eq 0 ]
# c3 on node-1
run docker_swarm run -dit --name c3 -e constraint:node==~node-1 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh
[ "$status" -eq 0 ]

run docker_swarm ps -q
[ "${#lines[@]}" -eq 3 ]

# Make sure containers are running where they should.
run docker_swarm inspect c1
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-0"'* ]]
run docker_swarm inspect c2
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-0"'* ]]
run docker_swarm inspect c3
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-1"'* ]]

# Stop node-0
docker_host stop ${DOCKER_CONTAINERS[0]}

# Wait for Swarm to detect the node failure.
retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'"

# c1 should have been rescheduled from node-0 to node-1
run docker_swarm inspect c1
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-1"'* ]]

run docker_swarm ps --filter "name=c1"
[[ "${lines[1]}" != *"Host Down"* ]]
# c2 should still be on node-0 since the rescheduling policy was off.
run docker_swarm inspect c2
[ "$status" -eq 1 ]

# c3 should still be on node-1 since it wasn't affected
run docker_swarm inspect c3
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-1"'* ]]

# Restart node-0
docker_host start ${DOCKER_CONTAINERS[0]}

sleep 5
run docker_swarm ps -a
echo ${output}
[ "${#lines[@]}" -eq 3 ]
}

0 comments on commit 4f796f6

Please sign in to comment.