Skip to content
This repository has been archived by the owner on Feb 1, 2021. It is now read-only.

Commit

Permalink
don't error on 1st rescheduling error
Browse files Browse the repository at this point in the history
Signed-off-by: Victor Vieux <[email protected]>
  • Loading branch information
vieux committed Jan 4, 2016
1 parent 0931466 commit edf2b30
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 25 deletions.
61 changes: 37 additions & 24 deletions cluster/watchdog.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,52 +7,65 @@ import (
)

type Watchdog struct {
l sync.Mutex
sync.Mutex
cluster Cluster
}

// Handle cluster callbacks
func (w *Watchdog) Handle(e *Event) error {
// Skip non-swarm events.
if e.From != "swarm" {
return nil
}

switch e.Status {
case "engine_disconnect":
go w.rescheduleContainers(e.Engine)
default:
go w.reschedulePendingContainers()
}

return nil
}

func (w *Watchdog) reschedulePendingContainers() {
w.Lock()
defer w.Unlock()

log.Infof("rescheduling pending containers")

for _, c := range w.cluster.Containers() {
if !c.Engine.IsHealthy() {
w.rescheduleContainer(c)
}
}
}

func (w *Watchdog) rescheduleContainers(e *Engine) {
w.l.Lock()
defer w.l.Unlock()
w.Lock()
defer w.Unlock()

log.Infof("Node %s failed - rescheduling containers", e.ID)
for _, c := range e.Containers() {
// Skip containers which don't have an "on-node-failure" reschedule policy.
if c.Config.ReschedulePolicy() != "on-node-failure" {
log.Debugf("Skipping rescheduling of %s based on rescheduling policy", c.Id)
continue
}
w.rescheduleContainer(c)
}
}

// Remove the container from the dead engine. If we don't, then both
// the old and new one will show up in docker ps.
// We have to do this before calling `CreateContainer`, otherwise it
// will abort because the name is already taken.
c.Engine.removeContainer(c)
func (w *Watchdog) rescheduleContainer(c *Container) {
// Skip containers which don't have an "on-node-failure" reschedule policy.
if c.Config.ReschedulePolicy() != "on-node-failure" {
log.Debugf("Skipping rescheduling of %s based on rescheduling policy", c.Id)
return
}

newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name)
// Remove the container from the dead engine. If we don't, then both
// the old and new one will show up in docker ps.
// We have to do this before calling `CreateContainer`, otherwise it
// will abort because the name is already taken.
c.Engine.removeContainer(c)

if err != nil {
log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err)
continue
}
newContainer, err := w.cluster.CreateContainer(c.Config, c.Info.Name, nil)

if err != nil {
log.Errorf("Failed to reschedule container %s (Swarm ID: %s): %v", c.Id, c.Config.SwarmID(), err)
c.Engine.AddContainer(c)
} else {
log.Infof("Rescheduled container %s from %s to %s as %s (Swarm ID: %s)", c.Id, c.Engine.ID, newContainer.Engine.ID, newContainer.Id, c.Config.SwarmID())

if c.Info.State.Running {
if err := newContainer.Start(); err != nil {
log.Errorf("Failed to start rescheduled container %s", newContainer.Id)
Expand Down
117 changes: 116 additions & 1 deletion test/integration/rescheduling.bats
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ function teardown() {
swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-refresh-retry=1 ${HOSTS[0]},${HOSTS[1]}

# c1 on node-0 with reschedule=on-node-failure
run docker_swarm run -dit --name c1 -e constraint:node==~node-0 -e reschedule:on-node-failure busybox sh
run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=on-node-failure busybox sh
[ "$status" -eq 0 ]
# c2 on node-0 with reschedule=off
run docker_swarm run -dit --name c2 -e constraint:node==~node-0 --label com.docker.swarm.reschedule-policy=off busybox sh
Expand Down Expand Up @@ -55,3 +55,118 @@ function teardown() {
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-1"'* ]]
}

@test "rescheduling with constraints" {
start_docker_with_busybox 2
swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-refresh-retry=1 ${HOSTS[0]},${HOSTS[1]}

# c1 on node-0 with reschedule=on-node-failure
run docker_swarm run -dit --name c1 -e constraint:node==~node-0 -e reschedule:on-node-failure busybox sh
[ "$status" -eq 0 ]
# c2 on node-0 with reschedule=off
run docker_swarm run -dit --name c2 -e constraint:node==node-0 -e reschedule:on-node-failure busybox sh
[ "$status" -eq 0 ]
# c3 on node-1
run docker_swarm run -dit --name c3 -e constraint:node==node-1 -e reschedule:on-node-failure busybox sh
[ "$status" -eq 0 ]

run docker_swarm ps -q
[ "${#lines[@]}" -eq 3 ]

# Make sure containers are running where they should.
run docker_swarm inspect c1
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-0"'* ]]
run docker_swarm inspect c2
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-0"'* ]]
run docker_swarm inspect c3
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-1"'* ]]

# Stop node-0
docker_host stop ${DOCKER_CONTAINERS[0]}

# Wait for Swarm to detect the node failure.
retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'"

# c1 should have been rescheduled from node-0 to node-1
run docker_swarm inspect c1
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-1"'* ]]

# c2 should still be on node-0 since a node constraint was applied.
run docker_swarm inspect c2
[ "$status" -eq 1 ]

# c3 should still be on node-1 since it wasn't affected
run docker_swarm inspect c3
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-1"'* ]]
}

@test "reschedule conflict" {
start_docker_with_busybox 2
swarm_manage

run docker_swarm run -dit -e reschedule:on-node-failure --label com.docker.swarm.reschedule-policy=off busybox sh
[ "$status" -eq 1 ]
echo ${output}
[[ "${output}" == *"conflicting reschedule policies"* ]]
}

@test "rescheduling pending" {
start_docker_with_busybox 2
swarm_manage --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-refresh-retry=1 ${HOSTS[0]},${HOSTS[1]}

# c1 on node-0 with reschedule=on-node-failure
run docker_swarm run -dit --name c1 -e affinity:container!=c* -e constraint:node==~node-0 -e reschedule:on-node-failure busybox sh
[ "$status" -eq 0 ]
# c2 on node-0 with reschedule=off
run docker_swarm run -dit --name c2 -e affinity:container!=c* -e constraint:node==node-1 -e reschedule:on-node-failure busybox sh
[ "$status" -eq 0 ]

run docker_swarm ps -q
[ "${#lines[@]}" -eq 2 ]

# Make sure containers are running where they should.
run docker_swarm inspect c1
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-0"'* ]]
run docker_swarm inspect c2
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-1"'* ]]

# Stop node-0
docker_host stop ${DOCKER_CONTAINERS[0]}

# Wait for Swarm to detect the node failure.
retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'"

# c1 cannot be reschedule right now because of the anti affinity
run docker_swarm inspect c1
[ "$status" -eq 1 ]

# c2 should still be on node-1 since it wasn't affected
run docker_swarm inspect c2
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-1"'* ]]

run docker_swarm ps --filter "name=c1"
[[ "${lines[1]}" == *"Host Down"* ]]

docker_swarm rm -f c2

# c2 was removed
run docker_swarm inspect c2
[ "$status" -eq 1 ]

# c1 should have been rescheduled from node-0 to node-1
run docker_swarm inspect c1
[ "$status" -eq 0 ]
[[ "${output}" == *'"Name": "node-1"'* ]]

run docker_swarm ps --filter "name=c1"
[[ "${lines[1]}" != *"Host Down"* ]]

}

0 comments on commit edf2b30

Please sign in to comment.