Skip to content

Commit

Permalink
Merge pull request #7944 from hashicorp/b-health-checks-after-task-he…
Browse files Browse the repository at this point in the history
…alth

Allocs are healthy if service checks get healthy before task health
  • Loading branch information
Mahmood Ali authored May 13, 2020
2 parents cf47153 + 22b65f2 commit 3cb5551
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 3 deletions.
13 changes: 10 additions & 3 deletions client/allochealth/tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,8 @@ func (t *Tracker) setTaskHealth(healthy, terminal bool) {
}

// setCheckHealth is used to mark the checks as either healthy or unhealthy.
func (t *Tracker) setCheckHealth(healthy bool) {
// returns true if health is propagated and no more health monitoring is needed
func (t *Tracker) setCheckHealth(healthy bool) bool {
t.l.Lock()
defer t.l.Unlock()

Expand All @@ -220,7 +221,7 @@ func (t *Tracker) setCheckHealth(healthy bool) {

// Only signal if we are healthy and so is the tasks
if !t.checksHealthy {
return
return false
}

select {
Expand All @@ -230,6 +231,7 @@ func (t *Tracker) setCheckHealth(healthy bool) {

// Shutdown the tracker
t.cancelFn()
return true
}

// markAllocStopped is used to mark the allocation as having stopped.
Expand Down Expand Up @@ -379,7 +381,12 @@ OUTER:
allocReg = newAllocReg
}
case <-healthyTimer.C:
t.setCheckHealth(true)
if t.setCheckHealth(true) {
// final health set and propagated
return
}
// tasks are unhealthy, reset and wait until all is healthy
primed = false
}

if allocReg == nil {
Expand Down
109 changes: 109 additions & 0 deletions client/allochealth/tracker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,3 +227,112 @@ func TestTracker_Healthy_IfBothTasksAndConsulChecksAreHealthy(t *testing.T) {
require.Fail(t, "expected a health status")
}
}

// TestTracker_Checks_Healthy_Before_TaskHealth asserts that we mark an alloc
// healthy, if the checks pass before task health pass
func TestTracker_Checks_Healthy_Before_TaskHealth(t *testing.T) {
t.Parallel()

alloc := mock.Alloc()
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
task := alloc.Job.TaskGroups[0].Tasks[0]

// new task starting unhealthy, without services
task2 := task.Copy()
task2.Name = task2.Name + "2"
task2.Services = nil
alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)

// Synthesize running alloc and tasks
alloc.ClientStatus = structs.AllocClientStatusRunning
alloc.TaskStates = map[string]*structs.TaskState{
task.Name: {
State: structs.TaskStateRunning,
StartedAt: time.Now(),
},
task2.Name: {
State: structs.TaskStatePending,
},
}

// Make Consul response
check := &consulapi.AgentCheck{
Name: task.Services[0].Checks[0].Name,
Status: consulapi.HealthPassing,
}
taskRegs := map[string]*agentconsul.ServiceRegistrations{
task.Name: {
Services: map[string]*agentconsul.ServiceRegistration{
task.Services[0].Name: {
Service: &consulapi.AgentService{
ID: "foo",
Service: task.Services[0].Name,
},
Checks: []*consulapi.AgentCheck{check},
},
},
},
}

logger := testlog.HCLogger(t)
b := cstructs.NewAllocBroadcaster(logger)
defer b.Close()

// Don't reply on the first call
var called uint64
consul := consul.NewMockConsulServiceClient(t, logger)
consul.AllocRegistrationsFn = func(string) (*agentconsul.AllocRegistration, error) {
if atomic.AddUint64(&called, 1) == 1 {
return nil, nil
}

reg := &agentconsul.AllocRegistration{
Tasks: taskRegs,
}

return reg, nil
}

ctx, cancelFn := context.WithCancel(context.Background())
defer cancelFn()

checkInterval := 10 * time.Millisecond
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
time.Millisecond, true)
tracker.checkLookupInterval = checkInterval
tracker.Start()

// assert that we don't get marked healthy
select {
case <-time.After(4 * checkInterval):
// still unhealthy, good
case h := <-tracker.HealthyCh():
require.Fail(t, "unexpected health event", h)
}
require.False(t, tracker.tasksHealthy)
require.False(t, tracker.checksHealthy)

// now set task to healthy
runningAlloc := alloc.Copy()
runningAlloc.TaskStates = map[string]*structs.TaskState{
task.Name: {
State: structs.TaskStateRunning,
StartedAt: time.Now(),
},
task2.Name: {
State: structs.TaskStateRunning,
StartedAt: time.Now(),
},
}
err := b.Send(runningAlloc)
require.NoError(t, err)

// eventually, it is marked as healthy
select {
case <-time.After(4 * checkInterval):
require.Fail(t, "timed out while waiting for health")
case h := <-tracker.HealthyCh():
require.True(t, h)
}

}

0 comments on commit 3cb5551

Please sign in to comment.