Skip to content

Commit

Permalink
periodic: always reset periodic children status
Browse files Browse the repository at this point in the history
Fixes a bug where Nomad reports negative or incorrect running children
counts for periodic jobs.

The periodic dispatcher derives a child job without reseting the status.
If the periodic job has a `running` status, the derived job will start
as `running` status and transition to `pending`.  Since this is
unexpected transition, the counting in StateStore.setJobSummary gets out of sync and
result in negative/incorrect values.

Note that this only affects periodic jobs after a leader transition.
During the first job registration, the job is added with `pending` or
`""` status. However, after a leader transition, the new leader
repopulates the dispatcher heap with `"running"` status and triggers the
bug.
  • Loading branch information
Mahmood Ali committed Mar 25, 2021
1 parent 7c75696 commit 15fd4f3
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 1 deletion.
4 changes: 3 additions & 1 deletion nomad/periodic.go
Original file line number Diff line number Diff line change
Expand Up @@ -444,12 +444,14 @@ func (p *PeriodicDispatch) deriveJob(periodicJob *structs.Job, time time.Time) (
}()

// Create a copy of the periodic job, give it a derived ID/Name and make it
// non-periodic.
// non-periodic in initial status
derived = periodicJob.Copy()
derived.ParentID = periodicJob.ID
derived.ID = p.derivedJobID(periodicJob, time)
derived.Name = derived.ID
derived.Periodic = nil
derived.Status = ""
derived.StatusDescription = ""
return
}

Expand Down
33 changes: 33 additions & 0 deletions nomad/periodic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,20 @@ func (m *MockJobEvalDispatcher) LaunchTimes(p *PeriodicDispatch, namespace, pare
return launches, nil
}

func (m *MockJobEvalDispatcher) dispatchedJobs(parent *structs.Job) []*structs.Job {
m.lock.Lock()
defer m.lock.Unlock()

jobs := []*structs.Job{}
for _, job := range m.Jobs {
if job.ParentID == parent.ID && job.Namespace == parent.Namespace {
jobs = append(jobs, job)
}
}

return jobs
}

type times []time.Time

func (t times) Len() int { return len(t) }
Expand Down Expand Up @@ -754,3 +768,22 @@ func TestPeriodicDispatch_RunningChildren_ActiveAllocs(t *testing.T) {
t.Fatalf("RunningChildren should return true")
}
}

// TestPeriodicDispatch_JobEmptyStatus asserts that dispatched
// job will always has an empty status
func TestPeriodicDispatch_JobEmptyStatus(t *testing.T) {
t.Parallel()
p, m := testPeriodicDispatcher(t)

job := testPeriodicJob(time.Now().Add(1 * time.Second))
job.Status = structs.JobStatusRunning

err := p.Add(job)
require.NoError(t, err)

time.Sleep(2 * time.Second)

dispatched := m.dispatchedJobs(job)
require.NotEmpty(t, dispatched)
require.Empty(t, dispatched[0].Status)
}

0 comments on commit 15fd4f3

Please sign in to comment.