Skip to content

Commit

Permalink
jobs: clear job claim after execution
Browse files Browse the repository at this point in the history
Since cockroachdb#89014 the job system reset a job's claim when transitioning it
from pause-requested to paused and from cancel-requested to
reverting. The job system signals these transitions to the running
Resumer by cancelling the job's context and does not wait for the
resumer to exit. Once the claim is clear, another node can adopt the
job and start running it's OnFailOrCancel callback. As a result,
clearing the context makes it more likely that OnFailOrCancel
executions will overlap with Resume executions.

In general, Jobs need to assume that Resume may still be running while
OnFailOrCancel is called. But, making it more likely isn't in our
interest.

Here, we only clear the lease when we exit the job state machine.
This makes it much more likely that OnFailOrCancel doesn't start until
Resume has returned.

Release note: None
  • Loading branch information
stevendanna committed Nov 9, 2022
1 parent 8dc57ed commit 5993dc6
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 5 deletions.
44 changes: 40 additions & 4 deletions pkg/jobs/adopt.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ func (r *Registry) runJob(
log.Errorf(ctx, "job %d: adoption completed with error %v", job.ID(), err)
}

r.maybeClearLease(job, err)
r.maybeDumpTrace(ctx, resumer, int64(job.ID()), int64(span.TraceID()), err)
r.maybeRecordExecutionFailure(ctx, err, job)
if r.knobs.AfterJobStateMachine != nil {
Expand All @@ -423,18 +424,53 @@ func (r *Registry) runJob(
return err
}

const clearClaimQuery = `
UPDATE system.jobs
SET claim_session_id = NULL, claim_instance_id = NULL
WHERE id = $1
AND claim_session_id = $2
AND claim_instance_id = $3
AND status NOT IN ('` + string(StatusPauseRequested) + `', '` + string(StatusCancelRequested) + `')`

// maybeClearLease clears the claim on the given job, provided that
// the current lease matches our liveness Session.
func (r *Registry) maybeClearLease(job *Job, jobErr error) {
if jobErr == nil {
return
}

// We use the serverCtx here rather than the context from the
// caller since the caller's context may have been canceled.
r.withSession(r.serverCtx, func(ctx context.Context, s sqlliveness.Session) {
err := r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
if err := txn.SetUserPriority(roachpb.MinUserPriority); err != nil {
return errors.WithAssertionFailure(err)
}
n, err := r.ex.ExecEx(ctx, "clear-job-claim", txn,
sessiondata.InternalExecutorOverride{User: username.NodeUserName()},
clearClaimQuery, job.ID(), s.ID().UnsafeBytes(), r.ID())
if err != nil {
return err
}
log.VEventf(ctx, 2, "cleared leases for %d jobs", n)
return nil
})
if err != nil {
log.Warningf(ctx, "could not clear job claim: %s", err.Error())
}
})
}

const pauseAndCancelUpdate = `
UPDATE system.jobs
SET status =
SET status =
CASE
WHEN status = '` + string(StatusPauseRequested) + `' THEN '` + string(StatusPaused) + `'
WHEN status = '` + string(StatusCancelRequested) + `' THEN '` + string(StatusReverting) + `'
ELSE status
END,
num_runs = 0,
last_run = NULL,
claim_session_id = NULL,
claim_instance_id = NULL
last_run = NULL
WHERE (status IN ('` + string(StatusPauseRequested) + `', '` + string(StatusCancelRequested) + `'))
AND ((claim_session_id = $1) AND (claim_instance_id = $2))
RETURNING id, status
Expand Down
9 changes: 8 additions & 1 deletion pkg/jobs/jobs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3160,9 +3160,16 @@ func TestPauseReason(t *testing.T) {
return n
}
mustNotHaveClaim := func() {
require.Equal(t, 0, countRowsWithClaimInfo())
t.Helper()
testutils.SucceedsSoon(t, func() error {
if countRowsWithClaimInfo() == 0 {
return nil
}
return errors.New("still waiting for claim to clear")
})
}
mustHaveClaim := func() {
t.Helper()
testutils.SucceedsSoon(t, func() error {
if countRowsWithClaimInfo() == 1 {
return nil
Expand Down

0 comments on commit 5993dc6

Please sign in to comment.