Skip to content

Commit

Permalink
core: improve job deregister error logging
Browse files Browse the repository at this point in the history
Noticed this error in some production logs, and they were far from
helpful. Changes:

1. Include job ID in logs
2. Wrap errors and log once instead of double log lines
3. Test fsm error handling behavior
  • Loading branch information
schmichael committed Aug 27, 2020
1 parent 7fdd643 commit c132879
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 11 deletions.
20 changes: 9 additions & 11 deletions nomad/fsm.go
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,8 @@ func (n *nomadFSM) applyDeregisterJob(buf []byte, index uint64) interface{} {
err := n.handleJobDeregister(index, req.JobID, req.Namespace, req.Purge, tx)

if err != nil {
n.logger.Error("deregistering job failed", "error", err)
n.logger.Error("deregistering job failed",
"error", err, "job", req.JobID, "namespace", req.Namespace)
return err
}

Expand Down Expand Up @@ -615,7 +616,7 @@ func (n *nomadFSM) applyBatchDeregisterJob(buf []byte, index uint64) interface{}
err := n.state.WithWriteTransaction(func(tx state.Txn) error {
for jobNS, options := range req.Jobs {
if err := n.handleJobDeregister(index, jobNS.ID, jobNS.Namespace, options.Purge, tx); err != nil {
n.logger.Error("deregistering job failed", "job", jobNS, "error", err)
n.logger.Error("deregistering job failed", "job", jobNS.ID, "error", err)
return err
}
}
Expand All @@ -637,18 +638,17 @@ func (n *nomadFSM) applyBatchDeregisterJob(buf []byte, index uint64) interface{}
return nil
}

// handleJobDeregister is used to deregister a job.
// handleJobDeregister is used to deregister a job. Leaves error logging up to
// caller.
func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, purge bool, tx state.Txn) error {
// If it is periodic remove it from the dispatcher
if err := n.periodicDispatcher.Remove(namespace, jobID); err != nil {
n.logger.Error("periodicDispatcher.Remove failed", "error", err)
return err
return fmt.Errorf("periodicDispatcher.Remove failed: %w", err)
}

if purge {
if err := n.state.DeleteJobTxn(index, namespace, jobID, tx); err != nil {
n.logger.Error("DeleteJob failed", "error", err)
return err
return fmt.Errorf("DeleteJob failed: %w", err)
}

// We always delete from the periodic launch table because it is possible that
Expand All @@ -660,8 +660,7 @@ func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, pu
ws := memdb.NewWatchSet()
current, err := n.state.JobByIDTxn(ws, namespace, jobID, tx)
if err != nil {
n.logger.Error("JobByID lookup failed", "error", err)
return err
return fmt.Errorf("JobByID lookup failed: %w", err)
}

if current == nil {
Expand All @@ -672,8 +671,7 @@ func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, pu
stopped.Stop = true

if err := n.state.UpsertJobTxn(index, stopped, tx); err != nil {
n.logger.Error("UpsertJob failed", "error", err)
return err
return fmt.Errorf("UpsertJob failed: %w", err)
}
}

Expand Down
23 changes: 23 additions & 0 deletions nomad/fsm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,29 @@ func TestFSM_RegisterJob_BadNamespace(t *testing.T) {
}
}

func TestFSM_DeregisterJob_Error(t *testing.T) {
t.Parallel()
fsm := testFSM(t)

job := mock.Job()

deregReq := structs.JobDeregisterRequest{
JobID: job.ID,
Purge: true,
WriteRequest: structs.WriteRequest{
Namespace: job.Namespace,
},
}
buf, err := structs.Encode(structs.JobDeregisterRequestType, deregReq)
require.NoError(t, err)

resp := fsm.Apply(makeLog(buf))
require.NotNil(t, resp)
respErr, ok := resp.(error)
require.Truef(t, ok, "expected response to be an error but found: %T", resp)
require.Error(t, respErr)
}

func TestFSM_DeregisterJob_Purge(t *testing.T) {
t.Parallel()
fsm := testFSM(t)
Expand Down

0 comments on commit c132879

Please sign in to comment.