From 313c53c06674ad8da5bbae7520517f1ae3552158 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Tue, 25 Aug 2020 16:25:29 -0700 Subject: [PATCH 1/2] core: improve job deregister error logging Noticed this error in some production logs, and they were far from helpful. Changes: 1. Include job ID in logs 2. Wrap errors and log once instead of double log lines 3. Test fsm error handling behavior --- nomad/fsm.go | 20 +++++++++----------- nomad/fsm_test.go | 23 +++++++++++++++++++++++ 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/nomad/fsm.go b/nomad/fsm.go index 22663fe17c8..dda83f49b4d 100644 --- a/nomad/fsm.go +++ b/nomad/fsm.go @@ -578,7 +578,8 @@ func (n *nomadFSM) applyDeregisterJob(buf []byte, index uint64) interface{} { err := n.handleJobDeregister(index, req.JobID, req.Namespace, req.Purge, tx) if err != nil { - n.logger.Error("deregistering job failed", "error", err) + n.logger.Error("deregistering job failed", + "error", err, "job", req.JobID, "namespace", req.Namespace) return err } @@ -615,7 +616,7 @@ func (n *nomadFSM) applyBatchDeregisterJob(buf []byte, index uint64) interface{} err := n.state.WithWriteTransaction(index, func(tx state.Txn) error { for jobNS, options := range req.Jobs { if err := n.handleJobDeregister(index, jobNS.ID, jobNS.Namespace, options.Purge, tx); err != nil { - n.logger.Error("deregistering job failed", "job", jobNS, "error", err) + n.logger.Error("deregistering job failed", "job", jobNS.ID, "error", err) return err } } @@ -637,18 +638,17 @@ func (n *nomadFSM) applyBatchDeregisterJob(buf []byte, index uint64) interface{} return nil } -// handleJobDeregister is used to deregister a job. +// handleJobDeregister is used to deregister a job. Leaves error logging up to +// caller. func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, purge bool, tx state.Txn) error { // If it is periodic remove it from the dispatcher if err := n.periodicDispatcher.Remove(namespace, jobID); err != nil { - n.logger.Error("periodicDispatcher.Remove failed", "error", err) - return err + return fmt.Errorf("periodicDispatcher.Remove failed: %w", err) } if purge { if err := n.state.DeleteJobTxn(index, namespace, jobID, tx); err != nil { - n.logger.Error("DeleteJob failed", "error", err) - return err + return fmt.Errorf("DeleteJob failed: %w", err) } // We always delete from the periodic launch table because it is possible that @@ -660,8 +660,7 @@ func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, pu ws := memdb.NewWatchSet() current, err := n.state.JobByIDTxn(ws, namespace, jobID, tx) if err != nil { - n.logger.Error("JobByID lookup failed", "error", err) - return err + return fmt.Errorf("JobByID lookup failed: %w", err) } if current == nil { @@ -672,8 +671,7 @@ func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, pu stopped.Stop = true if err := n.state.UpsertJobTxn(index, stopped, tx); err != nil { - n.logger.Error("UpsertJob failed", "error", err) - return err + return fmt.Errorf("UpsertJob failed: %w", err) } } diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go index 7144716543e..1a0857178e8 100644 --- a/nomad/fsm_test.go +++ b/nomad/fsm_test.go @@ -712,6 +712,29 @@ func TestFSM_RegisterJob_BadNamespace(t *testing.T) { } } +func TestFSM_DeregisterJob_Error(t *testing.T) { + t.Parallel() + fsm := testFSM(t) + + job := mock.Job() + + deregReq := structs.JobDeregisterRequest{ + JobID: job.ID, + Purge: true, + WriteRequest: structs.WriteRequest{ + Namespace: job.Namespace, + }, + } + buf, err := structs.Encode(structs.JobDeregisterRequestType, deregReq) + require.NoError(t, err) + + resp := fsm.Apply(makeLog(buf)) + require.NotNil(t, resp) + respErr, ok := resp.(error) + require.Truef(t, ok, "expected response to be an error but found: %T", resp) + require.Error(t, respErr) +} + func TestFSM_DeregisterJob_Purge(t *testing.T) { t.Parallel() fsm := testFSM(t) From 599719ba2dc65d843b6795e0f82d5c60e6a15383 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 27 Aug 2020 15:03:37 -0700 Subject: [PATCH 2/2] docs: add #8745 to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 820ee604fef..8fda7c213ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.13.0 (Unreleased) IMPROVEMENTS: + * core: Improved job deregistration error logging. [[GH-8745](https://github.com/hashicorp/nomad/issues/8745)] * api: Added support for cancellation contexts to HTTP API. [[GH-8836](https://github.com/hashicorp/nomad/issues/8836)] BUG FIXES: