From a80643e46d154c620c7c64709d1a7ba4cb7c288f Mon Sep 17 00:00:00 2001
From: Mahmood Ali <mahmood@hashicorp.com>
Date: Sun, 25 Aug 2019 11:03:49 -0400
Subject: [PATCH 1/2] Don't persist allocs of destroyed alloc runners

This fixes a bug where allocs that have been GCed get re-run again after client
is restarted.  A heavily-used client may launch thousands of allocs on startup
and get killed.

The bug is that an alloc runner that gets destroyed due to GC remains in
client alloc runner set.  Periodically, they get persisted until alloc is
gced by server.  During that  time, the client db will contain the alloc
but not its individual tasks status nor completed state.  On client restart,
client assumes that alloc is pending state and re-runs it.

Here, we fix it by ensuring that destroyed alloc runners don't persist any alloc
to the state DB.

This is a short-term fix, as we should consider revamping client state
management.  Storing alloc and task information in non-transaction non-atomic
concurrently while alloc runner is running and potentially changing state is a
recipe for bugs.

Fixes https://github.com/hashicorp/nomad/issues/5984
Related to https://github.com/hashicorp/nomad/pull/5890
---
 client/allocrunner/alloc_runner.go      | 18 ++++++++
 client/allocrunner/alloc_runner_test.go | 58 +++++++++++++++++++++++++
 client/client.go                        |  3 +-
 3 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go
index 39fbe01e694..10bf7290641 100644
--- a/client/allocrunner/alloc_runner.go
+++ b/client/allocrunner/alloc_runner.go
@@ -784,6 +784,24 @@ func (ar *allocRunner) destroyImpl() {
 	ar.destroyedLock.Unlock()
 }
 
+func (ar *allocRunner) PersistState() error {
+	// note that a race exists where a goroutine attempts to persist state
+	// while another kicks off destruction process.
+	// Here, we attempt to reconcile by always deleting alloc bucket after alloc destruction
+	if ar.IsDestroyed() {
+		err := ar.stateDB.DeleteAllocationBucket(ar.id)
+		if err != nil {
+			ar.logger.Warn("failed to delete allocation bucket", "error", err)
+		}
+		return nil
+	}
+
+	// TODO: consider persisting deployment state along with task status.
+	// While we study why only the alloc is persisted, I opted to maintain current
+	// behavior and not risk adding yet more IO calls unnecessarily.
+	return ar.stateDB.PutAllocation(ar.Alloc())
+}
+
 // Destroy the alloc runner by stopping it if it is still running and cleaning
 // up all of its resources.
 //
diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go
index 5a758ce0a50..548602f1ebb 100644
--- a/client/allocrunner/alloc_runner_test.go
+++ b/client/allocrunner/alloc_runner_test.go
@@ -1001,3 +1001,61 @@ func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
 		require.Fail(t, "err: %v", err)
 	})
 }
+
+// TestAllocRunner_PersistState_Destroyed asserts that destroyed allocs don't persist anymore
+func TestAllocRunner_PersistState_Destroyed(t *testing.T) {
+	t.Parallel()
+
+	alloc := mock.BatchAlloc()
+	taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
+
+	conf, cleanup := testAllocRunnerConfig(t, alloc)
+	conf.StateDB = state.NewMemDB(conf.Logger)
+
+	defer cleanup()
+	ar, err := NewAllocRunner(conf)
+	require.NoError(t, err)
+	defer destroy(ar)
+
+	go ar.Run()
+
+	select {
+	case <-ar.WaitCh():
+	case <-time.After(10 * time.Second):
+		require.Fail(t, "timed out waiting for alloc to complete")
+	}
+
+	// test final persisted state upon completion
+	require.NoError(t, ar.PersistState())
+	allocs, _, err := conf.StateDB.GetAllAllocations()
+	require.NoError(t, err)
+	require.Len(t, allocs, 1)
+	require.Equal(t, alloc.ID, allocs[0].ID)
+	_, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
+	require.NoError(t, err)
+	require.Equal(t, structs.TaskStateDead, ts.State)
+
+	// check that DB alloc is empty after destroying AR
+	ar.Destroy()
+	select {
+	case <-ar.DestroyCh():
+	case <-time.After(10 * time.Second):
+		require.Fail(t, "timedout waiting for destruction")
+	}
+
+	allocs, _, err = conf.StateDB.GetAllAllocations()
+	require.NoError(t, err)
+	require.Empty(t, allocs)
+	_, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
+	require.NoError(t, err)
+	require.Nil(t, ts)
+
+	// check that DB alloc is empty after persisting state of destroyed AR
+	ar.PersistState()
+	allocs, _, err = conf.StateDB.GetAllAllocations()
+	require.NoError(t, err)
+	require.Empty(t, allocs)
+	_, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
+	require.NoError(t, err)
+	require.Nil(t, ts)
+}
diff --git a/client/client.go b/client/client.go
index 622c1514d8c..ce6c1f2afcd 100644
--- a/client/client.go
+++ b/client/client.go
@@ -139,6 +139,7 @@ type AllocRunner interface {
 	ShutdownCh() <-chan struct{}
 	Signal(taskName, signal string) error
 	GetTaskEventHandler(taskName string) drivermanager.EventHandler
+	PersistState() error
 
 	RestartTask(taskName string, taskEvent *structs.TaskEvent) error
 	RestartAll(taskEvent *structs.TaskEvent) error
@@ -1084,7 +1085,7 @@ func (c *Client) saveState() error {
 
 	for id, ar := range runners {
 		go func(id string, ar AllocRunner) {
-			err := c.stateDB.PutAllocation(ar.Alloc())
+			err := ar.PersistState()
 			if err != nil {
 				c.logger.Error("error saving alloc state", "error", err, "alloc_id", id)
 				l.Lock()

From ff3dedd5346fa5b7737a33a19d838d8e928430cf Mon Sep 17 00:00:00 2001
From: Mahmood Ali <mahmood@hashicorp.com>
Date: Mon, 26 Aug 2019 13:45:58 -0400
Subject: [PATCH 2/2] Write to client store while holding lock

Protect against a race where destroying and persist state goroutines
race.

The downside is that the database io operation will run while holding
the lock and may run indefinitely.  The risk of lock being long held is
slow destruction, but slow io has bigger problems.
---
 client/allocrunner/alloc_runner.go | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go
index 10bf7290641..f628ef5017a 100644
--- a/client/allocrunner/alloc_runner.go
+++ b/client/allocrunner/alloc_runner.go
@@ -765,14 +765,15 @@ func (ar *allocRunner) destroyImpl() {
 	// state if Run() ran at all.
 	<-ar.taskStateUpdateHandlerCh
 
-	// Cleanup state db
+	// Mark alloc as destroyed
+	ar.destroyedLock.Lock()
+
+	// Cleanup state db; while holding the lock to avoid
+	// a race periodic PersistState that may resurrect the alloc
 	if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil {
 		ar.logger.Warn("failed to delete allocation state", "error", err)
 	}
 
-	// Mark alloc as destroyed
-	ar.destroyedLock.Lock()
-
 	if !ar.shutdown {
 		ar.shutdown = true
 		close(ar.shutdownCh)
@@ -785,10 +786,10 @@ func (ar *allocRunner) destroyImpl() {
 }
 
 func (ar *allocRunner) PersistState() error {
-	// note that a race exists where a goroutine attempts to persist state
-	// while another kicks off destruction process.
-	// Here, we attempt to reconcile by always deleting alloc bucket after alloc destruction
-	if ar.IsDestroyed() {
+	ar.destroyedLock.Lock()
+	defer ar.destroyedLock.Unlock()
+
+	if ar.destroyed {
 		err := ar.stateDB.DeleteAllocationBucket(ar.id)
 		if err != nil {
 			ar.logger.Warn("failed to delete allocation bucket", "error", err)