From 7a68d7616064d873eee0dbedf33763de43ab3bd9 Mon Sep 17 00:00:00 2001
From: Mahmood Ali <mahmood@hashicorp.com>
Date: Fri, 19 Apr 2019 09:00:24 -0400
Subject: [PATCH 1/4] client: wait for batched driver updated

Here we retain 0.8.7 behavior of waiting for driver fingerprints before
registering a node, with some timeout.  This is needed for system jobs,
as system job scheduling for node occur at node registration, and the
race might mean that a system job may not get placed on the node because
of missing drivers.

The timeout isn't strictly necessary, but raising it to 1 minute as it's
closer to indefinitely blocked than 1 second.  We need to keep the value
high enough to capture as much drivers/devices, but low enough that
doesn't risk blocking too long due to misbehaving plugin.

Fixes https://github.com/hashicorp/nomad/issues/5579
---
 client/client.go       | 12 ++++++++++++
 client/node_updater.go |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/client/client.go b/client/client.go
index d2c12b05daf..ca2a81a5aab 100644
--- a/client/client.go
+++ b/client/client.go
@@ -93,6 +93,11 @@ const (
 	allocSyncRetryIntv = 5 * time.Second
 )
 
+var (
+	// grace period to allow for batch fingerprint processing
+	batchFirstFingerprintsProcessingGrace = batchFirstFingerprintsTimeout + 5*time.Second
+)
+
 // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad
 // Client
 type ClientStatsReporter interface {
@@ -419,6 +424,13 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic
 		return nil, fmt.Errorf("failed to setup vault client: %v", err)
 	}
 
+	// wait until drivers are healthy before restoring or registering with servers
+	select {
+	case <-c.Ready():
+	case <-time.After(batchFirstFingerprintsProcessingGrace):
+		logger.Warn("batched fingerprint, registering node with registered so far")
+	}
+
 	// Restore the state
 	if err := c.restoreState(); err != nil {
 		logger.Error("failed to restore state", "error", err)
diff --git a/client/node_updater.go b/client/node_updater.go
index 4367eeb4f7e..702cfe8c298 100644
--- a/client/node_updater.go
+++ b/client/node_updater.go
@@ -14,7 +14,7 @@ import (
 var (
 	// batchFirstFingerprintsTimeout is the maximum amount of time to wait for
 	// initial fingerprinting to complete before sending a batched Node update
-	batchFirstFingerprintsTimeout = 5 * time.Second
+	batchFirstFingerprintsTimeout = 50 * time.Second
 )
 
 // batchFirstFingerprints waits for the first fingerprint response from all

From 9dcebcd8a3103f43636a4e13d4e6a02feebbc30b Mon Sep 17 00:00:00 2001
From: Mahmood Ali <mahmood@hashicorp.com>
Date: Fri, 19 Apr 2019 09:12:50 -0400
Subject: [PATCH 2/4] client: avoid registering node twice right away

I noticed that `watchNodeUpdates()` almost immediately after
`registerAndHeartbeat()` calls `retryRegisterNode()`, well after 5
seconds.

This call is unnecessary and made debugging a bit harder.  So here, we
ensure that we only re-register node for new node events, not for
initial registration.
---
 client/client.go | 11 +++--------
 client/util.go   | 11 +++++++++++
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/client/client.go b/client/client.go
index ca2a81a5aab..4cdbd3da6df 100644
--- a/client/client.go
+++ b/client/client.go
@@ -1468,13 +1468,7 @@ func (c *Client) watchNodeEvents() {
 	// batchEvents stores events that have yet to be published
 	var batchEvents []*structs.NodeEvent
 
-	// Create and drain the timer
-	timer := time.NewTimer(0)
-	timer.Stop()
-	select {
-	case <-timer.C:
-	default:
-	}
+	timer := stoppedTimer()
 	defer timer.Stop()
 
 	for {
@@ -1930,7 +1924,8 @@ func (c *Client) updateNodeLocked() {
 // it will update the client node copy and re-register the node.
 func (c *Client) watchNodeUpdates() {
 	var hasChanged bool
-	timer := time.NewTimer(c.retryIntv(nodeUpdateRetryIntv))
+
+	timer := stoppedTimer()
 	defer timer.Stop()
 
 	for {
diff --git a/client/util.go b/client/util.go
index e3662a303eb..af3bd75400b 100644
--- a/client/util.go
+++ b/client/util.go
@@ -3,6 +3,7 @@ package client
 import (
 	"fmt"
 	"math/rand"
+	"time"
 
 	"github.com/hashicorp/nomad/nomad/structs"
 )
@@ -63,3 +64,13 @@ func shuffleStrings(list []string) {
 		list[i], list[j] = list[j], list[i]
 	}
 }
+
+// stoppedTimer returns a timer that's stopped and wouldn't fire until
+// it's reset
+func stoppedTimer() *time.Timer {
+	timer := time.NewTimer(0)
+	if !timer.Stop() {
+		<-timer.C
+	}
+	return timer
+}

From 9a2f46f3324c66ea5a8b4f7d4fbb086ff2b96e30 Mon Sep 17 00:00:00 2001
From: Mahmood Ali <mahmood@hashicorp.com>
Date: Fri, 19 Apr 2019 09:15:25 -0400
Subject: [PATCH 3/4] client: log detected driver health state

Noticed that `detected drivers` log line was misleading - when a driver
doesn't fingerprint before timeout, their health status is empty string
`""` which we would mark as detected.

Now, we log all drivers along with their state to ease driver
fingerprint debugging.
---
 client/pluginmanager/drivermanager/manager.go | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/client/pluginmanager/drivermanager/manager.go b/client/pluginmanager/drivermanager/manager.go
index 6592a2a80b0..2c0474eea91 100644
--- a/client/pluginmanager/drivermanager/manager.go
+++ b/client/pluginmanager/drivermanager/manager.go
@@ -243,9 +243,18 @@ func (m *manager) waitForFirstFingerprint(ctx context.Context, cancel context.Ca
 	}
 
 	var mu sync.Mutex
-	var availDrivers []string
+	driversByStatus := map[drivers.HealthState][]string{}
+
 	var wg sync.WaitGroup
 
+	recordDriver := func(name string, lastHeath drivers.HealthState) {
+		mu.Lock()
+		defer mu.Unlock()
+
+		updated := append(driversByStatus[lastHeath], name)
+		driversByStatus[lastHeath] = updated
+	}
+
 	// loop through instances and wait for each to finish initial fingerprint
 	m.instancesMu.RLock()
 	for n, i := range m.instances {
@@ -253,16 +262,13 @@ func (m *manager) waitForFirstFingerprint(ctx context.Context, cancel context.Ca
 		go func(name string, instance *instanceManager) {
 			defer wg.Done()
 			instance.WaitForFirstFingerprint(ctx)
-			if instance.getLastHealth() != drivers.HealthStateUndetected {
-				mu.Lock()
-				availDrivers = append(availDrivers, name)
-				mu.Unlock()
-			}
+			recordDriver(name, instance.getLastHealth())
 		}(n, i)
 	}
 	m.instancesMu.RUnlock()
 	wg.Wait()
-	m.logger.Debug("detected drivers", "drivers", availDrivers)
+
+	m.logger.Debug("detected drivers", "drivers", driversByStatus)
 }
 
 func (m *manager) loadReattachConfigs() error {

From 8041b0cbe2e17f82fe27fb4f471db0e894c75fcb Mon Sep 17 00:00:00 2001
From: Mahmood Ali <mahmood@hashicorp.com>
Date: Fri, 19 Apr 2019 09:31:43 -0400
Subject: [PATCH 4/4] clarify cryptic log line

---
 client/client.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/client/client.go b/client/client.go
index 4cdbd3da6df..6b66bd5a31b 100644
--- a/client/client.go
+++ b/client/client.go
@@ -428,7 +428,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic
 	select {
 	case <-c.Ready():
 	case <-time.After(batchFirstFingerprintsProcessingGrace):
-		logger.Warn("batched fingerprint, registering node with registered so far")
+		logger.Warn("batch fingerprint operation timed out; proceeding to register with fingerprinted plugins so far")
 	}
 
 	// Restore the state