From 7a68d7616064d873eee0dbedf33763de43ab3bd9 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 19 Apr 2019 09:00:24 -0400 Subject: [PATCH 1/4] client: wait for batched driver updated Here we retain 0.8.7 behavior of waiting for driver fingerprints before registering a node, with some timeout. This is needed for system jobs, as system job scheduling for node occur at node registration, and the race might mean that a system job may not get placed on the node because of missing drivers. The timeout isn't strictly necessary, but raising it to 1 minute as it's closer to indefinitely blocked than 1 second. We need to keep the value high enough to capture as much drivers/devices, but low enough that doesn't risk blocking too long due to misbehaving plugin. Fixes https://github.com/hashicorp/nomad/issues/5579 --- client/client.go | 12 ++++++++++++ client/node_updater.go | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/client/client.go b/client/client.go index d2c12b05daf..ca2a81a5aab 100644 --- a/client/client.go +++ b/client/client.go @@ -93,6 +93,11 @@ const ( allocSyncRetryIntv = 5 * time.Second ) +var ( + // grace period to allow for batch fingerprint processing + batchFirstFingerprintsProcessingGrace = batchFirstFingerprintsTimeout + 5*time.Second +) + // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad // Client type ClientStatsReporter interface { @@ -419,6 +424,13 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic return nil, fmt.Errorf("failed to setup vault client: %v", err) } + // wait until drivers are healthy before restoring or registering with servers + select { + case <-c.Ready(): + case <-time.After(batchFirstFingerprintsProcessingGrace): + logger.Warn("batched fingerprint, registering node with registered so far") + } + // Restore the state if err := c.restoreState(); err != nil { logger.Error("failed to restore state", "error", err) diff --git a/client/node_updater.go b/client/node_updater.go index 4367eeb4f7e..702cfe8c298 100644 --- a/client/node_updater.go +++ b/client/node_updater.go @@ -14,7 +14,7 @@ import ( var ( // batchFirstFingerprintsTimeout is the maximum amount of time to wait for // initial fingerprinting to complete before sending a batched Node update - batchFirstFingerprintsTimeout = 5 * time.Second + batchFirstFingerprintsTimeout = 50 * time.Second ) // batchFirstFingerprints waits for the first fingerprint response from all From 9dcebcd8a3103f43636a4e13d4e6a02feebbc30b Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 19 Apr 2019 09:12:50 -0400 Subject: [PATCH 2/4] client: avoid registering node twice right away I noticed that `watchNodeUpdates()` almost immediately after `registerAndHeartbeat()` calls `retryRegisterNode()`, well after 5 seconds. This call is unnecessary and made debugging a bit harder. So here, we ensure that we only re-register node for new node events, not for initial registration. --- client/client.go | 11 +++-------- client/util.go | 11 +++++++++++ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/client/client.go b/client/client.go index ca2a81a5aab..4cdbd3da6df 100644 --- a/client/client.go +++ b/client/client.go @@ -1468,13 +1468,7 @@ func (c *Client) watchNodeEvents() { // batchEvents stores events that have yet to be published var batchEvents []*structs.NodeEvent - // Create and drain the timer - timer := time.NewTimer(0) - timer.Stop() - select { - case <-timer.C: - default: - } + timer := stoppedTimer() defer timer.Stop() for { @@ -1930,7 +1924,8 @@ func (c *Client) updateNodeLocked() { // it will update the client node copy and re-register the node. func (c *Client) watchNodeUpdates() { var hasChanged bool - timer := time.NewTimer(c.retryIntv(nodeUpdateRetryIntv)) + + timer := stoppedTimer() defer timer.Stop() for { diff --git a/client/util.go b/client/util.go index e3662a303eb..af3bd75400b 100644 --- a/client/util.go +++ b/client/util.go @@ -3,6 +3,7 @@ package client import ( "fmt" "math/rand" + "time" "github.com/hashicorp/nomad/nomad/structs" ) @@ -63,3 +64,13 @@ func shuffleStrings(list []string) { list[i], list[j] = list[j], list[i] } } + +// stoppedTimer returns a timer that's stopped and wouldn't fire until +// it's reset +func stoppedTimer() *time.Timer { + timer := time.NewTimer(0) + if !timer.Stop() { + <-timer.C + } + return timer +} From 9a2f46f3324c66ea5a8b4f7d4fbb086ff2b96e30 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 19 Apr 2019 09:15:25 -0400 Subject: [PATCH 3/4] client: log detected driver health state Noticed that `detected drivers` log line was misleading - when a driver doesn't fingerprint before timeout, their health status is empty string `""` which we would mark as detected. Now, we log all drivers along with their state to ease driver fingerprint debugging. --- client/pluginmanager/drivermanager/manager.go | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/client/pluginmanager/drivermanager/manager.go b/client/pluginmanager/drivermanager/manager.go index 6592a2a80b0..2c0474eea91 100644 --- a/client/pluginmanager/drivermanager/manager.go +++ b/client/pluginmanager/drivermanager/manager.go @@ -243,9 +243,18 @@ func (m *manager) waitForFirstFingerprint(ctx context.Context, cancel context.Ca } var mu sync.Mutex - var availDrivers []string + driversByStatus := map[drivers.HealthState][]string{} + var wg sync.WaitGroup + recordDriver := func(name string, lastHeath drivers.HealthState) { + mu.Lock() + defer mu.Unlock() + + updated := append(driversByStatus[lastHeath], name) + driversByStatus[lastHeath] = updated + } + // loop through instances and wait for each to finish initial fingerprint m.instancesMu.RLock() for n, i := range m.instances { @@ -253,16 +262,13 @@ func (m *manager) waitForFirstFingerprint(ctx context.Context, cancel context.Ca go func(name string, instance *instanceManager) { defer wg.Done() instance.WaitForFirstFingerprint(ctx) - if instance.getLastHealth() != drivers.HealthStateUndetected { - mu.Lock() - availDrivers = append(availDrivers, name) - mu.Unlock() - } + recordDriver(name, instance.getLastHealth()) }(n, i) } m.instancesMu.RUnlock() wg.Wait() - m.logger.Debug("detected drivers", "drivers", availDrivers) + + m.logger.Debug("detected drivers", "drivers", driversByStatus) } func (m *manager) loadReattachConfigs() error { From 8041b0cbe2e17f82fe27fb4f471db0e894c75fcb Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 19 Apr 2019 09:31:43 -0400 Subject: [PATCH 4/4] clarify cryptic log line --- client/client.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/client.go b/client/client.go index 4cdbd3da6df..6b66bd5a31b 100644 --- a/client/client.go +++ b/client/client.go @@ -428,7 +428,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic select { case <-c.Ready(): case <-time.After(batchFirstFingerprintsProcessingGrace): - logger.Warn("batched fingerprint, registering node with registered so far") + logger.Warn("batch fingerprint operation timed out; proceeding to register with fingerprinted plugins so far") } // Restore the state