From f23ac5f083f92afcd95e0040e385ac675b0e7df3 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Mon, 11 Sep 2017 21:42:10 -0700 Subject: [PATCH 1/2] Non-locked accessors to common Node fields This PR removes locking around commonly accessed node attributes that do not need to be locked. The locking could cause nodes to TTL as the heartbeat code path was acquiring a lock that could be held for an excessively long time. An example of this is when Vault is inaccessible, since the fingerprint is run with a lock held but the Vault fingerprinter makes the API calls with a large timeout. Fixes https://github.com/hashicorp/nomad/issues/2689 --- client/client.go | 38 ++++++++++++++++++++++---------------- client/stats/host.go | 5 +++-- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/client/client.go b/client/client.go index cec4709e1a1..796e72674ac 100644 --- a/client/client.go +++ b/client/client.go @@ -296,7 +296,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic // Start collecting stats go c.emitStats() - c.logger.Printf("[INFO] client: Node ID %q", c.Node().ID) + c.logger.Printf("[INFO] client: Node ID %q", c.NodeID()) return c, nil } @@ -369,9 +369,7 @@ func (c *Client) Leave() error { // Datacenter returns the datacenter for the given client func (c *Client) Datacenter() string { - c.configLock.RLock() - dc := c.configCopy.Node.Datacenter - c.configLock.RUnlock() + dc := c.config.Node.Datacenter return dc } @@ -380,6 +378,16 @@ func (c *Client) Region() string { return c.config.Region } +// NodeID returns the node ID for the given client +func (c *Client) NodeID() string { + return c.config.Node.ID +} + +// secretNodeID returns the secret node ID for the given client +func (c *Client) secretNodeID() string { + return c.config.Node.SecretID +} + // RPCMajorVersion returns the structs.ApiMajorVersion supported by the // client. func (c *Client) RPCMajorVersion() int { @@ -467,7 +475,7 @@ func (c *Client) Stats() map[string]map[string]string { defer c.heartbeatLock.Unlock() stats := map[string]map[string]string{ "client": map[string]string{ - "node_id": c.Node().ID, + "node_id": c.NodeID(), "known_servers": c.servers.all().String(), "num_allocations": strconv.Itoa(c.NumAllocs()), "last_heartbeat": fmt.Sprintf("%v", time.Since(c.lastHeartbeat)), @@ -1159,9 +1167,8 @@ func (c *Client) updateNodeStatus() error { c.heartbeatLock.Lock() defer c.heartbeatLock.Unlock() - node := c.Node() req := structs.NodeUpdateStatusRequest{ - NodeID: node.ID, + NodeID: c.NodeID(), Status: structs.NodeStatusReady, WriteRequest: structs.WriteRequest{Region: c.Region()}, } @@ -1226,7 +1233,7 @@ func (c *Client) updateAllocStatus(alloc *structs.Allocation) { // send the fields that are updatable by the client. stripped := new(structs.Allocation) stripped.ID = alloc.ID - stripped.NodeID = c.Node().ID + stripped.NodeID = c.NodeID() stripped.TaskStates = alloc.TaskStates stripped.ClientStatus = alloc.ClientStatus stripped.ClientDescription = alloc.ClientDescription @@ -1303,10 +1310,9 @@ func (c *Client) watchAllocations(updates chan *allocUpdates) { // The request and response for getting the map of allocations that should // be running on the Node to their AllocModifyIndex which is incremented // when the allocation is updated by the servers. - n := c.Node() req := structs.NodeSpecificRequest{ - NodeID: n.ID, - SecretID: n.SecretID, + NodeID: c.NodeID(), + SecretID: c.secretNodeID(), QueryOptions: structs.QueryOptions{ Region: c.Region(), AllowStale: true, @@ -1664,8 +1670,8 @@ func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vcli // DeriveVaultToken of nomad server can take in a set of tasks and // creates tokens for all the tasks. req := &structs.DeriveVaultTokenRequest{ - NodeID: c.Node().ID, - SecretID: c.Node().SecretID, + NodeID: c.NodeID(), + SecretID: c.secretNodeID(), AllocID: alloc.ID, Tasks: verifiedTasks, QueryOptions: structs.QueryOptions{ @@ -1849,7 +1855,7 @@ DISCOLOOP: func (c *Client) emitStats() { // Assign labels directly before emitting stats so the information expected // is ready - c.baseLabels = []metrics.Label{{Name: "node_id", Value: c.Node().ID}, {Name: "datacenter", Value: c.Node().Datacenter}} + c.baseLabels = []metrics.Label{{Name: "node_id", Value: c.NodeID()}, {Name: "datacenter", Value: c.Datacenter()}} // Start collecting host stats right away and then keep collecting every // collection interval @@ -2027,7 +2033,7 @@ func (c *Client) setGaugeForUptime(hStats *stats.HostStats) { // emitHostStats pushes host resource usage stats to remote metrics collection sinks func (c *Client) emitHostStats() { - nodeID := c.Node().ID + nodeID := c.NodeID() hStats := c.hostStatsCollector.Stats() c.setGaugeForMemoryStats(nodeID, hStats) @@ -2041,7 +2047,7 @@ func (c *Client) emitHostStats() { // emitClientMetrics emits lower volume client metrics func (c *Client) emitClientMetrics() { - nodeID := c.Node().ID + nodeID := c.NodeID() // Emit allocation metrics blocked, migrating, pending, running, terminal := 0, 0, 0, 0, 0 diff --git a/client/stats/host.go b/client/stats/host.go index 98fdc7c1982..d528ca784df 100644 --- a/client/stats/host.go +++ b/client/stats/host.go @@ -92,6 +92,9 @@ func NewHostStatsCollector(logger *log.Logger, allocDir string) *HostStatsCollec // Collect collects stats related to resource usage of a host func (h *HostStatsCollector) Collect() error { + h.hostStatsLock.Lock() + defer h.hostStatsLock.Unlock() + hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()} // Determine up-time @@ -131,9 +134,7 @@ func (h *HostStatsCollector) Collect() error { hs.AllocDirStats = h.toDiskStats(usage, nil) // Update the collected status object. - h.hostStatsLock.Lock() h.hostStats = hs - h.hostStatsLock.Unlock() return nil } From 98c47c72d020b6321387ed833a05ca5dbab4b765 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Thu, 14 Sep 2017 14:08:17 -0700 Subject: [PATCH 2/2] changelog and feedback --- CHANGELOG.md | 2 ++ client/client.go | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eade1e4a873..21e6947ee56 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ BUG FIXES: * core: *Fix restoration of stopped periodic jobs [GH-3201] * api: Fix search handling of jobs with more than four hyphens and case were length could cause lookup error [GH-3203] + * client: Fix lock contention that could cause a node to miss a heartbeat and + be marked as down [GH-3195] ## 0.6.3 (September 11, 2017) diff --git a/client/client.go b/client/client.go index 796e72674ac..b77bf6107c6 100644 --- a/client/client.go +++ b/client/client.go @@ -369,8 +369,7 @@ func (c *Client) Leave() error { // Datacenter returns the datacenter for the given client func (c *Client) Datacenter() string { - dc := c.config.Node.Datacenter - return dc + return c.config.Node.Datacenter } // Region returns the region for the given client