From dade7081cca2bd2c05e8c4aea2e691fc57ddd212 Mon Sep 17 00:00:00 2001 From: Wim Fournier Date: Thu, 12 Oct 2017 12:25:36 +0200 Subject: [PATCH] Apply the fix from PR #3195 to 0.6.3 --- CHANGELOG.md | 6 ++++++ client/client.go | 34 ++++++++++++++++++++-------------- client/stats/host.go | 4 ++-- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b37475b040..06bd6e03202 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.6.4 (October 12, 2017) + +BUG FIXES: + * client: Fix lock contention that could cause a node to miss a heartbeat and + be marked as down [GH-3195] + ## 0.6.3 (September 11, 2017) BUG FIXES: diff --git a/client/client.go b/client/client.go index b72e54a1a9e..aab1196ed5b 100644 --- a/client/client.go +++ b/client/client.go @@ -284,7 +284,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic // Start collecting stats go c.emitStats() - c.logger.Printf("[INFO] client: Node ID %q", c.Node().ID) + c.logger.Printf("[INFO] client: Node ID %q", c.NodeID()) return c, nil } @@ -357,10 +357,7 @@ func (c *Client) Leave() error { // Datacenter returns the datacenter for the given client func (c *Client) Datacenter() string { - c.configLock.RLock() - dc := c.configCopy.Node.Datacenter - c.configLock.RUnlock() - return dc + return c.configCopy.Node.Datacenter } // Region returns the region for the given client @@ -368,6 +365,16 @@ func (c *Client) Region() string { return c.config.Region } +// NodeID returns the node ID for the given client +func (c *Client) NodeID() string { + return c.config.Node.ID +} + +// secretNodeID returns the secret node ID for the given client +func (c *Client) secretNodeID() string { + return c.config.Node.SecretID +} + // RPCMajorVersion returns the structs.ApiMajorVersion supported by the // client. func (c *Client) RPCMajorVersion() int { @@ -455,7 +462,7 @@ func (c *Client) Stats() map[string]map[string]string { defer c.heartbeatLock.Unlock() stats := map[string]map[string]string{ "client": map[string]string{ - "node_id": c.Node().ID, + "node_id": c.NodeID(), "known_servers": c.servers.all().String(), "num_allocations": strconv.Itoa(c.NumAllocs()), "last_heartbeat": fmt.Sprintf("%v", time.Since(c.lastHeartbeat)), @@ -1214,7 +1221,7 @@ func (c *Client) updateAllocStatus(alloc *structs.Allocation) { // send the fields that are updatable by the client. stripped := new(structs.Allocation) stripped.ID = alloc.ID - stripped.NodeID = c.Node().ID + stripped.NodeID = c.NodeID() stripped.TaskStates = alloc.TaskStates stripped.ClientStatus = alloc.ClientStatus stripped.ClientDescription = alloc.ClientDescription @@ -1291,10 +1298,9 @@ func (c *Client) watchAllocations(updates chan *allocUpdates) { // The request and response for getting the map of allocations that should // be running on the Node to their AllocModifyIndex which is incremented // when the allocation is updated by the servers. - n := c.Node() req := structs.NodeSpecificRequest{ - NodeID: n.ID, - SecretID: n.SecretID, + NodeID: c.NodeID(), + SecretID: c.secretNodeID(), QueryOptions: structs.QueryOptions{ Region: c.Region(), AllowStale: true, @@ -1652,8 +1658,8 @@ func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vcli // DeriveVaultToken of nomad server can take in a set of tasks and // creates tokens for all the tasks. req := &structs.DeriveVaultTokenRequest{ - NodeID: c.Node().ID, - SecretID: c.Node().SecretID, + NodeID: c.NodeID(), + SecretID: c.secretNodeID(), AllocID: alloc.ID, Tasks: verifiedTasks, QueryOptions: structs.QueryOptions{ @@ -1863,7 +1869,7 @@ func (c *Client) emitStats() { // emitHostStats pushes host resource usage stats to remote metrics collection sinks func (c *Client) emitHostStats(hStats *stats.HostStats) { - nodeID := c.Node().ID + nodeID := c.NodeID() metrics.SetGauge([]string{"client", "host", "memory", nodeID, "total"}, float32(hStats.Memory.Total)) metrics.SetGauge([]string{"client", "host", "memory", nodeID, "available"}, float32(hStats.Memory.Available)) metrics.SetGauge([]string{"client", "host", "memory", nodeID, "used"}, float32(hStats.Memory.Used)) @@ -1930,7 +1936,7 @@ func (c *Client) emitHostStats(hStats *stats.HostStats) { // emitClientMetrics emits lower volume client metrics func (c *Client) emitClientMetrics() { - nodeID := c.Node().ID + nodeID := c.NodeID() // Emit allocation metrics blocked, migrating, pending, running, terminal := 0, 0, 0, 0, 0 diff --git a/client/stats/host.go b/client/stats/host.go index 98fdc7c1982..41ceb31edf2 100644 --- a/client/stats/host.go +++ b/client/stats/host.go @@ -92,6 +92,8 @@ func NewHostStatsCollector(logger *log.Logger, allocDir string) *HostStatsCollec // Collect collects stats related to resource usage of a host func (h *HostStatsCollector) Collect() error { + h.hostStatsLock.Lock() + defer h.hostStatsLock.Unlock() hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()} // Determine up-time @@ -131,9 +133,7 @@ func (h *HostStatsCollector) Collect() error { hs.AllocDirStats = h.toDiskStats(usage, nil) // Update the collected status object. - h.hostStatsLock.Lock() h.hostStats = hs - h.hostStatsLock.Unlock() return nil }