Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes races accessing node and updating it during fingerprinting #4162

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 27 additions & 24 deletions client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,13 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic
return nil, fmt.Errorf("node setup failed: %v", err)
}

fingerprintManager := NewFingerprintManager(c.GetConfig, c.config.Node,
// Store the config copy before restoring state but after it has been
// initialized.
c.configLock.Lock()
c.configCopy = c.config.Copy()
c.configLock.Unlock()

fingerprintManager := NewFingerprintManager(c.GetConfig, c.configCopy.Node,
c.shutdownCh, c.updateNodeFromFingerprint, c.updateNodeFromDriver,
c.logger)

Expand All @@ -271,12 +277,6 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic
// Setup the reserved resources
c.reservePorts()

// Store the config copy before restoring state but after it has been
// initialized.
c.configLock.Lock()
c.configCopy = c.config.Copy()
c.configLock.Unlock()

// Set the preconfigured list of static servers
c.configLock.RLock()
if len(c.configCopy.Servers) > 0 {
Expand Down Expand Up @@ -437,7 +437,7 @@ func (c *Client) Leave() error {
func (c *Client) GetConfig() *config.Config {
c.configLock.Lock()
defer c.configLock.Unlock()
return c.config
return c.configCopy
}

// Datacenter returns the datacenter for the given client
Expand Down Expand Up @@ -726,7 +726,7 @@ func (c *Client) restoreState() error {
watcher := noopPrevAlloc{}

c.configLock.RLock()
ar := NewAllocRunner(c.logger, c.configCopy, c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService, watcher)
ar := NewAllocRunner(c.logger, c.configCopy.Copy(), c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService, watcher)
c.configLock.RUnlock()

c.allocLock.Lock()
Expand Down Expand Up @@ -963,6 +963,9 @@ func (c *Client) reservePorts() {
for _, net := range reservedIndex {
node.Reserved.Networks = append(node.Reserved.Networks, net)
}

// Make the changes available to the config copy.
c.configCopy = c.config.Copy()
}

// updateNodeFromFingerprint updates the node with the result of
Expand Down Expand Up @@ -1009,10 +1012,10 @@ func (c *Client) updateNodeFromFingerprint(response *cstructs.FingerprintRespons
}

if nodeHasChanged {
c.updateNode()
c.updateNodeLocked()
}

return c.config.Node
return c.configCopy.Node
}

// updateNodeFromDriver receives either a fingerprint of the driver or its
Expand Down Expand Up @@ -1104,10 +1107,10 @@ func (c *Client) updateNodeFromDriver(name string, fingerprint, health *structs.

if hasChanged {
c.config.Node.Drivers[name].UpdateTime = time.Now()
c.updateNode()
c.updateNodeLocked()
}

return c.config.Node
return c.configCopy.Node
}

// resourcesAreEqual is a temporary function to compare whether resources are
Expand Down Expand Up @@ -1752,9 +1755,14 @@ OUTER:
}
}

// updateNode triggers a client to update its node copy if it isn't doing
// updateNode updates the Node copy and triggers the client to send the updated
// Node to the server. This should be done while holding the configLock lock.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe specify that the caller needs to hold the configLock

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Already states that.

// so already
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this last comment was meant to be included

func (c *Client) updateNode() {
func (c *Client) updateNodeLocked() {
// Update the config copy.
node := c.config.Node.Copy()
c.configCopy.Node = node

select {
case c.triggerNodeUpdate <- struct{}{}:
// Node update goroutine was released to execute
Expand All @@ -1774,15 +1782,7 @@ func (c *Client) watchNodeUpdates() {
select {
case <-timer.C:
c.logger.Printf("[DEBUG] client: state changed, updating node and re-registering.")

// Update the config copy.
c.configLock.Lock()
node := c.config.Node.Copy()
c.configCopy.Node = node
c.configLock.Unlock()

c.retryRegisterNode()

hasChanged = false
case <-c.triggerNodeUpdate:
if hasChanged {
Expand Down Expand Up @@ -1899,7 +1899,10 @@ func (c *Client) addAlloc(alloc *structs.Allocation, migrateToken string) error
c.configLock.RLock()
prevAlloc := newAllocWatcher(alloc, prevAR, c, c.configCopy, c.logger, migrateToken)

ar := NewAllocRunner(c.logger, c.configCopy, c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService, prevAlloc)
// Copy the config since the node can be swapped out as it is being updated.
// The long term fix is to pass in the config and node separately and then
// we don't have to do a copy.
ar := NewAllocRunner(c.logger, c.configCopy.Copy(), c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService, prevAlloc)
c.configLock.RUnlock()

// Store the alloc runner.
Expand Down
10 changes: 8 additions & 2 deletions client/fingerprint_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,16 @@ func NewFingerprintManager(getConfig func() *config.Config,
func (fm *FingerprintManager) setNode(node *structs.Node) {
fm.nodeLock.Lock()
defer fm.nodeLock.Unlock()

fm.node = node
}

// getNode returns the current client node
func (fm *FingerprintManager) getNode() *structs.Node {
fm.nodeLock.Lock()
defer fm.nodeLock.Unlock()
return fm.node
}

// Run starts the process of fingerprinting the node. It does an initial pass,
// identifying whitelisted and blacklisted fingerprints/drivers. Then, for
// those which require periotic checking, it starts a periodic process for
Expand Down Expand Up @@ -167,7 +173,7 @@ func (fm *FingerprintManager) setupFingerprinters(fingerprints []string) error {
// supported
func (fm *FingerprintManager) setupDrivers(drivers []string) error {
var availDrivers []string
driverCtx := driver.NewDriverContext("", "", fm.getConfig(), fm.node, fm.logger, nil)
driverCtx := driver.NewDriverContext("", "", fm.getConfig(), fm.getNode(), fm.logger, nil)
for _, name := range drivers {

d, err := driver.NewDriver(name, driverCtx)
Expand Down
13 changes: 13 additions & 0 deletions nomad/structs/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package structs

import (
"time"

"github.com/hashicorp/nomad/helper"
)

// DriverInfo is the current state of a single driver. This is updated
Expand All @@ -14,6 +16,17 @@ type DriverInfo struct {
UpdateTime time.Time
}

func (di *DriverInfo) Copy() *DriverInfo {
if di == nil {
return nil
}

cdi := new(DriverInfo)
*cdi = *di
cdi.Attributes = helper.CopyMapStringString(di.Attributes)
return cdi
}

// MergeHealthCheck merges information from a health check for a drier into a
// node's driver info
func (di *DriverInfo) MergeHealthCheck(other *DriverInfo) {
Expand Down
15 changes: 15 additions & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -1461,6 +1461,7 @@ func (n *Node) Copy() *Node {
nn.Meta = helper.CopyMapStringString(nn.Meta)
nn.Events = copyNodeEvents(n.Events)
nn.DrainStrategy = nn.DrainStrategy.Copy()
nn.Drivers = copyNodeDrivers(n.Drivers)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there is a test for Copy for a node, add this to the test assertion.

return nn
}

Expand All @@ -1478,6 +1479,20 @@ func copyNodeEvents(events []*NodeEvent) []*NodeEvent {
return c
}

// copyNodeDrivers is a helper to copy a map of DriverInfo
func copyNodeDrivers(drivers map[string]*DriverInfo) map[string]*DriverInfo {
l := len(drivers)
if l == 0 {
return nil
}

c := make(map[string]*DriverInfo, l)
for driver, info := range drivers {
c[driver] = info.Copy()
}
return c
}

// TerminalStatus returns if the current status is terminal and
// will no longer transition.
func (n *Node) TerminalStatus() bool {
Expand Down