From 3ee8d9385a652c4b273d7cc1024b5e52abcfff64 Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Thu, 11 Feb 2021 14:00:14 +0100 Subject: [PATCH 01/11] p2p/dnsdisc: fix hot-spin when all trees are empty In the random sync algorithm used by the DNS node iterator, we first pick a random tree and then perform one sync action on that tree. This happens in a loop until any node is found. If no trees contain any nodes, the iterator would just keep starting over in a hot loop, spinning at 100% CPU. The fix is a bit complicated. The iterator now checks if any sync action can be performed on the tree before selecting it. If no action can be performed on any tree, it waits for the closest root record recheck time to arrive and then tries again. --- p2p/dnsdisc/client.go | 65 +++++++++++++++++++++++++++++++------- p2p/dnsdisc/client_test.go | 51 ++++++++++++++++++++++++++++++ p2p/dnsdisc/sync.go | 28 ++++++++++++---- 3 files changed, 127 insertions(+), 17 deletions(-) diff --git a/p2p/dnsdisc/client.go b/p2p/dnsdisc/client.go index b8727848281a..b61aedb2eb71 100644 --- a/p2p/dnsdisc/client.go +++ b/p2p/dnsdisc/client.go @@ -216,9 +216,10 @@ type randomIterator struct { cancelFn context.CancelFunc c *Client - mu sync.Mutex - trees map[string]*clientTree // all trees - lc linkCache // tracks tree dependencies + mu sync.Mutex + trees map[string]*clientTree // all trees + rootWait map[string]*clientTree // trees waiting for root change + lc linkCache // tracks tree dependencies } func (c *Client) newRandomIterator() *randomIterator { @@ -264,7 +265,7 @@ func (it *randomIterator) addTree(url string) error { // nextNode syncs random tree entries until it finds a node. func (it *randomIterator) nextNode() *enode.Node { for { - ct := it.nextTree() + ct := it.pickTree() if ct == nil { return nil } @@ -282,8 +283,8 @@ func (it *randomIterator) nextNode() *enode.Node { } } -// nextTree returns a random tree. -func (it *randomIterator) nextTree() *clientTree { +// pickTree returns a random tree to sync from. +func (it *randomIterator) pickTree() *clientTree { it.mu.Lock() defer it.mu.Unlock() @@ -294,14 +295,56 @@ func (it *randomIterator) nextTree() *clientTree { if len(it.trees) == 0 { return nil } - limit := rand.Intn(len(it.trees)) + + for { + // Find trees that might still have pending items to sync. + // If there are any, pick a random syncable tree. + syncable, disabled := it.syncableTrees() + if len(syncable) > 0 { + return syncable[rand.Intn(len(syncable))] + } + // The client tried all trees, and no sync action can be performed on any of them. + // The only meaningful thing to do now is waiting for any root record to get + // updated. + if !it.waitForRootUpdates(disabled) { + return nil // Iterator was closed. + } + } +} + +// syncableTrees finds trees on which any meaningful sync action can be performed. +func (it *randomIterator) syncableTrees() (syncable, disabled []*clientTree) { for _, ct := range it.trees { - if limit == 0 { - return ct + if ct.canSyncRandom() { + syncable = append(syncable, ct) + } else { + disabled = append(disabled, ct) } - limit-- } - return nil + return syncable, disabled +} + +// waitForRootUpdates waits for the closest scheduled root check time on the given trees. +func (it *randomIterator) waitForRootUpdates(trees []*clientTree) bool { + var nextCheck mclock.AbsTime + now := it.c.clock.Now() + for _, ct := range trees { + check := ct.nextScheduledRootCheck() + if nextCheck == 0 || check < nextCheck { + nextCheck = check + } + } + + sleep := nextCheck.Sub(now) + it.c.cfg.Logger.Debug("DNS iterator waiting for root updates", "sleep", sleep) + timeout := it.c.clock.NewTimer(sleep) + defer timeout.Stop() + select { + case <-timeout.C(): + return true + case <-it.ctx.Done(): + return false // Iterator was closed. + } } // rebuildTrees rebuilds the 'trees' map. diff --git a/p2p/dnsdisc/client_test.go b/p2p/dnsdisc/client_test.go index 6a6705abf208..f40abba0202c 100644 --- a/p2p/dnsdisc/client_test.go +++ b/p2p/dnsdisc/client_test.go @@ -22,6 +22,7 @@ import ( "errors" "math/rand" "reflect" + "runtime" "testing" "time" @@ -231,6 +232,56 @@ func TestIteratorRootRecheckOnFail(t *testing.T) { checkIterator(t, it, nodes) } +// This test checks that the iterator works correctly when the tree is initially empty. +func TestIteratorEmptyTree(t *testing.T) { + var ( + clock = new(mclock.Simulated) + nodes = testNodes(nodesSeed1, 1) + resolver = newMapResolver() + c = NewClient(Config{ + Resolver: resolver, + Logger: testlog.Logger(t, log.LvlTrace), + RecheckInterval: 20 * time.Minute, + RateLimit: 500, + }) + ) + c.clock = clock + tree1, url := makeTestTree("n", nil, nil) + tree2, url := makeTestTree("n", nodes, nil) + resolver.add(tree1.ToTXT("n")) + + // Start the iterator. + node := make(chan *enode.Node) + it, err := c.NewIterator(url) + if err != nil { + t.Fatal(err) + } + go func() { + it.Next() + node <- it.Node() + }() + + // Wait for it to get stuck in slowdownRollover, then modify the root. + clock.WaitForTimers(1) + resolver.add(tree2.ToTXT("n")) + + timeout := time.After(5 * time.Second) + for { + clock.Run(1 * time.Second) + select { + case n := <-node: + if n.ID() != nodes[0].ID() { + t.Fatalf("wrong node returned") + } + return + case <-timeout: + t.Fatal("it.Next() did not unblock within 5s of real time") + default: + runtime.Gosched() + } + } +} + // updateSomeNodes applies ENR updates to some of the given nodes. func updateSomeNodes(keySeed int64, nodes []*enode.Node) { keys := testKeys(nodesSeed1, len(nodes)) diff --git a/p2p/dnsdisc/sync.go b/p2p/dnsdisc/sync.go index 36f02acba67a..073547c90d03 100644 --- a/p2p/dnsdisc/sync.go +++ b/p2p/dnsdisc/sync.go @@ -25,9 +25,9 @@ import ( "github.com/ethereum/go-ethereum/p2p/enode" ) -const ( - rootRecheckFailCount = 5 // update root if this many leaf requests fail -) +// This is the number of consecutive leaf requests that may fail before +// we consider re-resolving the tree root. +const rootRecheckFailCount = 5 // clientTree is a full tree being synced. type clientTree struct { @@ -89,13 +89,22 @@ func (ct *clientTree) syncRandom(ctx context.Context) (n *enode.Node, err error) ct.gcLinks() // Sync next random entry in ENR tree. Once every node has been visited, we simply - // start over. This is fine because entries are cached. + // start over. This is fine because entries are cached internally by the client LRU + // also by DNS resolvers. if ct.enrs.done() { ct.enrs = newSubtreeSync(ct.c, ct.loc, ct.root.eroot, false) } return ct.syncNextRandomENR(ctx) } +// canSyncRandom checks if any meaningful action can be performed by syncRandom. +func (ct *clientTree) canSyncRandom() bool { + // Note: the check for non-zero leaf count is very important here. + // If we're done syncing all nodes, and no leaves were found, the tree + // is empty and we can't use it for sync. + return ct.rootUpdateDue() || !ct.links.done() || !ct.enrs.done() || ct.enrs.leaves != 0 +} + // gcLinks removes outdated links from the global link cache. GC runs once // when the link sync finishes. func (ct *clientTree) gcLinks() { @@ -184,10 +193,14 @@ func (ct *clientTree) updateRoot(ctx context.Context) error { // rootUpdateDue returns true when a root update is needed. func (ct *clientTree) rootUpdateDue() bool { tooManyFailures := ct.leafFailCount > rootRecheckFailCount - scheduledCheck := ct.c.clock.Now().Sub(ct.lastRootCheck) > ct.c.cfg.RecheckInterval + scheduledCheck := ct.c.clock.Now() >= ct.nextScheduledRootCheck() return ct.root == nil || tooManyFailures || scheduledCheck } +func (ct *clientTree) nextScheduledRootCheck() mclock.AbsTime { + return ct.lastRootCheck.Add(ct.c.cfg.RecheckInterval) +} + // slowdownRootUpdate applies a delay to root resolution if is tried // too frequently. This avoids busy polling when the client is offline. // Returns true if the timeout passed, false if sync was canceled. @@ -218,10 +231,11 @@ type subtreeSync struct { root string missing []string // missing tree node hashes link bool // true if this sync is for the link tree + leaves int // counter of synced leaves } func newSubtreeSync(c *Client, loc *linkEntry, root string, link bool) *subtreeSync { - return &subtreeSync{c, loc, root, []string{root}, link} + return &subtreeSync{c, loc, root, []string{root}, link, 0} } func (ts *subtreeSync) done() bool { @@ -253,10 +267,12 @@ func (ts *subtreeSync) resolveNext(ctx context.Context, hash string) (entry, err if ts.link { return nil, errENRInLinkTree } + ts.leaves++ case *linkEntry: if !ts.link { return nil, errLinkInENRTree } + ts.leaves++ case *branchEntry: ts.missing = append(ts.missing, e.children...) } From 1a30f38859325a4c7d85abbb0cb9069b24403c22 Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Thu, 11 Feb 2021 14:10:39 +0100 Subject: [PATCH 02/11] p2p/dnsdisc: improve test --- p2p/dnsdisc/client_test.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/p2p/dnsdisc/client_test.go b/p2p/dnsdisc/client_test.go index f40abba0202c..b125e590bc89 100644 --- a/p2p/dnsdisc/client_test.go +++ b/p2p/dnsdisc/client_test.go @@ -261,13 +261,16 @@ func TestIteratorEmptyTree(t *testing.T) { node <- it.Node() }() - // Wait for it to get stuck in slowdownRollover, then modify the root. + // Wait for the client to get stuck in waitForRootUpdates. clock.WaitForTimers(1) + + // Now update the root. resolver.add(tree2.ToTXT("n")) + // Wait for it to pick up the root change. timeout := time.After(5 * time.Second) for { - clock.Run(1 * time.Second) + clock.Run(20 * time.Second) select { case n := <-node: if n.ID() != nodes[0].ID() { From 21ce379b50df37697b0624131688a6ba404b61e0 Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Thu, 11 Feb 2021 14:14:54 +0100 Subject: [PATCH 03/11] p2p/dnsdisc: improve test --- p2p/dnsdisc/client_test.go | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/p2p/dnsdisc/client_test.go b/p2p/dnsdisc/client_test.go index b125e590bc89..7b41776e6e03 100644 --- a/p2p/dnsdisc/client_test.go +++ b/p2p/dnsdisc/client_test.go @@ -22,7 +22,6 @@ import ( "errors" "math/rand" "reflect" - "runtime" "testing" "time" @@ -268,20 +267,14 @@ func TestIteratorEmptyTree(t *testing.T) { resolver.add(tree2.ToTXT("n")) // Wait for it to pick up the root change. - timeout := time.After(5 * time.Second) - for { - clock.Run(20 * time.Second) - select { - case n := <-node: - if n.ID() != nodes[0].ID() { - t.Fatalf("wrong node returned") - } - return - case <-timeout: - t.Fatal("it.Next() did not unblock within 5s of real time") - default: - runtime.Gosched() + clock.Run(c.cfg.RecheckInterval) + select { + case n := <-node: + if n.ID() != nodes[0].ID() { + t.Fatalf("wrong node returned") } + case <-time.After(5 * time.Second): + t.Fatal("it.Next() did not unblock within 5s of real time") } } From e46d6194adbd7e26cd755e5344317773eeb7b4eb Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Thu, 11 Feb 2021 14:15:53 +0100 Subject: [PATCH 04/11] p2p/dnsdisc: fix lint issue --- p2p/dnsdisc/client_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/p2p/dnsdisc/client_test.go b/p2p/dnsdisc/client_test.go index 7b41776e6e03..741bee4230be 100644 --- a/p2p/dnsdisc/client_test.go +++ b/p2p/dnsdisc/client_test.go @@ -246,7 +246,7 @@ func TestIteratorEmptyTree(t *testing.T) { ) c.clock = clock tree1, url := makeTestTree("n", nil, nil) - tree2, url := makeTestTree("n", nodes, nil) + tree2, _ := makeTestTree("n", nodes, nil) resolver.add(tree1.ToTXT("n")) // Start the iterator. From 66cc15147314643a81faa8506216cbdd8ccfaaf6 Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Thu, 11 Feb 2021 14:20:48 +0100 Subject: [PATCH 05/11] p2p/dnsdisc: remove rootWait --- p2p/dnsdisc/client.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/p2p/dnsdisc/client.go b/p2p/dnsdisc/client.go index b61aedb2eb71..6b926b00a570 100644 --- a/p2p/dnsdisc/client.go +++ b/p2p/dnsdisc/client.go @@ -216,10 +216,9 @@ type randomIterator struct { cancelFn context.CancelFunc c *Client - mu sync.Mutex - trees map[string]*clientTree // all trees - rootWait map[string]*clientTree // trees waiting for root change - lc linkCache // tracks tree dependencies + mu sync.Mutex + trees map[string]*clientTree // all trees + lc linkCache // tracks tree dependencies } func (c *Client) newRandomIterator() *randomIterator { From 6e211bc54df6da3612fb764f3ed1d1c2a5313640 Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Thu, 11 Feb 2021 14:47:31 +0100 Subject: [PATCH 06/11] p2p/dnsdisc: fix shutdown issue --- p2p/dnsdisc/client.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/p2p/dnsdisc/client.go b/p2p/dnsdisc/client.go index 6b926b00a570..55be1b3290b6 100644 --- a/p2p/dnsdisc/client.go +++ b/p2p/dnsdisc/client.go @@ -238,10 +238,10 @@ func (it *randomIterator) Node() *enode.Node { // Close closes the iterator. func (it *randomIterator) Close() { + it.cancelFn() + it.mu.Lock() defer it.mu.Unlock() - - it.cancelFn() it.trees = nil } From e0d0029b14faa6250915950d34aaa86ecc8a1656 Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Thu, 11 Feb 2021 14:47:50 +0100 Subject: [PATCH 07/11] p2p/dnsdisc: log tree URL that iterator is waiting for --- p2p/dnsdisc/client.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/p2p/dnsdisc/client.go b/p2p/dnsdisc/client.go index 55be1b3290b6..ff75a161cb48 100644 --- a/p2p/dnsdisc/client.go +++ b/p2p/dnsdisc/client.go @@ -325,17 +325,18 @@ func (it *randomIterator) syncableTrees() (syncable, disabled []*clientTree) { // waitForRootUpdates waits for the closest scheduled root check time on the given trees. func (it *randomIterator) waitForRootUpdates(trees []*clientTree) bool { - var nextCheck mclock.AbsTime - now := it.c.clock.Now() + var nextCheck = mclock.AbsTime(-1) + var checkTree *clientTree for _, ct := range trees { check := ct.nextScheduledRootCheck() - if nextCheck == 0 || check < nextCheck { + if nextCheck == -1 || check < nextCheck { nextCheck = check + checkTree = ct } } - sleep := nextCheck.Sub(now) - it.c.cfg.Logger.Debug("DNS iterator waiting for root updates", "sleep", sleep) + sleep := nextCheck.Sub(it.c.clock.Now()) + it.c.cfg.Logger.Debug("DNS iterator waiting for root updates", "sleep", sleep, "tree", checkTree) timeout := it.c.clock.NewTimer(sleep) defer timeout.Stop() select { From e4b3b5b198b69cbcdf4c5a5b6b9ff122c2042d15 Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Thu, 11 Feb 2021 15:13:17 +0100 Subject: [PATCH 08/11] p2p/dnsdisc: tweak log message again --- p2p/dnsdisc/client.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/p2p/dnsdisc/client.go b/p2p/dnsdisc/client.go index ff75a161cb48..44f65b36c99b 100644 --- a/p2p/dnsdisc/client.go +++ b/p2p/dnsdisc/client.go @@ -325,13 +325,13 @@ func (it *randomIterator) syncableTrees() (syncable, disabled []*clientTree) { // waitForRootUpdates waits for the closest scheduled root check time on the given trees. func (it *randomIterator) waitForRootUpdates(trees []*clientTree) bool { + var checkTree string var nextCheck = mclock.AbsTime(-1) - var checkTree *clientTree for _, ct := range trees { check := ct.nextScheduledRootCheck() if nextCheck == -1 || check < nextCheck { nextCheck = check - checkTree = ct + checkTree = ct.loc.domain } } From b2a9c66609b87f3a57e5318dd3eab773cc31a9d1 Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Thu, 11 Feb 2021 15:37:07 +0100 Subject: [PATCH 09/11] p2p/dnsdisc: reword comments --- p2p/dnsdisc/client.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/p2p/dnsdisc/client.go b/p2p/dnsdisc/client.go index 44f65b36c99b..75795f2a7998 100644 --- a/p2p/dnsdisc/client.go +++ b/p2p/dnsdisc/client.go @@ -291,9 +291,6 @@ func (it *randomIterator) pickTree() *clientTree { it.rebuildTrees() it.lc.changed = false } - if len(it.trees) == 0 { - return nil - } for { // Find trees that might still have pending items to sync. @@ -302,9 +299,11 @@ func (it *randomIterator) pickTree() *clientTree { if len(syncable) > 0 { return syncable[rand.Intn(len(syncable))] } - // The client tried all trees, and no sync action can be performed on any of them. - // The only meaningful thing to do now is waiting for any root record to get - // updated. + if len(disabled) == 0 { + return nil // Iterator was closed. + } + // No sync action can be performed on any tree right now. The only meaningful + // thing to do is waiting for any root record to get updated. if !it.waitForRootUpdates(disabled) { return nil // Iterator was closed. } From 1278699efa9ea407f0f4afe13aad6d80b9046fd1 Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Tue, 16 Feb 2021 14:07:14 +0100 Subject: [PATCH 10/11] p2p/dnsdisc: re-use tree lists in syncableTrees --- p2p/dnsdisc/client.go | 50 +++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/p2p/dnsdisc/client.go b/p2p/dnsdisc/client.go index 75795f2a7998..ce5f1d9926f6 100644 --- a/p2p/dnsdisc/client.go +++ b/p2p/dnsdisc/client.go @@ -217,8 +217,11 @@ type randomIterator struct { c *Client mu sync.Mutex - trees map[string]*clientTree // all trees lc linkCache // tracks tree dependencies + trees map[string]*clientTree // all trees + // buffers for syncableTrees + syncableList []*clientTree + disabledList []*clientTree } func (c *Client) newRandomIterator() *randomIterator { @@ -287,39 +290,50 @@ func (it *randomIterator) pickTree() *clientTree { it.mu.Lock() defer it.mu.Unlock() + // Rebuild the trees map if any links have changed. if it.lc.changed { it.rebuildTrees() it.lc.changed = false } for { - // Find trees that might still have pending items to sync. - // If there are any, pick a random syncable tree. - syncable, disabled := it.syncableTrees() - if len(syncable) > 0 { - return syncable[rand.Intn(len(syncable))] - } - if len(disabled) == 0 { - return nil // Iterator was closed. - } - // No sync action can be performed on any tree right now. The only meaningful - // thing to do is waiting for any root record to get updated. - if !it.waitForRootUpdates(disabled) { - return nil // Iterator was closed. + canSync, trees := it.syncableTrees() + switch { + case canSync: + // Pick a random tree. + return trees[rand.Intn(len(trees))] + case len(trees) > 0: + // No sync action can be performed on any tree right now. The only meaningful + // thing to do is waiting for any root record to get updated. + if !it.waitForRootUpdates(trees) { + // Iterator was closed while waiting. + return nil + } + default: + // There are no trees left, the iterator was closed. + return nil } } } // syncableTrees finds trees on which any meaningful sync action can be performed. -func (it *randomIterator) syncableTrees() (syncable, disabled []*clientTree) { +func (it *randomIterator) syncableTrees() (canSync bool, trees []*clientTree) { + // Resize tree lists. + it.syncableList = it.syncableList[:0] + it.disabledList = it.disabledList[:0] + + // Partition them into the two lists. for _, ct := range it.trees { if ct.canSyncRandom() { - syncable = append(syncable, ct) + it.syncableList = append(it.syncableList, ct) } else { - disabled = append(disabled, ct) + it.disabledList = append(it.disabledList, ct) } } - return syncable, disabled + if len(it.syncableList) > 0 { + return true, it.syncableList + } + return false, it.disabledList } // waitForRootUpdates waits for the closest scheduled root check time on the given trees. From aaf03bb6e7b757da1474c45b6aa0517b3d139f94 Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Tue, 16 Feb 2021 14:09:05 +0100 Subject: [PATCH 11/11] p2p/dnsdisc: improve safety in waitForRootUpdates --- p2p/dnsdisc/client.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/p2p/dnsdisc/client.go b/p2p/dnsdisc/client.go index ce5f1d9926f6..d3e8111ab53b 100644 --- a/p2p/dnsdisc/client.go +++ b/p2p/dnsdisc/client.go @@ -338,18 +338,18 @@ func (it *randomIterator) syncableTrees() (canSync bool, trees []*clientTree) { // waitForRootUpdates waits for the closest scheduled root check time on the given trees. func (it *randomIterator) waitForRootUpdates(trees []*clientTree) bool { - var checkTree string - var nextCheck = mclock.AbsTime(-1) + var minTree *clientTree + var nextCheck mclock.AbsTime for _, ct := range trees { check := ct.nextScheduledRootCheck() - if nextCheck == -1 || check < nextCheck { + if minTree == nil || check < nextCheck { + minTree = ct nextCheck = check - checkTree = ct.loc.domain } } sleep := nextCheck.Sub(it.c.clock.Now()) - it.c.cfg.Logger.Debug("DNS iterator waiting for root updates", "sleep", sleep, "tree", checkTree) + it.c.cfg.Logger.Debug("DNS iterator waiting for root updates", "sleep", sleep, "tree", minTree.loc.domain) timeout := it.c.clock.NewTimer(sleep) defer timeout.Stop() select {