From f2efc36c4c0c7336a94b80c8d5b7e34200c8f22b Mon Sep 17 00:00:00 2001 From: DarshanBPatel Date: Tue, 24 Sep 2024 17:48:08 +0530 Subject: [PATCH] Revert "replace stale peer management with timestemp approch" This reverts commit 731fe6cdf037a03754e309109205ff1c4e567556. --- waku/node/peer_manager/peer_manager.nim | 48 +++++++++++----------- waku/node/peer_manager/waku_peer_store.nim | 6 +-- waku/waku_core/peers.nim | 5 +-- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/waku/node/peer_manager/peer_manager.nim b/waku/node/peer_manager/peer_manager.nim index 5aa6c44366..87506e7a4c 100644 --- a/waku/node/peer_manager/peer_manager.nim +++ b/waku/node/peer_manager/peer_manager.nim @@ -47,6 +47,9 @@ const # TODO: Make configurable DefaultDialTimeout* = chronos.seconds(10) + # Max attempts before removing the peer + MaxFailedAttempts = 5 + # Time to wait before attempting to dial again is calculated as: # initialBackoffInSec*(backoffFactor^(failedAttempts-1)) # 120s, 480s, 1920, 7680s @@ -68,14 +71,13 @@ const # Max peers that we allow from the same IP DefaultColocationLimit* = 5 - Threshold = chronos.hours(2) - type PeerManager* = ref object of RootObj switch*: Switch peerStore*: PeerStore wakuMetadata*: WakuMetadata initialBackoffInSec*: int backoffFactor*: int + maxFailedAttempts*: int storage*: PeerStorage serviceSlots*: Table[string, RemotePeerInfo] maxRelayPeers*: int @@ -182,8 +184,9 @@ proc connectRelay*( if not pm.peerStore.hasPeer(peerId, WakuRelayCodec): pm.addPeer(peer) + let failedAttempts = pm.peerStore[NumberFailedConnBook][peerId] trace "Connecting to relay peer", - wireAddr = peer.addrs, peerId = peerId + wireAddr = peer.addrs, peerId = peerId, failedAttempts = failedAttempts var deadline = sleepAsync(dialTimeout) let workfut = pm.switch.connect(peerId, peer.addrs) @@ -205,21 +208,20 @@ proc connectRelay*( waku_peers_dials.inc(labelValues = ["successful"]) waku_node_conns_initiated.inc(labelValues = [source]) - if pm.peerStore[FirstFailedConnBook].contains(peerId): - discard pm.peerStore[FirstFailedConnBook].del(peerId) - + pm.peerStore[NumberFailedConnBook][peerId] = 0 + return true # Dial failed + pm.peerStore[NumberFailedConnBook][peerId] = + pm.peerStore[NumberFailedConnBook][peerId] + 1 pm.peerStore[LastFailedConnBook][peerId] = Moment.init(getTime().toUnix, Second) pm.peerStore[ConnectionBook][peerId] = CannotConnect - if not pm.peerStore[FirstFailedConnBook].contains(peerId): - pm.peerStore[FirstFailedConnBook][peerId] = Moment.init(getTime().toUnix, Second) trace "Connecting relay peer failed", peerId = peerId, - reason = reasonFailed - + reason = reasonFailed, + failedAttempts = pm.peerStore[NumberFailedConnBook][peerId] waku_peers_dials.inc(labelValues = [reasonFailed]) return false @@ -313,17 +315,14 @@ proc canBeConnected*(pm: PeerManager, peerId: PeerId): bool = # Returns if we can try to connect to this peer, based on past failed attempts # It uses an exponential backoff. Each connection attempt makes us # wait more before trying again. + let failedAttempts = pm.peerStore[NumberFailedConnBook][peerId] # if it never errored, we can try to connect - if not pm.peerStore[FirstFailedConnBook].contains(peerId): + if failedAttempts == 0: return true - - # if it's break threshold then do not reconnect - let - disconnectTime = pm.peerStore[FirstFailedConnBook][peerId] - currentTime = Moment.init(getTime().toUnix, Second) - - if (currentTime - disconnectTime) > Threshold: + + # if there are too many failed attempts, do not reconnect + if failedAttempts >= pm.maxFailedAttempts: return false # If it errored we wait an exponential backoff from last connection @@ -331,7 +330,7 @@ proc canBeConnected*(pm: PeerManager, peerId: PeerId): bool = let now = Moment.init(getTime().toUnix, Second) let lastFailed = pm.peerStore[LastFailedConnBook][peerId] let backoff = - calculateBackoff(pm.initialBackoffInSec, pm.backoffFactor, 5) + calculateBackoff(pm.initialBackoffInSec, pm.backoffFactor, failedAttempts) return now >= (lastFailed + backoff) @@ -462,6 +461,7 @@ proc new*( storage: PeerStorage = nil, initialBackoffInSec = InitialBackoffInSec, backoffFactor = BackoffFactor, + maxFailedAttempts = MaxFailedAttempts, colocationLimit = DefaultColocationLimit, shardedPeerManagement = false, ): PeerManager {.gcsafe.} = @@ -493,7 +493,7 @@ proc new*( maxRelayPeersValue = maxConnections - (maxConnections div 5) # attempt to calculate max backoff to prevent potential overflows or unreasonably high values - let backoff = calculateBackoff(initialBackoffInSec, backoffFactor, 5) + let backoff = calculateBackoff(initialBackoffInSec, backoffFactor, maxFailedAttempts) if backoff.weeks() > 1: error "Max backoff time can't be over 1 week", maxBackoff = backoff raise newException(Defect, "Max backoff time can't be over 1 week") @@ -510,6 +510,7 @@ proc new*( outRelayPeersTarget: outRelayPeersTarget, inRelayPeersTarget: maxRelayPeersValue - outRelayPeersTarget, maxRelayPeers: maxRelayPeersValue, + maxFailedAttempts: maxFailedAttempts, colocationLimit: colocationLimit, shardedPeerManagement: shardedPeerManagement, ) @@ -836,11 +837,8 @@ proc prunePeerStore*(pm: PeerManager) = var peersToPrune: HashSet[PeerId] # prune failed connections - for peerId in pm.peerStore[FirstFailedConnBook].book.keys: - let - disconnectTime = pm.peerStore[FirstFailedConnBook][peerId] - currentTime = Moment.init(getTime().toUnix, Second) - if (currentTime - disconnectTime) < Threshold: + for peerId, count in pm.peerStore[NumberFailedConnBook].book.pairs: + if count < pm.maxFailedAttempts: continue if peersToPrune.len >= pruningCount: diff --git a/waku/node/peer_manager/waku_peer_store.nim b/waku/node/peer_manager/waku_peer_store.nim index 2a8d244c40..09d6ebc658 100644 --- a/waku/node/peer_manager/waku_peer_store.nim +++ b/waku/node/peer_manager/waku_peer_store.nim @@ -23,8 +23,8 @@ type # Last failed connection attemp timestamp LastFailedConnBook* = ref object of PeerBook[Moment] - # First failed connection attemp timestamp - FirstFailedConnBook* = ref object of PeerBook[Moment] + # Failed connection attempts + NumberFailedConnBook* = ref object of PeerBook[int] # Keeps track of when peers were disconnected in Unix timestamps DisconnectBook* = ref object of PeerBook[int64] @@ -67,7 +67,7 @@ proc get*(peerStore: PeerStore, peerId: PeerID): RemotePeerInfo = origin: peerStore[SourceBook][peerId], direction: peerStore[DirectionBook][peerId], lastFailedConn: peerStore[LastFailedConnBook][peerId], - firstFailedConn: peerStore[FirstFailedConnBook][peerId], + numberFailedConn: peerStore[NumberFailedConnBook][peerId], ) proc getWakuProtos*(peerStore: PeerStore): seq[string] = diff --git a/waku/waku_core/peers.nim b/waku/waku_core/peers.nim index dadde1f891..07ad3bc4c7 100644 --- a/waku/waku_core/peers.nim +++ b/waku/waku_core/peers.nim @@ -53,10 +53,9 @@ type RemotePeerInfo* = ref object disconnectTime*: int64 origin*: PeerOrigin direction*: PeerDirection - firstFailedConn*: Moment lastFailedConn*: Moment - - + numberFailedConn*: int + func `$`*(remotePeerInfo: RemotePeerInfo): string = $remotePeerInfo.peerId