From 731fe6cdf037a03754e309109205ff1c4e567556 Mon Sep 17 00:00:00 2001 From: DarshanBPatel Date: Mon, 23 Sep 2024 22:02:05 +0530 Subject: [PATCH] replace stale peer management with timestemp approch --- waku/node/peer_manager/peer_manager.nim | 48 +++++++++++----------- waku/node/peer_manager/waku_peer_store.nim | 6 +-- waku/waku_core/peers.nim | 5 ++- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/waku/node/peer_manager/peer_manager.nim b/waku/node/peer_manager/peer_manager.nim index 12c6a084f7..079eb8a771 100644 --- a/waku/node/peer_manager/peer_manager.nim +++ b/waku/node/peer_manager/peer_manager.nim @@ -47,9 +47,6 @@ const # TODO: Make configurable DefaultDialTimeout* = chronos.seconds(10) - # Max attempts before removing the peer - MaxFailedAttempts = 5 - # Time to wait before attempting to dial again is calculated as: # initialBackoffInSec*(backoffFactor^(failedAttempts-1)) # 120s, 480s, 1920, 7680s @@ -71,13 +68,14 @@ const # Max peers that we allow from the same IP DefaultColocationLimit* = 5 + Threshold = chronos.hours(2) + type PeerManager* = ref object of RootObj switch*: Switch peerStore*: PeerStore wakuMetadata*: WakuMetadata initialBackoffInSec*: int backoffFactor*: int - maxFailedAttempts*: int storage*: PeerStorage serviceSlots*: Table[string, RemotePeerInfo] maxRelayPeers*: int @@ -184,9 +182,8 @@ proc connectRelay*( if not pm.peerStore.hasPeer(peerId, WakuRelayCodec): pm.addPeer(peer) - let failedAttempts = pm.peerStore[NumberFailedConnBook][peerId] trace "Connecting to relay peer", - wireAddr = peer.addrs, peerId = peerId, failedAttempts = failedAttempts + wireAddr = peer.addrs, peerId = peerId var deadline = sleepAsync(dialTimeout) let workfut = pm.switch.connect(peerId, peer.addrs) @@ -208,20 +205,21 @@ proc connectRelay*( waku_peers_dials.inc(labelValues = ["successful"]) waku_node_conns_initiated.inc(labelValues = [source]) - pm.peerStore[NumberFailedConnBook][peerId] = 0 - + if pm.peerStore[FirstFailedConnBook].contains(peerId): + discard pm.peerStore[FirstFailedConnBook].del(peerId) + return true # Dial failed - pm.peerStore[NumberFailedConnBook][peerId] = - pm.peerStore[NumberFailedConnBook][peerId] + 1 pm.peerStore[LastFailedConnBook][peerId] = Moment.init(getTime().toUnix, Second) pm.peerStore[ConnectionBook][peerId] = CannotConnect + if not pm.peerStore[FirstFailedConnBook].contains(peerId): + pm.peerStore[FirstFailedConnBook][peerId] = Moment.init(getTime().toUnix, Second) trace "Connecting relay peer failed", peerId = peerId, - reason = reasonFailed, - failedAttempts = pm.peerStore[NumberFailedConnBook][peerId] + reason = reasonFailed + waku_peers_dials.inc(labelValues = [reasonFailed]) return false @@ -311,14 +309,17 @@ proc canBeConnected*(pm: PeerManager, peerId: PeerId): bool = # Returns if we can try to connect to this peer, based on past failed attempts # It uses an exponential backoff. Each connection attempt makes us # wait more before trying again. - let failedAttempts = pm.peerStore[NumberFailedConnBook][peerId] # if it never errored, we can try to connect - if failedAttempts == 0: + if not pm.peerStore[FirstFailedConnBook].contains(peerId): return true - - # if there are too many failed attempts, do not reconnect - if failedAttempts >= pm.maxFailedAttempts: + + # if it's break threshold then do not reconnect + let + disconnectTime = pm.peerStore[FirstFailedConnBook][peerId] + currentTime = Moment.init(getTime().toUnix, Second) + + if (currentTime - disconnectTime) > Threshold: return false # If it errored we wait an exponential backoff from last connection @@ -326,7 +327,7 @@ proc canBeConnected*(pm: PeerManager, peerId: PeerId): bool = let now = Moment.init(getTime().toUnix, Second) let lastFailed = pm.peerStore[LastFailedConnBook][peerId] let backoff = - calculateBackoff(pm.initialBackoffInSec, pm.backoffFactor, failedAttempts) + calculateBackoff(pm.initialBackoffInSec, pm.backoffFactor, 5) return now >= (lastFailed + backoff) @@ -457,7 +458,6 @@ proc new*( storage: PeerStorage = nil, initialBackoffInSec = InitialBackoffInSec, backoffFactor = BackoffFactor, - maxFailedAttempts = MaxFailedAttempts, colocationLimit = DefaultColocationLimit, shardedPeerManagement = false, ): PeerManager {.gcsafe.} = @@ -489,7 +489,7 @@ proc new*( maxRelayPeersValue = maxConnections - (maxConnections div 5) # attempt to calculate max backoff to prevent potential overflows or unreasonably high values - let backoff = calculateBackoff(initialBackoffInSec, backoffFactor, maxFailedAttempts) + let backoff = calculateBackoff(initialBackoffInSec, backoffFactor, 5) if backoff.weeks() > 1: error "Max backoff time can't be over 1 week", maxBackoff = backoff raise newException(Defect, "Max backoff time can't be over 1 week") @@ -506,7 +506,6 @@ proc new*( outRelayPeersTarget: outRelayPeersTarget, inRelayPeersTarget: maxRelayPeersValue - outRelayPeersTarget, maxRelayPeers: maxRelayPeersValue, - maxFailedAttempts: maxFailedAttempts, colocationLimit: colocationLimit, shardedPeerManagement: shardedPeerManagement, ) @@ -849,8 +848,11 @@ proc prunePeerStore*(pm: PeerManager) = var peersToPrune: HashSet[PeerId] # prune failed connections - for peerId, count in pm.peerStore[NumberFailedConnBook].book.pairs: - if count < pm.maxFailedAttempts: + for peerId in pm.peerStore[FirstFailedConnBook].book.keys: + let + disconnectTime = pm.peerStore[FirstFailedConnBook][peerId] + currentTime = Moment.init(getTime().toUnix, Second) + if (currentTime - disconnectTime) < Threshold: continue if peersToPrune.len >= pruningCount: diff --git a/waku/node/peer_manager/waku_peer_store.nim b/waku/node/peer_manager/waku_peer_store.nim index 09d6ebc658..2a8d244c40 100644 --- a/waku/node/peer_manager/waku_peer_store.nim +++ b/waku/node/peer_manager/waku_peer_store.nim @@ -23,8 +23,8 @@ type # Last failed connection attemp timestamp LastFailedConnBook* = ref object of PeerBook[Moment] - # Failed connection attempts - NumberFailedConnBook* = ref object of PeerBook[int] + # First failed connection attemp timestamp + FirstFailedConnBook* = ref object of PeerBook[Moment] # Keeps track of when peers were disconnected in Unix timestamps DisconnectBook* = ref object of PeerBook[int64] @@ -67,7 +67,7 @@ proc get*(peerStore: PeerStore, peerId: PeerID): RemotePeerInfo = origin: peerStore[SourceBook][peerId], direction: peerStore[DirectionBook][peerId], lastFailedConn: peerStore[LastFailedConnBook][peerId], - numberFailedConn: peerStore[NumberFailedConnBook][peerId], + firstFailedConn: peerStore[FirstFailedConnBook][peerId], ) proc getWakuProtos*(peerStore: PeerStore): seq[string] = diff --git a/waku/waku_core/peers.nim b/waku/waku_core/peers.nim index 07ad3bc4c7..dadde1f891 100644 --- a/waku/waku_core/peers.nim +++ b/waku/waku_core/peers.nim @@ -53,9 +53,10 @@ type RemotePeerInfo* = ref object disconnectTime*: int64 origin*: PeerOrigin direction*: PeerDirection + firstFailedConn*: Moment lastFailedConn*: Moment - numberFailedConn*: int - + + func `$`*(remotePeerInfo: RemotePeerInfo): string = $remotePeerInfo.peerId