From b9b8a09a69d205851b169a48a45024d9292c83d3 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Wed, 24 Apr 2019 09:02:51 -0700 Subject: [PATCH] dial: limit error size Instead of storing _every_ error, store at most 32 errors (plus a "too many errors" error). Helps address https://github.com/libp2p/go-libp2p-swarm/issues/119 --- swarm_dial.go | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/swarm_dial.go b/swarm_dial.go index d4b31552..e7fc5e9b 100644 --- a/swarm_dial.go +++ b/swarm_dial.go @@ -43,6 +43,9 @@ var ( // ErrNoTransport is returned when we don't know a transport for the // given multiaddr. ErrNoTransport = errors.New("no transport for protocol") + + // ErrTooManyErrors is returned as the final error when we encounter too many errors when dialing a peer. + ErrTooManyErrors = errors.New("too many errors") ) // DialAttempts governs how many times a goroutine will try to dial a given peer. @@ -58,6 +61,9 @@ const ConcurrentFdDials = 160 // per peer const DefaultPerPeerRateLimit = 8 +// maxDialErrors is the maximum number of dial errors we record +const maxDialErrors = 32 + // dialbackoff is a struct used to avoid over-dialing the same, dead peers. // Whenever we totally time out on a peer (all three attempts), we add them // to dialbackoff. Then, whenevers goroutines would _wait_ (dialsync), they @@ -362,6 +368,22 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma. respch := make(chan dialResult) var dialErrors *multierror.Error + // aggregateErr aggregates returned errors into a single multi-error but + // limits the number of errors we record. + aggregateErr := func(err error) { + if dialErrors == nil || dialErrors.Len() < maxDialErrors { + // keep the error + } else if dialErrors.Len() == maxDialErrors { + // Make the last error "too many errors". + err = ErrTooManyErrors + } else { + // Already have too many errors. + return + } + + dialErrors = multierror.Append(dialErrors, err) + } + defer s.limiter.clearAllPeerDials(p) var active int @@ -379,7 +401,7 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma. if resp.Err != nil { // Errors are normal, lots of dials will fail log.Infof("got error on dial: %s", resp.Err) - dialErrors = multierror.Append(dialErrors, resp.Err) + aggregateErr(resp.Err) } else if resp.Conn != nil { return resp.Conn, nil } @@ -410,7 +432,7 @@ func (s *Swarm) dialAddrs(ctx context.Context, p peer.ID, remoteAddrs <-chan ma. if resp.Err != nil { // Errors are normal, lots of dials will fail log.Infof("got error on dial: %s", resp.Err) - dialErrors = multierror.Append(dialErrors, resp.Err) + aggregateErr(resp.Err) } else if resp.Conn != nil { return resp.Conn, nil }