From 589b1fd2d15127c21caa40ead0179f78d9675d13 Mon Sep 17 00:00:00 2001
From: Sam Stokes <35908605+bitwiseguy@users.noreply.github.com>
Date: Tue, 23 Apr 2024 10:39:48 -0400
Subject: [PATCH] op-node: prevent spamming of reqs for blocks triggered by
 `checkForGapInUnsafeQueue` (#10063)

* Add rangeReqId so we can cancel all associated block req via single flag

* Use crypto/rand instead of math/rand for randomReqId

* Use atomic counter instead of random num for rangeReqId

* Remove redundant peerRequest.complete field. Use inFlight instead

* Add mutex to activeRangeRequests map

* Do not penalize peer for block not found error

* Fix inFlight request clean up logic

* Add test checks for cancelled range request

* Add mutex protection to inFlight map

* Use constants for ResultCode

* Simplify inFlight.get method logic

* Use same struct for activeRangeRequests and inFlight
---
 op-node/p2p/node.go      |   3 +-
 op-node/p2p/sync.go      | 188 ++++++++++++++++++++++++---------------
 op-node/p2p/sync_test.go |  26 ++++--
 3 files changed, 139 insertions(+), 78 deletions(-)

diff --git a/op-node/p2p/node.go b/op-node/p2p/node.go
index b80801f7dbb5..8a0aee3afcf1 100644
--- a/op-node/p2p/node.go
+++ b/op-node/p2p/node.go
@@ -176,7 +176,8 @@ func (n *NodeP2P) RequestL2Range(ctx context.Context, start, end eth.L2BlockRef)
 	if !n.AltSyncEnabled() {
 		return fmt.Errorf("cannot request range %s - %s, req-resp sync is not enabled", start, end)
 	}
-	return n.syncCl.RequestL2Range(ctx, start, end)
+	_, err := n.syncCl.RequestL2Range(ctx, start, end)
+	return err
 }
 
 func (n *NodeP2P) Host() host.Host {
diff --git a/op-node/p2p/sync.go b/op-node/p2p/sync.go
index 6bf0fc242c0f..e006aa4f4160 100644
--- a/op-node/p2p/sync.go
+++ b/op-node/p2p/sync.go
@@ -61,6 +61,13 @@ const (
 	clientErrRateCost = peerServerBlocksBurst
 )
 
+const (
+	ResultCodeSuccess     byte = 0
+	ResultCodeNotFoundErr byte = 1
+	ResultCodeInvalidErr  byte = 2
+	ResultCodeUnknownErr  byte = 3
+)
+
 func PayloadByNumberProtocolID(l2ChainID *big.Int) protocol.ID {
 	return protocol.ID(fmt.Sprintf("/opstack/req/payload_by_number/%d/0", l2ChainID))
 }
@@ -87,6 +94,7 @@ type receivePayloadFn func(ctx context.Context, from peer.ID, payload *eth.Execu
 type rangeRequest struct {
 	start uint64
 	end   eth.L2BlockRef
+	id    uint64
 }
 
 type syncResult struct {
@@ -95,17 +103,44 @@ type syncResult struct {
 }
 
 type peerRequest struct {
-	num uint64
-
-	complete *atomic.Bool
+	num        uint64
+	rangeReqId uint64
 }
 
 type inFlightCheck struct {
-	num uint64
-
+	num    uint64
 	result chan bool
 }
 
+type requestIdMap struct {
+	requests map[uint64]bool
+	mu       sync.Mutex
+}
+
+func newRequestIdMap() *requestIdMap {
+	return &requestIdMap{
+		requests: make(map[uint64]bool),
+	}
+}
+
+func (r *requestIdMap) set(key uint64, value bool) {
+	r.mu.Lock()
+	r.requests[key] = value
+	r.mu.Unlock()
+}
+
+func (r *requestIdMap) get(key uint64) bool {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return r.requests[key]
+}
+
+func (r *requestIdMap) delete(key uint64) {
+	r.mu.Lock()
+	delete(r.requests, key)
+	r.mu.Unlock()
+}
+
 type SyncClientMetrics interface {
 	ClientPayloadByNumberEvent(num uint64, resultCode byte, duration time.Duration)
 	PayloadsQuarantineSize(n int)
@@ -209,12 +244,14 @@ type SyncClient struct {
 	quarantineByNum map[uint64]common.Hash
 
 	// inFlight requests are not repeated
-	inFlight map[uint64]*atomic.Bool
-
-	requests       chan rangeRequest
-	peerRequests   chan peerRequest
+	inFlight       *requestIdMap
 	inFlightChecks chan inFlightCheck
 
+	rangeRequests       chan rangeRequest
+	activeRangeRequests *requestIdMap
+	rangeReqId          uint64
+	peerRequests        chan peerRequest
+
 	results chan syncResult
 
 	receivePayload receivePayloadFn
@@ -238,24 +275,26 @@ func NewSyncClient(log log.Logger, cfg *rollup.Config, newStream newStreamFn, rc
 	ctx, cancel := context.WithCancel(context.Background())
 
 	c := &SyncClient{
-		log:             log,
-		cfg:             cfg,
-		metrics:         metrics,
-		appScorer:       appScorer,
-		newStreamFn:     newStream,
-		payloadByNumber: PayloadByNumberProtocolID(cfg.L2ChainID),
-		peers:           make(map[peer.ID]context.CancelFunc),
-		quarantineByNum: make(map[uint64]common.Hash),
-		inFlight:        make(map[uint64]*atomic.Bool),
-		requests:        make(chan rangeRequest), // blocking
-		peerRequests:    make(chan peerRequest, 128),
-		results:         make(chan syncResult, 128),
-		inFlightChecks:  make(chan inFlightCheck, 128),
-		globalRL:        rate.NewLimiter(globalServerBlocksRateLimit, globalServerBlocksBurst),
-		resCtx:          ctx,
-		resCancel:       cancel,
-		receivePayload:  rcv,
+		log:                 log,
+		cfg:                 cfg,
+		metrics:             metrics,
+		appScorer:           appScorer,
+		newStreamFn:         newStream,
+		payloadByNumber:     PayloadByNumberProtocolID(cfg.L2ChainID),
+		peers:               make(map[peer.ID]context.CancelFunc),
+		quarantineByNum:     make(map[uint64]common.Hash),
+		rangeRequests:       make(chan rangeRequest), // blocking
+		activeRangeRequests: newRequestIdMap(),
+		peerRequests:        make(chan peerRequest, 128),
+		results:             make(chan syncResult, 128),
+		inFlight:            newRequestIdMap(),
+		inFlightChecks:      make(chan inFlightCheck, 128),
+		globalRL:            rate.NewLimiter(globalServerBlocksRateLimit, globalServerBlocksBurst),
+		resCtx:              ctx,
+		resCancel:           cancel,
+		receivePayload:      rcv,
 	}
+
 	// never errors with positive LRU cache size
 	// TODO(CLI-3733): if we had an LRU based on on total payloads size, instead of payload count,
 	//  we can safely buffer more data in the happy case.
@@ -313,17 +352,23 @@ func (s *SyncClient) Close() error {
 	return nil
 }
 
-func (s *SyncClient) RequestL2Range(ctx context.Context, start, end eth.L2BlockRef) error {
+func (s *SyncClient) RequestL2Range(ctx context.Context, start, end eth.L2BlockRef) (uint64, error) {
 	if end == (eth.L2BlockRef{}) {
 		s.log.Debug("P2P sync client received range signal, but cannot sync open-ended chain: need sync target to verify blocks through parent-hashes", "start", start)
-		return nil
+		return 0, nil
 	}
+	// Create shared rangeReqId so associated peerRequests can all be cancelled by setting a single flag
+	rangeReqId := atomic.AddUint64(&s.rangeReqId, 1)
+	// need to flag request as active before adding request to s.rangeRequests to avoid race
+	s.activeRangeRequests.set(rangeReqId, true)
+
 	// synchronize requests with the main loop for state access
 	select {
-	case s.requests <- rangeRequest{start: start.Number, end: end}:
-		return nil
+	case s.rangeRequests <- rangeRequest{start: start.Number, end: end, id: rangeReqId}:
+		return rangeReqId, nil
 	case <-ctx.Done():
-		return fmt.Errorf("too busy with P2P results/requests: %w", ctx.Err())
+		s.activeRangeRequests.delete(rangeReqId)
+		return rangeReqId, fmt.Errorf("too busy with P2P results/requests: %w", ctx.Err())
 	}
 }
 
@@ -336,7 +381,7 @@ func (s *SyncClient) mainLoop() {
 	defer s.wg.Done()
 	for {
 		select {
-		case req := <-s.requests:
+		case req := <-s.rangeRequests:
 			ctx, cancel := context.WithTimeout(s.resCtx, maxRequestScheduling)
 			s.onRangeRequest(ctx, req)
 			cancel()
@@ -346,12 +391,7 @@ func (s *SyncClient) mainLoop() {
 			cancel()
 		case check := <-s.inFlightChecks:
 			s.log.Info("Checking in flight", "num", check.num)
-			complete, ok := s.inFlight[check.num]
-			if !ok {
-				check.result <- false
-			} else {
-				check.result <- !complete.Load()
-			}
+			check.result <- s.inFlight.get(check.num)
 		case <-s.resCtx.Done():
 			s.log.Info("stopped P2P req-resp L2 block sync client")
 			return
@@ -377,19 +417,13 @@ func (s *SyncClient) isInFlight(ctx context.Context, num uint64) (bool, error) {
 // onRangeRequest is exclusively called by the main loop, and has thus direct access to the request bookkeeping state.
 // This function transforms requested block ranges into work for each peer.
 func (s *SyncClient) onRangeRequest(ctx context.Context, req rangeRequest) {
+	log := s.log.New("target", req.start, "end", req.end)
+	log.Info("processing L2 range request", "rangeReqId", req.id)
+
 	// add req head to trusted set of blocks
 	s.trusted.Add(req.end.Hash, struct{}{})
 	s.trusted.Add(req.end.ParentHash, struct{}{})
 
-	log := s.log.New("target", req.start, "end", req.end)
-
-	// clean up the completed in-flight requests
-	for k, v := range s.inFlight {
-		if v.Load() {
-			delete(s.inFlight, k)
-		}
-	}
-
 	// Now try to fetch lower numbers than current end, to traverse back towards the updated start.
 	for i := uint64(0); ; i++ {
 		num := req.end.Number - 1 - i
@@ -406,17 +440,17 @@ func (s *SyncClient) onRangeRequest(ctx context.Context, req rangeRequest) {
 			continue
 		}
 
-		if _, ok := s.inFlight[num]; ok {
+		if s.inFlight.get(num) {
 			log.Debug("request still in-flight, not rescheduling sync request", "num", num)
 			continue // request still in flight
 		}
-		pr := peerRequest{num: num, complete: new(atomic.Bool)}
+		pr := peerRequest{num: num, rangeReqId: req.id}
 
-		log.Debug("Scheduling P2P block request", "num", num)
+		log.Debug("Scheduling P2P block request", "num", num, "rangeReqId", req.id)
 		// schedule number
 		select {
 		case s.peerRequests <- pr:
-			s.inFlight[num] = pr.complete
+			s.inFlight.set(num, true)
 		case <-ctx.Done():
 			log.Info("did not schedule full P2P sync range", "current", num, "err", ctx.Err())
 			return
@@ -487,7 +521,7 @@ func (s *SyncClient) onResult(ctx context.Context, res syncResult) {
 	payload := res.payload.ExecutionPayload
 	s.log.Debug("processing p2p sync result", "payload", payload.ID(), "peer", res.peer)
 	// Clean up the in-flight request, we have a result now.
-	delete(s.inFlight, uint64(payload.BlockNumber))
+	s.inFlight.delete(uint64(payload.BlockNumber))
 	// Always put it in quarantine first. If promotion fails because the receiver is too busy, this functions as cache.
 	s.quarantine.Add(payload.BlockHash, res)
 	s.quarantineByNum[uint64(payload.BlockNumber)] = payload.BlockHash
@@ -528,17 +562,39 @@ func (s *SyncClient) peerLoop(ctx context.Context, id peer.ID) {
 		// once the peer is available, wait for a sync request.
 		select {
 		case pr := <-s.peerRequests:
+			if !s.activeRangeRequests.get(pr.rangeReqId) {
+				log.Debug("dropping cancelled p2p sync request", "num", pr.num)
+				s.inFlight.delete(pr.num)
+				continue
+			}
+
 			// We already established the peer is available w.r.t. rate-limiting,
 			// and this is the only loop over this peer, so we can request now.
 			start := time.Now()
+
+			resultCode := ResultCodeSuccess
 			err := s.doRequest(ctx, id, pr.num)
 			if err != nil {
-				// mark as complete if there's an error: we are not sending any result and can complete immediately.
-				pr.complete.Store(true)
+				s.inFlight.delete(pr.num)
 				log.Warn("failed p2p sync request", "num", pr.num, "err", err)
-				s.appScorer.onResponseError(id)
+				resultCode = ResultCodeNotFoundErr
+				sendResponseError := true
+
+				if re, ok := err.(requestResultErr); ok {
+					resultCode = re.ResultCode()
+					if resultCode == ResultCodeNotFoundErr {
+						log.Warn("cancelling p2p sync range request", "rangeReqId", pr.rangeReqId)
+						s.activeRangeRequests.delete(pr.rangeReqId)
+						sendResponseError = false // don't penalize peer for this error
+					}
+				}
+
+				if sendResponseError {
+					s.appScorer.onResponseError(id)
+				}
+
 				// If we hit an error, then count it as many requests.
-				// We'd like to avoid making more requests for a while, to back off.
+				// We'd like to avoid making more requests for a while, so back off.
 				if err := rl.WaitN(ctx, clientErrRateCost); err != nil {
 					return
 				}
@@ -546,16 +602,8 @@ func (s *SyncClient) peerLoop(ctx context.Context, id peer.ID) {
 				log.Debug("completed p2p sync request", "num", pr.num)
 				s.appScorer.onValidResponse(id)
 			}
-			took := time.Since(start)
 
-			resultCode := byte(0)
-			if err != nil {
-				if re, ok := err.(requestResultErr); ok {
-					resultCode = re.ResultCode()
-				} else {
-					resultCode = 1
-				}
-			}
+			took := time.Since(start)
 			s.metrics.ClientPayloadByNumberEvent(pr.num, resultCode, took)
 		case <-ctx.Done():
 			return
@@ -740,22 +788,22 @@ func (srv *ReqRespServer) HandleSyncRequest(ctx context.Context, log log.Logger,
 	req, err := srv.handleSyncRequest(ctx, stream)
 	cancel()
 
-	resultCode := byte(0)
+	resultCode := ResultCodeSuccess
 	if err != nil {
 		log.Warn("failed to serve p2p sync request", "req", req, "err", err)
 		if errors.Is(err, ethereum.NotFound) {
-			resultCode = 1
+			resultCode = ResultCodeNotFoundErr
 		} else if errors.Is(err, invalidRequestErr) {
-			resultCode = 2
+			resultCode = ResultCodeInvalidErr
 		} else {
-			resultCode = 3
+			resultCode = ResultCodeUnknownErr
 		}
 		// try to write error code, so the other peer can understand the reason for failure.
 		_, _ = stream.Write([]byte{resultCode})
 	} else {
 		log.Debug("successfully served sync response", "req", req)
 	}
-	srv.metrics.ServerPayloadByNumberEvent(req, 0, time.Since(start))
+	srv.metrics.ServerPayloadByNumberEvent(req, resultCode, time.Since(start))
 }
 
 var invalidRequestErr = errors.New("invalid request")
diff --git a/op-node/p2p/sync_test.go b/op-node/p2p/sync_test.go
index 0c43778ce8f0..cd44fb521473 100644
--- a/op-node/p2p/sync_test.go
+++ b/op-node/p2p/sync_test.go
@@ -169,7 +169,8 @@ func TestSinglePeerSync(t *testing.T) {
 	defer cl.Close()
 
 	// request to start syncing between 10 and 20
-	require.NoError(t, cl.RequestL2Range(ctx, payloads.getBlockRef(10), payloads.getBlockRef(20)))
+	_, err = cl.RequestL2Range(ctx, payloads.getBlockRef(10), payloads.getBlockRef(20))
+	require.NoError(t, err)
 
 	// and wait for the sync results to come in (in reverse order)
 	for i := uint64(19); i > 10; i-- {
@@ -255,7 +256,8 @@ func TestMultiPeerSync(t *testing.T) {
 	defer clC.Close()
 
 	// request to start syncing between 10 and 90
-	require.NoError(t, clA.RequestL2Range(ctx, payloads.getBlockRef(10), payloads.getBlockRef(90)))
+	_, err = clA.RequestL2Range(ctx, payloads.getBlockRef(10), payloads.getBlockRef(90))
+	require.NoError(t, err)
 
 	// With such large range to request we are going to hit the rate-limits of B and C,
 	// but that means we'll balance the work between the peers.
@@ -270,13 +272,18 @@ func TestMultiPeerSync(t *testing.T) {
 	// now see if B can sync a range, and fill the gap with a re-request
 	bl25, _ := payloads.getPayload(25) // temporarily remove it from the available payloads. This will create a gap
 	payloads.deletePayload(25)
-	require.NoError(t, clB.RequestL2Range(ctx, payloads.getBlockRef(20), payloads.getBlockRef(30)))
+	rangeReqId, err := clB.RequestL2Range(ctx, payloads.getBlockRef(20), payloads.getBlockRef(30))
+
+	require.NoError(t, err)
+	require.True(t, clB.activeRangeRequests.get(rangeReqId), "expecting range request to be active")
+
 	for i := uint64(29); i > 25; i-- {
 		p := <-recvB
 		exp, ok := payloads.getPayload(uint64(p.ExecutionPayload.BlockNumber))
 		require.True(t, ok, "expecting known payload")
 		require.Equal(t, exp.ExecutionPayload.BlockHash, p.ExecutionPayload.BlockHash, "expecting the correct payload")
 	}
+
 	// Wait for the request for block 25 to be made
 	ctx, cancelFunc := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancelFunc()
@@ -291,12 +298,12 @@ func TestMultiPeerSync(t *testing.T) {
 			t.Fatal("Did not request block 25 in a reasonable time")
 		}
 	}
+
 	// the request for 25 should fail. See:
 	// server: WARN  peer requested unknown block by number   num=25
 	// client: WARN  failed p2p sync request    num=25 err="peer failed to serve request with code 1"
 	require.Zero(t, len(recvB), "there is a gap, should not see other payloads yet")
-	// Add back the block
-	payloads.addPayload(bl25)
+
 	// race-condition fix: the request for 25 is expected to error, but is marked as complete in the peer-loop.
 	// But the re-request checks the status in the main loop, and it may thus look like it's still in-flight,
 	// and thus not run the new request.
@@ -306,13 +313,18 @@ func TestMultiPeerSync(t *testing.T) {
 	for {
 		isInFlight, err := clB.isInFlight(ctx, 25)
 		require.NoError(t, err)
+		time.Sleep(time.Second)
 		if !isInFlight {
 			break
 		}
-		time.Sleep(time.Second)
 	}
+	require.False(t, clB.activeRangeRequests.get(rangeReqId), "expecting range request to be cancelled")
+
+	// Add back the block
+	payloads.addPayload(bl25)
 	// And request a range again, 25 is there now, and 21-24 should follow quickly (some may already have been fetched and wait in quarantine)
-	require.NoError(t, clB.RequestL2Range(ctx, payloads.getBlockRef(20), payloads.getBlockRef(26)))
+	_, err = clB.RequestL2Range(ctx, payloads.getBlockRef(20), payloads.getBlockRef(26))
+	require.NoError(t, err)
 	for i := uint64(25); i > 20; i-- {
 		p := <-recvB
 		exp, ok := payloads.getPayload(uint64(p.ExecutionPayload.BlockNumber))