Skip to content

Commit

Permalink
fix: determine F3 participants relative to current network name (#12597)
Browse files Browse the repository at this point in the history
* Investigate intermittent F3 itest failures on CI

Repeat F3 itests on CI to investigate intermittent failures.

* Fix participation lease removal for wrong network

When manifest changes, depending on the timing it is possible for newly
generated valid leases to get removed if the sign message loop attempts
to sign messages that are as a result of progressing previous network.

Here is an example scenario in a specific order that was causing itests
to fail:
 * participants get a lease for network A up to instance 5
 * network A progresses to instance 6
 * manifest changes the network name to B
 * participants get a new lease for network B up to instance 5
 * sign loop receives a message from network A, instance 6
 * `getParticipantsByInstance` lazily removes leases since it only
   checks the instance.
 * the node ends up with no participants, and stuck.

To fix this:
 1) check if participants asked for are within the current network, and
    if not refuse to participate.
 2) check network name, as well as instance, to lazily remove expired
    leases.

* Add debug capability to F3 itests to print current progress

To aid debugging failing tests add option to print progress of all nodes
at every eventual assertion, disabled by default.

* Shorten GPBFT settings for a more responsive timing

Defaults are based on epoch of 30s and real RTT. Shorten Delta and
rebroadcast times.

* Remove F3 itest repetitions on CI now that saul goodman

See proof of the pudding:
 * https://github.com/filecoin-project/lotus/actions/runs/11369403828/job/31626763159?pr=12597

* Update the changelog

* Address review comments

* Remove the sanity check that all nodes use the same initial manifest
  • Loading branch information
masih authored Oct 17, 2024
1 parent dafc56c commit fda61d3
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 14 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- Fix a bug in the `lotus-shed indexes backfill-events` command that may result in either duplicate events being backfilled where there are existing events (such an operation *should* be idempotent) or events erroneously having duplicate `logIndex` values when queried via ETH APIs. ([filecoin-project/lotus#12567](https://github.com/filecoin-project/lotus/pull/12567))
- Event APIs (Eth events and actor events) should only return reverted events if client queries by specific block hash / tipset. Eth and actor event subscription APIs should always return reverted events to enable accurate observation of real-time changes. ([filecoin-project/lotus#12585](https://github.com/filecoin-project/lotus/pull/12585))
- Add logic to check if the miner's owner address is delegated (f4 address). If it is delegated, the `lotus-shed sectors termination-estimate` command now sends the termination state call using the worker ID. This fix resolves the issue where termination-estimate did not function correctly for miners with delegated owner addresses. ([filecoin-project/lotus#12569](https://github.com/filecoin-project/lotus/pull/12569))
- Fix a bug in F3 participation API where valid leases may get removed due to dynamic manifest update. ([filecoin-project/lotus#12597](https://github.com/filecoin-project/lotus/pull/12597))

## Deps

Expand Down
2 changes: 1 addition & 1 deletion chain/lf3/f3.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ func (fff *F3) runSigningLoop(ctx context.Context) {
clear(alreadyParticipated)
}

participants := fff.leaser.getParticipantsByInstance(mb.Payload.Instance)
participants := fff.leaser.getParticipantsByInstance(mb.NetworkName, mb.Payload.Instance)
for _, id := range participants {
if _, ok := alreadyParticipated[id]; ok {
continue
Expand Down
16 changes: 13 additions & 3 deletions chain/lf3/participation_lease.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,25 @@ func (l *leaser) participate(ticket api.F3ParticipationTicket) (api.F3Participat
return newLease, nil
}

func (l *leaser) getParticipantsByInstance(instance uint64) []uint64 {
func (l *leaser) getParticipantsByInstance(network gpbft.NetworkName, instance uint64) []uint64 {
l.mutex.Lock()
defer l.mutex.Unlock()
currentManifest, _ := l.status()
currentNetwork := currentManifest.NetworkName
if currentNetwork != network {
return nil
}
var participants []uint64
for id, lease := range l.leases {
if instance > lease.ToInstance() {
if currentNetwork != lease.Network {
// Lazily delete any lease that does not belong to network, likely acquired from
// prior manifests.
delete(l.leases, id)
log.Warnf("lost F3 participation lease for miner %d at instance %d due to network mismatch: %s != %s", id, instance, currentNetwork, lease.Network)
} else if instance > lease.ToInstance() {
// Lazily delete the expired leases.
delete(l.leases, id)
log.Warnf("lost F3 participation lease for miner %d", id)
log.Warnf("lost F3 participation lease for miner %d due to instance (%d) > lease to instance (%d)", id, instance, lease.ToInstance())
} else {
participants = append(participants, id)
}
Expand Down
6 changes: 3 additions & 3 deletions chain/lf3/participation_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,18 @@ func TestLeaser(t *testing.T) {
require.NoError(t, err)

// Both participants should still be valid.
participants := subject.getParticipantsByInstance(11)
participants := subject.getParticipantsByInstance(testManifest.NetworkName, 11)
require.Len(t, participants, 2)
require.Contains(t, participants, uint64(123))
require.Contains(t, participants, uint64(456))

// After instance 16, only participant 456 should be valid.
participants = subject.getParticipantsByInstance(16)
participants = subject.getParticipantsByInstance(testManifest.NetworkName, 16)
require.Len(t, participants, 1)
require.Contains(t, participants, uint64(456))

// After instance 17, no participant must have a lease.
participants = subject.getParticipantsByInstance(17)
participants = subject.getParticipantsByInstance(testManifest.NetworkName, 17)
require.Empty(t, participants)
})
t.Run("expired ticket", func(t *testing.T) {
Expand Down
64 changes: 57 additions & 7 deletions itests/f3_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package itests

import (
"context"
"sync"
"testing"
"time"

Expand Down Expand Up @@ -36,6 +37,7 @@ type testEnv struct {
m *manifest.Manifest
t *testing.T
testCtx context.Context
debug bool
}

// Test that checks that F3 is enabled successfully,
Expand Down Expand Up @@ -194,6 +196,24 @@ func (e *testEnv) waitFor(f func(n *kit.TestFullNode) bool, timeout time.Duratio
e.t.Helper()
require.Eventually(e.t, func() bool {
e.t.Helper()
defer func() {
if e.debug {
var wg sync.WaitGroup
printProgress := func(index int, n *kit.TestFullNode) {
defer wg.Done()
if progress, err := n.F3GetProgress(e.testCtx); err != nil {
e.t.Logf("Node #%d progress: err: %v", index, err)
} else {
e.t.Logf("Node #%d progress: %v", index, progress)
}
}
for i, n := range e.minerFullNodes {
wg.Add(1)
go printProgress(i, n)
}
wg.Wait()
}
}()
for _, n := range e.minerFullNodes {
if !f(n) {
return false
Expand All @@ -209,8 +229,42 @@ func (e *testEnv) waitFor(f func(n *kit.TestFullNode) bool, timeout time.Duratio
// and the second full-node is an observer that is not directly connected to
// a miner. The last return value is the manifest sender for the network.
func setup(t *testing.T, blocktime time.Duration) *testEnv {
manif := lf3.NewManifest(BaseNetworkName+"/1", DefaultFinality, DefaultBootstrapEpoch, blocktime, cid.Undef)
return setupWithStaticManifest(t, manif, false)
return setupWithStaticManifest(t, newTestManifest(blocktime), false)
}

func newTestManifest(blocktime time.Duration) *manifest.Manifest {
return &manifest.Manifest{
ProtocolVersion: manifest.VersionCapability,
BootstrapEpoch: DefaultBootstrapEpoch,
NetworkName: BaseNetworkName + "/1",
InitialPowerTable: cid.Undef,
CommitteeLookback: manifest.DefaultCommitteeLookback,
CatchUpAlignment: blocktime / 2,
Gpbft: manifest.GpbftConfig{
// Use smaller time intervals for more responsive test progress/assertion.
Delta: 250 * time.Millisecond,
DeltaBackOffExponent: 1.3,
MaxLookaheadRounds: 5,
RebroadcastBackoffBase: 500 * time.Millisecond,
RebroadcastBackoffSpread: 0.1,
RebroadcastBackoffExponent: 1.3,
RebroadcastBackoffMax: 1 * time.Second,
},
EC: manifest.EcConfig{
Period: blocktime,
Finality: DefaultFinality,
DelayMultiplier: manifest.DefaultEcConfig.DelayMultiplier,
BaseDecisionBackoffTable: manifest.DefaultEcConfig.BaseDecisionBackoffTable,
HeadLookback: 0,
Finalize: true,
},
CertificateExchange: manifest.CxConfig{
ClientRequestTimeout: manifest.DefaultCxConfig.ClientRequestTimeout,
ServerRequestTimeout: manifest.DefaultCxConfig.ServerRequestTimeout,
MinimumPollInterval: blocktime,
MaximumPollInterval: 4 * blocktime,
},
}
}

func setupWithStaticManifest(t *testing.T, manif *manifest.Manifest, testBootstrap bool) *testEnv {
Expand Down Expand Up @@ -262,10 +316,7 @@ func setupWithStaticManifest(t *testing.T, manif *manifest.Manifest, testBootstr
cancel()
}

m, err := n1.F3GetManifest(ctx)
require.NoError(t, err)

e := &testEnv{m: m, t: t, testCtx: ctx}
e := &testEnv{m: manif, t: t, testCtx: ctx}
// in case we want to use more full-nodes in the future
e.minerFullNodes = []*kit.TestFullNode{&n1, &n2, &n3}

Expand All @@ -275,7 +326,6 @@ func setupWithStaticManifest(t *testing.T, manif *manifest.Manifest, testBootstr
err = n.NetConnect(ctx, e.ms.PeerInfo())
require.NoError(t, err)
}

errgrp.Go(func() error {
defer func() {
require.NoError(t, manifestServerHost.Close())
Expand Down

0 comments on commit fda61d3

Please sign in to comment.