From 8a9a7c0de74661ac8efe34962e86127b9b15aed5 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Thu, 24 Oct 2024 03:41:07 +0000 Subject: [PATCH] fix(f3): try again when we fail to fetch the manifest (#12634) Continuing with the function here will lead to a panic. This can happen, e.g., if the lotus node stops and/or returns an error while the participant is waiting for its lease to expire. --- chain/lf3/participation.go | 1 + chain/lf3/participation_test.go | 75 +++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 chain/lf3/participation_test.go diff --git a/chain/lf3/participation.go b/chain/lf3/participation.go index fe13a784b98..7d1b10bd808 100644 --- a/chain/lf3/participation.go +++ b/chain/lf3/participation.go @@ -203,6 +203,7 @@ func (p *Participant) awaitLeaseExpiry(ctx context.Context, lease api.F3Particip } log.Errorw("Failed to check F3 progress while awaiting lease expiry. Retrying after backoff.", "attempts", p.backoff.Attempt(), "backoff", p.backoff.Duration(), "err", err) p.backOff(ctx) + continue case manifest == nil || manifest.NetworkName != lease.Network: // If we got an unexpected manifest, or no manifest, go back to the // beginning and try to get another ticket. Switching from having a manifest diff --git a/chain/lf3/participation_test.go b/chain/lf3/participation_test.go new file mode 100644 index 00000000000..79564d467eb --- /dev/null +++ b/chain/lf3/participation_test.go @@ -0,0 +1,75 @@ +package lf3_test + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/jpillora/backoff" + "github.com/stretchr/testify/require" + + "github.com/filecoin-project/go-address" + "github.com/filecoin-project/go-f3/gpbft" + "github.com/filecoin-project/go-f3/manifest" + + "github.com/filecoin-project/lotus/api" + "github.com/filecoin-project/lotus/chain/lf3" + "github.com/filecoin-project/lotus/node/modules/dtypes" +) + +type manifestFailAPI struct { + manifestRequested chan struct{} +} + +func (m *manifestFailAPI) F3GetManifest(ctx context.Context) (*manifest.Manifest, error) { + select { + case m.manifestRequested <- struct{}{}: + default: + } + return nil, errors.New("test error") +} + +func (m *manifestFailAPI) F3GetOrRenewParticipationTicket(ctx context.Context, minerID address.Address, previous api.F3ParticipationTicket, instances uint64) (api.F3ParticipationTicket, error) { + switch string(previous) { + case "good ticket": + return api.F3ParticipationTicket("bad ticket"), nil + case "": + return api.F3ParticipationTicket("good ticket"), nil + default: + panic("unexpected ticket") + } +} + +func (m *manifestFailAPI) F3GetProgress(ctx context.Context) (gpbft.Instant, error) { + return gpbft.Instant{}, nil +} + +func (m *manifestFailAPI) F3Participate(ctx context.Context, ticket api.F3ParticipationTicket) (api.F3ParticipationLease, error) { + return api.F3ParticipationLease{ + Network: "test", + Issuer: "foobar", + MinerID: 0, + FromInstance: 0, + ValidityTerm: 10, + }, nil +} + +// Test that we correctly handle failed requests for the manifest and keep trying to get it. +func TestParticipantManifestFailure(t *testing.T) { + api := &manifestFailAPI{manifestRequested: make(chan struct{}, 5)} + addr, err := address.NewIDAddress(1000) + require.NoError(t, err) + + p := lf3.NewParticipant(context.Background(), api, dtypes.MinerAddress(addr), + &backoff.Backoff{ + Min: 1 * time.Second, + Max: 1 * time.Minute, + Factor: 1.5, + }, 13, 5) + require.NoError(t, p.Start(context.Background())) + <-api.manifestRequested + <-api.manifestRequested + <-api.manifestRequested + require.NoError(t, p.Stop(context.Background())) +}