From e65b3ca0a6a02c393ceb6250f6d505ac7d71933d Mon Sep 17 00:00:00 2001 From: Arul Ajmani Date: Wed, 17 Aug 2022 14:02:45 -0400 Subject: [PATCH] multiregionccl: improve and deflake TestMultiRegionDataDriven This patch switches to printing the entire trace instead of an opaque failure if the trace analysis does not conform to the expected output. We also deflake this test. The framework allows for tests to express an expected leaseholder and wait for these changes to apply. Given the test cluster interface, the only claim that can be made here is that the outgoing leaseholder has applied the lease transfer. Other replicas may still be operating under a stale view of the lease. The "served via follower read" test output line is contingent on the serving replicas view of the lease. I think this was leading to some (but maybe not all) flakes we've seen sporadically. To get around this, we now wait for all replicas to have the same view of the leaseholder when waiting for zone config changes to apply. Release justification: non production code change Release note: None --- pkg/ccl/multiregionccl/datadriven_test.go | 51 +++++++++++++++++++++-- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/pkg/ccl/multiregionccl/datadriven_test.go b/pkg/ccl/multiregionccl/datadriven_test.go index 77f1593e586d..7c9ee648f2ae 100644 --- a/pkg/ccl/multiregionccl/datadriven_test.go +++ b/pkg/ccl/multiregionccl/datadriven_test.go @@ -210,6 +210,7 @@ SET CLUSTER SETTING kv.closed_timestamp.propagation_slack = '0.5s' case "trace-sql": mustHaveArgOrFatal(t, d, serverIdx) + var rec tracingpb.Recording queryFunc := func() (localRead bool, followerRead bool, err error) { var idx int d.ScanArgs(t, serverIdx, &idx) @@ -232,7 +233,7 @@ SET CLUSTER SETTING kv.closed_timestamp.propagation_slack = '0.5s' if err != nil { return false, false, err } - rec := <-recCh + rec = <-recCh localRead, followerRead, err = checkReadServedLocallyInSimpleRecording(rec) if err != nil { return false, false, err @@ -251,6 +252,9 @@ SET CLUSTER SETTING kv.closed_timestamp.propagation_slack = '0.5s' output.WriteString( fmt.Sprintf("served via follower read: %s\n", strconv.FormatBool(followerRead))) } + if d.Expected != output.String() { + return errors.AssertionFailedf("not a match, trace:\n%s\n", rec).Error() + } return output.String() case "refresh-range-descriptor-cache": @@ -333,8 +337,8 @@ SET CLUSTER SETTING kv.closed_timestamp.propagation_slack = '0.5s' } } - // If the user specified a leaseholder, transfer range lease to the - // leaseholder. + // If the user specified a leaseholder, transfer range's lease to the + // that node. if expectedPlacement.hasLeaseholderInfo() { expectedLeaseIdx := expectedPlacement.getLeaseholder() actualLeaseIdx := actualPlacement.getLeaseholder() @@ -365,7 +369,17 @@ SET CLUSTER SETTING kv.closed_timestamp.propagation_slack = '0.5s' ) } } - + // Now that this range has gone through a bunch of changes, we lookup + // the range and its leaseholder again to ensure we're comparing the + // most up-to-date range state with the supplied expectation. + desc, err = ds.tc.LookupRange(lookupKey) + if err != nil { + return err + } + actualPlacement, err = parseReplicasFromRange(t, ds.tc, desc) + if err != nil { + return err + } err = actualPlacement.satisfiesExpectedPlacement(expectedPlacement) if err != nil { return err @@ -574,6 +588,7 @@ func parseReplicasFromInput( } // parseReplicasFromInput constructs a replicaPlacement from a range descriptor. +// It also ensures all replicas have the same view of who the leaseholder is. func parseReplicasFromRange( t *testing.T, tc serverutils.TestClusterInterface, desc roachpb.RangeDescriptor, ) (*replicaPlacement, error) { @@ -585,6 +600,28 @@ func parseReplicasFromRange( if err != nil { return nil, errors.Wrap(err, "could not get leaseholder") } + // This test performs lease transfers at various points and expects the + // range's leaseholder to conform to what was specified in the test. However, + // whenever the lease is transferred using methods on TestCluster, only the + // outgoing leaseholder is guaranteed to have applied the lease. Given the + // tracing assumptions these tests make, it's worth ensuring all replicas have + // applied the lease, as it makes reasoning about these tests much easier. + // To that effect, we loop through all replicas on the supplied descriptor and + // ensure that is indeed the case. + for _, repl := range desc.Replicas().VoterAndNonVoterDescriptors() { + t := tc.Target(nodeIdToIdx(t, tc, repl.NodeID)) + lh, err := tc.FindRangeLeaseHolder(desc, &t) + if err != nil { + return nil, err + } + if lh.NodeID != leaseHolder.NodeID && lh.StoreID != leaseHolder.StoreID { + return nil, errors.Newf( + "all replicas do not have the same view of the lease; found %s and %s", + lh, leaseHolder, + ) + } + } + leaseHolderIdx := nodeIdToIdx(t, tc, leaseHolder.NodeID) replicaMap[leaseHolderIdx] = replicaTypeLeaseholder ret.leaseholder = leaseHolderIdx @@ -656,6 +693,12 @@ func (r *replicaPlacement) satisfiesExpectedPlacement(expected *replicaPlacement } } + if expected.hasLeaseholderInfo() && expected.getLeaseholder() != r.getLeaseholder() { + return errors.Newf( + "expected %s to be the leaseholder, but %s was instead", + expected.getLeaseholder(), r.getLeaseholder(), + ) + } return nil }