Skip to content

Commit

Permalink
Merge #86339
Browse files Browse the repository at this point in the history
86339: multiregionccl: improve and deflake TestMultiRegionDataDriven r=ajwerner a=arulajmani

fixes #77908
fixes #80837

This patch switches to printing the entire trace instead of an opaque
failure if the trace analysis does not conform to the expected output.

We also deflake this test. The framework allows for tests to express
an expected leaseholder and wait for these changes to apply. Given the
test cluster interface, the only claim that can be made here is that
the outgoing leaseholder has applied the lease transfer. Other replicas
may still be operating under a stale view of the lease. The
"served via follower read" test output line is contingent on the serving
replicas view of the lease. I think this was leading to some (but maybe
not all) flakes we've seen sporadically. To get around this, we now wait
for all replicas to have the same view of the leaseholder when waiting
for zone config changes to apply.

Release justification: non production code change
Release note: None

Co-authored-by: Arul Ajmani <[email protected]>
  • Loading branch information
craig[bot] and arulajmani committed Aug 18, 2022
2 parents 1be194d + e65b3ca commit 1e751ae
Showing 1 changed file with 47 additions and 4 deletions.
51 changes: 47 additions & 4 deletions pkg/ccl/multiregionccl/datadriven_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ SET CLUSTER SETTING kv.closed_timestamp.propagation_slack = '0.5s'

case "trace-sql":
mustHaveArgOrFatal(t, d, serverIdx)
var rec tracingpb.Recording
queryFunc := func() (localRead bool, followerRead bool, err error) {
var idx int
d.ScanArgs(t, serverIdx, &idx)
Expand All @@ -232,7 +233,7 @@ SET CLUSTER SETTING kv.closed_timestamp.propagation_slack = '0.5s'
if err != nil {
return false, false, err
}
rec := <-recCh
rec = <-recCh
localRead, followerRead, err = checkReadServedLocallyInSimpleRecording(rec)
if err != nil {
return false, false, err
Expand All @@ -251,6 +252,9 @@ SET CLUSTER SETTING kv.closed_timestamp.propagation_slack = '0.5s'
output.WriteString(
fmt.Sprintf("served via follower read: %s\n", strconv.FormatBool(followerRead)))
}
if d.Expected != output.String() {
return errors.AssertionFailedf("not a match, trace:\n%s\n", rec).Error()
}
return output.String()

case "refresh-range-descriptor-cache":
Expand Down Expand Up @@ -333,8 +337,8 @@ SET CLUSTER SETTING kv.closed_timestamp.propagation_slack = '0.5s'
}
}

// If the user specified a leaseholder, transfer range lease to the
// leaseholder.
// If the user specified a leaseholder, transfer range's lease to the
// that node.
if expectedPlacement.hasLeaseholderInfo() {
expectedLeaseIdx := expectedPlacement.getLeaseholder()
actualLeaseIdx := actualPlacement.getLeaseholder()
Expand Down Expand Up @@ -365,7 +369,17 @@ SET CLUSTER SETTING kv.closed_timestamp.propagation_slack = '0.5s'
)
}
}

// Now that this range has gone through a bunch of changes, we lookup
// the range and its leaseholder again to ensure we're comparing the
// most up-to-date range state with the supplied expectation.
desc, err = ds.tc.LookupRange(lookupKey)
if err != nil {
return err
}
actualPlacement, err = parseReplicasFromRange(t, ds.tc, desc)
if err != nil {
return err
}
err = actualPlacement.satisfiesExpectedPlacement(expectedPlacement)
if err != nil {
return err
Expand Down Expand Up @@ -574,6 +588,7 @@ func parseReplicasFromInput(
}

// parseReplicasFromInput constructs a replicaPlacement from a range descriptor.
// It also ensures all replicas have the same view of who the leaseholder is.
func parseReplicasFromRange(
t *testing.T, tc serverutils.TestClusterInterface, desc roachpb.RangeDescriptor,
) (*replicaPlacement, error) {
Expand All @@ -585,6 +600,28 @@ func parseReplicasFromRange(
if err != nil {
return nil, errors.Wrap(err, "could not get leaseholder")
}
// This test performs lease transfers at various points and expects the
// range's leaseholder to conform to what was specified in the test. However,
// whenever the lease is transferred using methods on TestCluster, only the
// outgoing leaseholder is guaranteed to have applied the lease. Given the
// tracing assumptions these tests make, it's worth ensuring all replicas have
// applied the lease, as it makes reasoning about these tests much easier.
// To that effect, we loop through all replicas on the supplied descriptor and
// ensure that is indeed the case.
for _, repl := range desc.Replicas().VoterAndNonVoterDescriptors() {
t := tc.Target(nodeIdToIdx(t, tc, repl.NodeID))
lh, err := tc.FindRangeLeaseHolder(desc, &t)
if err != nil {
return nil, err
}
if lh.NodeID != leaseHolder.NodeID && lh.StoreID != leaseHolder.StoreID {
return nil, errors.Newf(
"all replicas do not have the same view of the lease; found %s and %s",
lh, leaseHolder,
)
}
}

leaseHolderIdx := nodeIdToIdx(t, tc, leaseHolder.NodeID)
replicaMap[leaseHolderIdx] = replicaTypeLeaseholder
ret.leaseholder = leaseHolderIdx
Expand Down Expand Up @@ -656,6 +693,12 @@ func (r *replicaPlacement) satisfiesExpectedPlacement(expected *replicaPlacement
}
}

if expected.hasLeaseholderInfo() && expected.getLeaseholder() != r.getLeaseholder() {
return errors.Newf(
"expected %s to be the leaseholder, but %s was instead",
expected.getLeaseholder(), r.getLeaseholder(),
)
}
return nil
}

Expand Down

0 comments on commit 1e751ae

Please sign in to comment.