Skip to content

Commit

Permalink
storage: deflake TestNodeLivenessStatusMap
Browse files Browse the repository at this point in the history
Prior to this patch, this test would fail `stressrace` after a few
dozen iterations. The root cause of this was the invalid call to
`t.Parallel()`, which this patch removes.

Additionally, this patch adapts TimeUntilStoreDead for each test case
to avoid flakes, and removes a previous hack obviated by this
simplification.

Release note: None

Co-authored-by: Tobias Schottdorf <[email protected]>
  • Loading branch information
knz and tbg committed Apr 24, 2019
1 parent d50ebea commit e01b5bb
Showing 1 changed file with 48 additions and 34 deletions.
82 changes: 48 additions & 34 deletions pkg/storage/node_liveness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -855,48 +855,62 @@ func TestNodeLivenessStatusMap(t *testing.T) {
// See what comes up in the status.
callerNodeLiveness := firstServer.GetNodeLiveness()

type expectedStatus struct {
type testCase struct {
nodeID roachpb.NodeID
expectedStatus storagepb.NodeLivenessStatus
}
testData := []expectedStatus{
{liveNodeID, storagepb.NodeLivenessStatus_LIVE},
{deadNodeID, storagepb.NodeLivenessStatus_DEAD},
{decommissioningNodeID, storagepb.NodeLivenessStatus_DECOMMISSIONING},
{removedNodeID, storagepb.NodeLivenessStatus_DECOMMISSIONED},

// This is a bit of a hack: we want to run with a low TimeUntilStoreDead
// if we know that the node is dead to speed up the test. However, doing
// so for all tests gives us false test failures in the opposite case in
// which the node remains live because when stressing the test
// sufficiently hard nodes can fail to become live over extended periods
// of time. So we run with a short duration only if running.
//
// NB: the test still takes >5s because it has to wait for liveness
// record expiration (~5s) before it can possibly declare a node as
// dead. We could try to lower the liveness duration but this isn't
// trivial and might lead to new test flakes, though.
running bool
}

// Below we're going to check that all statuses converge and stabilize
// to a known situation.
testData := []testCase{
{liveNodeID, storagepb.NodeLivenessStatus_LIVE, true},
{deadNodeID, storagepb.NodeLivenessStatus_DEAD, false},
{decommissioningNodeID, storagepb.NodeLivenessStatus_DECOMMISSIONING, true},
{removedNodeID, storagepb.NodeLivenessStatus_DECOMMISSIONED, false},
}

for _, test := range testData {
t.Run(test.expectedStatus.String(), func(t *testing.T) {
nodeID, expectedStatus := test.nodeID, test.expectedStatus
t.Parallel()

t.Run(fmt.Sprintf("n%d->%s", test.nodeID, test.expectedStatus), func(t *testing.T) {
testutils.SucceedsSoon(t, func() error {
// Ensure that dead nodes are quickly recognized as dead by
// gossip. Overriding cluster settings is generally a really bad
// idea as they are also populated via Gossip and so our update
// is possibly going to be wiped out. But going through SQL
// doesn't allow durations below 1m15s, which is much too long
// for a test.
// We do this in every SucceedsSoon attempt, so we'll be good.
storage.TimeUntilStoreDead.Override(&firstServer.ClusterSettings().SV,
storage.TestTimeUntilStoreDead)

log.Infof(ctx, "checking expected status for node %d", nodeID)
dur := 5 * time.Minute
if !test.running {
// Ensure that dead nodes are quickly recognized as dead by
// gossip. Overriding cluster settings is generally a really bad
// idea as they are also populated via Gossip and so our update
// is possibly going to be wiped out. But going through SQL
// doesn't allow durations below 1m15s, which is much too long
// for a test.
// We do this in every SucceedsSoon attempt, so we'll be good.
dur = storage.TestTimeUntilStoreDead
}
storage.TimeUntilStoreDead.Override(&firstServer.ClusterSettings().SV, dur)

nodeID, expectedStatus := test.nodeID, test.expectedStatus

log.Infof(ctx, "checking expected status (%s) for node %d", expectedStatus, nodeID)
nodeStatuses := callerNodeLiveness.GetLivenessStatusMap()
if st, ok := nodeStatuses[nodeID]; !ok {
return fmt.Errorf("%s node not in statuses", expectedStatus)
} else {
if st != expectedStatus {
if expectedStatus == storagepb.NodeLivenessStatus_DECOMMISSIONING && st == storagepb.NodeLivenessStatus_DECOMMISSIONED {
// Server somehow shut down super-fast. Tolerating the mismatch.
return nil
}
return fmt.Errorf("unexpected status: got %s, expected %s",
st, expectedStatus)
}
st, ok := nodeStatuses[nodeID]
if !ok {
return errors.Errorf("node %d: not in statuses\n", nodeID)
}
if st != expectedStatus {
return errors.Errorf("node %d: unexpected status: got %s, expected %s\n",
nodeID, st, expectedStatus,
)
}
log.Infof(ctx, "node %d status ok", nodeID)
return nil
})
})
Expand Down

0 comments on commit e01b5bb

Please sign in to comment.