Skip to content

Commit

Permalink
storage: fatal on replica corruption
Browse files Browse the repository at this point in the history
This path is seldom taken, but when it does and the node keeps running,
it won't be performing as expected. Besides, individual replicas can
become "corrupt" due to problems that affect a larger scope, as seen
in cockroachdb#33375 (which is caused by a txn anomaly, cockroachdb#32784).

Release note: None
  • Loading branch information
tbg committed Jan 3, 2019
1 parent f3cf71d commit 7404038
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 2 deletions.
10 changes: 10 additions & 0 deletions pkg/storage/client_raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1375,6 +1375,12 @@ func TestStoreRangeUpReplicate(t *testing.T) {
func TestStoreRangeCorruptionChangeReplicas(t *testing.T) {
defer leaktest.AfterTest(t)()

var exitStatus int
log.SetExitFunc(true /* hideStack */, func(i int) {
exitStatus = i
})
defer log.ResetExitFunc()

const numReplicas = 5
const extraStores = 3

Expand Down Expand Up @@ -1489,6 +1495,10 @@ func TestStoreRangeCorruptionChangeReplicas(t *testing.T) {
return nil
})
}

if exitStatus != 255 {
t.Fatalf("unexpected exit status %d", exitStatus)
}
}

// getRangeMetadata retrieves the current range descriptor for the target
Expand Down
2 changes: 2 additions & 0 deletions pkg/storage/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -6719,6 +6719,8 @@ func (r *Replica) maybeSetCorrupt(ctx context.Context, pErr *roachpb.Error) *roa
cErr.Processed = false
return roachpb.NewError(cErr)
}

log.Fatalf(ctx, "replica is corrupted: %s", pErr)
}
return pErr
}
Expand Down
36 changes: 34 additions & 2 deletions pkg/storage/replica_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3663,6 +3663,13 @@ func TestReplicaNoTimestampIncrementWithinTxn(t *testing.T) {
// not decodable.
func TestReplicaAbortSpanReadError(t *testing.T) {
defer leaktest.AfterTest(t)()

var exitStatus int
log.SetExitFunc(true /* hideStack */, func(i int) {
exitStatus = i
})
defer log.ResetExitFunc()

tc := testContext{}
stopper := stop.NewStopper()
defer stopper.Stop(context.TODO())
Expand Down Expand Up @@ -3693,6 +3700,9 @@ func TestReplicaAbortSpanReadError(t *testing.T) {
if !testutils.IsPError(pErr, "replica corruption") {
t.Fatal(pErr)
}
if exitStatus != 255 {
t.Fatalf("did not fatal (exit status %d)", exitStatus)
}
}

// TestReplicaAbortSpanStoredTxnRetryError verifies that if a cached
Expand Down Expand Up @@ -4122,6 +4132,13 @@ func Test1PCTransactionWriteTimestamp(t *testing.T) {
// EndTransaction call with a malformed commit trigger fails.
func TestEndTransactionWithMalformedSplitTrigger(t *testing.T) {
defer leaktest.AfterTest(t)()

var exitStatus int
log.SetExitFunc(true /* hideStack */, func(i int) {
exitStatus = i
})
defer log.ResetExitFunc()

tc := testContext{}
stopper := stop.NewStopper()
defer stopper.Stop(context.TODO())
Expand Down Expand Up @@ -4156,8 +4173,13 @@ func TestEndTransactionWithMalformedSplitTrigger(t *testing.T) {
}

assignSeqNumsForReqs(txn, &args)
if _, pErr := tc.SendWrappedWith(h, &args); !testutils.IsPError(pErr, "range does not match splits") {
t.Errorf("expected range does not match splits error; got %s", pErr)
expErr := regexp.QuoteMeta("replica corruption (processed=true): range does not match splits")
if _, pErr := tc.SendWrappedWith(h, &args); !testutils.IsPError(pErr, expErr) {
t.Errorf("unexpected error: %s", pErr)
}

if exitStatus != 255 {
t.Fatalf("unexpected exit status %d", exitStatus)
}
}

Expand Down Expand Up @@ -6518,6 +6540,12 @@ func TestAppliedIndex(t *testing.T) {
func TestReplicaCorruption(t *testing.T) {
defer leaktest.AfterTest(t)()

var exitStatus int
log.SetExitFunc(true /* hideStack */, func(i int) {
exitStatus = i
})
defer log.ResetExitFunc()

tsc := TestStoreConfig(nil)
tsc.TestingKnobs.EvalKnobs.TestingEvalFilter =
func(filterArgs storagebase.FilterArgs) *roachpb.Error {
Expand Down Expand Up @@ -6568,6 +6596,10 @@ func TestReplicaCorruption(t *testing.T) {
t.Fatalf("expected r.mu.destroyed == pErr.GetDetail(), instead %q != %q", r.mu.destroyStatus.err, pErr.GetDetail())
}

if exitStatus != 255 {
t.Fatalf("unexpected exit status %d", exitStatus)
}

// TODO(bdarnell): when maybeSetCorrupt is finished verify that future commands fail too.
}

Expand Down

0 comments on commit 7404038

Please sign in to comment.