storage: fatal on replica corruption

This path is seldom taken, but when it does and the node keeps running, it won't be performing as expected. Besides, individual replicas can become "corrupt" due to problems that affect a larger scope, as seen in cockroachdb#33375 (which is caused by a txn anomaly, cockroachdb#32784). Release note: None
tbg · Jan 3, 2019 · 7404038 · 7404038
1 parent f3cf71d
commit 7404038
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 2 deletions.
diff --git a/pkg/storage/client_raft_test.go b/pkg/storage/client_raft_test.go
@@ -1375,6 +1375,12 @@ func TestStoreRangeUpReplicate(t *testing.T) {
 func TestStoreRangeCorruptionChangeReplicas(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 
+	var exitStatus int
+	log.SetExitFunc(true /* hideStack */, func(i int) {
+		exitStatus = i
+	})
+	defer log.ResetExitFunc()
+
 	const numReplicas = 5
 	const extraStores = 3
 
@@ -1489,6 +1495,10 @@ func TestStoreRangeCorruptionChangeReplicas(t *testing.T) {
 			return nil
 		})
 	}
+
+	if exitStatus != 255 {
+		t.Fatalf("unexpected exit status %d", exitStatus)
+	}
 }
 
 // getRangeMetadata retrieves the current range descriptor for the target

diff --git a/pkg/storage/replica.go b/pkg/storage/replica.go
@@ -6719,6 +6719,8 @@ func (r *Replica) maybeSetCorrupt(ctx context.Context, pErr *roachpb.Error) *roa
 			cErr.Processed = false
 			return roachpb.NewError(cErr)
 		}
+
+		log.Fatalf(ctx, "replica is corrupted: %s", pErr)
 	}
 	return pErr
 }

diff --git a/pkg/storage/replica_test.go b/pkg/storage/replica_test.go
@@ -3663,6 +3663,13 @@ func TestReplicaNoTimestampIncrementWithinTxn(t *testing.T) {
 // not decodable.
 func TestReplicaAbortSpanReadError(t *testing.T) {
 	defer leaktest.AfterTest(t)()
+
+	var exitStatus int
+	log.SetExitFunc(true /* hideStack */, func(i int) {
+		exitStatus = i
+	})
+	defer log.ResetExitFunc()
+
 	tc := testContext{}
 	stopper := stop.NewStopper()
 	defer stopper.Stop(context.TODO())
@@ -3693,6 +3700,9 @@ func TestReplicaAbortSpanReadError(t *testing.T) {
 	if !testutils.IsPError(pErr, "replica corruption") {
 		t.Fatal(pErr)
 	}
+	if exitStatus != 255 {
+		t.Fatalf("did not fatal (exit status %d)", exitStatus)
+	}
 }
 
 // TestReplicaAbortSpanStoredTxnRetryError verifies that if a cached
@@ -4122,6 +4132,13 @@ func Test1PCTransactionWriteTimestamp(t *testing.T) {
 // EndTransaction call with a malformed commit trigger fails.
 func TestEndTransactionWithMalformedSplitTrigger(t *testing.T) {
 	defer leaktest.AfterTest(t)()
+
+	var exitStatus int
+	log.SetExitFunc(true /* hideStack */, func(i int) {
+		exitStatus = i
+	})
+	defer log.ResetExitFunc()
+
 	tc := testContext{}
 	stopper := stop.NewStopper()
 	defer stopper.Stop(context.TODO())
@@ -4156,8 +4173,13 @@ func TestEndTransactionWithMalformedSplitTrigger(t *testing.T) {
 	}
 
 	assignSeqNumsForReqs(txn, &args)
-	if _, pErr := tc.SendWrappedWith(h, &args); !testutils.IsPError(pErr, "range does not match splits") {
-		t.Errorf("expected range does not match splits error; got %s", pErr)
+	expErr := regexp.QuoteMeta("replica corruption (processed=true): range does not match splits")
+	if _, pErr := tc.SendWrappedWith(h, &args); !testutils.IsPError(pErr, expErr) {
+		t.Errorf("unexpected error: %s", pErr)
+	}
+
+	if exitStatus != 255 {
+		t.Fatalf("unexpected exit status %d", exitStatus)
 	}
 }
 
@@ -6518,6 +6540,12 @@ func TestAppliedIndex(t *testing.T) {
 func TestReplicaCorruption(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 
+	var exitStatus int
+	log.SetExitFunc(true /* hideStack */, func(i int) {
+		exitStatus = i
+	})
+	defer log.ResetExitFunc()
+
 	tsc := TestStoreConfig(nil)
 	tsc.TestingKnobs.EvalKnobs.TestingEvalFilter =
 		func(filterArgs storagebase.FilterArgs) *roachpb.Error {
@@ -6568,6 +6596,10 @@ func TestReplicaCorruption(t *testing.T) {
 		t.Fatalf("expected r.mu.destroyed == pErr.GetDetail(), instead %q != %q", r.mu.destroyStatus.err, pErr.GetDetail())
 	}
 
+	if exitStatus != 255 {
+		t.Fatalf("unexpected exit status %d", exitStatus)
+	}
+
 	// TODO(bdarnell): when maybeSetCorrupt is finished verify that future commands fail too.
 }