From 3a68186ba76d394c3b60ea2cfeaffa371478a1a0 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 29 Jun 2023 14:07:05 +0000 Subject: [PATCH] kvserver: propagate `ReplicaUnavailableError` from intent resolution If intent resolution attempted to access txn info on a different range, and hit a poisoned latch because that range had lost quorum, the returned error would incorrectly claim that the range with the intent was unavailable rather than the range that had lost quorum. This patch correctly propagates the `ReplicaUnavailableError` from the other range. Epic: none Release note: None --- .../client_replica_circuit_breaker_test.go | 50 +++++++++++++++++++ pkg/kv/kvserver/replica_send.go | 32 +++++++----- 2 files changed, 69 insertions(+), 13 deletions(-) diff --git a/pkg/kv/kvserver/client_replica_circuit_breaker_test.go b/pkg/kv/kvserver/client_replica_circuit_breaker_test.go index 2e4894419eab..44e6667e419a 100644 --- a/pkg/kv/kvserver/client_replica_circuit_breaker_test.go +++ b/pkg/kv/kvserver/client_replica_circuit_breaker_test.go @@ -395,6 +395,56 @@ func TestReplicaCircuitBreaker_Liveness_QuorumLoss(t *testing.T) { require.NoError(t, tc.Write(n1)) } +// In this test, a txn anchored on the range losing quorum also has an intent on +// a healthy range. Quorum is lost before committing, poisoning latches for the +// txn info. When resolving the intent on the healthy range, it will hit a +// poisoned latch. This should result in a ReplicaUnavailableError from the +// original range that lost quorum, not from the range with the intent. +func TestReplicaCircuitBreaker_ResolveIntent_QuorumLoss(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + ctx := context.Background() + tc := setupCircuitBreakerTest(t) + defer tc.Stopper().Stop(ctx) + + // Get lease on n1. + require.NoError(t, tc.Write(n1)) + + // Split off a healthy range, which will inherit a lease on n1. Remove the + // replica on n2, so that it remains healthy when we take down n2. + failKey := tc.ScratchRange(t) + okKey := failKey.Next() + failDesc, _ := tc.SplitRangeOrFatal(t, okKey) + okDesc := tc.RemoveVotersOrFatal(t, okKey, tc.Target(n2)) + t.Logf("failDesc=%s", failDesc) + t.Logf("okDesc=%s", okDesc) + + // Start a transaction, anchoring it on the faulty range. + db := tc.Server(n1).DB() + txn := db.NewTxn(ctx, "test") + require.NoError(t, txn.Put(ctx, failKey, "fail")) + require.NoError(t, txn.Put(ctx, okKey, "ok")) + + // Lose quorum. + tc.StopServer(n2) + tc.HeartbeatNodeLiveness(t, n1) + + // Attempt to commit. It should fail, but will poison latches on + // the faulty range. + tc.SetSlowThreshold(time.Second) + err := txn.Commit(ctx) + tc.RequireIsBreakerOpen(t, err) + + // Read the key from the healthy range. It should fail due to a poisoned latch + // on the txn's anchored range. This error should appear to come from the + // failed range, not from the healthy range. + _, err = db.Get(ctx, okKey) + tc.RequireIsBreakerOpen(t, err) + ruErr := &kvpb.ReplicaUnavailableError{} + require.True(t, errors.As(err, &ruErr)) + require.Equal(t, failDesc.RangeID, ruErr.Desc.RangeID) +} + type dummyStream struct { name string ctx context.Context diff --git a/pkg/kv/kvserver/replica_send.go b/pkg/kv/kvserver/replica_send.go index fdc15eddba5c..c3ec7959b266 100644 --- a/pkg/kv/kvserver/replica_send.go +++ b/pkg/kv/kvserver/replica_send.go @@ -477,19 +477,25 @@ func (r *Replica) executeBatchWithConcurrencyRetries( }, requestEvalKind) if pErr != nil { if poisonErr := (*poison.PoisonedError)(nil); errors.As(pErr.GoError(), &poisonErr) { - // NB: we make the breaker error (which may be nil at this point, but - // usually is not) a secondary error, meaning it is not in the error - // chain. That is fine; the important bits to investigate - // programmatically are the ReplicaUnavailableError (which contains the - // descriptor) and the *PoisonedError (which contains the concrete - // subspan that caused this request to fail). We mark - // circuit.ErrBreakerOpen into the chain as well so that we have the - // invariant that all replica circuit breaker errors contain both - // ErrBreakerOpen and ReplicaUnavailableError. - pErr = kvpb.NewError(r.replicaUnavailableError(errors.CombineErrors( - errors.Mark(poisonErr, circuit.ErrBreakerOpen), - r.breaker.Signal().Err(), - ))) + // It's possible that intent resolution accessed txn info anchored on a + // different range and hit a poisoned latch there, in which case we want + // to propagate its ReplicaUnavailableError instead of creating one for + // this range (which likely isn't tripped). + if !errors.HasType(pErr.GoError(), (*kvpb.ReplicaUnavailableError)(nil)) { + // NB: we make the breaker error (which may be nil at this point, but + // usually is not) a secondary error, meaning it is not in the error + // chain. That is fine; the important bits to investigate + // programmatically are the ReplicaUnavailableError (which contains the + // descriptor) and the *PoisonedError (which contains the concrete + // subspan that caused this request to fail). We mark + // circuit.ErrBreakerOpen into the chain as well so that we have the + // invariant that all replica circuit breaker errors contain both + // ErrBreakerOpen and ReplicaUnavailableError. + pErr = kvpb.NewError(r.replicaUnavailableError(errors.CombineErrors( + errors.Mark(poisonErr, circuit.ErrBreakerOpen), + r.breaker.Signal().Err(), + ))) + } } return nil, nil, pErr } else if resp != nil {