Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kvserver: propagate ReplicaUnavailableError from intent resolution #105816

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions pkg/kv/kvserver/client_replica_circuit_breaker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,56 @@ func TestReplicaCircuitBreaker_Liveness_QuorumLoss(t *testing.T) {
require.NoError(t, tc.Write(n1))
}

// In this test, a txn anchored on the range losing quorum also has an intent on
// a healthy range. Quorum is lost before committing, poisoning latches for the
// txn info. When resolving the intent on the healthy range, it will hit a
// poisoned latch. This should result in a ReplicaUnavailableError from the
// original range that lost quorum, not from the range with the intent.
func TestReplicaCircuitBreaker_ResolveIntent_QuorumLoss(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
ctx := context.Background()
tc := setupCircuitBreakerTest(t)
defer tc.Stopper().Stop(ctx)

// Get lease on n1.
require.NoError(t, tc.Write(n1))

// Split off a healthy range, which will inherit a lease on n1. Remove the
// replica on n2, so that it remains healthy when we take down n2.
failKey := tc.ScratchRange(t)
okKey := failKey.Next()
failDesc, _ := tc.SplitRangeOrFatal(t, okKey)
okDesc := tc.RemoveVotersOrFatal(t, okKey, tc.Target(n2))
t.Logf("failDesc=%s", failDesc)
t.Logf("okDesc=%s", okDesc)

// Start a transaction, anchoring it on the faulty range.
db := tc.Server(n1).DB()
txn := db.NewTxn(ctx, "test")
require.NoError(t, txn.Put(ctx, failKey, "fail"))
require.NoError(t, txn.Put(ctx, okKey, "ok"))

// Lose quorum.
tc.StopServer(n2)
tc.HeartbeatNodeLiveness(t, n1)

// Attempt to commit. It should fail, but will poison latches on
// the faulty range.
tc.SetSlowThreshold(time.Second)
err := txn.Commit(ctx)
tc.RequireIsBreakerOpen(t, err)

// Read the key from the healthy range. It should fail due to a poisoned latch
// on the txn's anchored range. This error should appear to come from the
// failed range, not from the healthy range.
_, err = db.Get(ctx, okKey)
tc.RequireIsBreakerOpen(t, err)
ruErr := &kvpb.ReplicaUnavailableError{}
require.True(t, errors.As(err, &ruErr))
require.Equal(t, failDesc.RangeID, ruErr.Desc.RangeID)
}

type dummyStream struct {
name string
ctx context.Context
Expand Down
32 changes: 19 additions & 13 deletions pkg/kv/kvserver/replica_send.go
Original file line number Diff line number Diff line change
Expand Up @@ -477,19 +477,25 @@ func (r *Replica) executeBatchWithConcurrencyRetries(
}, requestEvalKind)
if pErr != nil {
if poisonErr := (*poison.PoisonedError)(nil); errors.As(pErr.GoError(), &poisonErr) {
// NB: we make the breaker error (which may be nil at this point, but
// usually is not) a secondary error, meaning it is not in the error
// chain. That is fine; the important bits to investigate
// programmatically are the ReplicaUnavailableError (which contains the
// descriptor) and the *PoisonedError (which contains the concrete
// subspan that caused this request to fail). We mark
// circuit.ErrBreakerOpen into the chain as well so that we have the
// invariant that all replica circuit breaker errors contain both
// ErrBreakerOpen and ReplicaUnavailableError.
pErr = kvpb.NewError(r.replicaUnavailableError(errors.CombineErrors(
errors.Mark(poisonErr, circuit.ErrBreakerOpen),
r.breaker.Signal().Err(),
)))
// It's possible that intent resolution accessed txn info anchored on a
// different range and hit a poisoned latch there, in which case we want
// to propagate its ReplicaUnavailableError instead of creating one for
// this range (which likely isn't tripped).
if !errors.HasType(pErr.GoError(), (*kvpb.ReplicaUnavailableError)(nil)) {
// NB: we make the breaker error (which may be nil at this point, but
// usually is not) a secondary error, meaning it is not in the error
// chain. That is fine; the important bits to investigate
// programmatically are the ReplicaUnavailableError (which contains the
// descriptor) and the *PoisonedError (which contains the concrete
// subspan that caused this request to fail). We mark
// circuit.ErrBreakerOpen into the chain as well so that we have the
// invariant that all replica circuit breaker errors contain both
// ErrBreakerOpen and ReplicaUnavailableError.
pErr = kvpb.NewError(r.replicaUnavailableError(errors.CombineErrors(
errors.Mark(poisonErr, circuit.ErrBreakerOpen),
r.breaker.Signal().Err(),
)))
}
}
return nil, nil, pErr
} else if resp != nil {
Expand Down