From ecadcb20e2f6125fb497e50baba4fd4f4ca8dbf0 Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Fri, 9 Sep 2022 19:25:45 +0000 Subject: [PATCH] kvcoord: (partially) de-flake tpcc/multiregion Touches #85711 fixing one of the failure modes. In #85101 we deleted code in the span refresher interceptor that terminated WriteTooOld flags. We did so assuming these flags were only set in 19.2 servers, but that's not the case -- TestWTOBitTerminatedOnErrorResponses demonstrates that it's possible for the server to return error responses with the bit set if a response is combined with an error from another request in the same batch request. Since we were no longer terminating the flag, it was possible to update the TxnCoordSender's embedded txn with this bit, an then use it when issuing subsequent batch requests -- something we were asserting against. Release note: None Release justification: Bug fix --- .../kvclient/kvcoord/txn_coord_sender_test.go | 55 +++++++++++++++++++ .../kvcoord/txn_interceptor_span_refresher.go | 7 +++ 2 files changed, 62 insertions(+) diff --git a/pkg/kv/kvclient/kvcoord/txn_coord_sender_test.go b/pkg/kv/kvclient/kvcoord/txn_coord_sender_test.go index 9962a1f90a6d..e41d50aa8e00 100644 --- a/pkg/kv/kvclient/kvcoord/txn_coord_sender_test.go +++ b/pkg/kv/kvclient/kvcoord/txn_coord_sender_test.go @@ -931,6 +931,61 @@ func TestTxnCoordSenderTxnUpdatedOnError(t *testing.T) { } } +// TestWTOBitTerminatedOnErrorResponses is a regression test for #85711. It +// ensures that when batch request errors have the WTO bit set, subsequent +// request don't carry that bit (something that's asserted on in client-side +// interceptors). +func TestWTOBitTerminatedOnErrorResponses(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + ctx := context.Background() + + keyA := roachpb.Key("a") + keyB := roachpb.Key("b") + + s, _, db := serverutils.StartServer(t, base.TestServerArgs{}) + defer s.Stopper().Stop(ctx) + + // Split to ensure batch requests get split through the distsender. + require.NoError(t, db.AdminSplit(ctx, keyB /* splitKey */, hlc.MaxTimestamp /* expirationTimestamp */)) + + // Write a key that the txn-al CPut below doesn't expect, failing the batch + // request. + require.NoError(t, db.Put(ctx, keyB, []byte("b-unexpected"))) + + txn := db.NewTxn(ctx, "root") + + // Read a key to pin the read timestamp. We'll use it to trigger to WTO + // error below. + b0 := txn.NewBatch() + b0.Get(keyB) + require.NoError(t, txn.Run(ctx, b0)) + + // Write to keyA out-of-band to induce a WTO condition in the txn-al Put + // below. + require.NoError(t, db.Put(ctx, keyA, "a-unexpected")) + + // Send a batch (as part of a leaf txn) that will be split into two + // sub-batches: [Put(a), CPut(b, nil)]. Put(a) should observe the WTO bit + // set in the batch response, whereas the CPut induces an error response. + // Since these are two separate requests, the error response is combined + // with the batch request such that the error itself has the WTO bit set. In + // #85711 we observed that the bit was not terminated on the client side, + // and subsequent requests were issued with the WTO bit set, which tripped + // up assertions. + b1 := txn.NewBatch() + b1.Put(keyA, "a") + b1.CPut(keyB, "b", nil /* expValue */) + require.True(t, testutils.IsError(txn.Run(ctx, b1), "unexpected value")) + require.False(t, txn.TestingCloneTxn().WriteTooOld) // WTO bit is terminated + + b2 := txn.NewBatch() + b2.Put(keyB, "b") + require.NoError(t, txn.Run(ctx, b2)) + require.False(t, txn.TestingCloneTxn().WriteTooOld) // WTO bit is terminated + require.NoError(t, txn.Commit(ctx)) +} + // TestTxnMultipleCoord checks that multiple txn coordinators can be // used for reads by a single transaction, and their state can be combined. func TestTxnMultipleCoord(t *testing.T) { diff --git a/pkg/kv/kvclient/kvcoord/txn_interceptor_span_refresher.go b/pkg/kv/kvclient/kvcoord/txn_interceptor_span_refresher.go index 3a265d2db019..2bbd81c6c89a 100644 --- a/pkg/kv/kvclient/kvcoord/txn_interceptor_span_refresher.go +++ b/pkg/kv/kvclient/kvcoord/txn_interceptor_span_refresher.go @@ -224,6 +224,13 @@ func (sr *txnSpanRefresher) sendLockedWithRefreshAttempts( } br, pErr := sr.wrapped.SendLocked(ctx, ba) + // We might receive errors with the WriteTooOld flag set. This interceptor + // wants to always terminate that flag. In the case of an error, we can just + // ignore it. + if pErr != nil && pErr.GetTxn() != nil { + pErr.GetTxn().WriteTooOld = false + } + if pErr == nil && br.Txn.WriteTooOld { // If we got a response with the WriteTooOld flag set, then we pretend that // we got a WriteTooOldError, which will cause us to attempt to refresh and