Skip to content

Commit

Permalink
VTGate Buffering: Use a more accurate heuristic for determining if we…
Browse files Browse the repository at this point in the history
…'re doing a reshard (vitessio#13856)

Signed-off-by: Matt Lord <[email protected]>
  • Loading branch information
mattlord authored and timvaillancourt committed May 28, 2024
1 parent 53605bf commit b3262d7
Show file tree
Hide file tree
Showing 3 changed files with 244 additions and 14 deletions.
1 change: 1 addition & 0 deletions examples/local/scripts/vtgate-up.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ vtgate \
--tablet_types_to_wait PRIMARY,REPLICA \
--service_map 'grpc-vtgateservice' \
--pid_file $VTDATAROOT/tmp/vtgate.pid \
--enable_buffer \
--mysql_auth_server_impl none \
> $VTDATAROOT/tmp/vtgate.out 2>&1 &

Expand Down
36 changes: 24 additions & 12 deletions go/vt/discovery/keyspace_events.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ import (

"google.golang.org/protobuf/proto"

"vitess.io/vitess/go/vt/key"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/srvtopo"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/topoproto"

querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
)

// KeyspaceEventWatcher is an auxiliary watcher that watches all availability incidents
Expand Down Expand Up @@ -65,7 +67,7 @@ type KeyspaceEvent struct {

type ShardEvent struct {
Tablet *topodatapb.TabletAlias
Target *query.Target
Target *querypb.Target
Serving bool
}

Expand Down Expand Up @@ -124,17 +126,27 @@ func (kss *keyspaceState) beingResharded(currentShard string) bool {
kss.mu.Lock()
defer kss.mu.Unlock()

// if the keyspace is gone, or if it has no known availability events, the keyspace
// cannot be in the middle of a resharding operation
// If the keyspace is gone, has no known availability events, or is in the middle of a
// MoveTables then the keyspace cannot be in the middle of a resharding operation.
if kss.deleted || kss.consistent {
return false
}

// for all the known shards, try to find a primary shard besides the one we're trying to access
// and which is currently healthy. if there are other healthy primaries in the keyspace, it means
// we're in the middle of a resharding operation
// If there are unequal and overlapping shards in the keyspace and any of them are
// currently serving then we assume that we are in the middle of a Reshard.
_, ckr, err := topo.ValidateShardName(currentShard)
if err != nil || ckr == nil { // Assume not and avoid potential panic
return false
}
for shard, sstate := range kss.shards {
if shard != currentShard && sstate.serving {
if !sstate.serving || shard == currentShard {
continue
}
_, skr, err := topo.ValidateShardName(shard)
if err != nil || skr == nil { // Assume not and avoid potential panic
return false
}
if key.KeyRangesIntersect(ckr, skr) {
return true
}
}
Expand All @@ -143,7 +155,7 @@ func (kss *keyspaceState) beingResharded(currentShard string) bool {
}

type shardState struct {
target *query.Target
target *querypb.Target
serving bool
externallyReparented int64
currentPrimary *topodatapb.TabletAlias
Expand Down Expand Up @@ -426,7 +438,7 @@ func (kew *KeyspaceEventWatcher) getKeyspaceStatus(keyspace string) *keyspaceSta
// This is not a fully accurate heuristic, but it's good enough that we'd want to buffer the
// request for the given target under the assumption that the reason why it cannot be completed
// right now is transitory.
func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bool {
func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *querypb.Target) bool {
if target.TabletType != topodatapb.TabletType_PRIMARY {
return false
}
Expand All @@ -446,7 +458,7 @@ func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bo
// The shard state keeps track of the current primary and the last externally reparented time, which we can use
// to determine that there was a serving primary which now became non serving. This is only possible in a DemotePrimary
// RPC which are only called from ERS and PRS. So buffering will stop when these operations succeed.
func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *query.Target) bool {
func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *querypb.Target) bool {
if target.TabletType != topodatapb.TabletType_PRIMARY {
return false
}
Expand Down
221 changes: 219 additions & 2 deletions go/vt/discovery/keyspace_events_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ import (

"github.com/stretchr/testify/require"

topodatapb "vitess.io/vitess/go/vt/proto/topodata"
vschemapb "vitess.io/vitess/go/vt/proto/vschema"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/faketopo"

querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
vschemapb "vitess.io/vitess/go/vt/proto/vschema"
)

func TestSrvKeyspaceWithNilNewKeyspace(t *testing.T) {
Expand All @@ -38,6 +40,7 @@ func TestSrvKeyspaceWithNilNewKeyspace(t *testing.T) {
ts := faketopo.NewFakeTopoServer(factory)
ts2 := &fakeTopoServer{}
hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, cell, "")
defer hc.Close()
kew := NewKeyspaceEventWatcher(context.Background(), ts2, hc, cell)
kss := &keyspaceState{
kew: kew,
Expand All @@ -55,6 +58,220 @@ func TestSrvKeyspaceWithNilNewKeyspace(t *testing.T) {
require.True(t, kss.onSrvKeyspace(nil, nil))
}

// TestKeyspaceEventTypes confirms that the keyspace event watcher determines
// that the unavailability event is caused by the correct scenario. We should
// consider it to be caused by a resharding operation when the following
// conditions are present:
// 1. The keyspace is inconsistent (in the middle of an availability event)
// 2. The target tablet is a primary
// 3. The keyspace has overlapping shards
// 4. The overlapping shard's tablet is serving
// And we should consider the cause to be a primary not serving when the
// following conditions exist:
// 1. The keyspace is inconsistent (in the middle of an availability event)
// 2. The target tablet is a primary
// 3. The target tablet is not serving
// 4. The shard's externallyReparented time is not 0
// 5. The shard's currentPrimary state is not nil
// We should never consider both as a possible cause given the same
// keyspace state.
func TestKeyspaceEventTypes(t *testing.T) {
cell := "cell"
keyspace := "testks"
factory := faketopo.NewFakeTopoFactory()
factory.AddCell(cell)
ts := faketopo.NewFakeTopoServer(factory)
ts2 := &fakeTopoServer{}
hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, cell, "")
defer hc.Close()
kew := NewKeyspaceEventWatcher(context.Background(), ts2, hc, cell)

type testCase struct {
name string
kss *keyspaceState
shardToCheck string
expectResharding bool
expectPrimaryNotServing bool
}

testCases := []testCase{
{
name: "one to two resharding in progress",
kss: &keyspaceState{
kew: kew,
keyspace: keyspace,
shards: map[string]*shardState{
"-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
},
"-80": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-80",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
"80-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "80-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
},
},
consistent: false,
},
shardToCheck: "-",
expectResharding: true,
expectPrimaryNotServing: false,
},
{
name: "two to four resharding in progress",
kss: &keyspaceState{
kew: kew,
keyspace: keyspace,
shards: map[string]*shardState{
"-80": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-80",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
},
"80-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "80-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
"-40": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-40",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
"40-80": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "40-80",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
"80-c0": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "80-c0",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
},
"c0-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "c0-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
},
},
consistent: false,
},
shardToCheck: "-80",
expectResharding: true,
expectPrimaryNotServing: false,
},
{
name: "unsharded primary not serving",
kss: &keyspaceState{
kew: kew,
keyspace: keyspace,
shards: map[string]*shardState{
"-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
externallyReparented: time.Now().UnixNano(),
currentPrimary: &topodatapb.TabletAlias{
Cell: cell,
Uid: 100,
},
},
},
consistent: false,
},
shardToCheck: "-",
expectResharding: false,
expectPrimaryNotServing: true,
},
{
name: "sharded primary not serving",
kss: &keyspaceState{
kew: kew,
keyspace: keyspace,
shards: map[string]*shardState{
"-80": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-80",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
externallyReparented: time.Now().UnixNano(),
currentPrimary: &topodatapb.TabletAlias{
Cell: cell,
Uid: 100,
},
},
"80-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "80-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
},
consistent: false,
},
shardToCheck: "-80",
expectResharding: false,
expectPrimaryNotServing: true,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
kew.mu.Lock()
kew.keyspaces[keyspace] = tc.kss
kew.mu.Unlock()

require.NotNil(t, tc.kss.shards[tc.shardToCheck], "the specified shardToCheck of %q does not exist in the shardState", tc.shardToCheck)

resharding := kew.TargetIsBeingResharded(tc.kss.shards[tc.shardToCheck].target)
require.Equal(t, resharding, tc.expectResharding, "TargetIsBeingResharded should return %t", tc.expectResharding)

primaryDown := kew.PrimaryIsNotServing(tc.kss.shards[tc.shardToCheck].target)
require.Equal(t, primaryDown, tc.expectPrimaryNotServing, "PrimaryIsNotServing should return %t", tc.expectPrimaryNotServing)
})
}
}

type fakeTopoServer struct {
}

Expand Down

0 comments on commit b3262d7

Please sign in to comment.