Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VTGate Buffering: Use a more accurate heuristic for determining if we're doing a reshard #13856

Merged
merged 12 commits into from
Aug 28, 2023
1 change: 1 addition & 0 deletions examples/common/scripts/vtgate-up.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ vtgate \
--tablet_types_to_wait PRIMARY,REPLICA \
--service_map 'grpc-vtgateservice' \
--pid_file $VTDATAROOT/tmp/vtgate.pid \
--enable_buffer \
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This merely makes it easier for users to test buffering related behavior in the local examples moving forward.

--mysql_auth_server_impl none \
> $VTDATAROOT/tmp/vtgate.out 2>&1 &

Expand Down
41 changes: 26 additions & 15 deletions go/vt/discovery/keyspace_events.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,17 @@ import (

"google.golang.org/protobuf/proto"

"vitess.io/vitess/go/vt/key"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
vschemapb "vitess.io/vitess/go/vt/proto/vschema"
"vitess.io/vitess/go/vt/sidecardb"
"vitess.io/vitess/go/vt/srvtopo"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/topoproto"
"vitess.io/vitess/go/vt/topotools"

querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
vschemapb "vitess.io/vitess/go/vt/proto/vschema"
)

// KeyspaceEventWatcher is an auxiliary watcher that watches all availability incidents
Expand Down Expand Up @@ -71,7 +73,7 @@ type KeyspaceEvent struct {

type ShardEvent struct {
Tablet *topodatapb.TabletAlias
Target *query.Target
Target *querypb.Target
Serving bool
}

Expand Down Expand Up @@ -142,18 +144,27 @@ func (kss *keyspaceState) beingResharded(currentShard string) bool {
kss.mu.Lock()
defer kss.mu.Unlock()

// if the keyspace is gone, or if it has no known availability events, the keyspace
// cannot be in the middle of a resharding operation
if kss.deleted || kss.consistent {
// If the keyspace is gone, has no known availability events, or is in the middle of a
// MoveTables then the keyspace cannot be in the middle of a resharding operation.
if kss.deleted || kss.consistent || (kss.moveTablesState != nil && kss.moveTablesState.Typ != MoveTablesType(MoveTablesNone)) {
return false
}

// for all the known shards, try to find a primary shard besides the one we're trying to access
// and which is currently healthy. if there are other healthy primaries in the keyspace, it means
// we're in the middle of a resharding operation
// FIXME: probably doesn't work for anything other than 1->2 resharding
// If there are unequal and overlapping shards in the keyspace and any of them are
// currently serving then we assume that we are in the middle of a Reshard.
_, ckr, err := topo.ValidateShardName(currentShard)
if err != nil || ckr == nil { // Assume not and avoid potential panic
return false
}
for shard, sstate := range kss.shards {
if shard != currentShard && sstate.serving {
if !sstate.serving || shard == currentShard {
continue
}
_, skr, err := topo.ValidateShardName(shard)
if err != nil || skr == nil { // Assume not and avoid potential panic
return false
}
if key.KeyRangeIntersect(ckr, skr) {
return true
}
}
Expand All @@ -162,7 +173,7 @@ func (kss *keyspaceState) beingResharded(currentShard string) bool {
}

type shardState struct {
target *query.Target
target *querypb.Target
serving bool
externallyReparented int64
currentPrimary *topodatapb.TabletAlias
Expand Down Expand Up @@ -585,7 +596,7 @@ func (kew *KeyspaceEventWatcher) getKeyspaceStatus(keyspace string) *keyspaceSta
// This is not a fully accurate heuristic, but it's good enough that we'd want to buffer the
// request for the given target under the assumption that the reason why it cannot be completed
// right now is transitory.
func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bool {
func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *querypb.Target) bool {
if target.TabletType != topodatapb.TabletType_PRIMARY {
return false
}
Expand All @@ -606,7 +617,7 @@ func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bo
// to determine that there was a serving primary which now became non serving. This is only possible in a DemotePrimary
// RPC which are only called from ERS and PRS. So buffering will stop when these operations succeed.
// We return the tablet alias of the primary if it is serving.
func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *query.Target) (*topodatapb.TabletAlias, bool) {
func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *querypb.Target) (*topodatapb.TabletAlias, bool) {
if target.TabletType != topodatapb.TabletType_PRIMARY {
return nil, false
}
Expand Down
221 changes: 219 additions & 2 deletions go/vt/discovery/keyspace_events_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ import (

"github.com/stretchr/testify/require"

topodatapb "vitess.io/vitess/go/vt/proto/topodata"
vschemapb "vitess.io/vitess/go/vt/proto/vschema"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/faketopo"

querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
vschemapb "vitess.io/vitess/go/vt/proto/vschema"
)

func TestSrvKeyspaceWithNilNewKeyspace(t *testing.T) {
Expand All @@ -38,6 +40,7 @@ func TestSrvKeyspaceWithNilNewKeyspace(t *testing.T) {
ts := faketopo.NewFakeTopoServer(factory)
ts2 := &fakeTopoServer{}
hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, cell, "")
defer hc.Close()
kew := NewKeyspaceEventWatcher(context.Background(), ts2, hc, cell)
kss := &keyspaceState{
kew: kew,
Expand All @@ -55,6 +58,220 @@ func TestSrvKeyspaceWithNilNewKeyspace(t *testing.T) {
require.True(t, kss.onSrvKeyspace(nil, nil))
}

// TestKeyspaceEventTypes confirms that the keyspace event watcher determines
// that the unavailability event is caused by the correct scenario. We should
// consider it to be caused by a resharding operation when the following
// conditions are present:
// 1. The keyspace is inconsistent (in the middle of an availability event)
// 2. The target tablet is a primary
// 3. The keyspace has overlapping shards
// 4. The overlapping shard's tablet is serving
// And we should consider the cause to be a primary not serving when the
// following conditions exist:
// 1. The keyspace is inconsistent (in the middle of an availability event)
// 2. The target tablet is a primary
// 3. The target tablet is not serving
// 4. The shard's externallyReparented time is not 0
// 5. The shard's currentPrimary state is not nil
// We should never consider both as a possible cause given the same
// keyspace state.
func TestKeyspaceEventTypes(t *testing.T) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❤️

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice test description 💯

cell := "cell"
keyspace := "testks"
factory := faketopo.NewFakeTopoFactory()
factory.AddCell(cell)
ts := faketopo.NewFakeTopoServer(factory)
ts2 := &fakeTopoServer{}
hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, cell, "")
defer hc.Close()
kew := NewKeyspaceEventWatcher(context.Background(), ts2, hc, cell)

type testCase struct {
name string
kss *keyspaceState
shardToCheck string
expectResharding bool
expectPrimaryNotServing bool
}

testCases := []testCase{
{
name: "one to two resharding in progress",
kss: &keyspaceState{
kew: kew,
keyspace: keyspace,
shards: map[string]*shardState{
"-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
},
"-80": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-80",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
"80-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "80-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
},
},
consistent: false,
},
shardToCheck: "-",
expectResharding: true,
expectPrimaryNotServing: false,
},
{
name: "two to four resharding in progress",
kss: &keyspaceState{
kew: kew,
keyspace: keyspace,
shards: map[string]*shardState{
"-80": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-80",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
},
"80-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "80-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
"-40": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-40",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
"40-80": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "40-80",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
"80-c0": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "80-c0",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
},
"c0-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "c0-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
},
},
consistent: false,
},
shardToCheck: "-80",
expectResharding: true,
expectPrimaryNotServing: false,
},
{
name: "unsharded primary not serving",
kss: &keyspaceState{
kew: kew,
keyspace: keyspace,
shards: map[string]*shardState{
"-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
externallyReparented: time.Now().UnixNano(),
currentPrimary: &topodatapb.TabletAlias{
Cell: cell,
Uid: 100,
},
},
},
consistent: false,
},
shardToCheck: "-",
expectResharding: false,
expectPrimaryNotServing: true,
},
{
name: "sharded primary not serving",
kss: &keyspaceState{
kew: kew,
keyspace: keyspace,
shards: map[string]*shardState{
"-80": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-80",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
externallyReparented: time.Now().UnixNano(),
currentPrimary: &topodatapb.TabletAlias{
Cell: cell,
Uid: 100,
},
},
"80-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "80-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
},
consistent: false,
},
shardToCheck: "-80",
expectResharding: false,
expectPrimaryNotServing: true,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
kew.mu.Lock()
kew.keyspaces[keyspace] = tc.kss
kew.mu.Unlock()

require.NotNil(t, tc.kss.shards[tc.shardToCheck], "the specified shardToCheck of %q does not exist in the shardState", tc.shardToCheck)

resharding := kew.TargetIsBeingResharded(tc.kss.shards[tc.shardToCheck].target)
require.Equal(t, resharding, tc.expectResharding, "TargetIsBeingResharded should return %t", tc.expectResharding)

_, primaryDown := kew.PrimaryIsNotServing(tc.kss.shards[tc.shardToCheck].target)
require.Equal(t, primaryDown, tc.expectPrimaryNotServing, "PrimaryIsNotServing should return %t", tc.expectPrimaryNotServing)
})
}
}

type fakeTopoServer struct {
}

Expand Down