Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VTGate Buffering: Use a more accurate heuristic for determining if we're doing a reshard #13856

Merged
merged 12 commits into from
Aug 28, 2023
1 change: 1 addition & 0 deletions examples/common/scripts/vtgate-up.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ vtgate \
--tablet_types_to_wait PRIMARY,REPLICA \
--service_map 'grpc-vtgateservice' \
--pid_file $VTDATAROOT/tmp/vtgate.pid \
--enable_buffer \
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This merely makes it easier for users to test buffering related behavior in the local examples moving forward.

--mysql_auth_server_impl none \
> $VTDATAROOT/tmp/vtgate.out 2>&1 &

Expand Down
43 changes: 28 additions & 15 deletions go/vt/discovery/keyspace_events.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,17 @@ import (

"google.golang.org/protobuf/proto"

"vitess.io/vitess/go/vt/key"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
vschemapb "vitess.io/vitess/go/vt/proto/vschema"
"vitess.io/vitess/go/vt/sidecardb"
"vitess.io/vitess/go/vt/srvtopo"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/topoproto"
"vitess.io/vitess/go/vt/topotools"

querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
vschemapb "vitess.io/vitess/go/vt/proto/vschema"
)

// KeyspaceEventWatcher is an auxiliary watcher that watches all availability incidents
Expand Down Expand Up @@ -71,7 +73,7 @@ type KeyspaceEvent struct {

type ShardEvent struct {
Tablet *topodatapb.TabletAlias
Target *query.Target
Target *querypb.Target
Serving bool
}

Expand Down Expand Up @@ -142,18 +144,29 @@ func (kss *keyspaceState) beingResharded(currentShard string) bool {
kss.mu.Lock()
defer kss.mu.Unlock()

// if the keyspace is gone, or if it has no known availability events, the keyspace
// cannot be in the middle of a resharding operation
if kss.deleted || kss.consistent {
// If the keyspace is gone, if it has no known availability events, or the keyspace
// is in the middle of a MoveTables then keyspace cannot be in the middle of a
// resharding operation.
if kss.deleted || kss.consistent || (kss.moveTablesState != nil && kss.moveTablesState.Typ != MoveTablesType(MoveTablesNone)) {
return false
}

// for all the known shards, try to find a primary shard besides the one we're trying to access
// and which is currently healthy. if there are other healthy primaries in the keyspace, it means
// we're in the middle of a resharding operation
// FIXME: probably doesn't work for anything other than 1->2 resharding
// If there are unequal and overlapping shards in the keyspace and any
// of them are currently serving then we assume that we are in the middle
// of a Reshard operation.
_, ckr, err := topo.ValidateShardName(currentShard)
if err != nil || ckr == nil { // Assume not and avoid potential panic
return false
}
for shard, sstate := range kss.shards {
if shard != currentShard && sstate.serving {
if !sstate.serving || shard == currentShard {
continue
}
_, skr, err := topo.ValidateShardName(shard)
if err != nil || skr == nil { // Assume not and avoid potential panic
return false
}
if key.KeyRangeIntersect(ckr, skr) {
return true
}
}
Expand All @@ -162,7 +175,7 @@ func (kss *keyspaceState) beingResharded(currentShard string) bool {
}

type shardState struct {
target *query.Target
target *querypb.Target
serving bool
externallyReparented int64
currentPrimary *topodatapb.TabletAlias
Expand Down Expand Up @@ -585,7 +598,7 @@ func (kew *KeyspaceEventWatcher) getKeyspaceStatus(keyspace string) *keyspaceSta
// This is not a fully accurate heuristic, but it's good enough that we'd want to buffer the
// request for the given target under the assumption that the reason why it cannot be completed
// right now is transitory.
func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bool {
func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *querypb.Target) bool {
if target.TabletType != topodatapb.TabletType_PRIMARY {
return false
}
Expand All @@ -606,7 +619,7 @@ func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bo
// to determine that there was a serving primary which now became non serving. This is only possible in a DemotePrimary
// RPC which are only called from ERS and PRS. So buffering will stop when these operations succeed.
// We return the tablet alias of the primary if it is serving.
func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *query.Target) (*topodatapb.TabletAlias, bool) {
func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *querypb.Target) (*topodatapb.TabletAlias, bool) {
if target.TabletType != topodatapb.TabletType_PRIMARY {
return nil, false
}
Expand Down
149 changes: 147 additions & 2 deletions go/vt/discovery/keyspace_events_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ import (

"github.com/stretchr/testify/require"

topodatapb "vitess.io/vitess/go/vt/proto/topodata"
vschemapb "vitess.io/vitess/go/vt/proto/vschema"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/faketopo"

querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
vschemapb "vitess.io/vitess/go/vt/proto/vschema"
)

func TestSrvKeyspaceWithNilNewKeyspace(t *testing.T) {
Expand Down Expand Up @@ -55,6 +57,149 @@ func TestSrvKeyspaceWithNilNewKeyspace(t *testing.T) {
require.True(t, kss.onSrvKeyspace(nil, nil))
}

// TestTargetIsBeingResharded confirms that the keyspace event watcher thinks that a
// resharding operation is underway when the expected conditions are present:
// 1. The keyspace is inconsistent
// 2. The target tablet is primary
// 3. The keyspace has overlapping shards
// 4. The overlapping shard's tablet is serving
func TestTargetIsBeingResharded(t *testing.T) {
cell := "cell"
keyspace := "testks"
factory := faketopo.NewFakeTopoFactory()
factory.AddCell(cell)
ts := faketopo.NewFakeTopoServer(factory)
ts2 := &fakeTopoServer{}
hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, cell, "")
kew := NewKeyspaceEventWatcher(context.Background(), ts2, hc, cell)
kss := &keyspaceState{
kew: kew,
keyspace: keyspace,
shards: map[string]*shardState{
"-80": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-80",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
},
"80-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "80-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
},
"-40": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-40",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
"40-80": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "40-80",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
"80-c0": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "80-c0",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
"c0-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "c0-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
},
}
kew.mu.Lock()
kew.keyspaces[keyspace] = &keyspaceState{
kew: kew,
keyspace: keyspace,
shards: kss.shards,
consistent: false,
}
kew.mu.Unlock()

resharding := kew.TargetIsBeingResharded(kss.shards["-80"].target)
require.True(t, resharding, "TargetIsBeingResharded should return true")
}

// TestPrimaryIsNotServing confirms that the keyspace event watcher thinks that there
// is NOT a resharding operation underway and that it reports the expected primary not
// serving state when one shard's primary is not serving, meaning that the following
// conditions are met:
// 1. The keyspace is inconsistent
// 2. The target tablet is a primary
// 3. The target tablet is not serving
// 4. The shard's externallyReparented state is not 0
// 5. The shard's currentPrimary state is not nil
func TestPrimaryIsNotServing(t *testing.T) {
cell := "cell"
keyspace := "testks"
factory := faketopo.NewFakeTopoFactory()
factory.AddCell(cell)
ts := faketopo.NewFakeTopoServer(factory)
ts2 := &fakeTopoServer{}
hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, cell, "")
kew := NewKeyspaceEventWatcher(context.Background(), ts2, hc, cell)
kss := &keyspaceState{
kew: kew,
keyspace: keyspace,
shards: map[string]*shardState{
"-80": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "-80",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: false,
externallyReparented: time.Now().UnixNano(),
currentPrimary: &topodatapb.TabletAlias{
Cell: cell,
Uid: 100,
},
},
"80-": {
target: &querypb.Target{
Keyspace: keyspace,
Shard: "80-",
TabletType: topodatapb.TabletType_PRIMARY,
},
serving: true,
},
},
}
kew.mu.Lock()
kew.keyspaces[keyspace] = &keyspaceState{
kew: kew,
keyspace: keyspace,
shards: kss.shards,
consistent: false,
}
kew.mu.Unlock()

resharding := kew.TargetIsBeingResharded(kss.shards["-80"].target)
require.False(t, resharding, "TargetIsBeingResharded should return false")

_, primaryDown := kew.PrimaryIsNotServing(kss.shards["-80"].target)
require.True(t, primaryDown, "PrimaryIsNotServing should return true")
}

type fakeTopoServer struct {
}

Expand Down