diff --git a/examples/common/scripts/vtgate-up.sh b/examples/common/scripts/vtgate-up.sh index 03b85869e5d..e827cb613fc 100755 --- a/examples/common/scripts/vtgate-up.sh +++ b/examples/common/scripts/vtgate-up.sh @@ -39,6 +39,7 @@ vtgate \ --tablet_types_to_wait PRIMARY,REPLICA \ --service_map 'grpc-vtgateservice' \ --pid_file $VTDATAROOT/tmp/vtgate.pid \ + --enable_buffer \ --mysql_auth_server_impl none \ > $VTDATAROOT/tmp/vtgate.out 2>&1 & diff --git a/go/vt/discovery/keyspace_events.go b/go/vt/discovery/keyspace_events.go index bf5cfcdf1df..ec9988b500c 100644 --- a/go/vt/discovery/keyspace_events.go +++ b/go/vt/discovery/keyspace_events.go @@ -23,15 +23,17 @@ import ( "google.golang.org/protobuf/proto" + "vitess.io/vitess/go/vt/key" "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/proto/query" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" - vschemapb "vitess.io/vitess/go/vt/proto/vschema" "vitess.io/vitess/go/vt/sidecardb" "vitess.io/vitess/go/vt/srvtopo" "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/topotools" + + querypb "vitess.io/vitess/go/vt/proto/query" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + vschemapb "vitess.io/vitess/go/vt/proto/vschema" ) // KeyspaceEventWatcher is an auxiliary watcher that watches all availability incidents @@ -71,7 +73,7 @@ type KeyspaceEvent struct { type ShardEvent struct { Tablet *topodatapb.TabletAlias - Target *query.Target + Target *querypb.Target Serving bool } @@ -142,18 +144,27 @@ func (kss *keyspaceState) beingResharded(currentShard string) bool { kss.mu.Lock() defer kss.mu.Unlock() - // if the keyspace is gone, or if it has no known availability events, the keyspace - // cannot be in the middle of a resharding operation - if kss.deleted || kss.consistent { + // If the keyspace is gone, has no known availability events, or is in the middle of a + // MoveTables then the keyspace cannot be in the middle of a resharding operation. + if kss.deleted || kss.consistent || (kss.moveTablesState != nil && kss.moveTablesState.Typ != MoveTablesType(MoveTablesNone)) { return false } - // for all the known shards, try to find a primary shard besides the one we're trying to access - // and which is currently healthy. if there are other healthy primaries in the keyspace, it means - // we're in the middle of a resharding operation - // FIXME: probably doesn't work for anything other than 1->2 resharding + // If there are unequal and overlapping shards in the keyspace and any of them are + // currently serving then we assume that we are in the middle of a Reshard. + _, ckr, err := topo.ValidateShardName(currentShard) + if err != nil || ckr == nil { // Assume not and avoid potential panic + return false + } for shard, sstate := range kss.shards { - if shard != currentShard && sstate.serving { + if !sstate.serving || shard == currentShard { + continue + } + _, skr, err := topo.ValidateShardName(shard) + if err != nil || skr == nil { // Assume not and avoid potential panic + return false + } + if key.KeyRangeIntersect(ckr, skr) { return true } } @@ -162,7 +173,7 @@ func (kss *keyspaceState) beingResharded(currentShard string) bool { } type shardState struct { - target *query.Target + target *querypb.Target serving bool externallyReparented int64 currentPrimary *topodatapb.TabletAlias @@ -585,7 +596,7 @@ func (kew *KeyspaceEventWatcher) getKeyspaceStatus(keyspace string) *keyspaceSta // This is not a fully accurate heuristic, but it's good enough that we'd want to buffer the // request for the given target under the assumption that the reason why it cannot be completed // right now is transitory. -func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bool { +func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *querypb.Target) bool { if target.TabletType != topodatapb.TabletType_PRIMARY { return false } @@ -606,7 +617,7 @@ func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bo // to determine that there was a serving primary which now became non serving. This is only possible in a DemotePrimary // RPC which are only called from ERS and PRS. So buffering will stop when these operations succeed. // We return the tablet alias of the primary if it is serving. -func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *query.Target) (*topodatapb.TabletAlias, bool) { +func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *querypb.Target) (*topodatapb.TabletAlias, bool) { if target.TabletType != topodatapb.TabletType_PRIMARY { return nil, false } diff --git a/go/vt/discovery/keyspace_events_test.go b/go/vt/discovery/keyspace_events_test.go index 456d8566e87..eb2babdd44c 100644 --- a/go/vt/discovery/keyspace_events_test.go +++ b/go/vt/discovery/keyspace_events_test.go @@ -24,10 +24,12 @@ import ( "github.com/stretchr/testify/require" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" - vschemapb "vitess.io/vitess/go/vt/proto/vschema" "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/faketopo" + + querypb "vitess.io/vitess/go/vt/proto/query" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + vschemapb "vitess.io/vitess/go/vt/proto/vschema" ) func TestSrvKeyspaceWithNilNewKeyspace(t *testing.T) { @@ -38,6 +40,7 @@ func TestSrvKeyspaceWithNilNewKeyspace(t *testing.T) { ts := faketopo.NewFakeTopoServer(factory) ts2 := &fakeTopoServer{} hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, cell, "") + defer hc.Close() kew := NewKeyspaceEventWatcher(context.Background(), ts2, hc, cell) kss := &keyspaceState{ kew: kew, @@ -55,6 +58,220 @@ func TestSrvKeyspaceWithNilNewKeyspace(t *testing.T) { require.True(t, kss.onSrvKeyspace(nil, nil)) } +// TestKeyspaceEventTypes confirms that the keyspace event watcher determines +// that the unavailability event is caused by the correct scenario. We should +// consider it to be caused by a resharding operation when the following +// conditions are present: +// 1. The keyspace is inconsistent (in the middle of an availability event) +// 2. The target tablet is a primary +// 3. The keyspace has overlapping shards +// 4. The overlapping shard's tablet is serving +// And we should consider the cause to be a primary not serving when the +// following conditions exist: +// 1. The keyspace is inconsistent (in the middle of an availability event) +// 2. The target tablet is a primary +// 3. The target tablet is not serving +// 4. The shard's externallyReparented time is not 0 +// 5. The shard's currentPrimary state is not nil +// We should never consider both as a possible cause given the same +// keyspace state. +func TestKeyspaceEventTypes(t *testing.T) { + cell := "cell" + keyspace := "testks" + factory := faketopo.NewFakeTopoFactory() + factory.AddCell(cell) + ts := faketopo.NewFakeTopoServer(factory) + ts2 := &fakeTopoServer{} + hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, cell, "") + defer hc.Close() + kew := NewKeyspaceEventWatcher(context.Background(), ts2, hc, cell) + + type testCase struct { + name string + kss *keyspaceState + shardToCheck string + expectResharding bool + expectPrimaryNotServing bool + } + + testCases := []testCase{ + { + name: "one to two resharding in progress", + kss: &keyspaceState{ + kew: kew, + keyspace: keyspace, + shards: map[string]*shardState{ + "-": { + target: &querypb.Target{ + Keyspace: keyspace, + Shard: "-", + TabletType: topodatapb.TabletType_PRIMARY, + }, + serving: false, + }, + "-80": { + target: &querypb.Target{ + Keyspace: keyspace, + Shard: "-80", + TabletType: topodatapb.TabletType_PRIMARY, + }, + serving: true, + }, + "80-": { + target: &querypb.Target{ + Keyspace: keyspace, + Shard: "80-", + TabletType: topodatapb.TabletType_PRIMARY, + }, + serving: false, + }, + }, + consistent: false, + }, + shardToCheck: "-", + expectResharding: true, + expectPrimaryNotServing: false, + }, + { + name: "two to four resharding in progress", + kss: &keyspaceState{ + kew: kew, + keyspace: keyspace, + shards: map[string]*shardState{ + "-80": { + target: &querypb.Target{ + Keyspace: keyspace, + Shard: "-80", + TabletType: topodatapb.TabletType_PRIMARY, + }, + serving: false, + }, + "80-": { + target: &querypb.Target{ + Keyspace: keyspace, + Shard: "80-", + TabletType: topodatapb.TabletType_PRIMARY, + }, + serving: true, + }, + "-40": { + target: &querypb.Target{ + Keyspace: keyspace, + Shard: "-40", + TabletType: topodatapb.TabletType_PRIMARY, + }, + serving: true, + }, + "40-80": { + target: &querypb.Target{ + Keyspace: keyspace, + Shard: "40-80", + TabletType: topodatapb.TabletType_PRIMARY, + }, + serving: true, + }, + "80-c0": { + target: &querypb.Target{ + Keyspace: keyspace, + Shard: "80-c0", + TabletType: topodatapb.TabletType_PRIMARY, + }, + serving: false, + }, + "c0-": { + target: &querypb.Target{ + Keyspace: keyspace, + Shard: "c0-", + TabletType: topodatapb.TabletType_PRIMARY, + }, + serving: false, + }, + }, + consistent: false, + }, + shardToCheck: "-80", + expectResharding: true, + expectPrimaryNotServing: false, + }, + { + name: "unsharded primary not serving", + kss: &keyspaceState{ + kew: kew, + keyspace: keyspace, + shards: map[string]*shardState{ + "-": { + target: &querypb.Target{ + Keyspace: keyspace, + Shard: "-", + TabletType: topodatapb.TabletType_PRIMARY, + }, + serving: false, + externallyReparented: time.Now().UnixNano(), + currentPrimary: &topodatapb.TabletAlias{ + Cell: cell, + Uid: 100, + }, + }, + }, + consistent: false, + }, + shardToCheck: "-", + expectResharding: false, + expectPrimaryNotServing: true, + }, + { + name: "sharded primary not serving", + kss: &keyspaceState{ + kew: kew, + keyspace: keyspace, + shards: map[string]*shardState{ + "-80": { + target: &querypb.Target{ + Keyspace: keyspace, + Shard: "-80", + TabletType: topodatapb.TabletType_PRIMARY, + }, + serving: false, + externallyReparented: time.Now().UnixNano(), + currentPrimary: &topodatapb.TabletAlias{ + Cell: cell, + Uid: 100, + }, + }, + "80-": { + target: &querypb.Target{ + Keyspace: keyspace, + Shard: "80-", + TabletType: topodatapb.TabletType_PRIMARY, + }, + serving: true, + }, + }, + consistent: false, + }, + shardToCheck: "-80", + expectResharding: false, + expectPrimaryNotServing: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + kew.mu.Lock() + kew.keyspaces[keyspace] = tc.kss + kew.mu.Unlock() + + require.NotNil(t, tc.kss.shards[tc.shardToCheck], "the specified shardToCheck of %q does not exist in the shardState", tc.shardToCheck) + + resharding := kew.TargetIsBeingResharded(tc.kss.shards[tc.shardToCheck].target) + require.Equal(t, resharding, tc.expectResharding, "TargetIsBeingResharded should return %t", tc.expectResharding) + + _, primaryDown := kew.PrimaryIsNotServing(tc.kss.shards[tc.shardToCheck].target) + require.Equal(t, primaryDown, tc.expectPrimaryNotServing, "PrimaryIsNotServing should return %t", tc.expectPrimaryNotServing) + }) + } +} + type fakeTopoServer struct { }