Skip to content

Commit

Permalink
Backport 13856 - resharding error fix (#432)
Browse files Browse the repository at this point in the history
* VTGate Buffering: Use a more accurate heuristic for determining if we're doing a reshard (vitessio#13856)

Signed-off-by: Matt Lord <[email protected]>

* fix missing piece

---------

Signed-off-by: Matt Lord <[email protected]>
Co-authored-by: Matt Lord <[email protected]>
  • Loading branch information
tanjinx and mattlord authored Jul 1, 2024
1 parent bbfad9e commit 404332b
Show file tree
Hide file tree
Showing 4 changed files with 346 additions and 12 deletions.
1 change: 1 addition & 0 deletions examples/local/scripts/vtgate-up.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ vtgate \
--tablet_types_to_wait PRIMARY,REPLICA \
--service_map 'grpc-vtgateservice' \
--pid_file $VTDATAROOT/tmp/vtgate.pid \
--enable_buffer \
--mysql_auth_server_impl none \
> $VTDATAROOT/tmp/vtgate.out 2>&1 &

Expand Down
37 changes: 25 additions & 12 deletions go/vt/discovery/keyspace_events.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ import (

"google.golang.org/protobuf/proto"

"vitess.io/vitess/go/vt/key"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/srvtopo"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/topoproto"

querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
)

// KeyspaceEventWatcher is an auxiliary watcher that watches all availability incidents
Expand Down Expand Up @@ -65,7 +67,7 @@ type KeyspaceEvent struct {

type ShardEvent struct {
Tablet *topodatapb.TabletAlias
Target *query.Target
Target *querypb.Target
Serving bool
}

Expand Down Expand Up @@ -124,17 +126,27 @@ func (kss *keyspaceState) beingResharded(currentShard string) bool {
kss.mu.Lock()
defer kss.mu.Unlock()

// if the keyspace is gone, or if it has no known availability events, the keyspace
// cannot be in the middle of a resharding operation
// If the keyspace is gone, has no known availability events, or is in the middle of a
// MoveTables then the keyspace cannot be in the middle of a resharding operation.
if kss.deleted || kss.consistent {
return false
}

// for all the known shards, try to find a primary shard besides the one we're trying to access
// and which is currently healthy. if there are other healthy primaries in the keyspace, it means
// we're in the middle of a resharding operation
// If there are unequal and overlapping shards in the keyspace and any of them are
// currently serving then we assume that we are in the middle of a Reshard.
_, ckr, err := topo.ValidateShardName(currentShard)
if err != nil || ckr == nil { // Assume not and avoid potential panic
return false
}
for shard, sstate := range kss.shards {
if shard != currentShard && sstate.serving {
if !sstate.serving || shard == currentShard {
continue
}
_, skr, err := topo.ValidateShardName(shard)
if err != nil || skr == nil { // Assume not and avoid potential panic
return false
}
if key.KeyRangesIntersect(ckr, skr) {
return true
}
}
Expand All @@ -143,7 +155,7 @@ func (kss *keyspaceState) beingResharded(currentShard string) bool {
}

type shardState struct {
target *query.Target
target *querypb.Target
serving bool
externallyReparented int64
currentPrimary *topodatapb.TabletAlias
Expand Down Expand Up @@ -426,7 +438,7 @@ func (kew *KeyspaceEventWatcher) getKeyspaceStatus(keyspace string) *keyspaceSta
// This is not a fully accurate heuristic, but it's good enough that we'd want to buffer the
// request for the given target under the assumption that the reason why it cannot be completed
// right now is transitory.
func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bool {
func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *querypb.Target) bool {
if target.TabletType != topodatapb.TabletType_PRIMARY {
return false
}
Expand All @@ -446,7 +458,8 @@ func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bo
// The shard state keeps track of the current primary and the last externally reparented time, which we can use
// to determine that there was a serving primary which now became non serving. This is only possible in a DemotePrimary
// RPC which are only called from ERS and PRS. So buffering will stop when these operations succeed.
func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *query.Target) bool {
// We return the tablet alias of the primary if it is serving.
func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *querypb.Target) bool {
if target.TabletType != topodatapb.TabletType_PRIMARY {
return false
}
Expand Down
Loading

0 comments on commit 404332b

Please sign in to comment.