Skip to content

Commit

Permalink
Prevent race condition betweeen tablet delete and update
Browse files Browse the repository at this point in the history
There was a race condition between deleting a tablet's healthcheck
record from the authoritative map (hc.healthByAlias) and updating
the same tablet's health data (hc.healthData) record. This could
cause us to effectively re-add a an updated copy of the tablet
healthcheck record after it's been deleted. This then leads to
"zombie" tablet records in the SHOW VITESS_TABLETS output as it
is based on what is in the hc.healthData map:
https://github.com/vitessio/vitess/blob/693c5dbdeacdd7a705b46ebce6776a5256c8cfef/go/vt/discovery/healthcheck.go#L537-L557

And purge all potential healthcheck records for a tablet alias by
type on delete.

Signed-off-by: Matt Lord <[email protected]>
  • Loading branch information
mattlord committed Nov 15, 2021
1 parent 693c5db commit 17737ab
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 14 deletions.
2 changes: 1 addition & 1 deletion go/test/endtoend/tabletgateway/vtgate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ func TestReplicaTransactions(t *testing.T) {
require.Nil(t, err)
serving := replicaTablet.VttabletProcess.WaitForStatus("SERVING", time.Duration(60*time.Second))
assert.Equal(t, serving, true, "Tablet did not become ready within a reasonable time")
exec(t, readConn, fetchAllCustomers, "is either down or nonexistent")
exec(t, readConn, fetchAllCustomers, "not found")

// create a new connection, should be able to query again
readConn, err = mysql.Connect(ctx, &vtParams)
Expand Down
37 changes: 29 additions & 8 deletions go/vt/discovery/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,6 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) {
hc.mu.Lock()
defer hc.mu.Unlock()

key := hc.keyFromTablet(tablet)
tabletAlias := tabletAliasString(topoproto.TabletAliasString(tablet.Alias))
// delete from authoritative map
th, ok := hc.healthByAlias[tabletAlias]
Expand All @@ -393,14 +392,18 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) {
// which will call finalizeConn, which will close the connection.
th.cancelFunc()
delete(hc.healthByAlias, tabletAlias)
// delete from map by keyspace.shard.tabletType
ths, ok := hc.healthData[key]
if !ok {
log.Warningf("We have no health data for target: %v", key)
return

// the tablet has been deleted from the authoritative healthByAlias map so let's ensure it's deleted
// from the healthData map as well, which means we need to delete any existing combinations of
// keyspace.shard.tabletType we may have had for the tablet
for _, key := range hc.keysFromTablet(tablet) {
if ths, ok := hc.healthData[key]; ok {
delete(ths, tabletAlias)
}
}
delete(ths, tabletAlias)
// delete from healthy list

// We only need to recompute the healthy record for the current state of the tablet
key := hc.keyFromTablet(tablet)
healthy, ok := hc.healthy[key]
if ok && len(healthy) > 0 {
hc.recomputeHealthy(key)
Expand All @@ -413,6 +416,15 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ
defer hc.mu.Unlock()

tabletAlias := tabletAliasString(topoproto.TabletAliasString(th.Tablet.Alias))
// let's be sure that this tablet hasn't been deleted from the authortative map
// so that we're not racing to update it and in effect re-adding a copy of the
// tablet record that was deleted
_, ok := hc.healthByAlias[tabletAlias]
if !ok {
log.Infof("Tablet %s has been deleted, skipping health update", tabletAlias)
return
}

targetKey := hc.keyFromTarget(th.Target)
targetChanged := prevTarget.TabletType != th.Target.TabletType || prevTarget.Keyspace != th.Target.Keyspace || prevTarget.Shard != th.Target.Shard
if targetChanged {
Expand Down Expand Up @@ -723,6 +735,15 @@ func (hc *HealthCheckImpl) keyFromTablet(tablet *topodata.Tablet) keyspaceShardT
return keyspaceShardTabletType(fmt.Sprintf("%s.%s.%s", tablet.Keyspace, tablet.Shard, topoproto.TabletTypeLString(tablet.Type)))
}

// keysFromTablet returns a slice of potential type based keys for a tablet
func (hc *HealthCheckImpl) keysFromTablet(tablet *topodata.Tablet) []keyspaceShardTabletType {
tabletTypeKeys := make([]keyspaceShardTabletType, len(topoproto.AllTabletTypes))
for i, tabletType := range topoproto.AllTabletTypes {
tabletTypeKeys[i] = keyspaceShardTabletType(fmt.Sprintf("%s.%s.%s", tablet.Keyspace, tablet.Shard, topoproto.TabletTypeLString(tabletType)))
}
return tabletTypeKeys
}

// getAliasByCell should only be called while holding hc.mu
func (hc *HealthCheckImpl) getAliasByCell(cell string) string {
if alias, ok := hc.cellAliases[cell]; ok {
Expand Down
8 changes: 3 additions & 5 deletions go/vt/discovery/tablet_health_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,11 +290,9 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) {
if err != nil {
hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1)
// We have reason to suspect the tablet healthcheck record is corrupted or invalid so let's remove the tablet's record
// from the healthcheck cache and it will get re-added again if the tablet is reachable
if strings.Contains(err.Error(), "health stats mismatch") ||
strings.HasSuffix(err.Error(), context.Canceled.Error()) ||
strings.Contains(err.Error(), `"error reading from server: EOF", received prior goaway`) {
log.Warningf("tablet %s had a suspect healthcheck error: %s -- clearing cache record", thc.Tablet.Alias, err.Error())
// from the healthcheck cache and the corrected tablet record will be fetched from the topology and added to the
// healthcheck cache again via the topology watcher.
if strings.Contains(err.Error(), "health stats mismatch") {
hc.deleteTablet(thc.Tablet)
return
}
Expand Down

0 comments on commit 17737ab

Please sign in to comment.