diff --git a/go/test/endtoend/tabletgateway/vtgate_test.go b/go/test/endtoend/tabletgateway/vtgate_test.go index 4e3ac83187e..18e66b81f1c 100644 --- a/go/test/endtoend/tabletgateway/vtgate_test.go +++ b/go/test/endtoend/tabletgateway/vtgate_test.go @@ -204,7 +204,7 @@ func TestReplicaTransactions(t *testing.T) { require.Nil(t, err) serving := replicaTablet.VttabletProcess.WaitForStatus("SERVING", time.Duration(60*time.Second)) assert.Equal(t, serving, true, "Tablet did not become ready within a reasonable time") - exec(t, readConn, fetchAllCustomers, "is either down or nonexistent") + exec(t, readConn, fetchAllCustomers, "not found") // create a new connection, should be able to query again readConn, err = mysql.Connect(ctx, &vtParams) diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index 0044477c5be..6649b96626d 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -381,7 +381,6 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) { hc.mu.Lock() defer hc.mu.Unlock() - key := hc.keyFromTablet(tablet) tabletAlias := tabletAliasString(topoproto.TabletAliasString(tablet.Alias)) // delete from authoritative map th, ok := hc.healthByAlias[tabletAlias] @@ -393,14 +392,18 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) { // which will call finalizeConn, which will close the connection. th.cancelFunc() delete(hc.healthByAlias, tabletAlias) - // delete from map by keyspace.shard.tabletType - ths, ok := hc.healthData[key] - if !ok { - log.Warningf("We have no health data for target: %v", key) - return + + // the tablet has been deleted from the authoritative healthByAlias map so let's ensure it's deleted + // from the healthData map as well, which means we need to delete any existing combinations of + // keyspace.shard.tabletType we may have had for the tablet + for _, key := range hc.keysFromTablet(tablet) { + if ths, ok := hc.healthData[key]; ok { + delete(ths, tabletAlias) + } } - delete(ths, tabletAlias) - // delete from healthy list + + // We only need to recompute the healthy record for the current state of the tablet + key := hc.keyFromTablet(tablet) healthy, ok := hc.healthy[key] if ok && len(healthy) > 0 { hc.recomputeHealthy(key) @@ -413,6 +416,15 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ defer hc.mu.Unlock() tabletAlias := tabletAliasString(topoproto.TabletAliasString(th.Tablet.Alias)) + // let's be sure that this tablet hasn't been deleted from the authortative map + // so that we're not racing to update it and in effect re-adding a copy of the + // tablet record that was deleted + _, ok := hc.healthByAlias[tabletAlias] + if !ok { + log.Infof("Tablet %s has been deleted, skipping health update", tabletAlias) + return + } + targetKey := hc.keyFromTarget(th.Target) targetChanged := prevTarget.TabletType != th.Target.TabletType || prevTarget.Keyspace != th.Target.Keyspace || prevTarget.Shard != th.Target.Shard if targetChanged { @@ -723,6 +735,15 @@ func (hc *HealthCheckImpl) keyFromTablet(tablet *topodata.Tablet) keyspaceShardT return keyspaceShardTabletType(fmt.Sprintf("%s.%s.%s", tablet.Keyspace, tablet.Shard, topoproto.TabletTypeLString(tablet.Type))) } +// keysFromTablet returns a slice of potential type based keys for a tablet +func (hc *HealthCheckImpl) keysFromTablet(tablet *topodata.Tablet) []keyspaceShardTabletType { + tabletTypeKeys := make([]keyspaceShardTabletType, len(topoproto.AllTabletTypes)) + for i, tabletType := range topoproto.AllTabletTypes { + tabletTypeKeys[i] = keyspaceShardTabletType(fmt.Sprintf("%s.%s.%s", tablet.Keyspace, tablet.Shard, topoproto.TabletTypeLString(tabletType))) + } + return tabletTypeKeys +} + // getAliasByCell should only be called while holding hc.mu func (hc *HealthCheckImpl) getAliasByCell(cell string) string { if alias, ok := hc.cellAliases[cell]; ok { diff --git a/go/vt/discovery/tablet_health_check.go b/go/vt/discovery/tablet_health_check.go index f2d36f9886c..f6f89715c9b 100644 --- a/go/vt/discovery/tablet_health_check.go +++ b/go/vt/discovery/tablet_health_check.go @@ -290,11 +290,9 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { if err != nil { hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1) // We have reason to suspect the tablet healthcheck record is corrupted or invalid so let's remove the tablet's record - // from the healthcheck cache and it will get re-added again if the tablet is reachable - if strings.Contains(err.Error(), "health stats mismatch") || - strings.HasSuffix(err.Error(), context.Canceled.Error()) || - strings.Contains(err.Error(), `"error reading from server: EOF", received prior goaway`) { - log.Warningf("tablet %s had a suspect healthcheck error: %s -- clearing cache record", thc.Tablet.Alias, err.Error()) + // from the healthcheck cache and the corrected tablet record will be fetched from the topology and added to the + // healthcheck cache again via the topology watcher. + if strings.Contains(err.Error(), "health stats mismatch") { hc.deleteTablet(thc.Tablet) return }