From a1b7a1f3530cdf22d72322c2e108543c081e2d95 Mon Sep 17 00:00:00 2001 From: deepthi Date: Tue, 9 Mar 2021 17:37:53 -0800 Subject: [PATCH 1/3] healthcheck: update healthy tablets correctly when a stream returns an error or times out Signed-off-by: deepthi --- go/vt/discovery/healthcheck.go | 29 +++++++++++++------------- go/vt/discovery/healthcheck_test.go | 11 ++++++++++ go/vt/discovery/tablet_health_check.go | 15 +++++++++---- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index 02eb035ea21..c672e88ac05 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -404,17 +404,17 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) { } } -func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, shr *query.StreamHealthResponse, currentTarget *query.Target, trivialNonMasterUpdate bool, isMasterUpdate bool, isMasterChange bool) { +func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, currentTarget *query.Target, trivialNonMasterUpdate bool, isMasterUpdate bool, isMasterChange bool) { // hc.healthByAlias is authoritative, it should be updated hc.mu.Lock() defer hc.mu.Unlock() - tabletAlias := tabletAliasString(topoproto.TabletAliasString(shr.TabletAlias)) - - hcErrorCounters.Add([]string{shr.Target.Keyspace, shr.Target.Shard, topoproto.TabletTypeLString(shr.Target.TabletType)}, 0) - targetKey := hc.keyFromTarget(shr.Target) - targetChanged := currentTarget.TabletType != shr.Target.TabletType || currentTarget.Keyspace != shr.Target.Keyspace || currentTarget.Shard != shr.Target.Shard + tabletAlias := tabletAliasString(topoproto.TabletAliasString(th.Tablet.Alias)) + targetKey := hc.keyFromTarget(th.Target) + targetChanged := currentTarget.TabletType != th.Target.TabletType || currentTarget.Keyspace != th.Target.Keyspace || currentTarget.Shard != th.Target.Shard if targetChanged { + // Error counter has to be set here in case we get a new tablet type for the first time in a stream response + hcErrorCounters.Add([]string{th.Target.Keyspace, th.Target.Shard, topoproto.TabletTypeLString(th.Target.TabletType)}, 0) // keyspace and shard are not expected to change, but just in case ... // move this tabletHealthCheck to the correct map oldTargetKey := hc.keyFromTarget(currentTarget) @@ -433,12 +433,12 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, shr *query.StreamHealt } else { // We already have one up server, see if we // need to replace it. - if shr.TabletExternallyReparentedTimestamp < hc.healthy[targetKey][0].MasterTermStartTime { + if th.MasterTermStartTime < hc.healthy[targetKey][0].MasterTermStartTime { log.Warningf("not marking healthy master %s as Up for %s because its MasterTermStartTime is smaller than the highest known timestamp from previous MASTERs %s: %d < %d ", - topoproto.TabletAliasString(shr.TabletAlias), - topoproto.KeyspaceShardString(shr.Target.Keyspace, shr.Target.Shard), + topoproto.TabletAliasString(th.Tablet.Alias), + topoproto.KeyspaceShardString(th.Target.Keyspace, th.Target.Shard), topoproto.TabletAliasString(hc.healthy[targetKey][0].Tablet.Alias), - shr.TabletExternallyReparentedTimestamp, + th.MasterTermStartTime, hc.healthy[targetKey][0].MasterTermStartTime) } else { // Just replace it. @@ -450,21 +450,20 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, shr *query.StreamHealt // We re-sort the healthy tablet list whenever we get a health update for tablets we can route to. // Tablets from other cells for non-master targets should not trigger a re-sort; // they should also be excluded from healthy list. - if shr.Target.TabletType != topodata.TabletType_MASTER && hc.isIncluded(shr.Target.TabletType, shr.TabletAlias) { + if th.Target.TabletType != topodata.TabletType_MASTER && hc.isIncluded(th.Target.TabletType, th.Tablet.Alias) { hc.recomputeHealthy(targetKey) } - if targetChanged && currentTarget.TabletType != topodata.TabletType_MASTER && hc.isIncluded(shr.Target.TabletType, shr.TabletAlias) { // also recompute old target's healthy list + if targetChanged && currentTarget.TabletType != topodata.TabletType_MASTER && hc.isIncluded(th.Target.TabletType, th.Tablet.Alias) { // also recompute old target's healthy list oldTargetKey := hc.keyFromTarget(currentTarget) hc.recomputeHealthy(oldTargetKey) } } if isMasterChange { - log.Errorf("Adding 1 to MasterPromoted counter for tablet: %v, shr.Tablet: %v, shr.TabletType: %v", currentTarget, topoproto.TabletAliasString(shr.TabletAlias), shr.Target.TabletType) - hcMasterPromotedCounters.Add([]string{shr.Target.Keyspace, shr.Target.Shard}, 1) + log.Errorf("Adding 1 to MasterPromoted counter for target: %v, tablet: %v, tabletType: %v", currentTarget, topoproto.TabletAliasString(th.Tablet.Alias), th.Target.TabletType) + hcMasterPromotedCounters.Add([]string{th.Target.Keyspace, th.Target.Shard}, 1) } // broadcast to subscribers hc.broadcast(th) - } func (hc *HealthCheckImpl) recomputeHealthy(key keyspaceShardTabletType) { diff --git a/go/vt/discovery/healthcheck_test.go b/go/vt/discovery/healthcheck_test.go index e2b4d125f29..3b4046b58b5 100644 --- a/go/vt/discovery/healthcheck_test.go +++ b/go/vt/discovery/healthcheck_test.go @@ -65,6 +65,8 @@ func init() { } func TestHealthCheck(t *testing.T) { + // reset error counters + hcErrorCounters.ResetAll() ts := memorytopo.NewServer("cell") hc := createTestHc(ts) // close healthcheck @@ -257,6 +259,9 @@ func TestHealthCheckStreamError(t *testing.T) { result = <-resultChan //TODO: figure out how to compare objects that contain errors using utils.MustMatch assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual: %v", want, result) + // tablet should be removed from healthy list + a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) + assert.Empty(t, a, "wrong result, expected empty list") } func TestHealthCheckVerifiesTabletAlias(t *testing.T) { @@ -363,6 +368,8 @@ func TestHealthCheckCloseWaitsForGoRoutines(t *testing.T) { } func TestHealthCheckTimeout(t *testing.T) { + // reset counters + hcErrorCounters.ResetAll() ts := memorytopo.NewServer("cell") hc := createTestHc(ts) hc.healthCheckTimeout = 500 * time.Millisecond @@ -410,6 +417,10 @@ func TestHealthCheckTimeout(t *testing.T) { assert.Nil(t, checkErrorCounter("k", "s", topodatapb.TabletType_REPLICA, 1)) assert.True(t, fc.isCanceled(), "StreamHealth should be canceled after timeout, but is not") + // tablet should be removed from healthy list + a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) + assert.Empty(t, a, "wrong result, expected empty list") + // repeat the wait. It will timeout one more time trying to get the connection. fc.resetCanceledFlag() time.Sleep(hc.healthCheckTimeout) diff --git a/go/vt/discovery/tablet_health_check.go b/go/vt/discovery/tablet_health_check.go index 4882ac39341..2f36f78e765 100644 --- a/go/vt/discovery/tablet_health_check.go +++ b/go/vt/discovery/tablet_health_check.go @@ -202,7 +202,7 @@ func (thc *tabletHealthCheck) processResponse(hc *HealthCheckImpl, shr *query.St thc.setServingState(serving, reason) // notify downstream for master change - hc.updateHealth(thc.SimpleCopy(), shr, currentTarget, trivialNonMasterUpdate, isMasterUpdate, isMasterChange) + hc.updateHealth(thc.SimpleCopy(), currentTarget, trivialNonMasterUpdate, isMasterUpdate, isMasterChange) return nil } @@ -241,6 +241,9 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { hc.connsWG.Done() }() + // Initialize error counter + hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 0) + retryDelay := hc.retryDelay for { streamCtx, streamCancel := context.WithCancel(thc.ctx) @@ -287,12 +290,14 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { streamCancel() if err != nil { + hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1) if strings.Contains(err.Error(), "health stats mismatch") { hc.deleteTablet(thc.Tablet) return } - res := thc.SimpleCopy() - hc.broadcast(res) + // trivialNonMasterUpdate = false because this is an error + // It is safe to pass isMasterChange = false because in case of an error there is no information that can be used to update the master + hc.updateHealth(thc.SimpleCopy(), thc.Target, false, thc.Target.TabletType == topodata.TabletType_MASTER, false) } // If there was a timeout send an error. We do this after stream has returned. // This will ensure that this update prevails over any previous message that @@ -301,7 +306,9 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { thc.LastError = fmt.Errorf("healthcheck timed out (latest %v)", thc.lastResponseTimestamp) thc.setServingState(false, thc.LastError.Error()) hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1) - hc.broadcast(thc.SimpleCopy()) + // trivialNonMasterUpdate = false because this is an error + // It is safe to pass isMasterChange = false because in case of timeout there is no information that can be used to update the master + hc.updateHealth(thc.SimpleCopy(), thc.Target, false, thc.Target.TabletType == topodata.TabletType_MASTER, false) } // Streaming RPC failed e.g. because vttablet was restarted or took too long. From 7eb7228bd2666438729db73ccf013c3d7b42e507 Mon Sep 17 00:00:00 2001 From: deepthi Date: Wed, 10 Mar 2021 17:34:38 -0800 Subject: [PATCH 2/3] healthcheck: remove PRIMARY from healthy list when there's an error or timeout from healthcheck stream Signed-off-by: deepthi --- go/vt/discovery/healthcheck.go | 41 ++++++++++-------- go/vt/discovery/healthcheck_test.go | 60 ++++++++++++++++++++++++++ go/vt/discovery/tablet_health_check.go | 18 ++++---- 3 files changed, 91 insertions(+), 28 deletions(-) diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index c672e88ac05..f091a13a441 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -404,7 +404,7 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) { } } -func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, currentTarget *query.Target, trivialNonMasterUpdate bool, isMasterUpdate bool, isMasterChange bool) { +func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, currentTarget *query.Target, trivialUpdate bool, isPrimaryUp bool) { // hc.healthByAlias is authoritative, it should be updated hc.mu.Lock() defer hc.mu.Unlock() @@ -427,26 +427,31 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, currentTarget *query.T // add it to the map by target hc.healthData[targetKey][tabletAlias] = th - if isMasterUpdate { - if len(hc.healthy[targetKey]) == 0 { - hc.healthy[targetKey] = append(hc.healthy[targetKey], th) - } else { - // We already have one up server, see if we - // need to replace it. - if th.MasterTermStartTime < hc.healthy[targetKey][0].MasterTermStartTime { - log.Warningf("not marking healthy master %s as Up for %s because its MasterTermStartTime is smaller than the highest known timestamp from previous MASTERs %s: %d < %d ", - topoproto.TabletAliasString(th.Tablet.Alias), - topoproto.KeyspaceShardString(th.Target.Keyspace, th.Target.Shard), - topoproto.TabletAliasString(hc.healthy[targetKey][0].Tablet.Alias), - th.MasterTermStartTime, - hc.healthy[targetKey][0].MasterTermStartTime) + if th.Target.TabletType == topodata.TabletType_MASTER { + if isPrimaryUp { + if len(hc.healthy[targetKey]) == 0 { + hc.healthy[targetKey] = append(hc.healthy[targetKey], th) } else { - // Just replace it. - hc.healthy[targetKey][0] = th + // We already have one up server, see if we + // need to replace it. + if th.MasterTermStartTime < hc.healthy[targetKey][0].MasterTermStartTime { + log.Warningf("not marking healthy master %s as Up for %s because its MasterTermStartTime is smaller than the highest known timestamp from previous MASTERs %s: %d < %d ", + topoproto.TabletAliasString(th.Tablet.Alias), + topoproto.KeyspaceShardString(th.Target.Keyspace, th.Target.Shard), + topoproto.TabletAliasString(hc.healthy[targetKey][0].Tablet.Alias), + th.MasterTermStartTime, + hc.healthy[targetKey][0].MasterTermStartTime) + } else { + // Just replace it. + hc.healthy[targetKey][0] = th + } } + } else { + // No healthy master tablet + hc.healthy[targetKey] = []*TabletHealth{} } } - if !trivialNonMasterUpdate { + if !trivialUpdate { // We re-sort the healthy tablet list whenever we get a health update for tablets we can route to. // Tablets from other cells for non-master targets should not trigger a re-sort; // they should also be excluded from healthy list. @@ -458,7 +463,7 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, currentTarget *query.T hc.recomputeHealthy(oldTargetKey) } } - if isMasterChange { + if currentTarget.TabletType != topodata.TabletType_MASTER && th.Target.TabletType == topodata.TabletType_MASTER { log.Errorf("Adding 1 to MasterPromoted counter for target: %v, tablet: %v, tabletType: %v", currentTarget, topoproto.TabletAliasString(th.Tablet.Alias), th.Target.TabletType) hcMasterPromotedCounters.Add([]string{th.Target.Keyspace, th.Target.Shard}, 1) } diff --git a/go/vt/discovery/healthcheck_test.go b/go/vt/discovery/healthcheck_test.go index 3b4046b58b5..2d554ab2cf2 100644 --- a/go/vt/discovery/healthcheck_test.go +++ b/go/vt/discovery/healthcheck_test.go @@ -264,6 +264,66 @@ func TestHealthCheckStreamError(t *testing.T) { assert.Empty(t, a, "wrong result, expected empty list") } +// TestHealthCheckErrorOnPrimary is the same as TestHealthCheckStreamError except for tablet type +func TestHealthCheckErrorOnPrimary(t *testing.T) { + ts := memorytopo.NewServer("cell") + hc := createTestHc(ts) + defer hc.Close() + + tablet := createTestTablet(0, "cell", "a") + input := make(chan *querypb.StreamHealthResponse) + resultChan := hc.Subscribe() + fc := createFakeConn(tablet, input) + fc.errCh = make(chan error) + hc.AddTablet(tablet) + + // Immediately after AddTablet() there will be the first notification. + want := &TabletHealth{ + Tablet: tablet, + Target: &querypb.Target{Keyspace: "k", Shard: "s"}, + Serving: false, + MasterTermStartTime: 0, + } + result := <-resultChan + mustMatch(t, want, result, "Wrong TabletHealth data") + + // one tablet after receiving a StreamHealthResponse + shr := &querypb.StreamHealthResponse{ + TabletAlias: tablet.Alias, + Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_MASTER}, + Serving: true, + TabletExternallyReparentedTimestamp: 10, + RealtimeStats: &querypb.RealtimeStats{SecondsBehindMaster: 1, CpuUsage: 0.2}, + } + want = &TabletHealth{ + Tablet: tablet, + Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_MASTER}, + Serving: true, + Stats: &querypb.RealtimeStats{SecondsBehindMaster: 1, CpuUsage: 0.2}, + MasterTermStartTime: 10, + } + input <- shr + result = <-resultChan + mustMatch(t, want, result, "Wrong TabletHealth data") + + // Stream error + fc.errCh <- fmt.Errorf("some stream error") + want = &TabletHealth{ + Tablet: tablet, + Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_MASTER}, + Serving: false, + Stats: &querypb.RealtimeStats{SecondsBehindMaster: 1, CpuUsage: 0.2}, + MasterTermStartTime: 10, + LastError: fmt.Errorf("some stream error"), + } + result = <-resultChan + //TODO: figure out how to compare objects that contain errors using utils.MustMatch + assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual: %v", want, result) + // tablet should be removed from healthy list + a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_MASTER}) + assert.Empty(t, a, "wrong result, expected empty list") +} + func TestHealthCheckVerifiesTabletAlias(t *testing.T) { ts := memorytopo.NewServer("cell") hc := createTestHc(ts) diff --git a/go/vt/discovery/tablet_health_check.go b/go/vt/discovery/tablet_health_check.go index 2f36f78e765..c008d2f162f 100644 --- a/go/vt/discovery/tablet_health_check.go +++ b/go/vt/discovery/tablet_health_check.go @@ -186,10 +186,8 @@ func (thc *tabletHealthCheck) processResponse(hc *HealthCheckImpl, shr *query.St currentTarget := thc.Target // check whether this is a trivial update so as to update healthy map - trivialNonMasterUpdate := thc.LastError == nil && thc.Serving && shr.RealtimeStats.HealthError == "" && shr.Serving && + trivialUpdate := thc.LastError == nil && thc.Serving && shr.RealtimeStats.HealthError == "" && shr.Serving && currentTarget.TabletType != topodata.TabletType_MASTER && currentTarget.TabletType == shr.Target.TabletType && thc.isTrivialReplagChange(shr.RealtimeStats) - isMasterUpdate := shr.Target.TabletType == topodata.TabletType_MASTER - isMasterChange := thc.Target.TabletType != topodata.TabletType_MASTER && shr.Target.TabletType == topodata.TabletType_MASTER thc.lastResponseTimestamp = time.Now() thc.Target = shr.Target thc.MasterTermStartTime = shr.TabletExternallyReparentedTimestamp @@ -202,7 +200,7 @@ func (thc *tabletHealthCheck) processResponse(hc *HealthCheckImpl, shr *query.St thc.setServingState(serving, reason) // notify downstream for master change - hc.updateHealth(thc.SimpleCopy(), currentTarget, trivialNonMasterUpdate, isMasterUpdate, isMasterChange) + hc.updateHealth(thc.SimpleCopy(), currentTarget, trivialUpdate, true) return nil } @@ -295,9 +293,9 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { hc.deleteTablet(thc.Tablet) return } - // trivialNonMasterUpdate = false because this is an error - // It is safe to pass isMasterChange = false because in case of an error there is no information that can be used to update the master - hc.updateHealth(thc.SimpleCopy(), thc.Target, false, thc.Target.TabletType == topodata.TabletType_MASTER, false) + // trivialUpdate = false because this is an error + // isPrimaryUp = false because we did not get a healthy response + hc.updateHealth(thc.SimpleCopy(), thc.Target, false, false) } // If there was a timeout send an error. We do this after stream has returned. // This will ensure that this update prevails over any previous message that @@ -306,9 +304,9 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { thc.LastError = fmt.Errorf("healthcheck timed out (latest %v)", thc.lastResponseTimestamp) thc.setServingState(false, thc.LastError.Error()) hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1) - // trivialNonMasterUpdate = false because this is an error - // It is safe to pass isMasterChange = false because in case of timeout there is no information that can be used to update the master - hc.updateHealth(thc.SimpleCopy(), thc.Target, false, thc.Target.TabletType == topodata.TabletType_MASTER, false) + // trivialUpdate = false because this is an error + // isPrimaryUp = false because we did not get a healthy response within the timeout + hc.updateHealth(thc.SimpleCopy(), thc.Target, false, false) } // Streaming RPC failed e.g. because vttablet was restarted or took too long. From f3a432e857c910ff2b6c93d8805ba71223380b29 Mon Sep 17 00:00:00 2001 From: deepthi Date: Thu, 11 Mar 2021 17:35:43 -0800 Subject: [PATCH 3/3] healthcheck: reorganize code to be more readable Signed-off-by: deepthi --- go/vt/discovery/healthcheck.go | 58 ++++++++++++++------------ go/vt/discovery/healthcheck_test.go | 2 +- go/vt/discovery/tablet_health_check.go | 6 +-- 3 files changed, 35 insertions(+), 31 deletions(-) diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index f091a13a441..4af9dfd7202 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -404,20 +404,20 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) { } } -func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, currentTarget *query.Target, trivialUpdate bool, isPrimaryUp bool) { +func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Target, trivialUpdate bool, isPrimaryUp bool) { // hc.healthByAlias is authoritative, it should be updated hc.mu.Lock() defer hc.mu.Unlock() tabletAlias := tabletAliasString(topoproto.TabletAliasString(th.Tablet.Alias)) targetKey := hc.keyFromTarget(th.Target) - targetChanged := currentTarget.TabletType != th.Target.TabletType || currentTarget.Keyspace != th.Target.Keyspace || currentTarget.Shard != th.Target.Shard + targetChanged := prevTarget.TabletType != th.Target.TabletType || prevTarget.Keyspace != th.Target.Keyspace || prevTarget.Shard != th.Target.Shard if targetChanged { // Error counter has to be set here in case we get a new tablet type for the first time in a stream response hcErrorCounters.Add([]string{th.Target.Keyspace, th.Target.Shard, topoproto.TabletTypeLString(th.Target.TabletType)}, 0) // keyspace and shard are not expected to change, but just in case ... // move this tabletHealthCheck to the correct map - oldTargetKey := hc.keyFromTarget(currentTarget) + oldTargetKey := hc.keyFromTarget(prevTarget) delete(hc.healthData[oldTargetKey], tabletAlias) _, ok := hc.healthData[targetKey] if !ok { @@ -427,30 +427,31 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, currentTarget *query.T // add it to the map by target hc.healthData[targetKey][tabletAlias] = th - if th.Target.TabletType == topodata.TabletType_MASTER { - if isPrimaryUp { - if len(hc.healthy[targetKey]) == 0 { - hc.healthy[targetKey] = append(hc.healthy[targetKey], th) + isPrimary := th.Target.TabletType == topodata.TabletType_MASTER + switch { + case isPrimary && isPrimaryUp: + if len(hc.healthy[targetKey]) == 0 { + hc.healthy[targetKey] = append(hc.healthy[targetKey], th) + } else { + // We already have one up server, see if we + // need to replace it. + if th.MasterTermStartTime < hc.healthy[targetKey][0].MasterTermStartTime { + log.Warningf("not marking healthy master %s as Up for %s because its MasterTermStartTime is smaller than the highest known timestamp from previous MASTERs %s: %d < %d ", + topoproto.TabletAliasString(th.Tablet.Alias), + topoproto.KeyspaceShardString(th.Target.Keyspace, th.Target.Shard), + topoproto.TabletAliasString(hc.healthy[targetKey][0].Tablet.Alias), + th.MasterTermStartTime, + hc.healthy[targetKey][0].MasterTermStartTime) } else { - // We already have one up server, see if we - // need to replace it. - if th.MasterTermStartTime < hc.healthy[targetKey][0].MasterTermStartTime { - log.Warningf("not marking healthy master %s as Up for %s because its MasterTermStartTime is smaller than the highest known timestamp from previous MASTERs %s: %d < %d ", - topoproto.TabletAliasString(th.Tablet.Alias), - topoproto.KeyspaceShardString(th.Target.Keyspace, th.Target.Shard), - topoproto.TabletAliasString(hc.healthy[targetKey][0].Tablet.Alias), - th.MasterTermStartTime, - hc.healthy[targetKey][0].MasterTermStartTime) - } else { - // Just replace it. - hc.healthy[targetKey][0] = th - } + // Just replace it. + hc.healthy[targetKey][0] = th } - } else { - // No healthy master tablet - hc.healthy[targetKey] = []*TabletHealth{} } + case isPrimary && !isPrimaryUp: + // No healthy master tablet + hc.healthy[targetKey] = []*TabletHealth{} } + if !trivialUpdate { // We re-sort the healthy tablet list whenever we get a health update for tablets we can route to. // Tablets from other cells for non-master targets should not trigger a re-sort; @@ -458,15 +459,18 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, currentTarget *query.T if th.Target.TabletType != topodata.TabletType_MASTER && hc.isIncluded(th.Target.TabletType, th.Tablet.Alias) { hc.recomputeHealthy(targetKey) } - if targetChanged && currentTarget.TabletType != topodata.TabletType_MASTER && hc.isIncluded(th.Target.TabletType, th.Tablet.Alias) { // also recompute old target's healthy list - oldTargetKey := hc.keyFromTarget(currentTarget) + if targetChanged && prevTarget.TabletType != topodata.TabletType_MASTER && hc.isIncluded(th.Target.TabletType, th.Tablet.Alias) { // also recompute old target's healthy list + oldTargetKey := hc.keyFromTarget(prevTarget) hc.recomputeHealthy(oldTargetKey) } } - if currentTarget.TabletType != topodata.TabletType_MASTER && th.Target.TabletType == topodata.TabletType_MASTER { - log.Errorf("Adding 1 to MasterPromoted counter for target: %v, tablet: %v, tabletType: %v", currentTarget, topoproto.TabletAliasString(th.Tablet.Alias), th.Target.TabletType) + + isNewPrimary := isPrimary && prevTarget.TabletType != topodata.TabletType_MASTER + if isNewPrimary { + log.Errorf("Adding 1 to MasterPromoted counter for target: %v, tablet: %v, tabletType: %v", prevTarget, topoproto.TabletAliasString(th.Tablet.Alias), th.Target.TabletType) hcMasterPromotedCounters.Add([]string{th.Target.Keyspace, th.Target.Shard}, 1) } + // broadcast to subscribers hc.broadcast(th) } diff --git a/go/vt/discovery/healthcheck_test.go b/go/vt/discovery/healthcheck_test.go index 2d554ab2cf2..73ba30a665f 100644 --- a/go/vt/discovery/healthcheck_test.go +++ b/go/vt/discovery/healthcheck_test.go @@ -869,7 +869,7 @@ func TestMasterInOtherCell(t *testing.T) { // check that MASTER tablet from other cell IS in healthy tablet list a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_MASTER}) - assert.Len(t, a, 1, "") + require.Len(t, a, 1, "") mustMatch(t, want, a[0], "Expecting healthy master") } diff --git a/go/vt/discovery/tablet_health_check.go b/go/vt/discovery/tablet_health_check.go index c008d2f162f..664e1924870 100644 --- a/go/vt/discovery/tablet_health_check.go +++ b/go/vt/discovery/tablet_health_check.go @@ -184,10 +184,10 @@ func (thc *tabletHealthCheck) processResponse(hc *HealthCheckImpl, shr *query.St return vterrors.New(vtrpc.Code_FAILED_PRECONDITION, fmt.Sprintf("health stats mismatch, tablet %+v alias does not match response alias %v", thc.Tablet, shr.TabletAlias)) } - currentTarget := thc.Target + prevTarget := thc.Target // check whether this is a trivial update so as to update healthy map trivialUpdate := thc.LastError == nil && thc.Serving && shr.RealtimeStats.HealthError == "" && shr.Serving && - currentTarget.TabletType != topodata.TabletType_MASTER && currentTarget.TabletType == shr.Target.TabletType && thc.isTrivialReplagChange(shr.RealtimeStats) + prevTarget.TabletType != topodata.TabletType_MASTER && prevTarget.TabletType == shr.Target.TabletType && thc.isTrivialReplagChange(shr.RealtimeStats) thc.lastResponseTimestamp = time.Now() thc.Target = shr.Target thc.MasterTermStartTime = shr.TabletExternallyReparentedTimestamp @@ -200,7 +200,7 @@ func (thc *tabletHealthCheck) processResponse(hc *HealthCheckImpl, shr *query.St thc.setServingState(serving, reason) // notify downstream for master change - hc.updateHealth(thc.SimpleCopy(), currentTarget, trivialUpdate, true) + hc.updateHealth(thc.SimpleCopy(), prevTarget, trivialUpdate, true) return nil }