vitessio · rafael · Oct 24, 2019 · Oct 22, 2019
diff --git a/go/vt/discovery/replicationlag.go b/go/vt/discovery/replicationlag.go
@@ -25,9 +25,10 @@ import (
 
 var (
 	// lowReplicationLag defines the duration that replication lag is low enough that the VTTablet is considered healthy.
-	lowReplicationLag            = flag.Duration("discovery_low_replication_lag", 30*time.Second, "the replication lag that is considered low enough to be healthy")
-	highReplicationLagMinServing = flag.Duration("discovery_high_replication_lag_minimum_serving", 2*time.Hour, "the replication lag that is considered too high when selecting the minimum num vttablets for serving")
-	minNumTablets                = flag.Int("min_number_serving_vttablets", 2, "the minimum number of vttablets that will be continue to be used even with low replication lag")
+	lowReplicationLag             = flag.Duration("discovery_low_replication_lag", 30*time.Second, "the replication lag that is considered low enough to be healthy")
+	highReplicationLagMinServing  = flag.Duration("discovery_high_replication_lag_minimum_serving", 2*time.Hour, "the replication lag that is considered too high when selecting the minimum num vttablets for serving")
+	minNumTablets                 = flag.Int("min_number_serving_vttablets", 2, "the minimum number of vttablets that will be continue to be used even with low replication lag")
+	legacyReplicationLagAlgorithm = flag.Bool("legacy_replication_lag_algorithm", true, "use the legacy algorithm when selecting the vttablets for serving")
 )
 
 // IsReplicationLagHigh verifies that the given TabletStats refers to a tablet with high
@@ -43,7 +44,17 @@ func IsReplicationLagVeryHigh(tabletStats *TabletStats) bool {
 }
 
 // FilterByReplicationLag filters the list of TabletStats by TabletStats.Stats.SecondsBehindMaster.
-// The algorithm (TabletStats that is non-serving or has error is ignored):
+// Note that TabletStats that is non-serving or has error is ignored.
+//
+// The simplified logic:
+// - Return tablets that have lag <= lowReplicationLag.
+// - Make sure we return at least minNumTablets tablets, if there are enough one with lag <= highReplicationLagMinServing.
+// For example, with the default of 30s / 2h / 2, this means:
+// - lags of (5s, 10s, 15s, 120s) return the first three
+// - lags of (30m, 35m, 40m, 45m) return the first two
+// - lags of (2h, 3h, 4h, 5h) return the first one
+//
+// The legacy algorithm (default for now):
 // - Return the list if there is 0 or 1 tablet.
 // - Return the list if all tablets have <=30s lag.
 // - Filter by replication lag: for each tablet, if the mean value without it is more than 0.7 of the mean value across all tablets, it is valid.
@@ -58,16 +69,46 @@ func IsReplicationLagVeryHigh(tabletStats *TabletStats) bool {
 // * degraded_threshold: this is only used by vttablet for display. It should match
 //   discovery_low_replication_lag here, so the vttablet status display matches what vtgate will do of it.
 func FilterByReplicationLag(tabletStatsList []*TabletStats) []*TabletStats {
-	res := filterByLag(tabletStatsList)
+	if !*legacyReplicationLagAlgorithm {
+		return filterByLag(tabletStatsList)
+	}
+
+	res := filterByLagWithLegacyAlgorithm(tabletStatsList)
 	// run the filter again if exactly one tablet is removed,
 	// and we have spare tablets.
 	if len(res) > *minNumTablets && len(res) == len(tabletStatsList)-1 {
-		res = filterByLag(res)
+		res = filterByLagWithLegacyAlgorithm(res)
 	}
 	return res
 }
 
 func filterByLag(tabletStatsList []*TabletStats) []*TabletStats {
+	list := make([]tabletLagSnapshot, 0, len(tabletStatsList))
+	// filter non-serving tablets and those with very high replication lag
+	for _, ts := range tabletStatsList {
+		if !ts.Serving || ts.LastError != nil || ts.Stats == nil || IsReplicationLagVeryHigh(ts) {
+			continue
+		}
+		// Pull the current replication lag for a stable sort later.
+		list = append(list, tabletLagSnapshot{
+			ts:     ts,
+			replag: ts.Stats.SecondsBehindMaster})
+	}
+
+	// Sort by replication lag.
+	sort.Sort(byReplag(list))
+
+	// Pick those with low replication lag, but at least minNumTablets tablets regardless.
+	res := make([]*TabletStats, 0, len(list))
+	for i := 0; i < len(list); i++ {
+		if !IsReplicationLagHigh(list[i].ts) || i < *minNumTablets {
+			res = append(res, list[i].ts)
+		}
+	}
+	return res
+}
+
+func filterByLagWithLegacyAlgorithm(tabletStatsList []*TabletStats) []*TabletStats {
 	list := make([]*TabletStats, 0, len(tabletStatsList))
 	// filter non-serving tablets
 	for _, ts := range tabletStatsList {

diff --git a/go/vt/discovery/replicationlag_test.go b/go/vt/discovery/replicationlag_test.go
@@ -29,6 +29,11 @@ func testSetMinNumTablets(newMin int) {
 	*minNumTablets = newMin
 }
 
+// testSetLegacyReplicationLagAlgorithm is a test helper function, if this is used by a production code path, something is wrong.
+func testSetLegacyReplicationLagAlgorithm(newLegacy bool) {
+	*legacyReplicationLagAlgorithm = newLegacy
+}
+
 func TestFilterByReplicationLagUnhealthy(t *testing.T) {
 	// 1 healthy serving tablet, 1 not healhty
 	ts1 := &TabletStats{
@@ -51,6 +56,84 @@ func TestFilterByReplicationLagUnhealthy(t *testing.T) {
 }
 
 func TestFilterByReplicationLag(t *testing.T) {
+	// Use simplified logic
+	testSetLegacyReplicationLagAlgorithm(false)
+
+	cases := []struct {
+		description string
+		input       []uint32
+		output      []uint32
+	}{
+		{
+			"0 tablet",
+			[]uint32{},
+			[]uint32{},
+		},
+		{
+			"lags of (1s) - return all items with low lag.",
+			[]uint32{1},
+			[]uint32{1},
+		},
+		{
+			"lags of (1s, 1s, 1s, 30s) - return all items with low lag.",
+			[]uint32{1, 1, 1, 30},
+			[]uint32{1, 1, 1, 30},
+		},
+		{
+			"lags of (1s, 1s, 1s, 40m, 40m, 40m) - return all items with low lag.",
+			[]uint32{1, 1, 1, 40 * 60, 40 * 60, 40 * 60},
+			[]uint32{1, 1, 1},
+		},
+		{
+			"lags of (1s, 40m, 40m, 40m) - return at least 2 items if they don't have very high lag.",
+			[]uint32{1, 40 * 60, 40 * 60, 40 * 60},
+			[]uint32{1, 40 * 60},
+		},
+		{
+			"lags of (30m, 35m, 40m, 45m) - return at least 2 items if they don't have very high lag.",
+			[]uint32{30 * 60, 35 * 60, 40 * 60, 45 * 60},
+			[]uint32{30 * 60, 35 * 60},
+		},
+		{
+			"lags of (2h, 3h, 4h, 5h) - return <2 items if the others have very high lag.",
+			[]uint32{2 * 60 * 60, 3 * 60 * 60, 4 * 60 * 60, 5 * 60 * 60},
+			[]uint32{2 * 60 * 60},
+		},
+		{
+			"lags of (3h, 30h) - return nothing if all have very high lag.",
+			[]uint32{3 * 60 * 60, 30 * 60 * 60},
+			[]uint32{},
+		},
+	}
+
+	for _, tc := range cases {
+		lts := make([]*TabletStats, len(tc.input))
+		for i, lag := range tc.input {
+			lts[i] = &TabletStats{
+				Tablet:  topo.NewTablet(uint32(i+1), "cell", fmt.Sprintf("host-%vs-behind", lag)),
+				Serving: true,
+				Stats:   &querypb.RealtimeStats{SecondsBehindMaster: lag},
+			}
+		}
+		got := FilterByReplicationLag(lts)
+		if len(got) != len(tc.output) {
+			t.Errorf("FilterByReplicationLag(%v) failed: got output:\n%v\nExpected: %v", tc.description, got, tc.output)
+			continue
+		}
+		for i, elag := range tc.output {
+			if got[i].Stats.SecondsBehindMaster != elag {
+				t.Errorf("FilterByReplicationLag(%v) failed: got output:\n%v\nExpected value index %v to be %v", tc.description, got, i, elag)
+			}
+		}
+	}
+
+	// Reset to the default
+	testSetLegacyReplicationLagAlgorithm(true)
+}
+
+func TestFilterByReplicationLagWithLegacyAlgorithm(t *testing.T) {
+	// Use legacy algorithm by default for now
+
 	cases := []struct {
 		description string
 		input       []uint32
@@ -106,6 +189,23 @@ func TestFilterByReplicationLag(t *testing.T) {
 			[]uint32{3 * 60 * 60, 4 * 60 * 60},
 			[]uint32{3 * 60 * 60, 4 * 60 * 60},
 		},
+		{
+			"lags of (3h, 3h, 4h) - return 3 as they're all delayed too much, but still in a good group.",
+			[]uint32{3 * 60 * 60, 3 * 60 * 60, 4 * 60 * 60},
+			[]uint32{3 * 60 * 60, 3 * 60 * 60, 4 * 60 * 60},
+		},
+		{
+			"lags of (3h, 15h, 18h) - return 3 as they're all delayed too much, but still in a good group." +
+				"(different test case than above to show how absurb the good group logic is)",
+			[]uint32{3 * 60 * 60, 15 * 60 * 60, 18 * 60 * 60},
+			[]uint32{3 * 60 * 60, 15 * 60 * 60, 18 * 60 * 60},
+		},
+		{
+			"lags of (3h, 12h, 18h) - return 2 as they're all delayed too much, but 18h is now considered an outlier." +
+				"(different test case than above to show how absurb the good group logic is)",
+			[]uint32{3 * 60 * 60, 12 * 60 * 60, 18 * 60 * 60},
+			[]uint32{3 * 60 * 60, 12 * 60 * 60},
+		},
 		{
 			"lags of (3h, 30h) - return 2 as they're all delayed too much." +
 				"(different test case that before, as both tablet stats are" +