From eeb51fb46f64d77c0bf190652e2942b871048e83 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Mon, 15 Nov 2021 17:58:16 +0530 Subject: [PATCH 01/16] moved all topo imports to utils package Signed-off-by: Manan Gupta --- go/test/endtoend/vtorc/general/vtorc_test.go | 5 ----- go/test/endtoend/vtorc/primaryFailure/main_test.go | 5 ----- go/test/endtoend/vtorc/utils/utils.go | 5 +++++ 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index e9f71f88f3c..cedbc70eb80 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -23,11 +23,6 @@ import ( "vitess.io/vitess/go/test/endtoend/vtorc/utils" - _ "vitess.io/vitess/go/vt/topo/consultopo" - _ "vitess.io/vitess/go/vt/topo/etcd2topo" - _ "vitess.io/vitess/go/vt/topo/k8stopo" - _ "vitess.io/vitess/go/vt/topo/zk2topo" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" diff --git a/go/test/endtoend/vtorc/primaryFailure/main_test.go b/go/test/endtoend/vtorc/primaryFailure/main_test.go index e2b5c4caa5a..1439ba68085 100644 --- a/go/test/endtoend/vtorc/primaryFailure/main_test.go +++ b/go/test/endtoend/vtorc/primaryFailure/main_test.go @@ -23,11 +23,6 @@ import ( "vitess.io/vitess/go/test/endtoend/vtorc/utils" - _ "vitess.io/vitess/go/vt/topo/consultopo" - _ "vitess.io/vitess/go/vt/topo/etcd2topo" - _ "vitess.io/vitess/go/vt/topo/k8stopo" - _ "vitess.io/vitess/go/vt/topo/zk2topo" - "vitess.io/vitess/go/test/endtoend/cluster" ) diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 71aa0075dee..30953c98fa3 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -29,6 +29,11 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + _ "vitess.io/vitess/go/vt/topo/consultopo" + _ "vitess.io/vitess/go/vt/topo/etcd2topo" + _ "vitess.io/vitess/go/vt/topo/k8stopo" + _ "vitess.io/vitess/go/vt/topo/zk2topo" + "vitess.io/vitess/go/json2" "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/sqltypes" From 824ee4aa482dd0106f413da84bc22ee00dcf2174 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Mon, 15 Nov 2021 19:04:56 +0530 Subject: [PATCH 02/16] test: ported over graceful-master-takeover from vtorc Signed-off-by: Manan Gupta --- .../graceful_takeover_test.go | 64 +++++++++++++++++ .../vtorc/gracefulTakeover/main_test.go | 71 +++++++++++++++++++ go/test/endtoend/vtorc/utils/utils.go | 1 + 3 files changed, 136 insertions(+) create mode 100644 go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go create mode 100644 go/test/endtoend/vtorc/gracefulTakeover/main_test.go diff --git a/go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go b/go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go new file mode 100644 index 00000000000..9c81b0e86e9 --- /dev/null +++ b/go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go @@ -0,0 +1,64 @@ +/* +Copyright 2021 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gracefulTakeover + +import ( + "fmt" + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/test/endtoend/vtorc/utils" +) + +// make an api call to graceful primary takeover and let vtorc fix it +// covers the test case graceful-master-takeover from orchestrator +func TestGracefulTakeover(t *testing.T) { + defer cluster.PanicHandler(t) + utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, "test_config.json") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + + // find primary from topo + curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) + assert.NotNil(t, curPrimary, "should have elected a primary") + + // find the replica tablet + var replica *cluster.Vttablet + for _, tablet := range shard0.Vttablets { + // we know we have only two tablets, so the one not the primary must be the replica + if tablet.Alias != curPrimary.Alias { + replica = tablet + } + } + assert.NotNil(t, replica, "could not find replica tablet") + + // check that the replication is setup correctly before we failover + utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica}, 10*time.Second) + + response, err := http.Get(fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/localhost/%d", curPrimary.MySQLPort, replica.MySQLPort)) + require.NoError(t, err) + assert.Equal(t, 200, response.StatusCode) + + // check that the replica gets promoted + utils.CheckPrimaryTablet(t, clusterInfo, replica, true) + utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{curPrimary}, 10*time.Second) +} diff --git a/go/test/endtoend/vtorc/gracefulTakeover/main_test.go b/go/test/endtoend/vtorc/gracefulTakeover/main_test.go new file mode 100644 index 00000000000..0c81b0e24cd --- /dev/null +++ b/go/test/endtoend/vtorc/gracefulTakeover/main_test.go @@ -0,0 +1,71 @@ +/* +Copyright 2021 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gracefulTakeover + +import ( + "fmt" + "os" + "testing" + + "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/test/endtoend/vtorc/utils" +) + +var clusterInfo *utils.VtOrcClusterInfo + +func TestMain(m *testing.M) { + // setup cellInfos before creating the cluster + var cellInfos []*utils.CellInfo + cellInfos = append(cellInfos, &utils.CellInfo{ + CellName: utils.Cell1, + NumReplicas: 6, + NumRdonly: 2, + UIDBase: 100, + }) + + exitcode, err := func() (int, error) { + var err error + clusterInfo, err = utils.CreateClusterAndStartTopo(cellInfos) + if err != nil { + return 1, err + } + + return m.Run(), nil + }() + + cluster.PanicHandler(nil) + + // stop vtorc first otherwise its logs get polluted + // with instances being unreachable triggering unnecessary operations + if clusterInfo.ClusterInstance.VtorcProcess != nil { + _ = clusterInfo.ClusterInstance.VtorcProcess.TearDown() + } + + for _, cellInfo := range clusterInfo.CellInfos { + utils.KillTablets(cellInfo.ReplicaTablets) + utils.KillTablets(cellInfo.RdonlyTablets) + } + clusterInfo.ClusterInstance.Keyspaces[0].Shards[0].Vttablets = nil + clusterInfo.ClusterInstance.Teardown() + + if err != nil { + fmt.Printf("%v\n", err) + os.Exit(1) + } else { + os.Exit(exitcode) + } +} diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 30953c98fa3..c31acf156d8 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -29,6 +29,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + // This imports toposervers to register their implementations of TopoServer. _ "vitess.io/vitess/go/vt/topo/consultopo" _ "vitess.io/vitess/go/vt/topo/etcd2topo" _ "vitess.io/vitess/go/vt/topo/k8stopo" From f40b2636ec0ce258d537750771cd4727b0816813 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Mon, 15 Nov 2021 21:50:36 +0530 Subject: [PATCH 03/16] test: ported over graceful-master-takeover-fail-no-target from orchestrator Signed-off-by: Manan Gupta --- .../graceful_takeover_test.go | 43 ++++++++++++++++--- go/test/endtoend/vtorc/utils/utils.go | 13 ++++++ 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go b/go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go index 9c81b0e86e9..fb9bfafb826 100644 --- a/go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go +++ b/go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go @@ -18,12 +18,10 @@ package gracefulTakeover import ( "fmt" - "net/http" "testing" "time" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" "vitess.io/vitess/go/test/endtoend/cluster" "vitess.io/vitess/go/test/endtoend/vtorc/utils" @@ -31,7 +29,7 @@ import ( // make an api call to graceful primary takeover and let vtorc fix it // covers the test case graceful-master-takeover from orchestrator -func TestGracefulTakeover(t *testing.T) { +func TestGracefulPrimaryTakeover(t *testing.T) { defer cluster.PanicHandler(t) utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, "test_config.json") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] @@ -54,11 +52,44 @@ func TestGracefulTakeover(t *testing.T) { // check that the replication is setup correctly before we failover utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica}, 10*time.Second) - response, err := http.Get(fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/localhost/%d", curPrimary.MySQLPort, replica.MySQLPort)) - require.NoError(t, err) - assert.Equal(t, 200, response.StatusCode) + status, _ := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/localhost/%d", curPrimary.MySQLPort, replica.MySQLPort)) + assert.Equal(t, 200, status) // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{curPrimary}, 10*time.Second) } + +// make an api call to graceful primary takeover without specifying the primary tablet to promote +// covers the test case graceful-master-takeover-fail-no-target from orchestrator +func TestGracefulPrimaryTakeoverFailNoTarget(t *testing.T) { + defer cluster.PanicHandler(t) + utils.SetupVttabletsAndVtorc(t, clusterInfo, 3, 0, nil, "test_config.json") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + + // find primary from topo + curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) + assert.NotNil(t, curPrimary, "should have elected a primary") + + // find the replica tablet + var replicas []*cluster.Vttablet + for _, tablet := range shard0.Vttablets { + // we know we have only two tablets, so the one not the primary must be the replica + if tablet.Alias != curPrimary.Alias { + replicas = append(replicas, tablet) + } + } + assert.Equal(t, 2, len(replicas), "could not find replica tablets") + + // check that the replication is setup correctly before we failover + utils.CheckReplication(t, clusterInfo, curPrimary, replicas, 10*time.Second) + + status, response := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/", curPrimary.MySQLPort)) + assert.Equal(t, 500, status) + assert.Contains(t, response, "GracefulPrimaryTakeover: target instance not indicated") + + // check that the replica doesn't get promoted and the previous primary is still the primary + utils.CheckPrimaryTablet(t, clusterInfo, curPrimary, true) + utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, replicas, 10*time.Second) +} diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index c31acf156d8..cf123d2dc1b 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -19,6 +19,8 @@ package utils import ( "context" "fmt" + "io/ioutil" + "net/http" "os" "os/exec" "path" @@ -710,3 +712,14 @@ func CheckSourcePort(t *testing.T, replica *cluster.Vttablet, source *cluster.Vt time.Sleep(300 * time.Millisecond) } } + +// MakeAPICall is used make an API call given the url. It returns the status and the body of the response received +func MakeAPICall(t *testing.T, url string) (status int, response string) { + t.Helper() + res, err := http.Get(url) + require.NoError(t, err) + bodyBytes, err := ioutil.ReadAll(res.Body) + require.NoError(t, err) + body := string(bodyBytes) + return res.StatusCode, body +} From 7ed19b3a62b4ad3e9847aebbde564d9e9feb2a95 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 16 Nov 2021 11:27:09 +0530 Subject: [PATCH 04/16] test: ported over graceful-master-takeover-auto from orchestrator Signed-off-by: Manan Gupta --- .../graceful_takeover_test.go | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go b/go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go index fb9bfafb826..c57504e7046 100644 --- a/go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go +++ b/go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go @@ -93,3 +93,47 @@ func TestGracefulPrimaryTakeoverFailNoTarget(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, curPrimary, true) utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, replicas, 10*time.Second) } + +// make an api call to graceful primary takeover auto and let vtorc fix it +// covers the test case graceful-master-takeover-auto from orchestrator +func TestGracefulPrimaryTakeoverAuto(t *testing.T) { + defer cluster.PanicHandler(t) + utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 1, nil, "test_config.json") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + + // find primary from topo + primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) + assert.NotNil(t, primary, "should have elected a primary") + + // find the replica tablet and the rdonly tablet + var replica, rdonly *cluster.Vttablet + for _, tablet := range shard0.Vttablets { + // we know we have only two replcia tablets, so the one not the primary must be the other replica + if tablet.Alias != primary.Alias && tablet.Type == "replica" { + replica = tablet + } + if tablet.Type == "rdonly" { + rdonly = tablet + } + } + assert.NotNil(t, replica, "could not find replica tablet") + assert.NotNil(t, rdonly, "could not find rdonly tablet") + + // check that the replication is setup correctly before we failover + utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second) + + status, _ := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover-auto/localhost/%d/localhost/%d", primary.MySQLPort, replica.MySQLPort)) + assert.Equal(t, 200, status) + + // check that the replica gets promoted + utils.CheckPrimaryTablet(t, clusterInfo, replica, true) + utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{primary, rdonly}, 10*time.Second) + + status, _ = utils.MakeAPICall(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover-auto/localhost/%d/", replica.MySQLPort)) + assert.Equal(t, 200, status) + + // check that the primary gets promoted back + utils.CheckPrimaryTablet(t, clusterInfo, primary, true) + utils.VerifyWritesSucceed(t, clusterInfo, primary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second) +} From 6267f859294c5b2105c07672908dabb9f97663d0 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 16 Nov 2021 12:59:11 +0530 Subject: [PATCH 05/16] refactor: renamed package names to not include camel casing Signed-off-by: Manan Gupta --- .../graceful_takeover_test.go | 2 +- .../vtorc/{gracefulTakeover => gracefultakeover}/main_test.go | 2 +- .../vtorc/{primaryFailure => primaryfailure}/main_test.go | 2 +- .../{primaryFailure => primaryfailure}/primary_failure_test.go | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) rename go/test/endtoend/vtorc/{gracefulTakeover => gracefultakeover}/graceful_takeover_test.go (99%) rename go/test/endtoend/vtorc/{gracefulTakeover => gracefultakeover}/main_test.go (98%) rename go/test/endtoend/vtorc/{primaryFailure => primaryfailure}/main_test.go (98%) rename go/test/endtoend/vtorc/{primaryFailure => primaryfailure}/primary_failure_test.go (99%) diff --git a/go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go b/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go similarity index 99% rename from go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go rename to go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go index c57504e7046..2ab4508812d 100644 --- a/go/test/endtoend/vtorc/gracefulTakeover/graceful_takeover_test.go +++ b/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package gracefulTakeover +package gracefultakeover import ( "fmt" diff --git a/go/test/endtoend/vtorc/gracefulTakeover/main_test.go b/go/test/endtoend/vtorc/gracefultakeover/main_test.go similarity index 98% rename from go/test/endtoend/vtorc/gracefulTakeover/main_test.go rename to go/test/endtoend/vtorc/gracefultakeover/main_test.go index 0c81b0e24cd..d7c11bdae65 100644 --- a/go/test/endtoend/vtorc/gracefulTakeover/main_test.go +++ b/go/test/endtoend/vtorc/gracefultakeover/main_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package gracefulTakeover +package gracefultakeover import ( "fmt" diff --git a/go/test/endtoend/vtorc/primaryFailure/main_test.go b/go/test/endtoend/vtorc/primaryfailure/main_test.go similarity index 98% rename from go/test/endtoend/vtorc/primaryFailure/main_test.go rename to go/test/endtoend/vtorc/primaryfailure/main_test.go index 1439ba68085..ae8be76b392 100644 --- a/go/test/endtoend/vtorc/primaryFailure/main_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/main_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package primaryFailure +package primaryfailure import ( "fmt" diff --git a/go/test/endtoend/vtorc/primaryFailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go similarity index 99% rename from go/test/endtoend/vtorc/primaryFailure/primary_failure_test.go rename to go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index d5f875c1298..85ea91eed24 100644 --- a/go/test/endtoend/vtorc/primaryFailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package primaryFailure +package primaryfailure import ( "testing" From 74fd384969d7b0162a5975b8878f82770bd7003a Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 16 Nov 2021 13:21:25 +0530 Subject: [PATCH 06/16] test: ported over graceful-master-takeover-fail-cross-region from orchestrator Signed-off-by: Manan Gupta --- .../graceful_takeover_test.go | 34 +++++++++++++++++++ .../vtorc/gracefultakeover/main_test.go | 6 ++++ 2 files changed, 40 insertions(+) diff --git a/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go b/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go index 2ab4508812d..8cbab872629 100644 --- a/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go +++ b/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go @@ -137,3 +137,37 @@ func TestGracefulPrimaryTakeoverAuto(t *testing.T) { utils.CheckPrimaryTablet(t, clusterInfo, primary, true) utils.VerifyWritesSucceed(t, clusterInfo, primary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second) } + +// make an api call to graceful primary takeover with a cross-cell replica and check that it errors out +// covers the test case graceful-master-takeover-fail-cross-region from orchestrator +func TestGracefulPrimaryTakeoverFailCrossCell(t *testing.T) { + defer cluster.PanicHandler(t) + utils.SetupVttabletsAndVtorc(t, clusterInfo, 1, 1, nil, "test_config.json") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + + // find primary from topo + primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) + assert.NotNil(t, primary, "should have elected a primary") + + // find the rdonly tablet + var rdonly *cluster.Vttablet + for _, tablet := range shard0.Vttablets { + if tablet.Type == "rdonly" { + rdonly = tablet + } + } + assert.NotNil(t, rdonly, "could not find rdonly tablet") + + crossCellReplica1 := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) + // newly started tablet does not replicate from anyone yet, we will allow orchestrator to fix this too + utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{crossCellReplica1, rdonly}, 25*time.Second) + + status, response := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/localhost/%d", primary.MySQLPort, crossCellReplica1.MySQLPort)) + assert.Equal(t, 500, status) + assert.Contains(t, response, "GracefulPrimaryTakeover: constraint failure") + + // check that the cross-cell replica doesn't get promoted and the previous primary is still the primary + utils.CheckPrimaryTablet(t, clusterInfo, primary, true) + utils.VerifyWritesSucceed(t, clusterInfo, primary, []*cluster.Vttablet{crossCellReplica1, rdonly}, 10*time.Second) +} diff --git a/go/test/endtoend/vtorc/gracefultakeover/main_test.go b/go/test/endtoend/vtorc/gracefultakeover/main_test.go index d7c11bdae65..1132489ab8f 100644 --- a/go/test/endtoend/vtorc/gracefultakeover/main_test.go +++ b/go/test/endtoend/vtorc/gracefultakeover/main_test.go @@ -36,6 +36,12 @@ func TestMain(m *testing.M) { NumRdonly: 2, UIDBase: 100, }) + cellInfos = append(cellInfos, &utils.CellInfo{ + CellName: utils.Cell2, + NumReplicas: 2, + NumRdonly: 0, + UIDBase: 200, + }) exitcode, err := func() (int, error) { var err error From bba92f17a5cd517de4b5f09682270ca3b8b111b8 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 16 Nov 2021 23:05:20 +0530 Subject: [PATCH 07/16] feat: port vtorc to use PRS for graceful primary takeover Signed-off-by: Manan Gupta --- go/vt/orchestrator/inst/analysis.go | 1 + go/vt/orchestrator/logic/topology_recovery.go | 183 +++++++++--------- 2 files changed, 90 insertions(+), 94 deletions(-) diff --git a/go/vt/orchestrator/inst/analysis.go b/go/vt/orchestrator/inst/analysis.go index b6c27b96d58..c952e7cee94 100644 --- a/go/vt/orchestrator/inst/analysis.go +++ b/go/vt/orchestrator/inst/analysis.go @@ -70,6 +70,7 @@ const ( AllIntermediatePrimaryReplicasNotReplicating AnalysisCode = "AllIntermediatePrimaryReplicasNotReplicating" FirstTierReplicaFailingToConnectToPrimary AnalysisCode = "FirstTierReplicaFailingToConnectToPrimary" BinlogServerFailingToConnectToPrimary AnalysisCode = "BinlogServerFailingToConnectToPrimary" + PlannedReparentShard AnalysisCode = "PlannedReparentShard" ) const ( diff --git a/go/vt/orchestrator/logic/topology_recovery.go b/go/vt/orchestrator/logic/topology_recovery.go index 765695bd74d..ee5c5a43b29 100644 --- a/go/vt/orchestrator/logic/topology_recovery.go +++ b/go/vt/orchestrator/logic/topology_recovery.go @@ -1678,6 +1678,7 @@ func getGracefulPrimaryTakeoverDesignatedInstance(clusterPrimaryKey *inst.Instan // This function is graceful in that it will first lock down the primary, then wait // for the designated replica to catch up with last position. // It will point old primary at the newly promoted primary at the correct coordinates. +// All of this is accomplished via PlannedReparentShard operation. It is an idempotent operation, look at its documentation for more detail func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey, auto bool) (topologyRecovery *TopologyRecovery, promotedPrimaryCoordinates *inst.BinlogCoordinates, err error) { clusterPrimaries, err := inst.ReadClusterPrimary(clusterName) if err != nil { @@ -1688,118 +1689,112 @@ func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey } clusterPrimary := clusterPrimaries[0] - clusterPrimaryDirectReplicas, err := inst.ReadReplicaInstances(&clusterPrimary.Key) + analysisEntry, err := forceAnalysisEntry(clusterName, inst.PlannedReparentShard, inst.GracefulPrimaryTakeoverCommandHint, &clusterPrimary.Key) if err != nil { - return nil, nil, log.Errore(err) + return nil, nil, err } - - if len(clusterPrimaryDirectReplicas) == 0 { - return nil, nil, fmt.Errorf("Primary %+v doesn't seem to have replicas", clusterPrimary.Key) + topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false /*failIfFailedInstanceInActiveRecovery*/, false /*failIfClusterInActiveRecovery*/) + if topologyRecovery == nil || err != nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("could not register recovery on %+v. Unable to issue PlannedReparentShard.", analysisEntry.AnalyzedInstanceKey)) + return topologyRecovery, nil, err + } + if err := executeProcesses(config.Config.PreGracefulTakeoverProcesses, "PreGracefulTakeoverProcesses", topologyRecovery, true); err != nil { + return topologyRecovery, nil, fmt.Errorf("Failed running PreGracefulTakeoverProcesses: %+v", err) } + primaryTablet, err := inst.ReadTablet(clusterPrimary.Key) + if err != nil { + return topologyRecovery, nil, err + } if designatedKey != nil && !designatedKey.IsValid() { // An empty or invalid key is as good as no key designatedKey = nil } - designatedInstance, err := getGracefulPrimaryTakeoverDesignatedInstance(&clusterPrimary.Key, designatedKey, clusterPrimaryDirectReplicas, auto) - if err != nil { - return nil, nil, log.Errore(err) - } - - if inst.IsBannedFromBeingCandidateReplica(designatedInstance) { - return nil, nil, fmt.Errorf("GracefulPrimaryTakeover: designated instance %+v cannot be promoted due to promotion rule or it is explicitly ignored in PromotionIgnoreHostnameFilters configuration", designatedInstance.Key) + var designatedTabletAlias *topodatapb.TabletAlias + if designatedKey != nil { + designatedTablet, err := inst.ReadTablet(*designatedKey) + if err != nil { + return topologyRecovery, nil, err + } + designatedTabletAlias = designatedTablet.Alias } - primaryOfDesignatedInstance, err := inst.GetInstancePrimary(designatedInstance) - if err != nil { - return nil, nil, err - } - if !primaryOfDesignatedInstance.Key.Equals(&clusterPrimary.Key) { - return nil, nil, fmt.Errorf("Sanity check failure. It seems like the designated instance %+v does not replicate from the primary %+v (designated instance's primary key is %+v). This error is strange. Panicking", designatedInstance.Key, clusterPrimary.Key, designatedInstance.SourceKey) - } - if !designatedInstance.HasReasonableMaintenanceReplicationLag() { - return nil, nil, fmt.Errorf("Desginated instance %+v seems to be lagging to much for thie operation. Aborting.", designatedInstance.Key) - } - - if len(clusterPrimaryDirectReplicas) > 1 { - log.Infof("GracefulPrimaryTakeover: Will let %+v take over its siblings", designatedInstance.Key) - relocatedReplicas, _, err, _ := inst.RelocateReplicas(&clusterPrimary.Key, &designatedInstance.Key, "") - if len(relocatedReplicas) != len(clusterPrimaryDirectReplicas)-1 { - // We are unable to make designated instance primary of all its siblings - relocatedReplicasKeyMap := inst.NewInstanceKeyMap() - relocatedReplicasKeyMap.AddInstances(relocatedReplicas) - // Let's see which replicas have not been relocated - for _, directReplica := range clusterPrimaryDirectReplicas { - if relocatedReplicasKeyMap.HasKey(directReplica.Key) { - // relocated, good - continue - } - if directReplica.Key.Equals(&designatedInstance.Key) { - // obviously we skip this one - continue - } - if directReplica.IsDowntimed { - // obviously we skip this one - log.Warningf("GracefulPrimaryTakeover: unable to relocate %+v below designated %+v, but since it is downtimed (downtime reason: %s) I will proceed", directReplica.Key, designatedInstance.Key, directReplica.DowntimeReason) - continue - } - return nil, nil, fmt.Errorf("Desginated instance %+v cannot take over all of its siblings. Error: %+v", designatedInstance.Key, err) - } + ev, err := reparentutil.NewPlannedReparenter(ts, tmclient.NewTabletManagerClient(), logutil.NewCallbackLogger(func(event *logutilpb.Event) { + level := event.GetLevel() + value := event.GetValue() + // we only log the warnings and errors explicitly, everything gets logged as an information message anyways in auditing topology recovery + switch level { + case logutilpb.Level_WARNING: + log.Warningf("ERS - %s", value) + case logutilpb.Level_ERROR: + log.Errorf("ERS - %s", value) } - } - log.Infof("GracefulPrimaryTakeover: Will demote %+v and promote %+v instead", clusterPrimary.Key, designatedInstance.Key) + AuditTopologyRecovery(topologyRecovery, value) + })).ReparentShard(context.Background(), + primaryTablet.Keyspace, + primaryTablet.Shard, + reparentutil.PlannedReparentOptions{ + NewPrimaryAlias: designatedTabletAlias, + WaitReplicasTimeout: time.Duration(config.Config.WaitReplicasTimeoutSeconds) * time.Second, + }, + ) - analysisEntry, err := forceAnalysisEntry(clusterName, inst.DeadPrimary, inst.GracefulPrimaryTakeoverCommandHint, &clusterPrimary.Key) - if err != nil { - return nil, nil, err - } - preGracefulTakeoverTopologyRecovery := &TopologyRecovery{ - SuccessorKey: &designatedInstance.Key, - AnalysisEntry: analysisEntry, - } - if err := executeProcesses(config.Config.PreGracefulTakeoverProcesses, "PreGracefulTakeoverProcesses", preGracefulTakeoverTopologyRecovery, true); err != nil { - return nil, nil, fmt.Errorf("Failed running PreGracefulTakeoverProcesses: %+v", err) - } - demotedPrimarySelfBinlogCoordinates := &clusterPrimary.SelfBinlogCoordinates - log.Infof("GracefulPrimaryTakeover: Will wait for %+v to reach primary coordinates %+v", designatedInstance.Key, *demotedPrimarySelfBinlogCoordinates) - if designatedInstance, _, err = inst.WaitForExecBinlogCoordinatesToReach(&designatedInstance.Key, demotedPrimarySelfBinlogCoordinates, time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds)*time.Second); err != nil { - return nil, nil, err + // here we need to forcefully refresh all the tablets otherwise old information is used and failover scenarios are spawned off which are not required + // For example, if we do not refresh the tablets forcefully and the new primary is found in the cache then its source key is not updated and this spawns off + // PrimaryHasPrimary analysis which runs ERS + RefreshTablets(true /* forceRefresh */) + var promotedReplica *inst.Instance + if ev.NewPrimary != nil { + promotedReplica, _, _ = inst.ReadInstance(&inst.InstanceKey{ + Hostname: ev.NewPrimary.MysqlHostname, + Port: int(ev.NewPrimary.MysqlPort), + }) } - promotedPrimaryCoordinates = &designatedInstance.SelfBinlogCoordinates + postPrsCompletion(topologyRecovery, analysisEntry, promotedReplica) + return topologyRecovery, promotedPrimaryCoordinates, err +} - log.Infof("GracefulPrimaryTakeover: attempting recovery") - recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false) - if err != nil { - log.Errorf("GracefulPrimaryTakeover: noting an error, and for now proceeding: %+v", err) - } - if !recoveryAttempted { - return nil, nil, fmt.Errorf("GracefulPrimaryTakeover: unexpected error: recovery not attempted. This should not happen") - } - if topologyRecovery == nil { - return nil, nil, fmt.Errorf("GracefulPrimaryTakeover: recovery attempted but with no results. This should not happen") - } - var gtidHint inst.OperationGTIDHint = inst.GTIDHintNeutral - if topologyRecovery.RecoveryType == PrimaryRecoveryGTID { - gtidHint = inst.GTIDHintForce - } - clusterPrimary, err = inst.ChangePrimaryTo(&clusterPrimary.Key, &designatedInstance.Key, promotedPrimaryCoordinates, false, gtidHint) - if !clusterPrimary.SelfBinlogCoordinates.Equals(demotedPrimarySelfBinlogCoordinates) { - log.Errorf("GracefulPrimaryTakeover: sanity problem. Demoted primary's coordinates changed from %+v to %+v while supposed to have been frozen", *demotedPrimarySelfBinlogCoordinates, clusterPrimary.SelfBinlogCoordinates) - } - _, startReplicationErr := inst.StartReplication(&clusterPrimary.Key) - if err == nil { - err = startReplicationErr +func postPrsCompletion(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, promotedReplica *inst.Instance) { + if promotedReplica != nil { + message := fmt.Sprintf("promoted replica: %+v", promotedReplica.Key) + AuditTopologyRecovery(topologyRecovery, message) + inst.AuditOperation("graceful-primary-takeover", &analysisEntry.AnalyzedInstanceKey, message) } + // And this is the end; whether successful or not, we're done. + resolveRecovery(topologyRecovery, promotedReplica) + // Now, see whether we are successful or not. From this point there's no going back. + if promotedReplica != nil { + // Success! + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("GracefulPrimaryTakeover: successfully promoted %+v", promotedReplica.Key)) - if designatedInstance.AllowTLS { - _, enableSSLErr := inst.EnablePrimarySSL(&clusterPrimary.Key) - if err == nil { - err = enableSSLErr + kvPairs := inst.GetClusterPrimaryKVPairs(analysisEntry.ClusterDetails.ClusterAlias, &promotedReplica.Key) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Writing KV %+v", kvPairs)) + for _, kvPair := range kvPairs { + err := kv.PutKVPair(kvPair) + log.Errore(err) } - } - executeProcesses(config.Config.PostGracefulTakeoverProcesses, "PostGracefulTakeoverProcesses", topologyRecovery, false) + { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Distributing KV %+v", kvPairs)) + err := kv.DistributePairs(kvPairs) + log.Errore(err) + } + func() error { + before := analysisEntry.AnalyzedInstanceKey.StringCode() + after := promotedReplica.Key.StringCode() + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- GracefulPrimaryTakeover: updating cluster_alias: %v -> %v", before, after)) + //~~~inst.ReplaceClusterName(before, after) + if alias := analysisEntry.ClusterDetails.ClusterAlias; alias != "" { + inst.SetClusterAlias(promotedReplica.Key.StringCode(), alias) + } else { + inst.ReplaceAliasClusterName(before, after) + } + return nil + }() - return topologyRecovery, promotedPrimaryCoordinates, err + attributes.SetGeneralAttribute(analysisEntry.ClusterDetails.ClusterDomain, promotedReplica.Key.StringCode()) + + executeProcesses(config.Config.PostGracefulTakeoverProcesses, "PostGracefulTakeoverProcesses", topologyRecovery, false) + } } // electNewPrimary elects a new primary while none were present before. From 7b8aaf3e5061958847dcc2dee64cab8534164686 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 16 Nov 2021 23:06:14 +0530 Subject: [PATCH 08/16] refactor: fix the log message Signed-off-by: Manan Gupta --- go/vt/orchestrator/logic/topology_recovery.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/vt/orchestrator/logic/topology_recovery.go b/go/vt/orchestrator/logic/topology_recovery.go index ee5c5a43b29..f0630ab8bf1 100644 --- a/go/vt/orchestrator/logic/topology_recovery.go +++ b/go/vt/orchestrator/logic/topology_recovery.go @@ -1725,9 +1725,9 @@ func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey // we only log the warnings and errors explicitly, everything gets logged as an information message anyways in auditing topology recovery switch level { case logutilpb.Level_WARNING: - log.Warningf("ERS - %s", value) + log.Warningf("PRS - %s", value) case logutilpb.Level_ERROR: - log.Errorf("ERS - %s", value) + log.Errorf("PRS - %s", value) } AuditTopologyRecovery(topologyRecovery, value) })).ReparentShard(context.Background(), From d13170dc9ae1aec8ab9d575092fb511b8e149a90 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Wed, 17 Nov 2021 11:18:58 +0530 Subject: [PATCH 09/16] feat: remove pre and post failover hooks from graceful-primary-takeover Signed-off-by: Manan Gupta --- docker/mini/orchestrator-vitess-mini.conf.json | 2 -- go/vt/orchestrator/config/config.go | 4 ---- go/vt/orchestrator/logic/topology_recovery.go | 5 ----- 3 files changed, 11 deletions(-) diff --git a/docker/mini/orchestrator-vitess-mini.conf.json b/docker/mini/orchestrator-vitess-mini.conf.json index 352b22e3121..604801603c2 100644 --- a/docker/mini/orchestrator-vitess-mini.conf.json +++ b/docker/mini/orchestrator-vitess-mini.conf.json @@ -48,13 +48,11 @@ "RecoverMasterClusterFilters": [], "RecoverIntermediateMasterClusterFilters": [], "OnFailureDetectionProcesses": [], - "PreGracefulTakeoverProcesses": [], "PreFailoverProcesses": [], "PostFailoverProcesses": [], "PostUnsuccessfulFailoverProcesses": [], "PostMasterFailoverProcesses": [], "PostIntermediateMasterFailoverProcesses": [], - "PostGracefulTakeoverProcesses": [], "CoMasterRecoveryMustPromoteOtherCoMaster": true, "DetachLostReplicasAfterMasterFailover": true, "ApplyMySQLPromotionAfterMasterFailover": true, diff --git a/go/vt/orchestrator/config/config.go b/go/vt/orchestrator/config/config.go index ee2320a3058..ccbca4e8573 100644 --- a/go/vt/orchestrator/config/config.go +++ b/go/vt/orchestrator/config/config.go @@ -202,13 +202,11 @@ type Configuration struct { RecoverIntermediatePrimaryClusterFilters []string // Only do IM recovery on clusters matching these regexp patterns (of course the ".*" pattern matches everything) ProcessesShellCommand string // Shell that executes command scripts OnFailureDetectionProcesses []string // Processes to execute when detecting a failover scenario (before making a decision whether to failover or not). May and should use some of these placeholders: {failureType}, {instanceType}, {isPrimary}, {isCoPrimary}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {autoPrimaryRecovery}, {autoIntermediatePrimaryRecovery} - PreGracefulTakeoverProcesses []string // Processes to execute before doing a failover (aborting operation should any once of them exits with non-zero code; order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isPrimary}, {isCoPrimary}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {countReplicas}, {replicaHosts}, {isDowntimed} PreFailoverProcesses []string // Processes to execute before doing a failover (aborting operation should any once of them exits with non-zero code; order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isPrimary}, {isCoPrimary}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {countReplicas}, {replicaHosts}, {isDowntimed} PostFailoverProcesses []string // Processes to execute after doing a failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isPrimary}, {isCoPrimary}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {isSuccessful}, {lostReplicas}, {countLostReplicas} PostUnsuccessfulFailoverProcesses []string // Processes to execute after a not-completely-successful failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isPrimary}, {isCoPrimary}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {isSuccessful}, {lostReplicas}, {countLostReplicas} PostPrimaryFailoverProcesses []string // Processes to execute after doing a primary failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses PostIntermediatePrimaryFailoverProcesses []string // Processes to execute after doing a primary failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses - PostGracefulTakeoverProcesses []string // Processes to execute after running a graceful primary takeover. Uses same placeholders as PostFailoverProcesses PostTakePrimaryProcesses []string // Processes to execute after a successful Take-Primary event has taken place CoPrimaryRecoveryMustPromoteOtherCoPrimary bool // When 'false', anything can get promoted (and candidates are prefered over others). When 'true', orchestrator will promote the other co-primary or else fail DetachLostReplicasAfterPrimaryFailover bool // Should replicas that are not to be lost in primary recovery (i.e. were more up-to-date than promoted replica) be forcibly detached @@ -365,13 +363,11 @@ func newConfiguration() *Configuration { RecoverIntermediatePrimaryClusterFilters: []string{}, ProcessesShellCommand: "bash", OnFailureDetectionProcesses: []string{}, - PreGracefulTakeoverProcesses: []string{}, PreFailoverProcesses: []string{}, PostPrimaryFailoverProcesses: []string{}, PostIntermediatePrimaryFailoverProcesses: []string{}, PostFailoverProcesses: []string{}, PostUnsuccessfulFailoverProcesses: []string{}, - PostGracefulTakeoverProcesses: []string{}, PostTakePrimaryProcesses: []string{}, CoPrimaryRecoveryMustPromoteOtherCoPrimary: true, DetachLostReplicasAfterPrimaryFailover: true, diff --git a/go/vt/orchestrator/logic/topology_recovery.go b/go/vt/orchestrator/logic/topology_recovery.go index f0630ab8bf1..fd8c213e074 100644 --- a/go/vt/orchestrator/logic/topology_recovery.go +++ b/go/vt/orchestrator/logic/topology_recovery.go @@ -1698,9 +1698,6 @@ func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("could not register recovery on %+v. Unable to issue PlannedReparentShard.", analysisEntry.AnalyzedInstanceKey)) return topologyRecovery, nil, err } - if err := executeProcesses(config.Config.PreGracefulTakeoverProcesses, "PreGracefulTakeoverProcesses", topologyRecovery, true); err != nil { - return topologyRecovery, nil, fmt.Errorf("Failed running PreGracefulTakeoverProcesses: %+v", err) - } primaryTablet, err := inst.ReadTablet(clusterPrimary.Key) if err != nil { @@ -1792,8 +1789,6 @@ func postPrsCompletion(topologyRecovery *TopologyRecovery, analysisEntry inst.Re }() attributes.SetGeneralAttribute(analysisEntry.ClusterDetails.ClusterDomain, promotedReplica.Key.StringCode()) - - executeProcesses(config.Config.PostGracefulTakeoverProcesses, "PostGracefulTakeoverProcesses", topologyRecovery, false) } } From b82fb0704345901029d173367d676c477acfc96e Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Wed, 17 Nov 2021 18:36:09 +0530 Subject: [PATCH 10/16] refactor: deleted dead code Signed-off-by: Manan Gupta --- go/vt/orchestrator/logic/topology_recovery.go | 37 ------------------- 1 file changed, 37 deletions(-) diff --git a/go/vt/orchestrator/logic/topology_recovery.go b/go/vt/orchestrator/logic/topology_recovery.go index fd8c213e074..41af3e91ecb 100644 --- a/go/vt/orchestrator/logic/topology_recovery.go +++ b/go/vt/orchestrator/logic/topology_recovery.go @@ -1635,43 +1635,6 @@ func ForcePrimaryTakeover(clusterName string, destination *inst.Instance) (topol return topologyRecovery, nil } -func getGracefulPrimaryTakeoverDesignatedInstance(clusterPrimaryKey *inst.InstanceKey, designatedKey *inst.InstanceKey, clusterPrimaryDirectReplicas [](*inst.Instance), auto bool) (designatedInstance *inst.Instance, err error) { - if designatedKey == nil { - // User did not specify a replica to promote - if len(clusterPrimaryDirectReplicas) == 1 { - // Single replica. That's the one we'll promote - return clusterPrimaryDirectReplicas[0], nil - } - // More than one replica. - if !auto { - return nil, fmt.Errorf("GracefulPrimaryTakeover: target instance not indicated, auto=false, and primary %+v has %+v replicas. orchestrator cannot choose where to failover to. Aborting", *clusterPrimaryKey, len(clusterPrimaryDirectReplicas)) - } - log.Debugf("GracefulPrimaryTakeover: request takeover for primary %+v, no designated replica indicated. orchestrator will attempt to auto deduce replica.", *clusterPrimaryKey) - designatedInstance, _, _, _, _, err = inst.GetCandidateReplica(clusterPrimaryKey, false) - if err != nil || designatedInstance == nil { - return nil, fmt.Errorf("GracefulPrimaryTakeover: no target instance indicated, failed to auto-detect candidate replica for primary %+v. Aborting", *clusterPrimaryKey) - } - log.Debugf("GracefulPrimaryTakeover: candidateReplica=%+v", designatedInstance.Key) - if _, err := inst.StartReplication(&designatedInstance.Key); err != nil { - return nil, fmt.Errorf("GracefulPrimaryTakeover:cannot start replication on designated replica %+v. Aborting", designatedKey) - } - log.Infof("GracefulPrimaryTakeover: designated primary deduced to be %+v", designatedInstance.Key) - return designatedInstance, nil - } - - // Verify designated instance is a direct replica of primary - for _, directReplica := range clusterPrimaryDirectReplicas { - if directReplica.Key.Equals(designatedKey) { - designatedInstance = directReplica - } - } - if designatedInstance == nil { - return nil, fmt.Errorf("GracefulPrimaryTakeover: indicated designated instance %+v must be directly replicating from the primary %+v", *designatedKey, *clusterPrimaryKey) - } - log.Infof("GracefulPrimaryTakeover: designated primary instructed to be %+v", designatedInstance.Key) - return designatedInstance, nil -} - // GracefulPrimaryTakeover will demote primary of existing topology and promote its // direct replica instead. // It expects that replica to have no siblings. From 8902387b4b7401cc315439f111cde4278c85094f Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Wed, 17 Nov 2021 19:07:10 +0530 Subject: [PATCH 11/16] test: reduced test flakiness Signed-off-by: Manan Gupta --- .../graceful_takeover_test.go | 2 +- go/test/endtoend/vtorc/utils/utils.go | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go b/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go index 8cbab872629..1ae6aef78fc 100644 --- a/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go +++ b/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go @@ -52,7 +52,7 @@ func TestGracefulPrimaryTakeover(t *testing.T) { // check that the replication is setup correctly before we failover utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica}, 10*time.Second) - status, _ := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/localhost/%d", curPrimary.MySQLPort, replica.MySQLPort)) + status, _ := utils.MakeAPICallUntilRegistered(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/localhost/%d", curPrimary.MySQLPort, replica.MySQLPort)) assert.Equal(t, 200, status) // check that the replica gets promoted diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index cf123d2dc1b..d7f2a8d7b4c 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -723,3 +723,22 @@ func MakeAPICall(t *testing.T, url string) (status int, response string) { body := string(bodyBytes) return res.StatusCode, body } + +// MakeAPICallUntilRegistered is used to make an API call and retry if we see a 500 - no successor promoted output. This happens when some other recovery had previously run +// and the API recovery was unable to be registered due to active timeout period. +func MakeAPICallUntilRegistered(t *testing.T, url string) (status int, response string) { + timeout := time.After(10 * time.Second) + for { + select { + case <-timeout: + t.Fatal("timedout waiting for api to register correctly") + return + default: + status, response = MakeAPICall(t, url) + if status == 500 && strings.Contains(response, "no successor promoted") { + break + } + return status, response + } + } +} From e74a635c13a02b27acbe68004df999fa6625a166 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Wed, 17 Nov 2021 19:08:11 +0530 Subject: [PATCH 12/16] refactor: add a log statement at the start of PRS Signed-off-by: Manan Gupta --- go/vt/orchestrator/logic/topology_recovery.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/go/vt/orchestrator/logic/topology_recovery.go b/go/vt/orchestrator/logic/topology_recovery.go index 41af3e91ecb..cf72ae3ce26 100644 --- a/go/vt/orchestrator/logic/topology_recovery.go +++ b/go/vt/orchestrator/logic/topology_recovery.go @@ -28,6 +28,8 @@ import ( "sync/atomic" "time" + "vitess.io/vitess/go/vt/topo/topoproto" + "github.com/patrickmn/go-cache" "github.com/rcrowley/go-metrics" @@ -1677,6 +1679,9 @@ func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey return topologyRecovery, nil, err } designatedTabletAlias = designatedTablet.Alias + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("started PlannedReparentShard, new primary will be %s.", topoproto.TabletAliasString(designatedTabletAlias))) + } else { + AuditTopologyRecovery(topologyRecovery, "started PlannedReparentShard with automatic primary selection.") } ev, err := reparentutil.NewPlannedReparenter(ts, tmclient.NewTabletManagerClient(), logutil.NewCallbackLogger(func(event *logutilpb.Event) { From 43a1920007e67c9c064fc326a6af4bbf633887b8 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Wed, 17 Nov 2021 19:26:18 +0530 Subject: [PATCH 13/16] refactor: remove unused parameters Signed-off-by: Manan Gupta --- go/vt/orchestrator/app/cli.go | 10 ++++------ go/vt/orchestrator/http/api.go | 8 ++++---- go/vt/orchestrator/logic/topology_recovery.go | 16 ++++++++-------- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/go/vt/orchestrator/app/cli.go b/go/vt/orchestrator/app/cli.go index 0dc179af848..fd438416bd3 100644 --- a/go/vt/orchestrator/app/cli.go +++ b/go/vt/orchestrator/app/cli.go @@ -1200,13 +1200,12 @@ func Cli(command string, strict bool, instance string, destination string, owner if destinationKey != nil { validateInstanceIsFound(destinationKey) } - topologyRecovery, promotedPrimaryCoordinates, err := logic.GracefulPrimaryTakeover(clusterName, destinationKey, false) + topologyRecovery, err := logic.GracefulPrimaryTakeover(clusterName, destinationKey) if err != nil { log.Fatale(err) } fmt.Println(topologyRecovery.SuccessorKey.DisplayString()) - fmt.Println(*promotedPrimaryCoordinates) - log.Debugf("Promoted %+v as new primary. Binlog coordinates at time of promotion: %+v", topologyRecovery.SuccessorKey, *promotedPrimaryCoordinates) + log.Debugf("Promoted %+v as new primary.", topologyRecovery.SuccessorKey) } case registerCliCommand("graceful-primary-takeover-auto", "Recovery", `Gracefully promote a new primary. orchestrator will attempt to pick the promoted replica automatically`): { @@ -1216,13 +1215,12 @@ func Cli(command string, strict bool, instance string, destination string, owner if destinationKey != nil { validateInstanceIsFound(destinationKey) } - topologyRecovery, promotedPrimaryCoordinates, err := logic.GracefulPrimaryTakeover(clusterName, destinationKey, true) + topologyRecovery, err := logic.GracefulPrimaryTakeover(clusterName, destinationKey) if err != nil { log.Fatale(err) } fmt.Println(topologyRecovery.SuccessorKey.DisplayString()) - fmt.Println(*promotedPrimaryCoordinates) - log.Debugf("Promoted %+v as new primary. Binlog coordinates at time of promotion: %+v", topologyRecovery.SuccessorKey, *promotedPrimaryCoordinates) + log.Debugf("Promoted %+v as new primary.", topologyRecovery.SuccessorKey) } case registerCliCommand("replication-analysis", "Recovery", `Request an analysis of potential crash incidents in all known topologies`): { diff --git a/go/vt/orchestrator/http/api.go b/go/vt/orchestrator/http/api.go index 15278508bfc..789b92887af 100644 --- a/go/vt/orchestrator/http/api.go +++ b/go/vt/orchestrator/http/api.go @@ -2283,7 +2283,7 @@ func (this *HttpAPI) Recover(params martini.Params, r render.Render, req *http.R } // GracefulPrimaryTakeover gracefully fails over a primary onto its single replica. -func (this *HttpAPI) gracefulPrimaryTakeover(params martini.Params, r render.Render, req *http.Request, user auth.User, auto bool) { +func (this *HttpAPI) gracefulPrimaryTakeover(params martini.Params, r render.Render, req *http.Request, user auth.User) { if !isAuthorizedForAction(req, user) { Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) return @@ -2295,7 +2295,7 @@ func (this *HttpAPI) gracefulPrimaryTakeover(params martini.Params, r render.Ren } designatedKey, _ := this.getInstanceKey(params["designatedHost"], params["designatedPort"]) // designatedKey may be empty/invalid - topologyRecovery, _, err := logic.GracefulPrimaryTakeover(clusterName, &designatedKey, auto) + topologyRecovery, err := logic.GracefulPrimaryTakeover(clusterName, &designatedKey) if err != nil { Respond(r, &APIResponse{Code: ERROR, Message: err.Error(), Details: topologyRecovery}) return @@ -2311,12 +2311,12 @@ func (this *HttpAPI) gracefulPrimaryTakeover(params martini.Params, r render.Ren // - onto its single replica, or // - onto a replica indicated by the user func (this *HttpAPI) GracefulPrimaryTakeover(params martini.Params, r render.Render, req *http.Request, user auth.User) { - this.gracefulPrimaryTakeover(params, r, req, user, false) + this.gracefulPrimaryTakeover(params, r, req, user) } // GracefulPrimaryTakeoverAuto gracefully fails over a primary onto a replica of orchestrator's choosing func (this *HttpAPI) GracefulPrimaryTakeoverAuto(params martini.Params, r render.Render, req *http.Request, user auth.User) { - this.gracefulPrimaryTakeover(params, r, req, user, true) + this.gracefulPrimaryTakeover(params, r, req, user) } // ForcePrimaryFailover fails over a primary (even if there's no particular problem with the primary) diff --git a/go/vt/orchestrator/logic/topology_recovery.go b/go/vt/orchestrator/logic/topology_recovery.go index cf72ae3ce26..9561ad86f21 100644 --- a/go/vt/orchestrator/logic/topology_recovery.go +++ b/go/vt/orchestrator/logic/topology_recovery.go @@ -1644,29 +1644,29 @@ func ForcePrimaryTakeover(clusterName string, destination *inst.Instance) (topol // for the designated replica to catch up with last position. // It will point old primary at the newly promoted primary at the correct coordinates. // All of this is accomplished via PlannedReparentShard operation. It is an idempotent operation, look at its documentation for more detail -func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey, auto bool) (topologyRecovery *TopologyRecovery, promotedPrimaryCoordinates *inst.BinlogCoordinates, err error) { +func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey) (topologyRecovery *TopologyRecovery, err error) { clusterPrimaries, err := inst.ReadClusterPrimary(clusterName) if err != nil { - return nil, nil, fmt.Errorf("Cannot deduce cluster primary for %+v; error: %+v", clusterName, err) + return nil, fmt.Errorf("Cannot deduce cluster primary for %+v; error: %+v", clusterName, err) } if len(clusterPrimaries) != 1 { - return nil, nil, fmt.Errorf("Cannot deduce cluster primary for %+v. Found %+v potential primarys", clusterName, len(clusterPrimaries)) + return nil, fmt.Errorf("Cannot deduce cluster primary for %+v. Found %+v potential primarys", clusterName, len(clusterPrimaries)) } clusterPrimary := clusterPrimaries[0] analysisEntry, err := forceAnalysisEntry(clusterName, inst.PlannedReparentShard, inst.GracefulPrimaryTakeoverCommandHint, &clusterPrimary.Key) if err != nil { - return nil, nil, err + return nil, err } topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false /*failIfFailedInstanceInActiveRecovery*/, false /*failIfClusterInActiveRecovery*/) if topologyRecovery == nil || err != nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("could not register recovery on %+v. Unable to issue PlannedReparentShard.", analysisEntry.AnalyzedInstanceKey)) - return topologyRecovery, nil, err + return topologyRecovery, err } primaryTablet, err := inst.ReadTablet(clusterPrimary.Key) if err != nil { - return topologyRecovery, nil, err + return topologyRecovery, err } if designatedKey != nil && !designatedKey.IsValid() { // An empty or invalid key is as good as no key @@ -1676,7 +1676,7 @@ func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey if designatedKey != nil { designatedTablet, err := inst.ReadTablet(*designatedKey) if err != nil { - return topologyRecovery, nil, err + return topologyRecovery, err } designatedTabletAlias = designatedTablet.Alias AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("started PlannedReparentShard, new primary will be %s.", topoproto.TabletAliasString(designatedTabletAlias))) @@ -1716,7 +1716,7 @@ func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey }) } postPrsCompletion(topologyRecovery, analysisEntry, promotedReplica) - return topologyRecovery, promotedPrimaryCoordinates, err + return topologyRecovery, err } func postPrsCompletion(topologyRecovery *TopologyRecovery, analysisEntry inst.ReplicationAnalysis, promotedReplica *inst.Instance) { From c8a3e36d6fb671a91365f45832b59ee04f2bc68e Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Wed, 17 Nov 2021 19:34:47 +0530 Subject: [PATCH 14/16] test: changed expectations of the test where no primary target is specified Signed-off-by: Manan Gupta --- .../graceful_takeover_test.go | 24 +++++++++---------- go/test/endtoend/vtorc/utils/utils.go | 1 + 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go b/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go index 1ae6aef78fc..d2c7e4e1971 100644 --- a/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go +++ b/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go @@ -62,9 +62,10 @@ func TestGracefulPrimaryTakeover(t *testing.T) { // make an api call to graceful primary takeover without specifying the primary tablet to promote // covers the test case graceful-master-takeover-fail-no-target from orchestrator -func TestGracefulPrimaryTakeoverFailNoTarget(t *testing.T) { +// orchestrator used to fail in this case, but for VtOrc, specifying no target makes it choose one on its own +func TestGracefulPrimaryTakeoverNoTarget(t *testing.T) { defer cluster.PanicHandler(t) - utils.SetupVttabletsAndVtorc(t, clusterInfo, 3, 0, nil, "test_config.json") + utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, "test_config.json") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -73,25 +74,24 @@ func TestGracefulPrimaryTakeoverFailNoTarget(t *testing.T) { assert.NotNil(t, curPrimary, "should have elected a primary") // find the replica tablet - var replicas []*cluster.Vttablet + var replica *cluster.Vttablet for _, tablet := range shard0.Vttablets { // we know we have only two tablets, so the one not the primary must be the replica if tablet.Alias != curPrimary.Alias { - replicas = append(replicas, tablet) + replica = tablet } } - assert.Equal(t, 2, len(replicas), "could not find replica tablets") + assert.NotNil(t, replica, "could not find the replica tablet") // check that the replication is setup correctly before we failover - utils.CheckReplication(t, clusterInfo, curPrimary, replicas, 10*time.Second) + utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica}, 10*time.Second) - status, response := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/", curPrimary.MySQLPort)) - assert.Equal(t, 500, status) - assert.Contains(t, response, "GracefulPrimaryTakeover: target instance not indicated") + status, _ := utils.MakeAPICallUntilRegistered(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/", curPrimary.MySQLPort)) + assert.Equal(t, 200, status) - // check that the replica doesn't get promoted and the previous primary is still the primary - utils.CheckPrimaryTablet(t, clusterInfo, curPrimary, true) - utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, replicas, 10*time.Second) + // check that the replica gets promoted + utils.CheckPrimaryTablet(t, clusterInfo, replica, true) + utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{curPrimary}, 10*time.Second) } // make an api call to graceful primary takeover auto and let vtorc fix it diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index d7f2a8d7b4c..a5a2a175126 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -736,6 +736,7 @@ func MakeAPICallUntilRegistered(t *testing.T, url string) (status int, response default: status, response = MakeAPICall(t, url) if status == 500 && strings.Contains(response, "no successor promoted") { + time.Sleep(1 * time.Second) break } return status, response From c1f38b58d72568502c54b7068b5aff60293e6d5b Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Wed, 17 Nov 2021 19:40:13 +0530 Subject: [PATCH 15/16] test: use the function which reduces flakiness in tests Signed-off-by: Manan Gupta --- .../vtorc/gracefultakeover/graceful_takeover_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go b/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go index d2c7e4e1971..41b3a21bdf2 100644 --- a/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go +++ b/go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go @@ -123,14 +123,14 @@ func TestGracefulPrimaryTakeoverAuto(t *testing.T) { // check that the replication is setup correctly before we failover utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second) - status, _ := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover-auto/localhost/%d/localhost/%d", primary.MySQLPort, replica.MySQLPort)) + status, _ := utils.MakeAPICallUntilRegistered(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover-auto/localhost/%d/localhost/%d", primary.MySQLPort, replica.MySQLPort)) assert.Equal(t, 200, status) // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{primary, rdonly}, 10*time.Second) - status, _ = utils.MakeAPICall(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover-auto/localhost/%d/", replica.MySQLPort)) + status, _ = utils.MakeAPICallUntilRegistered(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover-auto/localhost/%d/", replica.MySQLPort)) assert.Equal(t, 200, status) // check that the primary gets promoted back @@ -163,7 +163,7 @@ func TestGracefulPrimaryTakeoverFailCrossCell(t *testing.T) { // newly started tablet does not replicate from anyone yet, we will allow orchestrator to fix this too utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{crossCellReplica1, rdonly}, 25*time.Second) - status, response := utils.MakeAPICall(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/localhost/%d", primary.MySQLPort, crossCellReplica1.MySQLPort)) + status, response := utils.MakeAPICallUntilRegistered(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/localhost/%d", primary.MySQLPort, crossCellReplica1.MySQLPort)) assert.Equal(t, 500, status) assert.Contains(t, response, "GracefulPrimaryTakeover: constraint failure") From 574cfeee2d02553035debae8e5498529bfdb8a7d Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Thu, 18 Nov 2021 11:26:54 +0530 Subject: [PATCH 16/16] feat: honour the constraint rules before calling planned reparent shard Signed-off-by: Manan Gupta --- go/vt/orchestrator/logic/topology_recovery.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/go/vt/orchestrator/logic/topology_recovery.go b/go/vt/orchestrator/logic/topology_recovery.go index 9561ad86f21..97171fa69b0 100644 --- a/go/vt/orchestrator/logic/topology_recovery.go +++ b/go/vt/orchestrator/logic/topology_recovery.go @@ -1684,6 +1684,13 @@ func GracefulPrimaryTakeover(clusterName string, designatedKey *inst.InstanceKey AuditTopologyRecovery(topologyRecovery, "started PlannedReparentShard with automatic primary selection.") } + // check for the constraint failure for cross cell promotion + if designatedTabletAlias != nil && designatedTabletAlias.Cell != primaryTablet.Alias.Cell && config.Config.PreventCrossDataCenterPrimaryFailover { + errorMessage := fmt.Sprintf("GracefulPrimaryTakeover: constraint failure - %s and %s are in different cells", topoproto.TabletAliasString(designatedTabletAlias), topoproto.TabletAliasString(primaryTablet.Alias)) + AuditTopologyRecovery(topologyRecovery, errorMessage) + return topologyRecovery, fmt.Errorf(errorMessage) + } + ev, err := reparentutil.NewPlannedReparenter(ts, tmclient.NewTabletManagerClient(), logutil.NewCallbackLogger(func(event *logutilpb.Event) { level := event.GetLevel() value := event.GetValue()