From 3080571fa94e03ed2d4b0a389097fbb798342a4d Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Mon, 7 Jun 2021 16:53:41 +0530 Subject: [PATCH 01/25] altered tests to reuse topo, vtctl, vtctld and vtctlclient Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 231 ++++++++++++++++++--------- 1 file changed, 156 insertions(+), 75 deletions(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index f6e8690e8d2..da2bc611c97 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -40,48 +40,73 @@ import ( topodatapb "vitess.io/vitess/go/vt/proto/topodata" ) -func createCluster(t *testing.T, numReplicas int, numRdonly int, orcExtraArgs []string) *cluster.LocalProcessCluster { - keyspaceName := "ks" - shardName := "0" - keyspace := &cluster.Keyspace{Name: keyspaceName} - shard0 := &cluster.Shard{Name: shardName} - //dbName = "vt_" + keyspaceName - //username = "vt_dba" - hostname := "localhost" - cell1 := "zone1" - cell2 := "zone2" - tablets := []*cluster.Vttablet{} - clusterInstance := cluster.NewCluster(cell1, hostname) +var ( + clusterInstance *cluster.LocalProcessCluster + uidBase = 100 +) + +const ( + keyspaceName = "ks" + shardName = "0" + hostname = "localhost" + cell1 = "zone1" +) + +// createClusterAndStartTopo starts the cluster and topology service +func createClusterAndStartTopo() error { + clusterInstance = cluster.NewCluster(cell1, hostname) // Start topo server err := clusterInstance.StartTopo() - require.NoError(t, err) + if err != nil { + return err + } + + return nil +} - // Adding another cell in the same cluster - err = clusterInstance.TopoProcess.ManageTopoDir("mkdir", "/vitess/"+cell2) +// startVtorc is used to start the orchestrator with the given extra arguments +func startVtorc(t *testing.T, orcExtraArgs []string) { + // Start vtorc + clusterInstance.VtorcProcess = clusterInstance.NewOrcProcess(path.Join(os.Getenv("PWD"), "test_config.json")) + clusterInstance.VtorcProcess.ExtraArgs = orcExtraArgs + err := clusterInstance.VtorcProcess.Setup() require.NoError(t, err) - err = clusterInstance.VtctlProcess.AddCellInfo(cell2) +} + +// stopVtorc is used to stop the orchestrator +func stopVtorc(t *testing.T) { + // Stop vtorc + err := clusterInstance.VtorcProcess.TearDown() require.NoError(t, err) +} + +// setupVttabletsAndVtorc is used to create the vttablets and start the orchestrator +func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, orcExtraArgs []string) { + keyspace := &cluster.Keyspace{Name: keyspaceName} + shard0 := &cluster.Shard{Name: shardName} // creating tablets by hand instead of using StartKeyspace because we don't want to call InitShardMaster - uidBase := 100 - for i := 0; i < numReplicas; i++ { - tablets = append(tablets, clusterInstance.NewVttabletInstance("replica", uidBase+i, cell1)) + var tablets []*cluster.Vttablet + for i := 0; i < numReplicasReq; i++ { + vttabletInstance := clusterInstance.NewVttabletInstance("replica", uidBase, cell1) + uidBase++ + tablets = append(tablets, vttabletInstance) } - for i := 0; i < numRdonly; i++ { - tablets = append(tablets, clusterInstance.NewVttabletInstance("rdonly", uidBase+numReplicas+i, cell1)) + for i := 0; i < numRdonlyReq; i++ { + vttabletInstance := clusterInstance.NewVttabletInstance("rdonly", uidBase, cell1) + uidBase++ + tablets = append(tablets, vttabletInstance) } - clusterInstance.VtTabletExtraArgs = []string{ "-lock_tables_timeout", "5s", "-disable_active_reparents", } - // Initialize Cluster shard0.Vttablets = tablets - err = clusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard0}) + clusterInstance.ReusingVTDATAROOT = true + err := clusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard0}) require.NoError(t, err) - //Start MySql var mysqlCtlProcessList []*exec.Cmd for _, tablet := range shard0.Vttablets { @@ -90,34 +115,45 @@ func createCluster(t *testing.T, numReplicas int, numRdonly int, orcExtraArgs [] require.NoError(t, err) mysqlCtlProcessList = append(mysqlCtlProcessList, proc) } - // Wait for mysql processes to start for _, proc := range mysqlCtlProcessList { err := proc.Wait() require.NoError(t, err) } - for _, tablet := range shard0.Vttablets { // Reset status, don't wait for the tablet status. We will check it later tablet.VttabletProcess.ServingStatus = "" - // Start the tablet err := tablet.VttabletProcess.Setup() require.NoError(t, err) } - for _, tablet := range shard0.Vttablets { err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) require.NoError(t, err) } - // Start vtorc - clusterInstance.VtorcProcess = clusterInstance.NewOrcProcess(path.Join(os.Getenv("PWD"), "test_config.json")) - clusterInstance.VtorcProcess.ExtraArgs = orcExtraArgs - err = clusterInstance.VtorcProcess.Setup() - require.NoError(t, err) + // start vtorc + startVtorc(t, orcExtraArgs) +} + +func TestMain(m *testing.M) { + exitcode, err := func() (int, error) { + err := createClusterAndStartTopo() + if err != nil { + return 1, err + } + return m.Run(), nil + }() - return clusterInstance + cluster.PanicHandler(nil) + clusterInstance.Teardown() + + if err != nil { + fmt.Printf("%v\n", err) + os.Exit(1) + } else { + os.Exit(exitcode) + } } // Cases to test: @@ -126,15 +162,20 @@ func createCluster(t *testing.T, numReplicas int, numRdonly int, orcExtraArgs [] // verify replication is setup func TestMasterElection(t *testing.T) { defer cluster.PanicHandler(t) - clusterInstance := createCluster(t, 1, 1, nil) + setupVttabletsAndVtorc(t, 1, 1, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] defer func() { - clusterInstance.Teardown() - killTablets(t, shard0) + stopVtorc(t) + // remove all the tablets from the topology + for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { + _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) + require.NoError(t, err) + } + killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) + clusterInstance.Keyspaces = nil }() - //log.Exitf("error") checkMasterTablet(t, clusterInstance, shard0.Vttablets[0]) checkReplication(t, clusterInstance, shard0.Vttablets[0], shard0.Vttablets[1:]) } @@ -145,15 +186,20 @@ func TestMasterElection(t *testing.T) { // verify replication is setup func TestSingleKeyspace(t *testing.T) { defer cluster.PanicHandler(t) - clusterInstance := createCluster(t, 1, 1, []string{"-clusters_to_watch", "ks"}) + setupVttabletsAndVtorc(t, 1, 1, []string{"-clusters_to_watch", "ks"}) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] defer func() { - clusterInstance.Teardown() - killTablets(t, shard0) + stopVtorc(t) + // remove all the tablets from the topology + for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { + _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) + require.NoError(t, err) + } + killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) + clusterInstance.Keyspaces = nil }() - //log.Exitf("error") checkMasterTablet(t, clusterInstance, shard0.Vttablets[0]) checkReplication(t, clusterInstance, shard0.Vttablets[0], shard0.Vttablets[1:]) } @@ -164,15 +210,20 @@ func TestSingleKeyspace(t *testing.T) { // verify replication is setup func TestKeyspaceShard(t *testing.T) { defer cluster.PanicHandler(t) - clusterInstance := createCluster(t, 1, 1, []string{"-clusters_to_watch", "ks/0"}) + setupVttabletsAndVtorc(t, 1, 1, []string{"-clusters_to_watch", "ks/0"}) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] defer func() { - clusterInstance.Teardown() - killTablets(t, shard0) + stopVtorc(t) + // remove all the tablets from the topology + for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { + _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) + require.NoError(t, err) + } + killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) + clusterInstance.Keyspaces = nil }() - //log.Exitf("error") checkMasterTablet(t, clusterInstance, shard0.Vttablets[0]) checkReplication(t, clusterInstance, shard0.Vttablets[0], shard0.Vttablets[1:]) } @@ -180,12 +231,18 @@ func TestKeyspaceShard(t *testing.T) { // 2. bring down master, let orc promote replica func TestDownMaster(t *testing.T) { defer cluster.PanicHandler(t) - clusterInstance := createCluster(t, 2, 0, nil) + setupVttabletsAndVtorc(t, 2, 0, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] defer func() { - clusterInstance.Teardown() - killTablets(t, shard0) + stopVtorc(t) + // remove all the tablets from the topology + for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { + _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) + require.NoError(t, err) + } + killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) + clusterInstance.Keyspaces = nil }() // find master from topo curMaster := shardMasterTablet(t, clusterInstance, keyspace, shard0) @@ -225,13 +282,18 @@ func waitForReadOnlyValue(t *testing.T, curMaster *cluster.Vttablet, expectValue // 3. make master readonly, let orc repair func TestMasterReadOnly(t *testing.T) { defer cluster.PanicHandler(t) - clusterInstance := createCluster(t, 2, 0, nil) + setupVttabletsAndVtorc(t, 2, 0, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] defer func() { - clusterInstance.Teardown() - // Kill tablets - killTablets(t, shard0) + stopVtorc(t) + // remove all the tablets from the topology + for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { + _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) + require.NoError(t, err) + } + killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) + clusterInstance.Keyspaces = nil }() // find master from topo @@ -249,13 +311,18 @@ func TestMasterReadOnly(t *testing.T) { // 4. make replica ReadWrite, let orc repair func TestReplicaReadWrite(t *testing.T) { defer cluster.PanicHandler(t) - clusterInstance := createCluster(t, 2, 0, nil) + setupVttabletsAndVtorc(t, 2, 0, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] defer func() { - clusterInstance.Teardown() - // Kill tablets - killTablets(t, shard0) + stopVtorc(t) + // remove all the tablets from the topology + for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { + _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) + require.NoError(t, err) + } + killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) + clusterInstance.Keyspaces = nil }() // find master from topo @@ -281,13 +348,18 @@ func TestReplicaReadWrite(t *testing.T) { // 5. stop replication, let orc repair func TestStopReplication(t *testing.T) { defer cluster.PanicHandler(t) - clusterInstance := createCluster(t, 2, 0, nil) + setupVttabletsAndVtorc(t, 2, 0, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] defer func() { - clusterInstance.Teardown() - // Kill tablets - killTablets(t, shard0) + stopVtorc(t) + // remove all the tablets from the topology + for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { + _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) + require.NoError(t, err) + } + killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) + clusterInstance.Keyspaces = nil }() // find master from topo @@ -320,13 +392,18 @@ func TestStopReplication(t *testing.T) { // 6. setup replication from non-master, let orc repair func TestReplicationFromOtherReplica(t *testing.T) { defer cluster.PanicHandler(t) - clusterInstance := createCluster(t, 3, 0, nil) + setupVttabletsAndVtorc(t, 3, 0, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] defer func() { - clusterInstance.Teardown() - // Kill tablets - killTablets(t, shard0) + stopVtorc(t) + // remove all the tablets from the topology + for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { + _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) + require.NoError(t, err) + } + killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) + clusterInstance.Keyspaces = nil }() // find master from topo @@ -371,13 +448,18 @@ func TestRepairAfterTER(t *testing.T) { // test fails intermittently on CI, skip until it can be fixed. t.SkipNow() defer cluster.PanicHandler(t) - clusterInstance := createCluster(t, 2, 0, nil) + setupVttabletsAndVtorc(t, 2, 0, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] defer func() { - clusterInstance.Teardown() - // Kill tablets - killTablets(t, shard0) + stopVtorc(t) + // remove all the tablets from the topology + for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { + _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) + require.NoError(t, err) + } + killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) + clusterInstance.Keyspaces = nil }() // find master from topo @@ -455,7 +537,7 @@ func checkMasterTablet(t *testing.T, cluster *cluster.LocalProcessCluster, table continue } else { // allow time for tablet state to be updated after topo is updated - time.Sleep(time.Second) + time.Sleep(2 * time.Second) // make sure the health stream is updated result, err = cluster.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", tablet.Alias) require.NoError(t, err) @@ -526,11 +608,10 @@ func validateTopology(t *testing.T, cluster *cluster.LocalProcessCluster, pingTa } } -func killTablets(t *testing.T, shard *cluster.Shard) { - for _, tablet := range shard.Vttablets { +func killTablets(vttablets []*cluster.Vttablet) { + for _, tablet := range vttablets { log.Infof("Calling TearDown on tablet %v", tablet.Alias) - err := tablet.VttabletProcess.TearDown() - require.NoError(t, err) + _ = tablet.VttabletProcess.TearDown() } } From 8133c1f4d07c098fe1bd6d2ed52a8ee6fb789920 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Tue, 15 Jun 2021 10:38:29 +0530 Subject: [PATCH 02/25] reuse vttablets too in the tests Signed-off-by: GuptaManan100 --- .../endtoend/cluster/vtctlclient_process.go | 2 - go/test/endtoend/vtorc/vtorc_test.go | 246 ++++++++++-------- 2 files changed, 131 insertions(+), 117 deletions(-) diff --git a/go/test/endtoend/cluster/vtctlclient_process.go b/go/test/endtoend/cluster/vtctlclient_process.go index beaa704d5ef..4bd73b6fe5f 100644 --- a/go/test/endtoend/cluster/vtctlclient_process.go +++ b/go/test/endtoend/cluster/vtctlclient_process.go @@ -156,8 +156,6 @@ func (vtctlclient *VtctlClientProcess) ExecuteCommand(args ...string) (err error if output != "" { if err != nil { log.Errorf("Output:\n%v", output) - } else { - log.Infof("Output:\n%v", output) } } return err diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index da2bc611c97..f78331457c4 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -42,6 +42,8 @@ import ( var ( clusterInstance *cluster.LocalProcessCluster + replicaTablets []*cluster.Vttablet + rdonlyTablets []*cluster.Vttablet uidBase = 100 ) @@ -50,6 +52,8 @@ const ( shardName = "0" hostname = "localhost" cell1 = "zone1" + numReplicas = 3 + numRdonly = 1 ) // createClusterAndStartTopo starts the cluster and topology service @@ -62,41 +66,33 @@ func createClusterAndStartTopo() error { return err } - return nil -} - -// startVtorc is used to start the orchestrator with the given extra arguments -func startVtorc(t *testing.T, orcExtraArgs []string) { - // Start vtorc - clusterInstance.VtorcProcess = clusterInstance.NewOrcProcess(path.Join(os.Getenv("PWD"), "test_config.json")) - clusterInstance.VtorcProcess.ExtraArgs = orcExtraArgs - err := clusterInstance.VtorcProcess.Setup() - require.NoError(t, err) -} + // create the vttablets + err = createVttablets() + if err != nil { + return err + } -// stopVtorc is used to stop the orchestrator -func stopVtorc(t *testing.T) { - // Stop vtorc - err := clusterInstance.VtorcProcess.TearDown() - require.NoError(t, err) + return err } -// setupVttabletsAndVtorc is used to create the vttablets and start the orchestrator -func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, orcExtraArgs []string) { +// createVttablets is used to create the vttablets for all the tests +func createVttablets() error { keyspace := &cluster.Keyspace{Name: keyspaceName} shard0 := &cluster.Shard{Name: shardName} // creating tablets by hand instead of using StartKeyspace because we don't want to call InitShardMaster var tablets []*cluster.Vttablet - for i := 0; i < numReplicasReq; i++ { + for i := 0; i < numReplicas; i++ { vttabletInstance := clusterInstance.NewVttabletInstance("replica", uidBase, cell1) uidBase++ tablets = append(tablets, vttabletInstance) + replicaTablets = append(replicaTablets, vttabletInstance) } - for i := 0; i < numRdonlyReq; i++ { + for i := 0; i < numRdonly; i++ { vttabletInstance := clusterInstance.NewVttabletInstance("rdonly", uidBase, cell1) uidBase++ tablets = append(tablets, vttabletInstance) + rdonlyTablets = append(rdonlyTablets, vttabletInstance) } clusterInstance.VtTabletExtraArgs = []string{ "-lock_tables_timeout", "5s", @@ -104,31 +100,112 @@ func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, } // Initialize Cluster shard0.Vttablets = tablets - clusterInstance.ReusingVTDATAROOT = true err := clusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard0}) - require.NoError(t, err) + if err != nil { + return err + } //Start MySql var mysqlCtlProcessList []*exec.Cmd for _, tablet := range shard0.Vttablets { log.Infof("Starting MySql for tablet %v", tablet.Alias) proc, err := tablet.MysqlctlProcess.StartProcess() - require.NoError(t, err) + if err != nil { + return err + } mysqlCtlProcessList = append(mysqlCtlProcessList, proc) } // Wait for mysql processes to start for _, proc := range mysqlCtlProcessList { err := proc.Wait() - require.NoError(t, err) + if err != nil { + return err + } } for _, tablet := range shard0.Vttablets { // Reset status, don't wait for the tablet status. We will check it later tablet.VttabletProcess.ServingStatus = "" // Start the tablet err := tablet.VttabletProcess.Setup() - require.NoError(t, err) + if err != nil { + return err + } } for _, tablet := range shard0.Vttablets { err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) + if err != nil { + return err + } + } + return nil +} + +// removeVttabletsFromTopology removes all the vttablets from the topology +func removeVttabletsFromTopology() error { + for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { + out, _ := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", vttablet.Alias) + log.Error("removeVttabletsFromTopology: ", out) + + err := clusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", "-allow_master", vttablet.Alias) + if err != nil { + return err + } + } + clusterInstance.Keyspaces[0].Shards[0].Vttablets = nil + return nil +} + +// startVtorc is used to start the orchestrator with the given extra arguments +func startVtorc(t *testing.T, orcExtraArgs []string) { + // Start vtorc + clusterInstance.VtorcProcess = clusterInstance.NewOrcProcess(path.Join(os.Getenv("PWD"), "test_config.json")) + clusterInstance.VtorcProcess.ExtraArgs = orcExtraArgs + err := clusterInstance.VtorcProcess.Setup() + require.NoError(t, err) +} + +// stopVtorc is used to stop the orchestrator +func stopVtorc(t *testing.T) { + // Stop vtorc + if clusterInstance.VtorcProcess != nil { + err := clusterInstance.VtorcProcess.TearDown() + require.NoError(t, err) + } + clusterInstance.VtorcProcess = nil +} + +// setupVttabletsAndVtorc is used to setup the vttablets and start the orchestrator +func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, orcExtraArgs []string) { + // stop vtorc if it is running + stopVtorc(t) + + // remove all the vttablets so that each test can add the amount that they require + err := removeVttabletsFromTopology() + require.NoError(t, err) + + for _, tablet := range replicaTablets { + if numReplicasReq == 0 { + break + } + err = cleanAndAddVttablet(t, tablet) + require.NoError(t, err) + numReplicasReq-- + } + + for _, tablet := range rdonlyTablets { + if numRdonlyReq == 0 { + break + } + err = cleanAndAddVttablet(t, tablet) + require.NoError(t, err) + numRdonlyReq-- + } + + if numRdonlyReq > 0 || numReplicasReq > 0 { + t.Fatalf("more than available tablets requested. Please increase the constants numReplicas or numRdonly") + } + + for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { + err = tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) require.NoError(t, err) } @@ -136,6 +213,33 @@ func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, startVtorc(t, orcExtraArgs) } +func cleanAndAddVttablet(t *testing.T, vttablet *cluster.Vttablet) error { + // remove the database if it exists + runSQL(t, "DROP DATABASE IF EXISTS vt_ks", vttablet, "") + // reset the binlog + runSQL(t, "RESET MASTER", vttablet, "") + + // add the vttablet to the topology + err := clusterInstance.VtctlclientProcess.ExecuteCommand("InitTablet", + "-port", fmt.Sprintf("%d", vttablet.VttabletProcess.Port), + "-grpc_port", fmt.Sprintf("%d", vttablet.VttabletProcess.GrpcPort), + "-hostname", vttablet.VttabletProcess.TabletHostname, + "-mysql_port", fmt.Sprintf("%d", vttablet.MysqlctlProcess.MySQLPort), + "-keyspace", vttablet.VttabletProcess.Keyspace, + "-shard", vttablet.VttabletProcess.Shard, + vttablet.Alias, + vttablet.VttabletProcess.TabletType) + if err != nil { + return err + } + + out, _ := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", vttablet.Alias) + log.Error("cleanAndAddVttablet: ", out) + + clusterInstance.Keyspaces[0].Shards[0].Vttablets = append(clusterInstance.Keyspaces[0].Shards[0].Vttablets, vttablet) + return nil +} + func TestMain(m *testing.M) { exitcode, err := func() (int, error) { err := createClusterAndStartTopo() @@ -147,6 +251,8 @@ func TestMain(m *testing.M) { cluster.PanicHandler(nil) clusterInstance.Teardown() + killTablets(replicaTablets) + killTablets(rdonlyTablets) if err != nil { fmt.Printf("%v\n", err) @@ -165,16 +271,6 @@ func TestMasterElection(t *testing.T) { setupVttabletsAndVtorc(t, 1, 1, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] - defer func() { - stopVtorc(t) - // remove all the tablets from the topology - for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) - require.NoError(t, err) - } - killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) - clusterInstance.Keyspaces = nil - }() checkMasterTablet(t, clusterInstance, shard0.Vttablets[0]) checkReplication(t, clusterInstance, shard0.Vttablets[0], shard0.Vttablets[1:]) @@ -189,16 +285,6 @@ func TestSingleKeyspace(t *testing.T) { setupVttabletsAndVtorc(t, 1, 1, []string{"-clusters_to_watch", "ks"}) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] - defer func() { - stopVtorc(t) - // remove all the tablets from the topology - for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) - require.NoError(t, err) - } - killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) - clusterInstance.Keyspaces = nil - }() checkMasterTablet(t, clusterInstance, shard0.Vttablets[0]) checkReplication(t, clusterInstance, shard0.Vttablets[0], shard0.Vttablets[1:]) @@ -213,16 +299,6 @@ func TestKeyspaceShard(t *testing.T) { setupVttabletsAndVtorc(t, 1, 1, []string{"-clusters_to_watch", "ks/0"}) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] - defer func() { - stopVtorc(t) - // remove all the tablets from the topology - for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) - require.NoError(t, err) - } - killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) - clusterInstance.Keyspaces = nil - }() checkMasterTablet(t, clusterInstance, shard0.Vttablets[0]) checkReplication(t, clusterInstance, shard0.Vttablets[0], shard0.Vttablets[1:]) @@ -234,16 +310,6 @@ func TestDownMaster(t *testing.T) { setupVttabletsAndVtorc(t, 2, 0, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] - defer func() { - stopVtorc(t) - // remove all the tablets from the topology - for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) - require.NoError(t, err) - } - killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) - clusterInstance.Keyspaces = nil - }() // find master from topo curMaster := shardMasterTablet(t, clusterInstance, keyspace, shard0) assert.NotNil(t, curMaster, "should have elected a master") @@ -285,16 +351,6 @@ func TestMasterReadOnly(t *testing.T) { setupVttabletsAndVtorc(t, 2, 0, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] - defer func() { - stopVtorc(t) - // remove all the tablets from the topology - for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) - require.NoError(t, err) - } - killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) - clusterInstance.Keyspaces = nil - }() // find master from topo curMaster := shardMasterTablet(t, clusterInstance, keyspace, shard0) @@ -314,16 +370,6 @@ func TestReplicaReadWrite(t *testing.T) { setupVttabletsAndVtorc(t, 2, 0, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] - defer func() { - stopVtorc(t) - // remove all the tablets from the topology - for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) - require.NoError(t, err) - } - killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) - clusterInstance.Keyspaces = nil - }() // find master from topo curMaster := shardMasterTablet(t, clusterInstance, keyspace, shard0) @@ -351,16 +397,6 @@ func TestStopReplication(t *testing.T) { setupVttabletsAndVtorc(t, 2, 0, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] - defer func() { - stopVtorc(t) - // remove all the tablets from the topology - for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) - require.NoError(t, err) - } - killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) - clusterInstance.Keyspaces = nil - }() // find master from topo curMaster := shardMasterTablet(t, clusterInstance, keyspace, shard0) @@ -395,16 +431,6 @@ func TestReplicationFromOtherReplica(t *testing.T) { setupVttabletsAndVtorc(t, 3, 0, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] - defer func() { - stopVtorc(t) - // remove all the tablets from the topology - for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) - require.NoError(t, err) - } - killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) - clusterInstance.Keyspaces = nil - }() // find master from topo curMaster := shardMasterTablet(t, clusterInstance, keyspace, shard0) @@ -451,16 +477,6 @@ func TestRepairAfterTER(t *testing.T) { setupVttabletsAndVtorc(t, 2, 0, nil) keyspace := &clusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] - defer func() { - stopVtorc(t) - // remove all the tablets from the topology - for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - _, err := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("DeleteTablet", "-allow_master", vttablet.Alias) - require.NoError(t, err) - } - killTablets(clusterInstance.Keyspaces[0].Shards[0].Vttablets) - clusterInstance.Keyspaces = nil - }() // find master from topo curMaster := shardMasterTablet(t, clusterInstance, keyspace, shard0) From 3332b4986a811dbea55316cd7be6b25e67722ada Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Tue, 15 Jun 2021 12:46:13 +0530 Subject: [PATCH 03/25] added wait for tablet type in test setup Signed-off-by: GuptaManan100 --- .../transform/backup_transform_utils.go | 2 +- .../backup/vtctlbackup/backup_utils.go | 8 +-- go/test/endtoend/cluster/vttablet_process.go | 52 +++++++++++++++---- .../recovery/pitr/shardedpitr_test.go | 2 +- .../recovery/pitrtls/shardedpitr_tls_test.go | 2 +- go/test/endtoend/recovery/recovery_util.go | 2 +- .../shardedrecovery/sharded_recovery_test.go | 4 +- go/test/endtoend/reparent/utils_test.go | 2 +- .../sharding/initialsharding/sharding_util.go | 14 ++--- .../mergesharding/mergesharding_base.go | 10 ++-- .../sharding/resharding/resharding_base.go | 18 +++---- .../verticalsplit/vertical_split_test.go | 4 +- .../master/tablet_master_test.go | 4 +- .../tabletmanager/tablet_health_test.go | 4 +- go/test/endtoend/tabletmanager/tablet_test.go | 2 +- go/test/endtoend/vtorc/vtorc_test.go | 14 ++++- go/test/endtoend/worker/worker_test.go | 2 +- 17 files changed, 95 insertions(+), 51 deletions(-) diff --git a/go/test/endtoend/backup/transform/backup_transform_utils.go b/go/test/endtoend/backup/transform/backup_transform_utils.go index a07a1c84fdd..cfd20d88e8a 100644 --- a/go/test/endtoend/backup/transform/backup_transform_utils.go +++ b/go/test/endtoend/backup/transform/backup_transform_utils.go @@ -250,7 +250,7 @@ func TestBackupTransformImpl(t *testing.T) { replica2.VttabletProcess.ServingStatus = "" err = replica2.VttabletProcess.Setup() require.Nil(t, err) - err = replica2.VttabletProcess.WaitForTabletTypesForTimeout([]string{"SERVING"}, 25*time.Second) + err = replica2.VttabletProcess.WaitForTabletStatusesForTimeout([]string{"SERVING"}, 25*time.Second) require.Nil(t, err) defer replica2.VttabletProcess.TearDown() diff --git a/go/test/endtoend/backup/vtctlbackup/backup_utils.go b/go/test/endtoend/backup/vtctlbackup/backup_utils.go index 896e6e4038d..49ca0efe6df 100644 --- a/go/test/endtoend/backup/vtctlbackup/backup_utils.go +++ b/go/test/endtoend/backup/vtctlbackup/backup_utils.go @@ -295,7 +295,7 @@ func masterBackup(t *testing.T) { require.Nil(t, err) restoreWaitForBackup(t, "replica") - err = replica2.VttabletProcess.WaitForTabletTypesForTimeout([]string{"SERVING"}, 25*time.Second) + err = replica2.VttabletProcess.WaitForTabletStatusesForTimeout([]string{"SERVING"}, 25*time.Second) require.Nil(t, err) cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 2) @@ -327,7 +327,7 @@ func masterReplicaSameBackup(t *testing.T) { // now bring up the other replica, letting it restore from backup. restoreWaitForBackup(t, "replica") - err = replica2.VttabletProcess.WaitForTabletTypesForTimeout([]string{"SERVING"}, 25*time.Second) + err = replica2.VttabletProcess.WaitForTabletStatusesForTimeout([]string{"SERVING"}, 25*time.Second) require.Nil(t, err) // check the new replica has the data @@ -563,7 +563,7 @@ func vtctlBackup(t *testing.T, tabletType string) { _, err = master.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test2')", keyspaceName, true) require.Nil(t, err) - err = replica2.VttabletProcess.WaitForTabletTypesForTimeout([]string{"SERVING"}, 25*time.Second) + err = replica2.VttabletProcess.WaitForTabletStatusesForTimeout([]string{"SERVING"}, 25*time.Second) require.Nil(t, err) cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 2) @@ -626,7 +626,7 @@ func verifyRestoreTablet(t *testing.T, tablet *cluster.Vttablet, status string) err := tablet.VttabletProcess.Setup() require.Nil(t, err) if status != "" { - err = tablet.VttabletProcess.WaitForTabletTypesForTimeout([]string{status}, 25*time.Second) + err = tablet.VttabletProcess.WaitForTabletStatusesForTimeout([]string{status}, 25*time.Second) require.Nil(t, err) } diff --git a/go/test/endtoend/cluster/vttablet_process.go b/go/test/endtoend/cluster/vttablet_process.go index 42e14d4a989..8965725bf6b 100644 --- a/go/test/endtoend/cluster/vttablet_process.go +++ b/go/test/endtoend/cluster/vttablet_process.go @@ -140,7 +140,7 @@ func (vttablet *VttabletProcess) Setup() (err error) { }() if vttablet.ServingStatus != "" { - if err = vttablet.WaitForTabletType(vttablet.ServingStatus); err != nil { + if err = vttablet.WaitForTabletStatus(vttablet.ServingStatus); err != nil { return fmt.Errorf("process '%s' timed out after 10s (err: %s)", vttablet.Name, err) } } @@ -204,23 +204,37 @@ func (vttablet *VttabletProcess) GetTabletStatus() string { return "" } -// WaitForTabletType waits for 10 second till expected type reached -func (vttablet *VttabletProcess) WaitForTabletType(expectedType string) error { - return vttablet.WaitForTabletTypesForTimeout([]string{expectedType}, 10*time.Second) +// GetTabletType returns the tablet type as seen in /debug/vars TabletType +func (vttablet *VttabletProcess) GetTabletType() string { + resultMap := vttablet.GetVars() + if resultMap != nil { + return reflect.ValueOf(resultMap["TabletType"]).String() + } + return "" +} + +// WaitForTabletStatus waits for 10 second till expected status is reached +func (vttablet *VttabletProcess) WaitForTabletStatus(expectedStatus string) error { + return vttablet.WaitForTabletStatusesForTimeout([]string{expectedStatus}, 10*time.Second) } -// WaitForTabletTypes waits for 10 second till expected type reached +// WaitForTabletStatuses waits for 10 second till one of expected statuses is reached +func (vttablet *VttabletProcess) WaitForTabletStatuses(expectedStatuses []string) error { + return vttablet.WaitForTabletStatusesForTimeout(expectedStatuses, 10*time.Second) +} + +// WaitForTabletTypes waits for 10 second till one of expected statuses is reached func (vttablet *VttabletProcess) WaitForTabletTypes(expectedTypes []string) error { return vttablet.WaitForTabletTypesForTimeout(expectedTypes, 10*time.Second) } -// WaitForTabletTypesForTimeout waits till the tablet reaches to any of the provided status -func (vttablet *VttabletProcess) WaitForTabletTypesForTimeout(expectedTypes []string, timeout time.Duration) error { +// WaitForTabletStatusesForTimeout waits till the tablet reaches to any of the provided statuses +func (vttablet *VttabletProcess) WaitForTabletStatusesForTimeout(expectedStatuses []string, timeout time.Duration) error { timeToWait := time.Now().Add(timeout) var status string for time.Now().Before(timeToWait) { status = vttablet.GetTabletStatus() - if contains(expectedTypes, status) { + if contains(expectedStatuses, status) { return nil } select { @@ -231,7 +245,27 @@ func (vttablet *VttabletProcess) WaitForTabletTypesForTimeout(expectedTypes []st } } return fmt.Errorf("Vttablet %s, current status = %s, expected status [%s] not reached, details: %v", - vttablet.TabletPath, status, strings.Join(expectedTypes, ","), vttablet.GetStatusDetails()) + vttablet.TabletPath, status, strings.Join(expectedStatuses, ","), vttablet.GetStatusDetails()) +} + +// WaitForTabletTypesForTimeout waits till the tablet reaches to any of the provided types +func (vttablet *VttabletProcess) WaitForTabletTypesForTimeout(expectedTypes []string, timeout time.Duration) error { + timeToWait := time.Now().Add(timeout) + var tabletType string + for time.Now().Before(timeToWait) { + tabletType = vttablet.GetTabletType() + if contains(expectedTypes, tabletType) { + return nil + } + select { + case err := <-vttablet.exit: + return fmt.Errorf("process '%s' exited prematurely (err: %s)", vttablet.Name, err) + default: + time.Sleep(300 * time.Millisecond) + } + } + return fmt.Errorf("Vttablet %s, current type = %s, expected type [%s] not reached, status details: %v", + vttablet.TabletPath, tabletType, strings.Join(expectedTypes, ","), vttablet.GetStatusDetails()) } func contains(arr []string, str string) bool { diff --git a/go/test/endtoend/recovery/pitr/shardedpitr_test.go b/go/test/endtoend/recovery/pitr/shardedpitr_test.go index c7c89630651..4e9cceaa3a8 100644 --- a/go/test/endtoend/recovery/pitr/shardedpitr_test.go +++ b/go/test/endtoend/recovery/pitr/shardedpitr_test.go @@ -538,5 +538,5 @@ func launchRecoveryTablet(t *testing.T, tablet *cluster.Vttablet, binlogServer * err = tablet.VttabletProcess.Setup() require.NoError(t, err) - tablet.VttabletProcess.WaitForTabletTypesForTimeout([]string{"SERVING"}, 20*time.Second) + tablet.VttabletProcess.WaitForTabletStatusesForTimeout([]string{"SERVING"}, 20*time.Second) } diff --git a/go/test/endtoend/recovery/pitrtls/shardedpitr_tls_test.go b/go/test/endtoend/recovery/pitrtls/shardedpitr_tls_test.go index 259db0d1835..64a7bce652f 100644 --- a/go/test/endtoend/recovery/pitrtls/shardedpitr_tls_test.go +++ b/go/test/endtoend/recovery/pitrtls/shardedpitr_tls_test.go @@ -522,7 +522,7 @@ func tlsLaunchRecoveryTablet(t *testing.T, tablet *cluster.Vttablet, tabletForBi err = tablet.VttabletProcess.Setup() require.NoError(t, err) - tablet.VttabletProcess.WaitForTabletTypesForTimeout([]string{"SERVING"}, 20*time.Second) + tablet.VttabletProcess.WaitForTabletStatusesForTimeout([]string{"SERVING"}, 20*time.Second) } func getCNFromCertPEM(filename string) string { diff --git a/go/test/endtoend/recovery/recovery_util.go b/go/test/endtoend/recovery/recovery_util.go index c48320d487f..cde3af4dac0 100644 --- a/go/test/endtoend/recovery/recovery_util.go +++ b/go/test/endtoend/recovery/recovery_util.go @@ -83,7 +83,7 @@ func RestoreTablet(t *testing.T, localCluster *cluster.LocalProcessCluster, tabl err = tablet.VttabletProcess.Setup() require.Nil(t, err) - err = tablet.VttabletProcess.WaitForTabletTypesForTimeout([]string{"SERVING"}, 20*time.Second) + err = tablet.VttabletProcess.WaitForTabletStatusesForTimeout([]string{"SERVING"}, 20*time.Second) require.Nil(t, err) } diff --git a/go/test/endtoend/recovery/shardedrecovery/sharded_recovery_test.go b/go/test/endtoend/recovery/shardedrecovery/sharded_recovery_test.go index 612428219b1..83a8098b92f 100644 --- a/go/test/endtoend/recovery/shardedrecovery/sharded_recovery_test.go +++ b/go/test/endtoend/recovery/shardedrecovery/sharded_recovery_test.go @@ -149,7 +149,7 @@ func TestUnShardedRecoveryAfterSharding(t *testing.T) { shardedTablets := []*cluster.Vttablet{shard0Master, shard0Replica, shard0RdOnly, shard1Master, shard1Replica, shard1RdOnly} for _, tablet := range shardedTablets { - _ = tablet.VttabletProcess.WaitForTabletType("SERVING") + _ = tablet.VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) } @@ -288,7 +288,7 @@ func TestShardedRecovery(t *testing.T) { require.NoError(t, err) for _, tablet := range shardedTablets { - _ = tablet.VttabletProcess.WaitForTabletType("SERVING") + _ = tablet.VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) } diff --git a/go/test/endtoend/reparent/utils_test.go b/go/test/endtoend/reparent/utils_test.go index fa97ebef4f8..0ecb350826b 100644 --- a/go/test/endtoend/reparent/utils_test.go +++ b/go/test/endtoend/reparent/utils_test.go @@ -162,7 +162,7 @@ func setupShard(ctx context.Context, t *testing.T, shardName string, tablets []* } for _, tablet := range tablets { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) + err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"}) require.NoError(t, err) } diff --git a/go/test/endtoend/sharding/initialsharding/sharding_util.go b/go/test/endtoend/sharding/initialsharding/sharding_util.go index d7cf5f1414c..07fefbade85 100644 --- a/go/test/endtoend/sharding/initialsharding/sharding_util.go +++ b/go/test/endtoend/sharding/initialsharding/sharding_util.go @@ -271,15 +271,15 @@ func TestInitialSharding(t *testing.T, keyspace *cluster.Keyspace, keyType query err = ClusterInstance.VtctlclientProcess.InitShardMaster(keyspace.Name, shard1.Name, cell, shard1MasterTablet.TabletUID) require.NoError(t, err) } else { - err = shard1.Replica().VttabletProcess.WaitForTabletType("SERVING") + err = shard1.Replica().VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) _, err = ClusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("TabletExternallyReparented", shard1MasterTablet.Alias) require.NoError(t, err) } - err = shard1.Replica().VttabletProcess.WaitForTabletType("SERVING") + err = shard1.Replica().VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) - err = shard1.Rdonly().VttabletProcess.WaitForTabletType("SERVING") + err = shard1.Rdonly().VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) for _, vttablet := range shard1.Vttablets { assert.Equal(t, vttablet.VttabletProcess.GetTabletStatus(), "SERVING") @@ -352,8 +352,8 @@ func TestInitialSharding(t *testing.T, keyspace *cluster.Keyspace, keyType query _ = ClusterInstance.VtctlclientProcess.ApplyVSchema(keyspaceName, fmt.Sprintf(vSchema, tableName, "id")) for _, shard := range []cluster.Shard{shard21, shard22} { - _ = shard.Replica().VttabletProcess.WaitForTabletType("SERVING") - _ = shard.Rdonly().VttabletProcess.WaitForTabletType("SERVING") + _ = shard.Replica().VttabletProcess.WaitForTabletStatus("SERVING") + _ = shard.Rdonly().VttabletProcess.WaitForTabletStatus("SERVING") } for _, shard := range []cluster.Shard{shard21, shard22} { @@ -526,8 +526,8 @@ func TestInitialSharding(t *testing.T, keyspace *cluster.Keyspace, keyType query expectedPartitions[topodata.TabletType_RDONLY] = []string{shard21.Name, shard22.Name} checkSrvKeyspaceForSharding(t, keyspaceName, expectedPartitions) - _ = shard21.Rdonly().VttabletProcess.WaitForTabletType("SERVING") - _ = shard22.Rdonly().VttabletProcess.WaitForTabletType("SERVING") + _ = shard21.Rdonly().VttabletProcess.WaitForTabletStatus("SERVING") + _ = shard22.Rdonly().VttabletProcess.WaitForTabletStatus("SERVING") _ = vtgateInstance.WaitForStatusOfTabletInShard(fmt.Sprintf("%s.%s.rdonly", keyspaceName, shard21.Name), 1) _ = vtgateInstance.WaitForStatusOfTabletInShard(fmt.Sprintf("%s.%s.rdonly", keyspaceName, shard22.Name), 1) diff --git a/go/test/endtoend/sharding/mergesharding/mergesharding_base.go b/go/test/endtoend/sharding/mergesharding/mergesharding_base.go index 15245ebedac..9a0de98edfa 100644 --- a/go/test/endtoend/sharding/mergesharding/mergesharding_base.go +++ b/go/test/endtoend/sharding/mergesharding/mergesharding_base.go @@ -217,13 +217,13 @@ func TestMergesharding(t *testing.T, useVarbinaryShardingKeyType bool) { require.NoError(t, err) // Wait for tablets to come in Service state - err = shard0Master.VttabletProcess.WaitForTabletType("SERVING") + err = shard0Master.VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) - err = shard1Master.VttabletProcess.WaitForTabletType("SERVING") + err = shard1Master.VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) - err = shard2Master.VttabletProcess.WaitForTabletType("SERVING") + err = shard2Master.VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) - err = shard3Master.VttabletProcess.WaitForTabletType("SERVING") + err = shard3Master.VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) // keyspace/shard name fields @@ -351,7 +351,7 @@ func TestMergesharding(t *testing.T, useVarbinaryShardingKeyType bool) { // The tested behavior is a safeguard to prevent that somebody can // accidentally modify data on the destination masters while they are not // migrated yet and the source shards are still the source of truth. - err = shard3Master.VttabletProcess.WaitForTabletType("NOT_SERVING") + err = shard3Master.VttabletProcess.WaitForTabletStatus("NOT_SERVING") require.NoError(t, err) // check that binlog server exported the stats vars diff --git a/go/test/endtoend/sharding/resharding/resharding_base.go b/go/test/endtoend/sharding/resharding/resharding_base.go index fabc1a0734b..9b04ca89c7e 100644 --- a/go/test/endtoend/sharding/resharding/resharding_base.go +++ b/go/test/endtoend/sharding/resharding/resharding_base.go @@ -305,13 +305,13 @@ func TestResharding(t *testing.T, useVarbinaryShardingKeyType bool) { require.Nil(t, err) // Wait for tablets to come in Service state - err = shard0Master.VttabletProcess.WaitForTabletType("SERVING") + err = shard0Master.VttabletProcess.WaitForTabletStatus("SERVING") require.Nil(t, err) - err = shard1Master.VttabletProcess.WaitForTabletType("SERVING") + err = shard1Master.VttabletProcess.WaitForTabletStatus("SERVING") require.Nil(t, err) - err = shard2Master.VttabletProcess.WaitForTabletType("SERVING") + err = shard2Master.VttabletProcess.WaitForTabletStatus("SERVING") require.Nil(t, err) - err = shard3Master.VttabletProcess.WaitForTabletType("SERVING") + err = shard3Master.VttabletProcess.WaitForTabletStatus("SERVING") require.Nil(t, err) // keyspace/shard name fields @@ -375,7 +375,7 @@ func TestResharding(t *testing.T, useVarbinaryShardingKeyType bool) { err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", shard1Replica2.Alias, "spare") require.Nil(t, err) - err = shard1Replica2.VttabletProcess.WaitForTabletType("NOT_SERVING") + err = shard1Replica2.VttabletProcess.WaitForTabletStatus("NOT_SERVING") require.Nil(t, err) // we need to create the schema, and the worker will do data copying @@ -522,9 +522,9 @@ func TestResharding(t *testing.T, useVarbinaryShardingKeyType bool) { // The tested behavior is a safeguard to prevent that somebody can // accidentally modify data on the destination masters while they are not // migrated yet and the source shards are still the source of truth. - err = shard2Master.VttabletProcess.WaitForTabletType("NOT_SERVING") + err = shard2Master.VttabletProcess.WaitForTabletStatus("NOT_SERVING") require.Nil(t, err) - err = shard3Master.VttabletProcess.WaitForTabletType("NOT_SERVING") + err = shard3Master.VttabletProcess.WaitForTabletStatus("NOT_SERVING") require.Nil(t, err) // check that binlog server exported the stats vars @@ -596,9 +596,9 @@ func TestResharding(t *testing.T, useVarbinaryShardingKeyType bool) { err = clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", shard1Replica1.Alias, "spare") require.Nil(t, err) - err = shard1Replica2.VttabletProcess.WaitForTabletType("SERVING") + err = shard1Replica2.VttabletProcess.WaitForTabletStatus("SERVING") require.Nil(t, err) - err = shard1Replica1.VttabletProcess.WaitForTabletType("NOT_SERVING") + err = shard1Replica1.VttabletProcess.WaitForTabletStatus("NOT_SERVING") require.Nil(t, err) err = clusterInstance.VtctlclientProcess.ExecuteCommand("RunHealthCheck", shard1Replica2.Alias) diff --git a/go/test/endtoend/sharding/verticalsplit/vertical_split_test.go b/go/test/endtoend/sharding/verticalsplit/vertical_split_test.go index ad1c73ccaa1..2de8bfe7338 100644 --- a/go/test/endtoend/sharding/verticalsplit/vertical_split_test.go +++ b/go/test/endtoend/sharding/verticalsplit/vertical_split_test.go @@ -141,11 +141,11 @@ func TestVerticalSplit(t *testing.T) { destinationMasterTablet.Type = "master" for _, tablet := range []cluster.Vttablet{sourceReplicaTablet, destinationReplicaTablet} { - _ = tablet.VttabletProcess.WaitForTabletType("SERVING") + _ = tablet.VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) } for _, tablet := range []cluster.Vttablet{sourceRdOnlyTablet1, sourceRdOnlyTablet2, destinationRdOnlyTablet1, destinationRdOnlyTablet2} { - _ = tablet.VttabletProcess.WaitForTabletType("SERVING") + _ = tablet.VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) } diff --git a/go/test/endtoend/tabletmanager/master/tablet_master_test.go b/go/test/endtoend/tabletmanager/master/tablet_master_test.go index 07dcdd4b899..31cd0631e78 100644 --- a/go/test/endtoend/tabletmanager/master/tablet_master_test.go +++ b/go/test/endtoend/tabletmanager/master/tablet_master_test.go @@ -169,7 +169,7 @@ func TestMasterRestartSetsTERTimestamp(t *testing.T) { err := clusterInstance.VtctlclientProcess.InitShardMaster(keyspaceName, shardName, cell, replicaTablet.TabletUID) require.Nil(t, err) - err = replicaTablet.VttabletProcess.WaitForTabletType("SERVING") + err = replicaTablet.VttabletProcess.WaitForTabletStatus("SERVING") require.Nil(t, err) // Capture the current TER. @@ -224,7 +224,7 @@ func TestMasterRestartSetsTERTimestamp(t *testing.T) { // Reset master err = clusterInstance.VtctlclientProcess.InitShardMaster(keyspaceName, shardName, cell, masterTablet.TabletUID) require.Nil(t, err) - err = masterTablet.VttabletProcess.WaitForTabletType("SERVING") + err = masterTablet.VttabletProcess.WaitForTabletStatus("SERVING") require.Nil(t, err) } diff --git a/go/test/endtoend/tabletmanager/tablet_health_test.go b/go/test/endtoend/tabletmanager/tablet_health_test.go index a31b9b82d63..0d7906ab259 100644 --- a/go/test/endtoend/tabletmanager/tablet_health_test.go +++ b/go/test/endtoend/tabletmanager/tablet_health_test.go @@ -205,7 +205,7 @@ func TestHealthCheckDrainedStateDoesNotShutdownQueryService(t *testing.T) { //Wait if tablet is not in service state defer cluster.PanicHandler(t) - err := rdonlyTablet.VttabletProcess.WaitForTabletType("SERVING") + err := rdonlyTablet.VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) // Check tablet health @@ -231,7 +231,7 @@ func TestHealthCheckDrainedStateDoesNotShutdownQueryService(t *testing.T) { checkTabletType(t, rdonlyTablet.Alias, "DRAINED") // Query service is still running. - err = rdonlyTablet.VttabletProcess.WaitForTabletType("SERVING") + err = rdonlyTablet.VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) // Restart replication. Tablet will become healthy again. diff --git a/go/test/endtoend/tabletmanager/tablet_test.go b/go/test/endtoend/tabletmanager/tablet_test.go index 0ce8cd94276..778583102d0 100644 --- a/go/test/endtoend/tabletmanager/tablet_test.go +++ b/go/test/endtoend/tabletmanager/tablet_test.go @@ -53,7 +53,7 @@ func TestEnsureDB(t *testing.T) { // Switch to read-write and verify that that we go serving. _ = clusterInstance.VtctlclientProcess.ExecuteCommand("SetReadWrite", tablet.Alias) - err = tablet.VttabletProcess.WaitForTabletType("SERVING") + err = tablet.VttabletProcess.WaitForTabletStatus("SERVING") require.NoError(t, err) killTablets(t, tablet) } diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index f78331457c4..13175c3ceb6 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -131,11 +131,21 @@ func createVttablets() error { } } for _, tablet := range shard0.Vttablets { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) + err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"}) if err != nil { return err } } + + // we also need to wait for the tablet type to change from restore to replica, before we delete a tablet from the topology + // otherwise it will notice that their is no record for the tablet in the topology when it tries to update its state and shutdown itself! + for _, tablet := range shard0.Vttablets { + err := tablet.VttabletProcess.WaitForTabletTypes([]string{"replica", "rdonly"}) + if err != nil { + return err + } + } + return nil } @@ -205,7 +215,7 @@ func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, } for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - err = tablet.VttabletProcess.WaitForTabletTypes([]string{"SERVING", "NOT_SERVING"}) + err = tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"}) require.NoError(t, err) } diff --git a/go/test/endtoend/worker/worker_test.go b/go/test/endtoend/worker/worker_test.go index 4fdb163ec3e..5e834f2314d 100644 --- a/go/test/endtoend/worker/worker_test.go +++ b/go/test/endtoend/worker/worker_test.go @@ -509,7 +509,7 @@ func runShardTablets(t *testing.T, shardName string, tabletArr []*cluster.Vttabl // set a replica or rdonly tablet back to NOT_SERVING. for _, tablet := range tabletArr { - err = tablet.VttabletProcess.WaitForTabletType("SERVING") + err = tablet.VttabletProcess.WaitForTabletStatus("SERVING") require.Nil(t, err) } From 52b7d02d25c52d75a60d3692708d555f01d884fb Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Tue, 15 Jun 2021 14:13:27 +0530 Subject: [PATCH 04/25] pass mysql-hostname too for orchestrator to discover the tablet Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index 13175c3ceb6..3e612571f70 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -152,9 +152,6 @@ func createVttablets() error { // removeVttabletsFromTopology removes all the vttablets from the topology func removeVttabletsFromTopology() error { for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - out, _ := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", vttablet.Alias) - log.Error("removeVttabletsFromTopology: ", out) - err := clusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", "-allow_master", vttablet.Alias) if err != nil { return err @@ -234,6 +231,7 @@ func cleanAndAddVttablet(t *testing.T, vttablet *cluster.Vttablet) error { "-port", fmt.Sprintf("%d", vttablet.VttabletProcess.Port), "-grpc_port", fmt.Sprintf("%d", vttablet.VttabletProcess.GrpcPort), "-hostname", vttablet.VttabletProcess.TabletHostname, + "-mysql_host", vttablet.VttabletProcess.TabletHostname, "-mysql_port", fmt.Sprintf("%d", vttablet.MysqlctlProcess.MySQLPort), "-keyspace", vttablet.VttabletProcess.Keyspace, "-shard", vttablet.VttabletProcess.Shard, @@ -243,9 +241,6 @@ func cleanAndAddVttablet(t *testing.T, vttablet *cluster.Vttablet) error { return err } - out, _ := clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", vttablet.Alias) - log.Error("cleanAndAddVttablet: ", out) - clusterInstance.Keyspaces[0].Shards[0].Vttablets = append(clusterInstance.Keyspaces[0].Shards[0].Vttablets, vttablet) return nil } From 84fe9c1343ed0b08695c26f9052e8e753831486f Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Wed, 16 Jun 2021 11:02:02 +0530 Subject: [PATCH 05/25] change tablet type from master before deleting it Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index 3e612571f70..7ba13f1ca1a 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -152,7 +152,20 @@ func createVttablets() error { // removeVttabletsFromTopology removes all the vttablets from the topology func removeVttabletsFromTopology() error { for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - err := clusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", "-allow_master", vttablet.Alias) + tabletType := vttablet.VttabletProcess.GetTabletType() + if tabletType == "master" { + err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", vttablet.Alias, vttablet.Type) + if err != nil { + return err + } + + err = vttablet.VttabletProcess.WaitForTabletTypes([]string{vttablet.Type}) + if err != nil { + return err + } + } + + err := clusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", vttablet.Alias) if err != nil { return err } @@ -211,11 +224,6 @@ func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, t.Fatalf("more than available tablets requested. Please increase the constants numReplicas or numRdonly") } - for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - err = tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"}) - require.NoError(t, err) - } - // start vtorc startVtorc(t, orcExtraArgs) } From 1f732438ef4229242c0815e96ab8962cd2b40986 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Wed, 16 Jun 2021 12:16:04 +0530 Subject: [PATCH 06/25] allow force update of tablets Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 2 +- go/vt/vtctl/vtctl.go | 9 +++++---- go/vt/worker/topo_utils.go | 2 +- go/vt/wrangler/tablet.go | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index 7ba13f1ca1a..4fe9c7dff4d 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -154,7 +154,7 @@ func removeVttabletsFromTopology() error { for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { tabletType := vttablet.VttabletProcess.GetTabletType() if tabletType == "master" { - err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", vttablet.Alias, vttablet.Type) + err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", "-force", vttablet.Alias, vttablet.Type) if err != nil { return err } diff --git a/go/vt/vtctl/vtctl.go b/go/vt/vtctl/vtctl.go index 1168c87d94c..044627a74f2 100644 --- a/go/vt/vtctl/vtctl.go +++ b/go/vt/vtctl/vtctl.go @@ -182,8 +182,8 @@ var commands = []commandGroup{ "", "Stops replication on the specified tablet."}, {"ChangeTabletType", commandChangeTabletType, - "[-dry-run] ", - "Changes the db type for the specified tablet, if possible. This command is used primarily to arrange replicas, and it will not convert a master.\n" + + "[-dry-run] [-force] ", + "Changes the db type for the specified tablet, if possible. This command is used primarily to arrange replicas, and it should be used to convert a master cautiously.\n" + "NOTE: This command automatically updates the serving graph.\n"}, {"Ping", commandPing, "", @@ -876,6 +876,7 @@ func commandStopReplication(ctx context.Context, wr *wrangler.Wrangler, subFlags func commandChangeTabletType(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error { dryRun := subFlags.Bool("dry-run", false, "Lists the proposed change without actually executing it") + force := subFlags.Bool("force", false, "Do not check if the change requested is safe or not. Should be used to convert master cautiously.") if err := subFlags.Parse(args); err != nil { return err @@ -901,7 +902,7 @@ func commandChangeTabletType(ctx context.Context, wr *wrangler.Wrangler, subFlag if err != nil { return fmt.Errorf("failed reading tablet %v: %v", tabletAlias, err) } - if !topo.IsTrivialTypeChange(ti.Type, newType) { + if !(*force) && !topo.IsTrivialTypeChange(ti.Type, newType) { return fmt.Errorf("invalid type transition %v: %v -> %v", tabletAlias, ti.Type, newType) } wr.Logger().Printf("- %v\n", fmtTabletAwkable(ti)) @@ -909,7 +910,7 @@ func commandChangeTabletType(ctx context.Context, wr *wrangler.Wrangler, subFlag wr.Logger().Printf("+ %v\n", fmtTabletAwkable(ti)) return nil } - return wr.ChangeTabletType(ctx, tabletAlias, newType) + return wr.ChangeTabletType(ctx, tabletAlias, newType, *force) } func commandPing(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error { diff --git a/go/vt/worker/topo_utils.go b/go/vt/worker/topo_utils.go index b707cf3955c..a2eb1e3eee0 100644 --- a/go/vt/worker/topo_utils.go +++ b/go/vt/worker/topo_utils.go @@ -124,7 +124,7 @@ func FindWorkerTablet(ctx context.Context, wr *wrangler.Wrangler, cleaner *wrang wr.Logger().Infof("Changing tablet %v to '%v'", topoproto.TabletAliasString(tabletAlias), topodatapb.TabletType_DRAINED) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) defer cancel() - if err := wr.ChangeTabletType(shortCtx, tabletAlias, topodatapb.TabletType_DRAINED); err != nil { + if err := wr.ChangeTabletType(shortCtx, tabletAlias, topodatapb.TabletType_DRAINED, false); err != nil { return nil, err } // Record a clean-up action to take the tablet back to tabletAlias. diff --git a/go/vt/wrangler/tablet.go b/go/vt/wrangler/tablet.go index 077db77b51b..e84265b3c98 100644 --- a/go/vt/wrangler/tablet.go +++ b/go/vt/wrangler/tablet.go @@ -158,14 +158,14 @@ func (wr *Wrangler) DeleteTablet(ctx context.Context, tabletAlias *topodatapb.Ta // // Note we don't update the master record in the Shard here, as we // can't ChangeType from and out of master anyway. -func (wr *Wrangler) ChangeTabletType(ctx context.Context, tabletAlias *topodatapb.TabletAlias, tabletType topodatapb.TabletType) error { +func (wr *Wrangler) ChangeTabletType(ctx context.Context, tabletAlias *topodatapb.TabletAlias, tabletType topodatapb.TabletType, force bool) error { // Load tablet to find endpoint, and keyspace and shard assignment. ti, err := wr.ts.GetTablet(ctx, tabletAlias) if err != nil { return err } - if !topo.IsTrivialTypeChange(ti.Type, tabletType) { + if !force && !topo.IsTrivialTypeChange(ti.Type, tabletType) { return fmt.Errorf("tablet %v type change %v -> %v is not an allowed transition for ChangeTabletType", tabletAlias, ti.Type, tabletType) } From 09492defc1d3686400986e7deb3c1d588954fa02 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Wed, 16 Jun 2021 12:31:11 +0530 Subject: [PATCH 07/25] delete the tablet for whom mysqlctl process is stopped Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index 4fe9c7dff4d..6927bf37323 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -52,7 +52,7 @@ const ( shardName = "0" hostname = "localhost" cell1 = "zone1" - numReplicas = 3 + numReplicas = 4 numRdonly = 1 ) @@ -330,6 +330,17 @@ func TestDownMaster(t *testing.T) { // Make the current master database unavailable. err := curMaster.MysqlctlProcess.Stop() require.NoError(t, err) + defer func() { + // we remove the tablet from our global list since its mysqlctl process has stopped and cannot be reused for other tests + for i, tablet := range replicaTablets { + if tablet == curMaster { + // remove this tablet since its mysql has stopped + replicaTablets = append(replicaTablets[:i], replicaTablets[i+1:]...) + killTablets([]*cluster.Vttablet{curMaster}) + return + } + } + }() for _, tablet := range shard0.Vttablets { // we know we have only two tablets, so the "other" one must be the new master From 68c9436d02b3eb20f3820e1d155bf42d5b50bd30 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Wed, 16 Jun 2021 13:04:25 +0530 Subject: [PATCH 08/25] Revert "allow force update of tablets" This reverts commit 1f732438ef4229242c0815e96ab8962cd2b40986. Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 2 +- go/vt/vtctl/vtctl.go | 9 ++++----- go/vt/worker/topo_utils.go | 2 +- go/vt/wrangler/tablet.go | 4 ++-- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index 6927bf37323..6122578fe55 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -154,7 +154,7 @@ func removeVttabletsFromTopology() error { for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { tabletType := vttablet.VttabletProcess.GetTabletType() if tabletType == "master" { - err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", "-force", vttablet.Alias, vttablet.Type) + err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", vttablet.Alias, vttablet.Type) if err != nil { return err } diff --git a/go/vt/vtctl/vtctl.go b/go/vt/vtctl/vtctl.go index c819fb3b225..41bf6f36d79 100644 --- a/go/vt/vtctl/vtctl.go +++ b/go/vt/vtctl/vtctl.go @@ -182,8 +182,8 @@ var commands = []commandGroup{ "", "Stops replication on the specified tablet."}, {"ChangeTabletType", commandChangeTabletType, - "[-dry-run] [-force] ", - "Changes the db type for the specified tablet, if possible. This command is used primarily to arrange replicas, and it should be used to convert a master cautiously.\n" + + "[-dry-run] ", + "Changes the db type for the specified tablet, if possible. This command is used primarily to arrange replicas, and it will not convert a master.\n" + "NOTE: This command automatically updates the serving graph.\n"}, {"Ping", commandPing, "", @@ -876,7 +876,6 @@ func commandStopReplication(ctx context.Context, wr *wrangler.Wrangler, subFlags func commandChangeTabletType(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error { dryRun := subFlags.Bool("dry-run", false, "Lists the proposed change without actually executing it") - force := subFlags.Bool("force", false, "Do not check if the change requested is safe or not. Should be used to convert master cautiously.") if err := subFlags.Parse(args); err != nil { return err @@ -902,7 +901,7 @@ func commandChangeTabletType(ctx context.Context, wr *wrangler.Wrangler, subFlag if err != nil { return fmt.Errorf("failed reading tablet %v: %v", tabletAlias, err) } - if !(*force) && !topo.IsTrivialTypeChange(ti.Type, newType) { + if !topo.IsTrivialTypeChange(ti.Type, newType) { return fmt.Errorf("invalid type transition %v: %v -> %v", tabletAlias, ti.Type, newType) } wr.Logger().Printf("- %v\n", fmtTabletAwkable(ti)) @@ -910,7 +909,7 @@ func commandChangeTabletType(ctx context.Context, wr *wrangler.Wrangler, subFlag wr.Logger().Printf("+ %v\n", fmtTabletAwkable(ti)) return nil } - return wr.ChangeTabletType(ctx, tabletAlias, newType, *force) + return wr.ChangeTabletType(ctx, tabletAlias, newType) } func commandPing(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error { diff --git a/go/vt/worker/topo_utils.go b/go/vt/worker/topo_utils.go index a2eb1e3eee0..b707cf3955c 100644 --- a/go/vt/worker/topo_utils.go +++ b/go/vt/worker/topo_utils.go @@ -124,7 +124,7 @@ func FindWorkerTablet(ctx context.Context, wr *wrangler.Wrangler, cleaner *wrang wr.Logger().Infof("Changing tablet %v to '%v'", topoproto.TabletAliasString(tabletAlias), topodatapb.TabletType_DRAINED) shortCtx, cancel := context.WithTimeout(ctx, *remoteActionsTimeout) defer cancel() - if err := wr.ChangeTabletType(shortCtx, tabletAlias, topodatapb.TabletType_DRAINED, false); err != nil { + if err := wr.ChangeTabletType(shortCtx, tabletAlias, topodatapb.TabletType_DRAINED); err != nil { return nil, err } // Record a clean-up action to take the tablet back to tabletAlias. diff --git a/go/vt/wrangler/tablet.go b/go/vt/wrangler/tablet.go index e84265b3c98..077db77b51b 100644 --- a/go/vt/wrangler/tablet.go +++ b/go/vt/wrangler/tablet.go @@ -158,14 +158,14 @@ func (wr *Wrangler) DeleteTablet(ctx context.Context, tabletAlias *topodatapb.Ta // // Note we don't update the master record in the Shard here, as we // can't ChangeType from and out of master anyway. -func (wr *Wrangler) ChangeTabletType(ctx context.Context, tabletAlias *topodatapb.TabletAlias, tabletType topodatapb.TabletType, force bool) error { +func (wr *Wrangler) ChangeTabletType(ctx context.Context, tabletAlias *topodatapb.TabletAlias, tabletType topodatapb.TabletType) error { // Load tablet to find endpoint, and keyspace and shard assignment. ti, err := wr.ts.GetTablet(ctx, tabletAlias) if err != nil { return err } - if !force && !topo.IsTrivialTypeChange(ti.Type, tabletType) { + if !topo.IsTrivialTypeChange(ti.Type, tabletType) { return fmt.Errorf("tablet %v type change %v -> %v is not an allowed transition for ChangeTabletType", tabletAlias, ti.Type, tabletType) } From 265019a4ee029ffa4b365513f619313ec1a38259 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Wed, 16 Jun 2021 16:43:36 +0530 Subject: [PATCH 09/25] added additional command to demote master tablet to circumvent the side effect of DeleteTablet shutting down the master tablet Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 23 ++++++++++------ go/vt/vtctl/vtctl.go | 23 ++++++++++++++++ go/vt/wrangler/tablet.go | 40 ++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 8 deletions(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index 6122578fe55..c8385f548d4 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -154,14 +154,9 @@ func removeVttabletsFromTopology() error { for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { tabletType := vttablet.VttabletProcess.GetTabletType() if tabletType == "master" { - err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", vttablet.Alias, vttablet.Type) - if err != nil { - return err - } - - err = vttablet.VttabletProcess.WaitForTabletTypes([]string{vttablet.Type}) - if err != nil { - return err + err2 := demoteMasterTablet(vttablet) + if err2 != nil { + return err2 } } @@ -174,6 +169,16 @@ func removeVttabletsFromTopology() error { return nil } +func demoteMasterTablet(vttablet *cluster.Vttablet) error { + err := clusterInstance.VtctlclientProcess.ExecuteCommand("DemoteMasterTablet", vttablet.Alias) + if err != nil { + return err + } + + err = vttablet.VttabletProcess.WaitForTabletTypes([]string{vttablet.Type}) + return err +} + // startVtorc is used to start the orchestrator with the given extra arguments func startVtorc(t *testing.T, orcExtraArgs []string) { // Start vtorc @@ -231,6 +236,8 @@ func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, func cleanAndAddVttablet(t *testing.T, vttablet *cluster.Vttablet) error { // remove the database if it exists runSQL(t, "DROP DATABASE IF EXISTS vt_ks", vttablet, "") + // stop the replication + runSQL(t, "STOP SLAVE", vttablet, "") // reset the binlog runSQL(t, "RESET MASTER", vttablet, "") diff --git a/go/vt/vtctl/vtctl.go b/go/vt/vtctl/vtctl.go index 41bf6f36d79..c251caadf80 100644 --- a/go/vt/vtctl/vtctl.go +++ b/go/vt/vtctl/vtctl.go @@ -169,6 +169,9 @@ var commands = []commandGroup{ {"DeleteTablet", commandDeleteTablet, "[-allow_master] ...", "Deletes tablet(s) from the topology."}, + {"DemoteMasterTablet", commandDemoteMasterTablet, + " ...", + "Demotes master tablet(s) to primary Use with caution. Can leave the shard with no master."}, {"SetReadOnly", commandSetReadOnly, "", "Sets the tablet as read-only."}, @@ -798,6 +801,26 @@ func commandDeleteTablet(ctx context.Context, wr *wrangler.Wrangler, subFlags *f return nil } +func commandDemoteMasterTablet(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error { + if err := subFlags.Parse(args); err != nil { + return err + } + if subFlags.NArg() == 0 { + return fmt.Errorf("the argument must be used to specify at least one tablet when calling the DemoteMasterTablet command") + } + + tabletAliases, err := tabletParamsToTabletAliases(subFlags.Args()) + if err != nil { + return err + } + for _, tabletAlias := range tabletAliases { + if err := wr.DemoteMasterTablet(ctx, tabletAlias); err != nil { + return err + } + } + return nil +} + func commandSetReadOnly(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error { if err := subFlags.Parse(args); err != nil { return err diff --git a/go/vt/wrangler/tablet.go b/go/vt/wrangler/tablet.go index 077db77b51b..c09e2719f03 100644 --- a/go/vt/wrangler/tablet.go +++ b/go/vt/wrangler/tablet.go @@ -153,6 +153,46 @@ func (wr *Wrangler) DeleteTablet(ctx context.Context, tabletAlias *topodatapb.Ta return nil } +// DemoteMasterTablet demotes a master tablet. +func (wr *Wrangler) DemoteMasterTablet(ctx context.Context, tabletAlias *topodatapb.TabletAlias) (err error) { + // load the tablet, see if we'll need to rebuild + ti, err := wr.ts.GetTablet(ctx, tabletAlias) + if err != nil { + return err + } + + wasMaster, err := wr.isMasterTablet(ctx, ti) + if err != nil { + return err + } + + if !wasMaster { + return fmt.Errorf("tablet %v is not a master tablet", topoproto.TabletAliasString(tabletAlias)) + } + + // update the Shard object if the master was scrapped. + // We lock the shard to not conflict with reparent operations. + ctx, unlock, lockErr := wr.ts.LockShard(ctx, ti.Keyspace, ti.Shard, fmt.Sprintf("DemoteMasterTablet(%v)", topoproto.TabletAliasString(tabletAlias))) + if lockErr != nil { + return lockErr + } + defer unlock(&err) + + // update the shard record's master + if _, err := wr.ts.UpdateShardFields(ctx, ti.Keyspace, ti.Shard, func(si *topo.ShardInfo) error { + if !topoproto.TabletAliasEqual(si.MasterAlias, tabletAlias) { + wr.Logger().Warningf("Deleting master %v from shard %v/%v but master in Shard object was %v", topoproto.TabletAliasString(tabletAlias), ti.Keyspace, ti.Shard, topoproto.TabletAliasString(si.MasterAlias)) + return topo.NewError(topo.NoUpdateNeeded, si.Keyspace()+"/"+si.ShardName()) + } + si.MasterAlias = nil + return nil + }); err != nil { + return err + } + + return nil +} + // ChangeTabletType changes the type of tablet and recomputes all // necessary derived paths in the serving graph, if necessary. // From 0414ac9967e85b2e4c146856710a490a39a189db Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Thu, 17 Jun 2021 14:02:42 +0530 Subject: [PATCH 10/25] wait for releasing the locks in vtorcon sigterm Signed-off-by: GuptaManan100 --- go/test/endtoend/cluster/vtorc_process.go | 4 ++-- go/test/endtoend/vtorc/vtorc_test.go | 6 +++--- go/vt/orchestrator/logic/orchestrator.go | 16 ++++++++++++++++ go/vt/orchestrator/logic/tablet_discovery.go | 13 ++++++++++--- go/vt/wrangler/tablet.go | 4 ++-- 5 files changed, 33 insertions(+), 10 deletions(-) diff --git a/go/test/endtoend/cluster/vtorc_process.go b/go/test/endtoend/cluster/vtorc_process.go index 6bc00e1dfad..801cc08aa75 100644 --- a/go/test/endtoend/cluster/vtorc_process.go +++ b/go/test/endtoend/cluster/vtorc_process.go @@ -60,7 +60,7 @@ func (orc *VtorcProcess) Setup() (err error) { orc.proc.Args = append(orc.proc.Args, orc.ExtraArgs...) orc.proc.Args = append(orc.proc.Args, "-alsologtostderr", "http") - errFile, _ := os.Create(path.Join(orc.LogDir, "orc-stderr.txt")) + errFile, _ := os.Create(path.Join(orc.LogDir, "orc-stderr.txt"+time.Now().String())) orc.proc.Stderr = errFile orc.proc.Env = append(orc.proc.Env, os.Environ()...) @@ -95,7 +95,7 @@ func (orc *VtorcProcess) TearDown() error { orc.proc = nil return nil - case <-time.After(10 * time.Second): + case <-time.After(30 * time.Second): _ = orc.proc.Process.Kill() orc.proc = nil return <-orc.exit diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index c8385f548d4..1483724b3d2 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -154,9 +154,9 @@ func removeVttabletsFromTopology() error { for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { tabletType := vttablet.VttabletProcess.GetTabletType() if tabletType == "master" { - err2 := demoteMasterTablet(vttablet) - if err2 != nil { - return err2 + err := demoteMasterTablet(vttablet) + if err != nil { + return err } } diff --git a/go/vt/orchestrator/logic/orchestrator.go b/go/vt/orchestrator/logic/orchestrator.go index f8d05577a1c..59dedea516c 100644 --- a/go/vt/orchestrator/logic/orchestrator.go +++ b/go/vt/orchestrator/logic/orchestrator.go @@ -124,6 +124,22 @@ func acceptSignals() { discoveryMetrics.StopAutoExpiration() // probably should poke other go routines to stop cleanly here ... inst.AuditOperation("shutdown", nil, "Triggered via SIGTERM") + timeout := time.After(*shutdownWaitTime) + func() { + for { + count := atomic.LoadInt32(&shardsLockCounter) + if count == 0 { + return + } + select { + case <-timeout: + log.Infof("wait for lock release timed out. Some locks might not have been released.") + return + default: + time.Sleep(100 * time.Millisecond) + } + } + }() os.Exit(0) } } diff --git a/go/vt/orchestrator/logic/tablet_discovery.go b/go/vt/orchestrator/logic/tablet_discovery.go index 0bf55069753..5154f787b8e 100644 --- a/go/vt/orchestrator/logic/tablet_discovery.go +++ b/go/vt/orchestrator/logic/tablet_discovery.go @@ -23,6 +23,7 @@ import ( "fmt" "strings" "sync" + "sync/atomic" "time" "google.golang.org/protobuf/encoding/prototext" @@ -41,8 +42,10 @@ import ( ) var ( - ts *topo.Server - clustersToWatch = flag.String("clusters_to_watch", "", "Comma-separated list of keyspaces or keyspace/shards that this instance will monitor and repair. Defaults to all clusters in the topology. Example: \"ks1,ks2/-80\"") + ts *topo.Server + clustersToWatch = flag.String("clusters_to_watch", "", "Comma-separated list of keyspaces or keyspace/shards that this instance will monitor and repair. Defaults to all clusters in the topology. Example: \"ks1,ks2/-80\"") + shutdownWaitTime = flag.Duration("shutdown_wait_time", 30*time.Second, "maximum time to wait for vtorc to release all the locks that it is holding before shutting down on SIGTERM") + shardsLockCounter int32 ) // OpenTabletDiscovery opens the vitess topo if enables and returns a ticker @@ -247,8 +250,12 @@ func LockShard(instanceKey inst.InstanceKey) (func(*error), error) { } ctx, cancel := context.WithTimeout(context.Background(), time.Duration(config.Config.LockShardTimeoutSeconds)*time.Second) defer cancel() + atomic.AddInt32(&shardsLockCounter, 1) _, unlock, err := ts.LockShard(ctx, tablet.Keyspace, tablet.Shard, "Orc Recovery") - return unlock, err + return func(e *error) { + defer atomic.AddInt32(&shardsLockCounter, -1) + unlock(e) + }, err } // TabletRefresh refreshes the tablet info. diff --git a/go/vt/wrangler/tablet.go b/go/vt/wrangler/tablet.go index c09e2719f03..a96a8fd7c86 100644 --- a/go/vt/wrangler/tablet.go +++ b/go/vt/wrangler/tablet.go @@ -150,7 +150,7 @@ func (wr *Wrangler) DeleteTablet(ctx context.Context, tabletAlias *topodatapb.Ta return err } - return nil + return } // DemoteMasterTablet demotes a master tablet. @@ -190,7 +190,7 @@ func (wr *Wrangler) DemoteMasterTablet(ctx context.Context, tabletAlias *topodat return err } - return nil + return } // ChangeTabletType changes the type of tablet and recomputes all From 5250bff3752e6a3da2e7693ff7952171574322c1 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Thu, 17 Jun 2021 18:41:09 +0530 Subject: [PATCH 11/25] check if database exists before querying it Signed-off-by: GuptaManan100 --- go/test/endtoend/cluster/cluster_process.go | 2 +- go/test/endtoend/vtorc/vtorc_test.go | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/go/test/endtoend/cluster/cluster_process.go b/go/test/endtoend/cluster/cluster_process.go index f0e1a9a8380..dec0c86dd5c 100644 --- a/go/test/endtoend/cluster/cluster_process.go +++ b/go/test/endtoend/cluster/cluster_process.go @@ -42,7 +42,7 @@ const ( ) var ( - keepData = flag.Bool("keep-data", false, "don't delete the per-test VTDATAROOT subfolders") + keepData = flag.Bool("keep-data", true, "don't delete the per-test VTDATAROOT subfolders") topoFlavor = flag.String("topo-flavor", "etcd2", "choose a topo server from etcd2, zk2 or consul") isCoverage = flag.Bool("is-coverage", false, "whether coverage is required") forceVTDATAROOT = flag.String("force-vtdataroot", "", "force path for VTDATAROOT, which may already be populated") diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index 1483724b3d2..968212c4d97 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -635,10 +635,16 @@ func checkInsertedValues(t *testing.T, tablet *cluster.Vttablet, index int) erro // wait until it gets the data timeout := time.Now().Add(10 * time.Second) for time.Now().Before(timeout) { - selectSQL := fmt.Sprintf("select msg from vt_insert_test where id=%d", index) - qr := runSQL(t, selectSQL, tablet, "vt_ks") - if len(qr.Rows) == 1 { - return nil + qr := runSQL(t, "show databases", tablet, "") + // only run the select query if the database is created, otherwise wait + // the six expected databases are - [information_schema, _vt, mysql, performance_schema, sys, vt_ks] + // alternately we can check explicitly for the value vt_ks in the output + if len(qr.Rows) == 6 { + selectSQL := fmt.Sprintf("select msg from vt_insert_test where id=%d", index) + qr = runSQL(t, selectSQL, tablet, "vt_ks") + if len(qr.Rows) == 1 { + return nil + } } time.Sleep(300 * time.Millisecond) } From 74a07b76c8a9899f38052129a9bf733fd90c52be Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Thu, 17 Jun 2021 19:43:35 +0530 Subject: [PATCH 12/25] refactor to check all errors in select query Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 50 +++++++++++++++------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index 968212c4d97..ecb1d6c7fd0 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -235,14 +235,17 @@ func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, func cleanAndAddVttablet(t *testing.T, vttablet *cluster.Vttablet) error { // remove the database if it exists - runSQL(t, "DROP DATABASE IF EXISTS vt_ks", vttablet, "") + _, err := runSQL(t, "DROP DATABASE IF EXISTS vt_ks", vttablet, "") + require.NoError(t, err) // stop the replication - runSQL(t, "STOP SLAVE", vttablet, "") + _, err = runSQL(t, "STOP SLAVE", vttablet, "") + require.NoError(t, err) // reset the binlog - runSQL(t, "RESET MASTER", vttablet, "") + _, err = runSQL(t, "RESET MASTER", vttablet, "") + require.NoError(t, err) // add the vttablet to the topology - err := clusterInstance.VtctlclientProcess.ExecuteCommand("InitTablet", + err = clusterInstance.VtctlclientProcess.ExecuteCommand("InitTablet", "-port", fmt.Sprintf("%d", vttablet.VttabletProcess.Port), "-grpc_port", fmt.Sprintf("%d", vttablet.VttabletProcess.GrpcPort), "-hostname", vttablet.VttabletProcess.TabletHostname, @@ -362,7 +365,8 @@ func waitForReadOnlyValue(t *testing.T, curMaster *cluster.Vttablet, expectValue timeout := 15 * time.Second startTime := time.Now() for time.Since(startTime) < timeout { - qr := runSQL(t, "select @@global.read_only as read_only", curMaster, "") + qr, err := runSQL(t, "select @@global.read_only as read_only", curMaster, "") + require.NoError(t, err) require.NotNil(t, qr) row := qr.Named().Row() require.NotNil(t, row) @@ -388,7 +392,8 @@ func TestMasterReadOnly(t *testing.T) { assert.NotNil(t, curMaster, "should have elected a master") // Make the current master database read-only. - runSQL(t, "set global read_only=ON", curMaster, "") + _, err := runSQL(t, "set global read_only=ON", curMaster, "") + require.NoError(t, err) // wait for repair match := waitForReadOnlyValue(t, curMaster, 0) @@ -415,7 +420,8 @@ func TestReplicaReadWrite(t *testing.T) { } } // Make the replica database read-write. - runSQL(t, "set global read_only=OFF", replica, "") + _, err := runSQL(t, "set global read_only=OFF", replica, "") + require.NoError(t, err) // wait for repair match := waitForReadOnlyValue(t, replica, 1) @@ -614,7 +620,8 @@ func checkReplication(t *testing.T, clusterInstance *cluster.LocalProcessCluster primary key (id) ) Engine=InnoDB ` - runSQL(t, sqlSchema, master, "vt_ks") + _, err := runSQL(t, sqlSchema, master, "vt_ks") + require.NoError(t, err) confirmReplication(t, master, replicas) } @@ -623,7 +630,8 @@ func confirmReplication(t *testing.T, master *cluster.Vttablet, replicas []*clus n := 2 // random value ... // insert data into the new master, check the connected replica work insertSQL := fmt.Sprintf("insert into vt_insert_test(id, msg) values (%d, 'test %d')", n, n) - runSQL(t, insertSQL, master, "vt_ks") + _, err := runSQL(t, insertSQL, master, "vt_ks") + require.NoError(t, err) time.Sleep(100 * time.Millisecond) for _, tab := range replicas { err := checkInsertedValues(t, tab, n) @@ -635,16 +643,12 @@ func checkInsertedValues(t *testing.T, tablet *cluster.Vttablet, index int) erro // wait until it gets the data timeout := time.Now().Add(10 * time.Second) for time.Now().Before(timeout) { - qr := runSQL(t, "show databases", tablet, "") - // only run the select query if the database is created, otherwise wait - // the six expected databases are - [information_schema, _vt, mysql, performance_schema, sys, vt_ks] - // alternately we can check explicitly for the value vt_ks in the output - if len(qr.Rows) == 6 { - selectSQL := fmt.Sprintf("select msg from vt_insert_test where id=%d", index) - qr = runSQL(t, selectSQL, tablet, "vt_ks") - if len(qr.Rows) == 1 { - return nil - } + selectSQL := fmt.Sprintf("select msg from vt_insert_test where id=%d", index) + qr, err := runSQL(t, selectSQL, tablet, "vt_ks") + // The error may be not nil, if the replication has not caught upto the point where the table exists. + // We can safely skip this error and retry reading after wait + if err == nil && len(qr.Rows) == 1 { + return nil } time.Sleep(300 * time.Millisecond) } @@ -679,7 +683,7 @@ func getMysqlConnParam(tablet *cluster.Vttablet, db string) mysql.ConnParams { return connParams } -func runSQL(t *testing.T, sql string, tablet *cluster.Vttablet, db string) *sqltypes.Result { +func runSQL(t *testing.T, sql string, tablet *cluster.Vttablet, db string) (*sqltypes.Result, error) { // Get Connection tabletParams := getMysqlConnParam(tablet, db) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) @@ -692,9 +696,7 @@ func runSQL(t *testing.T, sql string, tablet *cluster.Vttablet, db string) *sqlt return execute(t, conn, sql) } -func execute(t *testing.T, conn *mysql.Conn, query string) *sqltypes.Result { +func execute(t *testing.T, conn *mysql.Conn, query string) (*sqltypes.Result, error) { t.Helper() - qr, err := conn.ExecuteFetch(query, 1000, true) - require.Nil(t, err) - return qr + return conn.ExecuteFetch(query, 1000, true) } From 89e9decb701ba5bc5c4f72732f404acb395ca320 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Thu, 17 Jun 2021 19:44:30 +0530 Subject: [PATCH 13/25] prevent vtorc from acquiring locks after SIGTERM has been received Signed-off-by: GuptaManan100 --- go/test/endtoend/cluster/cluster_process.go | 2 +- go/vt/orchestrator/logic/orchestrator.go | 2 ++ go/vt/orchestrator/logic/tablet_discovery.go | 4 ++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/go/test/endtoend/cluster/cluster_process.go b/go/test/endtoend/cluster/cluster_process.go index dec0c86dd5c..f0e1a9a8380 100644 --- a/go/test/endtoend/cluster/cluster_process.go +++ b/go/test/endtoend/cluster/cluster_process.go @@ -42,7 +42,7 @@ const ( ) var ( - keepData = flag.Bool("keep-data", true, "don't delete the per-test VTDATAROOT subfolders") + keepData = flag.Bool("keep-data", false, "don't delete the per-test VTDATAROOT subfolders") topoFlavor = flag.String("topo-flavor", "etcd2", "choose a topo server from etcd2, zk2 or consul") isCoverage = flag.Bool("is-coverage", false, "whether coverage is required") forceVTDATAROOT = flag.String("force-vtdataroot", "", "force path for VTDATAROOT, which may already be populated") diff --git a/go/vt/orchestrator/logic/orchestrator.go b/go/vt/orchestrator/logic/orchestrator.go index 59dedea516c..bd05f79c6b0 100644 --- a/go/vt/orchestrator/logic/orchestrator.go +++ b/go/vt/orchestrator/logic/orchestrator.go @@ -49,6 +49,7 @@ const ( var discoveryQueue *discovery.Queue var snapshotDiscoveryKeys chan inst.InstanceKey var snapshotDiscoveryKeysMutex sync.Mutex +var hasReceivedSIGTERM int32 var discoveriesCounter = metrics.NewCounter() var failedDiscoveriesCounter = metrics.NewCounter() @@ -121,6 +122,7 @@ func acceptSignals() { discoveryMetrics.SetExpirePeriod(time.Duration(config.Config.DiscoveryCollectionRetentionSeconds) * time.Second) case syscall.SIGTERM: log.Infof("Received SIGTERM. Shutting down orchestrator") + atomic.StoreInt32(&hasReceivedSIGTERM, 1) discoveryMetrics.StopAutoExpiration() // probably should poke other go routines to stop cleanly here ... inst.AuditOperation("shutdown", nil, "Triggered via SIGTERM") diff --git a/go/vt/orchestrator/logic/tablet_discovery.go b/go/vt/orchestrator/logic/tablet_discovery.go index 5154f787b8e..22040b854e6 100644 --- a/go/vt/orchestrator/logic/tablet_discovery.go +++ b/go/vt/orchestrator/logic/tablet_discovery.go @@ -243,6 +243,10 @@ func LockShard(instanceKey inst.InstanceKey) (func(*error), error) { if instanceKey.Hostname == "" { return nil, errors.New("Can't lock shard: instance is unspecified") } + val := atomic.LoadInt32(&hasReceivedSIGTERM) + if val > 0 { + return nil, errors.New("Can't lock shard: SIGTERM received") + } tablet, err := inst.ReadTablet(instanceKey) if err != nil { From bbc1dca0b4a252918c39f9bd32bcac828f6b366b Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Thu, 17 Jun 2021 22:34:06 +0530 Subject: [PATCH 14/25] fixing bug in vtorc - decrement shard counter in case of errors too Signed-off-by: GuptaManan100 --- go/vt/orchestrator/logic/orchestrator.go | 3 ++- go/vt/orchestrator/logic/tablet_discovery.go | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/go/vt/orchestrator/logic/orchestrator.go b/go/vt/orchestrator/logic/orchestrator.go index bd05f79c6b0..28c4d2062d2 100644 --- a/go/vt/orchestrator/logic/orchestrator.go +++ b/go/vt/orchestrator/logic/orchestrator.go @@ -121,7 +121,7 @@ func acceptSignals() { config.Reload() discoveryMetrics.SetExpirePeriod(time.Duration(config.Config.DiscoveryCollectionRetentionSeconds) * time.Second) case syscall.SIGTERM: - log.Infof("Received SIGTERM. Shutting down orchestrator") + log.Infof("Received SIGTERM. Starting shutdown") atomic.StoreInt32(&hasReceivedSIGTERM, 1) discoveryMetrics.StopAutoExpiration() // probably should poke other go routines to stop cleanly here ... @@ -142,6 +142,7 @@ func acceptSignals() { } } }() + log.Infof("Shutting down orchestrator") os.Exit(0) } } diff --git a/go/vt/orchestrator/logic/tablet_discovery.go b/go/vt/orchestrator/logic/tablet_discovery.go index 22040b854e6..d62c51ef566 100644 --- a/go/vt/orchestrator/logic/tablet_discovery.go +++ b/go/vt/orchestrator/logic/tablet_discovery.go @@ -256,10 +256,14 @@ func LockShard(instanceKey inst.InstanceKey) (func(*error), error) { defer cancel() atomic.AddInt32(&shardsLockCounter, 1) _, unlock, err := ts.LockShard(ctx, tablet.Keyspace, tablet.Shard, "Orc Recovery") + if err != nil { + atomic.AddInt32(&shardsLockCounter, -1) + return nil, err + } return func(e *error) { defer atomic.AddInt32(&shardsLockCounter, -1) unlock(e) - }, err + }, nil } // TabletRefresh refreshes the tablet info. From 870effeb4b2af054cde451b2f6f0cced3f9f055b Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Mon, 21 Jun 2021 15:00:05 +0530 Subject: [PATCH 15/25] added a test for circular replication Signed-off-by: GuptaManan100 --- go/test/endtoend/cluster/vtorc_process.go | 1 + go/test/endtoend/vtorc/vtorc_test.go | 33 +++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/go/test/endtoend/cluster/vtorc_process.go b/go/test/endtoend/cluster/vtorc_process.go index 801cc08aa75..39c453c7247 100644 --- a/go/test/endtoend/cluster/vtorc_process.go +++ b/go/test/endtoend/cluster/vtorc_process.go @@ -52,6 +52,7 @@ func (orc *VtorcProcess) Setup() (err error) { "-topo_global_server_address", orc.TopoGlobalAddress, "-topo_global_root", orc.TopoGlobalRoot, "-config", orc.Config, + "-orc_web_dir", path.Join(os.Getenv("VTROOT"), "web", "orchestrator"), ) if *isCoverage { orc.proc.Args = append(orc.proc.Args, "-test.coverprofile="+getCoveragePath("orc.out")) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index ecb1d6c7fd0..eddbf49e0cf 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -543,6 +543,39 @@ func TestRepairAfterTER(t *testing.T) { checkReplication(t, clusterInstance, newMaster, []*cluster.Vttablet{curMaster}) } +// 7. make instance A replicates from B and B from A, wait for repair +func TestCircularReplication(t *testing.T) { + defer cluster.PanicHandler(t) + setupVttabletsAndVtorc(t, 2, 0, nil) + keyspace := &clusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + + // find master from topo + curMaster := shardMasterTablet(t, clusterInstance, keyspace, shard0) + assert.NotNil(t, curMaster, "should have elected a master") + + var replica *cluster.Vttablet + for _, tablet := range shard0.Vttablets { + // we know we have only two tablets, so the "other" one must be the new master + if tablet.Alias != curMaster.Alias { + replica = tablet + break + } + } + + changeMasterCommands := fmt.Sprintf("RESET SLAVE;"+ + "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+ + "START SLAVE;", replica.VttabletProcess.TabletHostname, replica.MySQLPort) + + _, err := runSQL(t, changeMasterCommands, curMaster, "") + require.NoError(t, err) + + // wait for repair + time.Sleep(15 * time.Second) + // check replication is setup correctly + checkReplication(t, clusterInstance, curMaster, []*cluster.Vttablet{replica}) +} + func shardMasterTablet(t *testing.T, cluster *cluster.LocalProcessCluster, keyspace *cluster.Keyspace, shard *cluster.Shard) *cluster.Vttablet { start := time.Now() for { From e9831e1a253e5f82f5519cb4562d38c03355b7b2 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Mon, 21 Jun 2021 17:00:53 +0530 Subject: [PATCH 16/25] removed absolute wait from the test Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 32 ++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index eddbf49e0cf..923a231478a 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -551,13 +551,13 @@ func TestCircularReplication(t *testing.T) { shard0 := &keyspace.Shards[0] // find master from topo - curMaster := shardMasterTablet(t, clusterInstance, keyspace, shard0) - assert.NotNil(t, curMaster, "should have elected a master") + master := shardMasterTablet(t, clusterInstance, keyspace, shard0) + assert.NotNil(t, master, "should have elected a master") var replica *cluster.Vttablet for _, tablet := range shard0.Vttablets { // we know we have only two tablets, so the "other" one must be the new master - if tablet.Alias != curMaster.Alias { + if tablet.Alias != master.Alias { replica = tablet break } @@ -567,13 +567,14 @@ func TestCircularReplication(t *testing.T) { "CHANGE MASTER TO MASTER_HOST='%s', MASTER_PORT=%d, MASTER_USER='vt_repl', MASTER_AUTO_POSITION = 1;"+ "START SLAVE;", replica.VttabletProcess.TabletHostname, replica.MySQLPort) - _, err := runSQL(t, changeMasterCommands, curMaster, "") + _, err := runSQL(t, changeMasterCommands, master, "") require.NoError(t, err) // wait for repair - time.Sleep(15 * time.Second) + err = waitForReplicationToStop(t, master) + require.NoError(t, err) // check replication is setup correctly - checkReplication(t, clusterInstance, curMaster, []*cluster.Vttablet{replica}) + checkReplication(t, clusterInstance, master, []*cluster.Vttablet{replica}) } func shardMasterTablet(t *testing.T, cluster *cluster.LocalProcessCluster, keyspace *cluster.Keyspace, shard *cluster.Shard) *cluster.Vttablet { @@ -688,6 +689,25 @@ func checkInsertedValues(t *testing.T, tablet *cluster.Vttablet, index int) erro return fmt.Errorf("data is not yet replicated") } +func waitForReplicationToStop(t *testing.T, vttablet *cluster.Vttablet) error { + timeout := time.After(15 * time.Second) + for { + select { + case <-timeout: + return fmt.Errorf("timedout: waiting for master to stop replication") + default: + res, err := runSQL(t, "SHOW SLAVE STATUS", vttablet, "") + if err != nil { + return err + } + if len(res.Rows) == 0 { + return nil + } + time.Sleep(1 * time.Second) + } + } +} + func validateTopology(t *testing.T, cluster *cluster.LocalProcessCluster, pingTablets bool) { if pingTablets { out, err := cluster.VtctlclientProcess.ExecuteCommandWithOutput("Validate", "-ping-tablets=true") From 368ce046fff63b0a5ba501d2cbcdb14b9b30b3ab Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Fri, 25 Jun 2021 14:01:19 +0530 Subject: [PATCH 17/25] cleanly shutdown all the mysql instances Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index 923a231478a..e029d4621db 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -273,9 +273,10 @@ func TestMain(m *testing.M) { }() cluster.PanicHandler(nil) - clusterInstance.Teardown() killTablets(replicaTablets) killTablets(rdonlyTablets) + clusterInstance.Keyspaces[0].Shards[0].Vttablets = nil + clusterInstance.Teardown() if err != nil { fmt.Printf("%v\n", err) @@ -720,6 +721,8 @@ func validateTopology(t *testing.T, cluster *cluster.LocalProcessCluster, pingTa func killTablets(vttablets []*cluster.Vttablet) { for _, tablet := range vttablets { + log.Infof("Shutting down MySQL for %v", tablet.Alias) + _ = tablet.MysqlctlProcess.Stop() log.Infof("Calling TearDown on tablet %v", tablet.Alias) _ = tablet.VttabletProcess.TearDown() } From b720b4af7e15207c32db9814cc14fb0d0e49df49 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Fri, 25 Jun 2021 14:11:20 +0530 Subject: [PATCH 18/25] rename error file and a variable Signed-off-by: GuptaManan100 --- go/test/endtoend/cluster/vtorc_process.go | 3 ++- go/test/endtoend/cluster/vttablet_process.go | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/go/test/endtoend/cluster/vtorc_process.go b/go/test/endtoend/cluster/vtorc_process.go index 39c453c7247..895ff94219f 100644 --- a/go/test/endtoend/cluster/vtorc_process.go +++ b/go/test/endtoend/cluster/vtorc_process.go @@ -18,6 +18,7 @@ limitations under the License. package cluster import ( + "fmt" "os" "os/exec" "path" @@ -61,7 +62,7 @@ func (orc *VtorcProcess) Setup() (err error) { orc.proc.Args = append(orc.proc.Args, orc.ExtraArgs...) orc.proc.Args = append(orc.proc.Args, "-alsologtostderr", "http") - errFile, _ := os.Create(path.Join(orc.LogDir, "orc-stderr.txt"+time.Now().String())) + errFile, _ := os.Create(path.Join(orc.LogDir, fmt.Sprintf("orc-stderr-%d.txt", time.Now().UnixNano()))) orc.proc.Stderr = errFile orc.proc.Env = append(orc.proc.Env, os.Environ()...) diff --git a/go/test/endtoend/cluster/vttablet_process.go b/go/test/endtoend/cluster/vttablet_process.go index 8965725bf6b..ceb6368d685 100644 --- a/go/test/endtoend/cluster/vttablet_process.go +++ b/go/test/endtoend/cluster/vttablet_process.go @@ -230,9 +230,9 @@ func (vttablet *VttabletProcess) WaitForTabletTypes(expectedTypes []string) erro // WaitForTabletStatusesForTimeout waits till the tablet reaches to any of the provided statuses func (vttablet *VttabletProcess) WaitForTabletStatusesForTimeout(expectedStatuses []string, timeout time.Duration) error { - timeToWait := time.Now().Add(timeout) + waitUntil := time.Now().Add(timeout) var status string - for time.Now().Before(timeToWait) { + for time.Now().Before(waitUntil) { status = vttablet.GetTabletStatus() if contains(expectedStatuses, status) { return nil @@ -250,9 +250,9 @@ func (vttablet *VttabletProcess) WaitForTabletStatusesForTimeout(expectedStatuse // WaitForTabletTypesForTimeout waits till the tablet reaches to any of the provided types func (vttablet *VttabletProcess) WaitForTabletTypesForTimeout(expectedTypes []string, timeout time.Duration) error { - timeToWait := time.Now().Add(timeout) + waitUntil := time.Now().Add(timeout) var tabletType string - for time.Now().Before(timeToWait) { + for time.Now().Before(waitUntil) { tabletType = vttablet.GetTabletType() if contains(expectedTypes, tabletType) { return nil From 0fd39ae267d71c07093c35a82bd72892feb2fe5b Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Wed, 30 Jun 2021 11:44:03 +0530 Subject: [PATCH 19/25] shutdown and restart vttablets instead of removing them from topology Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 35 +++++++++++----------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index e029d4621db..a26ae7280c9 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -149,8 +149,8 @@ func createVttablets() error { return nil } -// removeVttabletsFromTopology removes all the vttablets from the topology -func removeVttabletsFromTopology() error { +// shutdownVttablets shuts down all the vttablets and removes them from the topology +func shutdownVttablets() error { for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { tabletType := vttablet.VttabletProcess.GetTabletType() if tabletType == "master" { @@ -160,7 +160,11 @@ func removeVttabletsFromTopology() error { } } - err := clusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", vttablet.Alias) + err := vttablet.VttabletProcess.TearDown() + if err != nil { + return err + } + err = clusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", vttablet.Alias) if err != nil { return err } @@ -204,14 +208,14 @@ func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, stopVtorc(t) // remove all the vttablets so that each test can add the amount that they require - err := removeVttabletsFromTopology() + err := shutdownVttablets() require.NoError(t, err) for _, tablet := range replicaTablets { if numReplicasReq == 0 { break } - err = cleanAndAddVttablet(t, tablet) + err = cleanAndStartVttablet(t, tablet) require.NoError(t, err) numReplicasReq-- } @@ -220,7 +224,7 @@ func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, if numRdonlyReq == 0 { break } - err = cleanAndAddVttablet(t, tablet) + err = cleanAndStartVttablet(t, tablet) require.NoError(t, err) numRdonlyReq-- } @@ -233,7 +237,7 @@ func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, startVtorc(t, orcExtraArgs) } -func cleanAndAddVttablet(t *testing.T, vttablet *cluster.Vttablet) error { +func cleanAndStartVttablet(t *testing.T, vttablet *cluster.Vttablet) error { // remove the database if it exists _, err := runSQL(t, "DROP DATABASE IF EXISTS vt_ks", vttablet, "") require.NoError(t, err) @@ -244,20 +248,9 @@ func cleanAndAddVttablet(t *testing.T, vttablet *cluster.Vttablet) error { _, err = runSQL(t, "RESET MASTER", vttablet, "") require.NoError(t, err) - // add the vttablet to the topology - err = clusterInstance.VtctlclientProcess.ExecuteCommand("InitTablet", - "-port", fmt.Sprintf("%d", vttablet.VttabletProcess.Port), - "-grpc_port", fmt.Sprintf("%d", vttablet.VttabletProcess.GrpcPort), - "-hostname", vttablet.VttabletProcess.TabletHostname, - "-mysql_host", vttablet.VttabletProcess.TabletHostname, - "-mysql_port", fmt.Sprintf("%d", vttablet.MysqlctlProcess.MySQLPort), - "-keyspace", vttablet.VttabletProcess.Keyspace, - "-shard", vttablet.VttabletProcess.Shard, - vttablet.Alias, - vttablet.VttabletProcess.TabletType) - if err != nil { - return err - } + // start the vttablet + err = vttablet.VttabletProcess.Setup() + require.NoError(t, err) clusterInstance.Keyspaces[0].Shards[0].Vttablets = append(clusterInstance.Keyspaces[0].Shards[0].Vttablets, vttablet) return nil From dd2c8415fb02a7140bfd3a283c781e72b4107c4c Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Wed, 30 Jun 2021 13:15:00 +0530 Subject: [PATCH 20/25] move demoting replica tablet to a test only use function Signed-off-by: GuptaManan100 --- go/test/endtoend/cluster/vttablet_process.go | 5 ++ go/test/endtoend/vtorc/vtorc_test.go | 54 +++++++++++++++----- go/vt/vtctl/vtctl.go | 23 --------- go/vt/wrangler/tablet.go | 40 --------------- 4 files changed, 46 insertions(+), 76 deletions(-) diff --git a/go/test/endtoend/cluster/vttablet_process.go b/go/test/endtoend/cluster/vttablet_process.go index ceb6368d685..283ae9a3713 100644 --- a/go/test/endtoend/cluster/vttablet_process.go +++ b/go/test/endtoend/cluster/vttablet_process.go @@ -499,6 +499,11 @@ func (vttablet *VttabletProcess) BulkLoad(t testing.TB, db, table string, bulkIn bufFinish.Sub(bufStart), end.Sub(bufFinish), end.Sub(bufStart)) } +// IsShutdown returns whether a vttablet is shutdown or not +func (vttablet *VttabletProcess) IsShutdown() bool { + return vttablet.proc == nil +} + // VttabletProcessInstance returns a VttabletProcess handle for vttablet process // configured with the given Config. // The process must be manually started by calling setup() diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index a26ae7280c9..a27c2faf6c7 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -25,6 +25,12 @@ import ( "testing" "time" + "vitess.io/vitess/go/vt/topo" + _ "vitess.io/vitess/go/vt/topo/consultopo" + _ "vitess.io/vitess/go/vt/topo/etcd2topo" + _ "vitess.io/vitess/go/vt/topo/k8stopo" + _ "vitess.io/vitess/go/vt/topo/zk2topo" + "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/sqltypes" @@ -42,6 +48,7 @@ import ( var ( clusterInstance *cluster.LocalProcessCluster + ts *topo.Server replicaTablets []*cluster.Vttablet rdonlyTablets []*cluster.Vttablet uidBase = 100 @@ -72,6 +79,8 @@ func createClusterAndStartTopo() error { return err } + // create topo server connection + ts, err = topo.OpenServer(*clusterInstance.TopoFlavorString(), clusterInstance.VtctlProcess.TopoGlobalAddress, clusterInstance.VtctlProcess.TopoGlobalRoot) return err } @@ -151,19 +160,28 @@ func createVttablets() error { // shutdownVttablets shuts down all the vttablets and removes them from the topology func shutdownVttablets() error { + // demote the primary tablet if there is + err := demotePrimaryTablet() + if err != nil { + return err + } + for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - tabletType := vttablet.VttabletProcess.GetTabletType() - if tabletType == "master" { - err := demoteMasterTablet(vttablet) - if err != nil { - return err - } + // if a tablet is already shutdown, we dont need to do anything + if vttablet.VttabletProcess.IsShutdown() { + continue } - + // wait for primary tablet to demote. For all others, it will not wait + err = vttablet.VttabletProcess.WaitForTabletTypes([]string{vttablet.Type}) + if err != nil { + return err + } + // Stop the vttablets err := vttablet.VttabletProcess.TearDown() if err != nil { return err } + // Remove the tablet record for this tablet err = clusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", vttablet.Alias) if err != nil { return err @@ -173,14 +191,24 @@ func shutdownVttablets() error { return nil } -func demoteMasterTablet(vttablet *cluster.Vttablet) error { - err := clusterInstance.VtctlclientProcess.ExecuteCommand("DemoteMasterTablet", vttablet.Alias) - if err != nil { +// demotePrimaryTablet demotes the primary tablet for our shard +func demotePrimaryTablet() (err error) { + // lock the shard + ctx, unlock, lockErr := ts.LockShard(context.Background(), keyspaceName, shardName, "demotePrimaryTablet-vtorc-endtoend-test") + if lockErr != nil { + return lockErr + } + defer unlock(&err) + + // update the shard record's master + if _, err = ts.UpdateShardFields(ctx, keyspaceName, shardName, func(si *topo.ShardInfo) error { + si.MasterAlias = nil + si.SetMasterTermStartTime(time.Now()) + return nil + }); err != nil { return err } - - err = vttablet.VttabletProcess.WaitForTabletTypes([]string{vttablet.Type}) - return err + return } // startVtorc is used to start the orchestrator with the given extra arguments diff --git a/go/vt/vtctl/vtctl.go b/go/vt/vtctl/vtctl.go index c251caadf80..41bf6f36d79 100644 --- a/go/vt/vtctl/vtctl.go +++ b/go/vt/vtctl/vtctl.go @@ -169,9 +169,6 @@ var commands = []commandGroup{ {"DeleteTablet", commandDeleteTablet, "[-allow_master] ...", "Deletes tablet(s) from the topology."}, - {"DemoteMasterTablet", commandDemoteMasterTablet, - " ...", - "Demotes master tablet(s) to primary Use with caution. Can leave the shard with no master."}, {"SetReadOnly", commandSetReadOnly, "", "Sets the tablet as read-only."}, @@ -801,26 +798,6 @@ func commandDeleteTablet(ctx context.Context, wr *wrangler.Wrangler, subFlags *f return nil } -func commandDemoteMasterTablet(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error { - if err := subFlags.Parse(args); err != nil { - return err - } - if subFlags.NArg() == 0 { - return fmt.Errorf("the argument must be used to specify at least one tablet when calling the DemoteMasterTablet command") - } - - tabletAliases, err := tabletParamsToTabletAliases(subFlags.Args()) - if err != nil { - return err - } - for _, tabletAlias := range tabletAliases { - if err := wr.DemoteMasterTablet(ctx, tabletAlias); err != nil { - return err - } - } - return nil -} - func commandSetReadOnly(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error { if err := subFlags.Parse(args); err != nil { return err diff --git a/go/vt/wrangler/tablet.go b/go/vt/wrangler/tablet.go index a96a8fd7c86..86133199ab4 100644 --- a/go/vt/wrangler/tablet.go +++ b/go/vt/wrangler/tablet.go @@ -153,46 +153,6 @@ func (wr *Wrangler) DeleteTablet(ctx context.Context, tabletAlias *topodatapb.Ta return } -// DemoteMasterTablet demotes a master tablet. -func (wr *Wrangler) DemoteMasterTablet(ctx context.Context, tabletAlias *topodatapb.TabletAlias) (err error) { - // load the tablet, see if we'll need to rebuild - ti, err := wr.ts.GetTablet(ctx, tabletAlias) - if err != nil { - return err - } - - wasMaster, err := wr.isMasterTablet(ctx, ti) - if err != nil { - return err - } - - if !wasMaster { - return fmt.Errorf("tablet %v is not a master tablet", topoproto.TabletAliasString(tabletAlias)) - } - - // update the Shard object if the master was scrapped. - // We lock the shard to not conflict with reparent operations. - ctx, unlock, lockErr := wr.ts.LockShard(ctx, ti.Keyspace, ti.Shard, fmt.Sprintf("DemoteMasterTablet(%v)", topoproto.TabletAliasString(tabletAlias))) - if lockErr != nil { - return lockErr - } - defer unlock(&err) - - // update the shard record's master - if _, err := wr.ts.UpdateShardFields(ctx, ti.Keyspace, ti.Shard, func(si *topo.ShardInfo) error { - if !topoproto.TabletAliasEqual(si.MasterAlias, tabletAlias) { - wr.Logger().Warningf("Deleting master %v from shard %v/%v but master in Shard object was %v", topoproto.TabletAliasString(tabletAlias), ti.Keyspace, ti.Shard, topoproto.TabletAliasString(si.MasterAlias)) - return topo.NewError(topo.NoUpdateNeeded, si.Keyspace()+"/"+si.ShardName()) - } - si.MasterAlias = nil - return nil - }); err != nil { - return err - } - - return -} - // ChangeTabletType changes the type of tablet and recomputes all // necessary derived paths in the serving graph, if necessary. // From 253e4735a9011f1467935285cb035b9259132055 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Wed, 30 Jun 2021 14:43:25 +0530 Subject: [PATCH 21/25] wait for the tablets to come up properly when restarted Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index a27c2faf6c7..95fe267c6ae 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -261,6 +261,19 @@ func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, t.Fatalf("more than available tablets requested. Please increase the constants numReplicas or numRdonly") } + // wait for the tablets to come up properly + for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { + // Reset status, don't wait for the tablet status. We will check it later + tablet.VttabletProcess.ServingStatus = "" + // Start the tablet + err := tablet.VttabletProcess.Setup() + require.NoError(t, err) + } + for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { + err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"}) + require.NoError(t, err) + } + // start vtorc startVtorc(t, orcExtraArgs) } From 1191dfbdfd250df4d39391c35effed9b1d312af6 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Wed, 30 Jun 2021 16:49:15 +0530 Subject: [PATCH 22/25] bug fix Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index 95fe267c6ae..befb4f46b24 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -263,14 +263,11 @@ func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, // wait for the tablets to come up properly for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - // Reset status, don't wait for the tablet status. We will check it later - tablet.VttabletProcess.ServingStatus = "" - // Start the tablet - err := tablet.VttabletProcess.Setup() + err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"}) require.NoError(t, err) } for _, tablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - err := tablet.VttabletProcess.WaitForTabletStatuses([]string{"SERVING", "NOT_SERVING"}) + err := tablet.VttabletProcess.WaitForTabletTypes([]string{"replica", "rdonly"}) require.NoError(t, err) } From fb5728a6e68e26bab6df129448e918c6f6271703 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Tue, 6 Jul 2021 12:09:19 +0530 Subject: [PATCH 23/25] also delete tablet record after testdownmaster Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 50 +++++++++++++++++----------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index befb4f46b24..497a9b357db 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -167,21 +167,20 @@ func shutdownVttablets() error { } for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - // if a tablet is already shutdown, we dont need to do anything - if vttablet.VttabletProcess.IsShutdown() { - continue - } - // wait for primary tablet to demote. For all others, it will not wait - err = vttablet.VttabletProcess.WaitForTabletTypes([]string{vttablet.Type}) - if err != nil { - return err - } - // Stop the vttablets - err := vttablet.VttabletProcess.TearDown() - if err != nil { - return err + // we need to stop a vttablet only if it is not shutdown + if !vttablet.VttabletProcess.IsShutdown() { + // wait for primary tablet to demote. For all others, it will not wait + err = vttablet.VttabletProcess.WaitForTabletTypes([]string{vttablet.Type}) + if err != nil { + return err + } + // Stop the vttablets + err := vttablet.VttabletProcess.TearDown() + if err != nil { + return err + } + // Remove the tablet record for this tablet } - // Remove the tablet record for this tablet err = clusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", vttablet.Alias) if err != nil { return err @@ -741,12 +740,23 @@ func waitForReplicationToStop(t *testing.T, vttablet *cluster.Vttablet) error { } func validateTopology(t *testing.T, cluster *cluster.LocalProcessCluster, pingTablets bool) { - if pingTablets { - out, err := cluster.VtctlclientProcess.ExecuteCommandWithOutput("Validate", "-ping-tablets=true") - require.NoError(t, err, out) - } else { - err := cluster.VtctlclientProcess.ExecuteCommand("Validate") - require.NoError(t, err) + ch := make(chan interface{}) + go func() { + if pingTablets { + out, err := cluster.VtctlclientProcess.ExecuteCommandWithOutput("Validate", "-ping-tablets=true") + require.NoError(t, err, out) + } else { + err := cluster.VtctlclientProcess.ExecuteCommand("Validate") + require.NoError(t, err) + } + ch <- true + }() + + select { + case <-ch: + return + case <-time.After(120 * time.Second): + t.Fatal("time out waiting for validation to finish") } } From 18f01b16cb233a92ef2fa18668c2a97b84c85d98 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Wed, 7 Jul 2021 16:24:35 +0530 Subject: [PATCH 24/25] drop _vt database too Signed-off-by: GuptaManan100 --- go/test/endtoend/vtorc/vtorc_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/go/test/endtoend/vtorc/vtorc_test.go b/go/test/endtoend/vtorc/vtorc_test.go index 497a9b357db..9f005d6311a 100644 --- a/go/test/endtoend/vtorc/vtorc_test.go +++ b/go/test/endtoend/vtorc/vtorc_test.go @@ -275,9 +275,11 @@ func setupVttabletsAndVtorc(t *testing.T, numReplicasReq int, numRdonlyReq int, } func cleanAndStartVttablet(t *testing.T, vttablet *cluster.Vttablet) error { - // remove the database if it exists + // remove the databases if they exist _, err := runSQL(t, "DROP DATABASE IF EXISTS vt_ks", vttablet, "") require.NoError(t, err) + _, err = runSQL(t, "DROP DATABASE IF EXISTS _vt", vttablet, "") + require.NoError(t, err) // stop the replication _, err = runSQL(t, "STOP SLAVE", vttablet, "") require.NoError(t, err) From 382d13fe97b29e5394356965ffee9277acee7d55 Mon Sep 17 00:00:00 2001 From: GuptaManan100 Date: Thu, 8 Jul 2021 08:31:50 +0530 Subject: [PATCH 25/25] revert unnecessary change Signed-off-by: GuptaManan100 --- go/vt/wrangler/tablet.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/vt/wrangler/tablet.go b/go/vt/wrangler/tablet.go index 30408ea8930..9f7c49b679f 100644 --- a/go/vt/wrangler/tablet.go +++ b/go/vt/wrangler/tablet.go @@ -151,7 +151,7 @@ func (wr *Wrangler) DeleteTablet(ctx context.Context, tabletAlias *topodatapb.Ta return err } - return + return nil } // ChangeTabletType changes the type of tablet and recomputes all