From 227e66edf59552850cf6567c205837dfbe8ca3cf Mon Sep 17 00:00:00 2001 From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com> Date: Wed, 19 Jul 2023 13:50:31 +0530 Subject: [PATCH] [release-17.0] Deflake `TestPlannedReparentShardPromoteReplicaFail` (#13548) (#13550) Signed-off-by: Manan Gupta Co-authored-by: vitess-bot[bot] <108069721+vitess-bot[bot]@users.noreply.github.com> --- .../testlib/planned_reparent_shard_test.go | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/go/vt/wrangler/testlib/planned_reparent_shard_test.go b/go/vt/wrangler/testlib/planned_reparent_shard_test.go index 80a067f221a..fba7117fe68 100644 --- a/go/vt/wrangler/testlib/planned_reparent_shard_test.go +++ b/go/vt/wrangler/testlib/planned_reparent_shard_test.go @@ -799,6 +799,9 @@ func TestPlannedReparentShardPromoteReplicaFail(t *testing.T) { oldPrimary.FakeMysqlDaemon.CurrentPrimaryPosition = newPrimary.FakeMysqlDaemon.WaitPrimaryPositions[0] oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs = append(oldPrimary.FakeMysqlDaemon.SetReplicationSourceInputs, topoproto.MysqlAddr(newPrimary.Tablet)) oldPrimary.FakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "FAKE SET MASTER", + "START SLAVE", + // We call a SetReplicationSource explicitly "FAKE SET MASTER", "START SLAVE", // extra SetReplicationSource call due to retry @@ -857,6 +860,13 @@ func TestPlannedReparentShardPromoteReplicaFail(t *testing.T) { assert.True(t, newPrimary.FakeMysqlDaemon.ReadOnly, "newPrimary.FakeMysqlDaemon.ReadOnly") assert.True(t, oldPrimary.FakeMysqlDaemon.ReadOnly, "oldPrimary.FakeMysqlDaemon.ReadOnly") + // After the first call to PRS has failed, we don't know whether `SetReplicationSource` RPC has succeeded on the oldPrimary or not. + // This causes the test to become non-deterministic. To prevent this, we call `SetReplicationSource` on the oldPrimary again, and make sure it has succeeded. + // We also wait until the oldPrimary has demoted itself to a replica type. + err = wr.TabletManagerClient().SetReplicationSource(context.Background(), oldPrimary.Tablet, newPrimary.Tablet.Alias, 0, "", false, false) + require.NoError(t, err) + waitForTabletType(t, wr, oldPrimary.Tablet.Alias, topodatapb.TabletType_REPLICA) + // retrying should work newPrimary.FakeMysqlDaemon.PromoteError = nil newPrimary.FakeMysqlDaemon.CurrentPrimaryPosition = newPrimary.FakeMysqlDaemon.WaitPrimaryPositions[0] @@ -870,6 +880,26 @@ func TestPlannedReparentShardPromoteReplicaFail(t *testing.T) { assert.True(t, oldPrimary.FakeMysqlDaemon.ReadOnly, "oldPrimary.FakeMysqlDaemon.ReadOnly") } +// waitForTabletType waits for the given tablet type to be reached. +func waitForTabletType(t *testing.T, wr *wrangler.Wrangler, tabletAlias *topodatapb.TabletAlias, tabletType topodatapb.TabletType) { + timeout := time.After(15 * time.Second) + for { + tablet, err := wr.TopoServer().GetTablet(context.Background(), tabletAlias) + require.NoError(t, err) + if tablet.Type == tabletType { + return + } + + select { + case <-timeout: + t.Fatalf("%s didn't reach the tablet type %v", topoproto.TabletAliasString(tabletAlias), tabletType.String()) + return + default: + time.Sleep(100 * time.Millisecond) + } + } +} + // TestPlannedReparentShardSamePrimary tests PRS with oldPrimary works correctly // Simulate failure of previous PRS and oldPrimary is ReadOnly // Verify that primary correctly gets set to ReadWrite