From 6b20304975658393e74ede45e93c24f7f7c91b84 Mon Sep 17 00:00:00 2001
From: Manan Gupta <manan@planetscale.com>
Date: Thu, 13 Jul 2023 12:43:37 +0530
Subject: [PATCH 1/4] feat: deflake TestDownPrimary by reducing the
 wait_replicas_timeout value

Signed-off-by: Manan Gupta <manan@planetscale.com>
---
 .../endtoend/vtorc/primaryfailure/primary_failure_test.go    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
index 9b85895c66d..bcc5cce3565 100644
--- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
+++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
@@ -39,7 +39,10 @@ import (
 // Also tests that VTOrc can handle multiple failures, if the durability policies allow it
 func TestDownPrimary(t *testing.T) {
 	defer cluster.PanicHandler(t)
-	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{
+	// We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test.
+	// If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source.
+	// If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test.
+	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
 	}, 1, "semi_sync")
 	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]

From 3bba79a82e8100cd658182e6df662214f04bb34c Mon Sep 17 00:00:00 2001
From: Manan Gupta <manan@planetscale.com>
Date: Thu, 13 Jul 2023 13:13:05 +0530
Subject: [PATCH 2/4] feat: print the VTOrc logs when a test fails

Signed-off-by: Manan Gupta <manan@planetscale.com>
---
 go/test/endtoend/vtorc/general/vtorc_test.go  |  9 ++++++++
 .../primaryfailure/primary_failure_test.go    | 11 +++++++++
 .../vtorc/readtopologyinstance/main_test.go   |  1 +
 go/test/endtoend/vtorc/utils/utils.go         | 23 +++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go
index 91cfc1d91ac..daea2f37a1b 100644
--- a/go/test/endtoend/vtorc/general/vtorc_test.go
+++ b/go/test/endtoend/vtorc/general/vtorc_test.go
@@ -37,6 +37,7 @@ import (
 // verify replication is setup
 // verify that with multiple vtorc instances, we still only have 1 PlannedReparentShard call
 func TestPrimaryElection(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
@@ -64,6 +65,7 @@ func TestPrimaryElection(t *testing.T) {
 // verify rdonly is not elected, only replica
 // verify replication is setup
 func TestSingleKeyspace(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks"}, cluster.VTOrcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
@@ -81,6 +83,7 @@ func TestSingleKeyspace(t *testing.T) {
 // verify rdonly is not elected, only replica
 // verify replication is setup
 func TestKeyspaceShard(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks/0"}, cluster.VTOrcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
@@ -100,6 +103,7 @@ func TestKeyspaceShard(t *testing.T) {
 // 4. setup replication from non-primary, let vtorc repair
 // 5. make instance A replicates from B and B from A, wait for repair
 func TestVTOrcRepairs(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 0, nil, cluster.VTOrcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
@@ -216,6 +220,7 @@ func TestVTOrcRepairs(t *testing.T) {
 func TestRepairAfterTER(t *testing.T) {
 	// test fails intermittently on CI, skip until it can be fixed.
 	t.SkipNow()
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
@@ -252,6 +257,7 @@ func TestSemiSync(t *testing.T) {
 	// stop any vtorc instance running due to a previous test.
 	utils.StopVTOrcs(t, clusterInfo)
 	newCluster := utils.SetupNewClusterSemiSync(t)
+	defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance)
 	utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
 	}, 1)
@@ -316,6 +322,7 @@ func TestSemiSync(t *testing.T) {
 
 // TestVTOrcWithPrs tests that VTOrc works fine even when PRS is called from vtctld
 func TestVTOrcWithPrs(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 4, 0, nil, cluster.VTOrcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
@@ -364,6 +371,7 @@ func TestVTOrcWithPrs(t *testing.T) {
 
 // TestMultipleDurabilities tests that VTOrc works with 2 keyspaces having 2 different durability policies
 func TestMultipleDurabilities(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	// Setup a normal cluster and start vtorc
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{}, 1, "")
@@ -388,6 +396,7 @@ func TestDurabilityPolicySetLater(t *testing.T) {
 	// stop any vtorc instance running due to a previous test.
 	utils.StopVTOrcs(t, clusterInfo)
 	newCluster := utils.SetupNewClusterSemiSync(t)
+	defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance)
 	keyspace := &newCluster.ClusterInstance.Keyspaces[0]
 	shard0 := &keyspace.Shards[0]
 	// Before starting VTOrc we explicity want to set the durability policy of the keyspace to an empty string
diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
index bcc5cce3565..e924846427c 100644
--- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
+++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
@@ -38,6 +38,7 @@ import (
 // covers the test case master-failover from orchestrator
 // Also tests that VTOrc can handle multiple failures, if the durability policies allow it
 func TestDownPrimary(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	// We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test.
 	// If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source.
@@ -102,6 +103,7 @@ func TestDownPrimary(t *testing.T) {
 
 // bring down primary before VTOrc has started, let vtorc repair.
 func TestDownPrimaryBeforeVTOrc(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{}, 0, "none")
 	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
@@ -158,6 +160,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) {
 // Reason is, unlike other recoveries, in DeadPrimary we don't call DiscoverInstance since we know
 // that primary is unreachable. This help us save few seconds depending on value of `RemoteOperationTimeout` flag.
 func TestDeadPrimaryRecoversImmediately(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	// We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test.
 	// If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source.
@@ -238,6 +241,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) {
 // Failover should not be cross data centers, according to the configuration file
 // covers part of the test case master-failover-lost-replicas from orchestrator
 func TestCrossDataCenterFailure(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
@@ -283,6 +287,7 @@ func TestCrossDataCenterFailure(t *testing.T) {
 // Failover should not be cross data centers, according to the configuration file
 // In case of no viable candidates, we should error out
 func TestCrossDataCenterFailureError(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
@@ -329,6 +334,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) {
 	// Earlier any replicas that were not able to replicate from the previous primary
 	// were detected by vtorc and could be configured to have their sources detached
 	t.Skip()
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 2, nil, cluster.VTOrcConfiguration{
 		PreventCrossDataCenterPrimaryFailover: true,
@@ -410,6 +416,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) {
 // This test checks that the promotion of a tablet succeeds if it passes the promotion lag test
 // covers the test case master-failover-fail-promotion-lag-minutes-success from orchestrator
 func TestPromotionLagSuccess(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
 		ReplicationLagQuery:              "select 59",
@@ -459,6 +466,7 @@ func TestPromotionLagFailure(t *testing.T) {
 	// Earlier vtorc used to check that the promotion lag between the new primary and the old one
 	// was smaller than the configured value, otherwise it would fail the promotion
 	t.Skip()
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 1, nil, cluster.VTOrcConfiguration{
 		ReplicationLagQuery:              "select 61",
@@ -511,6 +519,7 @@ func TestPromotionLagFailure(t *testing.T) {
 // We explicitly set one of the replicas to Prefer promotion rule.
 // That is the replica which should be promoted in case of primary failure
 func TestDownPrimaryPromotionRule(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
 		LockShardTimeoutSeconds: 5,
@@ -558,6 +567,7 @@ func TestDownPrimaryPromotionRule(t *testing.T) {
 // That is the replica which should be promoted in case of primary failure
 // It should also be caught up when it is promoted
 func TestDownPrimaryPromotionRuleWithLag(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
 		LockShardTimeoutSeconds: 5,
@@ -637,6 +647,7 @@ func TestDownPrimaryPromotionRuleWithLag(t *testing.T) {
 // We let a replica in our own cell lag. That is the replica which should be promoted in case of primary failure
 // It should also be caught up when it is promoted
 func TestDownPrimaryPromotionRuleWithLagCrossCenter(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	defer cluster.PanicHandler(t)
 	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
 		LockShardTimeoutSeconds:               5,
diff --git a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go
index e9756ad5d2b..7fbe7283fd1 100644
--- a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go
+++ b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go
@@ -41,6 +41,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) {
 	defer func() {
 		clusterInfo.ClusterInstance.Teardown()
 	}()
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
 	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
 	shard0 := &keyspace.Shards[0]
 	oldArgs := os.Args
diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go
index 4d7352fd2a8..75863de9de2 100644
--- a/go/test/endtoend/vtorc/utils/utils.go
+++ b/go/test/endtoend/vtorc/utils/utils.go
@@ -988,3 +988,26 @@ func WaitForInstancePollSecondsExceededCount(t *testing.T, vtorcInstance *cluste
 	}
 	assert.Fail(t, "invalid response from api/aggregated-discovery-metrics")
 }
+
+// PrintVTOrcLogsOnFailure prints the VTOrc logs on failure of the test.
+// This function is supposed to be called as the first defer command from the vtorc tests.
+func PrintVTOrcLogsOnFailure(t *testing.T, clusterInstance *cluster.LocalProcessCluster) {
+	// If the test has not failed, then we don't need to print anything.
+	if !t.Failed() {
+		return
+	}
+
+	log.Errorf("Printing VTOrc logs")
+	for _, vtorc := range clusterInstance.VTOrcProcesses {
+		if vtorc == nil || vtorc.LogFileName == "" {
+			continue
+		}
+		filePath := path.Join(vtorc.LogDir, vtorc.LogFileName)
+		log.Errorf("Printing file - %s", filePath)
+		content, err := os.ReadFile(filePath)
+		if err != nil {
+			log.Errorf("Error while reading the file - %v", err)
+		}
+		log.Errorf("%s", string(content))
+	}
+}

From 31c011b279c34e32723d30362da0c46bb0d97b55 Mon Sep 17 00:00:00 2001
From: Manan Gupta <manan@planetscale.com>
Date: Fri, 14 Jul 2023 21:07:15 +0530
Subject: [PATCH 3/4] feat: fix flakiness in TestReadTopologyInstanceBufferable

Signed-off-by: Manan Gupta <manan@planetscale.com>
---
 go/test/endtoend/vtorc/readtopologyinstance/main_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go
index 7fbe7283fd1..e3b55d64c6b 100644
--- a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go
+++ b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go
@@ -159,7 +159,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) {
 	assert.Equal(t, replicaInstance.ReadBinlogCoordinates.LogFile, primaryInstance.SelfBinlogCoordinates.LogFile)
 	assert.Greater(t, replicaInstance.ReadBinlogCoordinates.LogPos, uint32(0))
 	assert.Equal(t, replicaInstance.ExecBinlogCoordinates.LogFile, primaryInstance.SelfBinlogCoordinates.LogFile)
-	assert.LessOrEqual(t, replicaInstance.ExecBinlogCoordinates.LogPos, replicaInstance.ReadBinlogCoordinates.LogPos)
+	assert.Greater(t, replicaInstance.ExecBinlogCoordinates.LogPos, uint32(0))
 	assert.Contains(t, replicaInstance.RelaylogCoordinates.LogFile, fmt.Sprintf("vt-0000000%d-relay", replica.TabletUID))
 	assert.Greater(t, replicaInstance.RelaylogCoordinates.LogPos, uint32(0))
 	assert.Empty(t, replicaInstance.LastIOError)

From 8f8c878f6e5e1aaf139d9d3472df31e81530ca67 Mon Sep 17 00:00:00 2001
From: Manan Gupta <manan@planetscale.com>
Date: Tue, 18 Jul 2023 11:42:43 +0530
Subject: [PATCH 4/4] feat: fix data race flakiness.

Signed-off-by: Manan Gupta <manan@planetscale.com>
---
 go/vt/vtorc/inst/instance_dao.go      |  3 +++
 go/vt/vtorc/inst/instance_dao_test.go | 20 ++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go
index ccb4e84fe86..020f3d103a4 100644
--- a/go/vt/vtorc/inst/instance_dao.go
+++ b/go/vt/vtorc/inst/instance_dao.go
@@ -25,6 +25,7 @@ import (
 	"strconv"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"github.com/patrickmn/go-cache"
@@ -71,6 +72,7 @@ var backendWrites = collection.CreateOrReturnCollection("BACKEND_WRITES")
 var writeBufferLatency = stopwatch.NewNamedStopwatch()
 
 var emptyQuotesRegexp = regexp.MustCompile(`^""$`)
+var cacheInitializationCompleted atomic.Bool
 
 func init() {
 	_ = metrics.Register("instance.access_denied", accessDeniedCounter)
@@ -91,6 +93,7 @@ func init() {
 func initializeInstanceDao() {
 	config.WaitForConfigurationToBeLoaded()
 	forgetAliases = cache.New(time.Duration(config.Config.InstancePollSeconds*3)*time.Second, time.Second)
+	cacheInitializationCompleted.Store(true)
 }
 
 // ExecDBWriteFunc chooses how to execute a write onto the database: whether synchronuously or not
diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go
index 10ea74616ba..63cef97601a 100644
--- a/go/vt/vtorc/inst/instance_dao_test.go
+++ b/go/vt/vtorc/inst/instance_dao_test.go
@@ -416,12 +416,18 @@ func TestReadOutdatedInstanceKeys(t *testing.T) {
 		},
 	}
 
+	// wait for the forgetAliases cache to be initialized to prevent data race.
+	waitForCacheInitialization()
+
 	// We are setting InstancePollSeconds to 59 minutes, just for the test.
 	oldVal := config.Config.InstancePollSeconds
+	oldCache := forgetAliases
 	defer func() {
+		forgetAliases = oldCache
 		config.Config.InstancePollSeconds = oldVal
 	}()
 	config.Config.InstancePollSeconds = 60 * 59
+	forgetAliases = cache.New(time.Minute, time.Minute)
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
@@ -569,6 +575,9 @@ func TestForgetInstanceAndInstanceIsForgotten(t *testing.T) {
 		},
 	}
 
+	// wait for the forgetAliases cache to be initialized to prevent data race.
+	waitForCacheInitialization()
+
 	oldCache := forgetAliases
 	// Clear the database after the test. The easiest way to do that is to run all the initialization commands again.
 	defer func() {
@@ -628,3 +637,14 @@ func TestSnapshotTopologies(t *testing.T) {
 
 	require.Equal(t, []string{"zone1-0000000100", "zone1-0000000101", "zone1-0000000112", "zone2-0000000200"}, tabletAliases)
 }
+
+// waitForCacheInitialization waits for the cache to be initialized to prevent data race in tests
+// that alter the cache or depend on its behaviour.
+func waitForCacheInitialization() {
+	for {
+		if cacheInitializationCompleted.Load() {
+			return
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+}