From 6b20304975658393e74ede45e93c24f7f7c91b84 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Thu, 13 Jul 2023 12:43:37 +0530 Subject: [PATCH 1/4] feat: deflake TestDownPrimary by reducing the wait_replicas_timeout value Signed-off-by: Manan Gupta --- .../endtoend/vtorc/primaryfailure/primary_failure_test.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index 9b85895c66d..bcc5cce3565 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -39,7 +39,10 @@ import ( // Also tests that VTOrc can handle multiple failures, if the durability policies allow it func TestDownPrimary(t *testing.T) { defer cluster.PanicHandler(t) - utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{ + // We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test. + // If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source. + // If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test. + utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, }, 1, "semi_sync") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] From 3bba79a82e8100cd658182e6df662214f04bb34c Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Thu, 13 Jul 2023 13:13:05 +0530 Subject: [PATCH 2/4] feat: print the VTOrc logs when a test fails Signed-off-by: Manan Gupta --- go/test/endtoend/vtorc/general/vtorc_test.go | 9 ++++++++ .../primaryfailure/primary_failure_test.go | 11 +++++++++ .../vtorc/readtopologyinstance/main_test.go | 1 + go/test/endtoend/vtorc/utils/utils.go | 23 +++++++++++++++++++ 4 files changed, 44 insertions(+) diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 91cfc1d91ac..daea2f37a1b 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -37,6 +37,7 @@ import ( // verify replication is setup // verify that with multiple vtorc instances, we still only have 1 PlannedReparentShard call func TestPrimaryElection(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -64,6 +65,7 @@ func TestPrimaryElection(t *testing.T) { // verify rdonly is not elected, only replica // verify replication is setup func TestSingleKeyspace(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks"}, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -81,6 +83,7 @@ func TestSingleKeyspace(t *testing.T) { // verify rdonly is not elected, only replica // verify replication is setup func TestKeyspaceShard(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks/0"}, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -100,6 +103,7 @@ func TestKeyspaceShard(t *testing.T) { // 4. setup replication from non-primary, let vtorc repair // 5. make instance A replicates from B and B from A, wait for repair func TestVTOrcRepairs(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 0, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -216,6 +220,7 @@ func TestVTOrcRepairs(t *testing.T) { func TestRepairAfterTER(t *testing.T) { // test fails intermittently on CI, skip until it can be fixed. t.SkipNow() + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -252,6 +257,7 @@ func TestSemiSync(t *testing.T) { // stop any vtorc instance running due to a previous test. utils.StopVTOrcs(t, clusterInfo) newCluster := utils.SetupNewClusterSemiSync(t) + defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance) utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, }, 1) @@ -316,6 +322,7 @@ func TestSemiSync(t *testing.T) { // TestVTOrcWithPrs tests that VTOrc works fine even when PRS is called from vtctld func TestVTOrcWithPrs(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 4, 0, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -364,6 +371,7 @@ func TestVTOrcWithPrs(t *testing.T) { // TestMultipleDurabilities tests that VTOrc works with 2 keyspaces having 2 different durability policies func TestMultipleDurabilities(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) // Setup a normal cluster and start vtorc utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{}, 1, "") @@ -388,6 +396,7 @@ func TestDurabilityPolicySetLater(t *testing.T) { // stop any vtorc instance running due to a previous test. utils.StopVTOrcs(t, clusterInfo) newCluster := utils.SetupNewClusterSemiSync(t) + defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance) keyspace := &newCluster.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] // Before starting VTOrc we explicity want to set the durability policy of the keyspace to an empty string diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index bcc5cce3565..e924846427c 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -38,6 +38,7 @@ import ( // covers the test case master-failover from orchestrator // Also tests that VTOrc can handle multiple failures, if the durability policies allow it func TestDownPrimary(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) // We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test. // If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source. @@ -102,6 +103,7 @@ func TestDownPrimary(t *testing.T) { // bring down primary before VTOrc has started, let vtorc repair. func TestDownPrimaryBeforeVTOrc(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{}, 0, "none") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] @@ -158,6 +160,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) { // Reason is, unlike other recoveries, in DeadPrimary we don't call DiscoverInstance since we know // that primary is unreachable. This help us save few seconds depending on value of `RemoteOperationTimeout` flag. func TestDeadPrimaryRecoversImmediately(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) // We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test. // If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source. @@ -238,6 +241,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { // Failover should not be cross data centers, according to the configuration file // covers part of the test case master-failover-lost-replicas from orchestrator func TestCrossDataCenterFailure(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -283,6 +287,7 @@ func TestCrossDataCenterFailure(t *testing.T) { // Failover should not be cross data centers, according to the configuration file // In case of no viable candidates, we should error out func TestCrossDataCenterFailureError(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -329,6 +334,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) { // Earlier any replicas that were not able to replicate from the previous primary // were detected by vtorc and could be configured to have their sources detached t.Skip() + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 2, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -410,6 +416,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) { // This test checks that the promotion of a tablet succeeds if it passes the promotion lag test // covers the test case master-failover-fail-promotion-lag-minutes-success from orchestrator func TestPromotionLagSuccess(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ ReplicationLagQuery: "select 59", @@ -459,6 +466,7 @@ func TestPromotionLagFailure(t *testing.T) { // Earlier vtorc used to check that the promotion lag between the new primary and the old one // was smaller than the configured value, otherwise it would fail the promotion t.Skip() + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 1, nil, cluster.VTOrcConfiguration{ ReplicationLagQuery: "select 61", @@ -511,6 +519,7 @@ func TestPromotionLagFailure(t *testing.T) { // We explicitly set one of the replicas to Prefer promotion rule. // That is the replica which should be promoted in case of primary failure func TestDownPrimaryPromotionRule(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ LockShardTimeoutSeconds: 5, @@ -558,6 +567,7 @@ func TestDownPrimaryPromotionRule(t *testing.T) { // That is the replica which should be promoted in case of primary failure // It should also be caught up when it is promoted func TestDownPrimaryPromotionRuleWithLag(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ LockShardTimeoutSeconds: 5, @@ -637,6 +647,7 @@ func TestDownPrimaryPromotionRuleWithLag(t *testing.T) { // We let a replica in our own cell lag. That is the replica which should be promoted in case of primary failure // It should also be caught up when it is promoted func TestDownPrimaryPromotionRuleWithLagCrossCenter(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ LockShardTimeoutSeconds: 5, diff --git a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go index e9756ad5d2b..7fbe7283fd1 100644 --- a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go +++ b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go @@ -41,6 +41,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { defer func() { clusterInfo.ClusterInstance.Teardown() }() + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] oldArgs := os.Args diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 4d7352fd2a8..75863de9de2 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -988,3 +988,26 @@ func WaitForInstancePollSecondsExceededCount(t *testing.T, vtorcInstance *cluste } assert.Fail(t, "invalid response from api/aggregated-discovery-metrics") } + +// PrintVTOrcLogsOnFailure prints the VTOrc logs on failure of the test. +// This function is supposed to be called as the first defer command from the vtorc tests. +func PrintVTOrcLogsOnFailure(t *testing.T, clusterInstance *cluster.LocalProcessCluster) { + // If the test has not failed, then we don't need to print anything. + if !t.Failed() { + return + } + + log.Errorf("Printing VTOrc logs") + for _, vtorc := range clusterInstance.VTOrcProcesses { + if vtorc == nil || vtorc.LogFileName == "" { + continue + } + filePath := path.Join(vtorc.LogDir, vtorc.LogFileName) + log.Errorf("Printing file - %s", filePath) + content, err := os.ReadFile(filePath) + if err != nil { + log.Errorf("Error while reading the file - %v", err) + } + log.Errorf("%s", string(content)) + } +} From 31c011b279c34e32723d30362da0c46bb0d97b55 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Fri, 14 Jul 2023 21:07:15 +0530 Subject: [PATCH 3/4] feat: fix flakiness in TestReadTopologyInstanceBufferable Signed-off-by: Manan Gupta --- go/test/endtoend/vtorc/readtopologyinstance/main_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go index 7fbe7283fd1..e3b55d64c6b 100644 --- a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go +++ b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go @@ -159,7 +159,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { assert.Equal(t, replicaInstance.ReadBinlogCoordinates.LogFile, primaryInstance.SelfBinlogCoordinates.LogFile) assert.Greater(t, replicaInstance.ReadBinlogCoordinates.LogPos, uint32(0)) assert.Equal(t, replicaInstance.ExecBinlogCoordinates.LogFile, primaryInstance.SelfBinlogCoordinates.LogFile) - assert.LessOrEqual(t, replicaInstance.ExecBinlogCoordinates.LogPos, replicaInstance.ReadBinlogCoordinates.LogPos) + assert.Greater(t, replicaInstance.ExecBinlogCoordinates.LogPos, uint32(0)) assert.Contains(t, replicaInstance.RelaylogCoordinates.LogFile, fmt.Sprintf("vt-0000000%d-relay", replica.TabletUID)) assert.Greater(t, replicaInstance.RelaylogCoordinates.LogPos, uint32(0)) assert.Empty(t, replicaInstance.LastIOError) From 8f8c878f6e5e1aaf139d9d3472df31e81530ca67 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 18 Jul 2023 11:42:43 +0530 Subject: [PATCH 4/4] feat: fix data race flakiness. Signed-off-by: Manan Gupta --- go/vt/vtorc/inst/instance_dao.go | 3 +++ go/vt/vtorc/inst/instance_dao_test.go | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index ccb4e84fe86..020f3d103a4 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -25,6 +25,7 @@ import ( "strconv" "strings" "sync" + "sync/atomic" "time" "github.com/patrickmn/go-cache" @@ -71,6 +72,7 @@ var backendWrites = collection.CreateOrReturnCollection("BACKEND_WRITES") var writeBufferLatency = stopwatch.NewNamedStopwatch() var emptyQuotesRegexp = regexp.MustCompile(`^""$`) +var cacheInitializationCompleted atomic.Bool func init() { _ = metrics.Register("instance.access_denied", accessDeniedCounter) @@ -91,6 +93,7 @@ func init() { func initializeInstanceDao() { config.WaitForConfigurationToBeLoaded() forgetAliases = cache.New(time.Duration(config.Config.InstancePollSeconds*3)*time.Second, time.Second) + cacheInitializationCompleted.Store(true) } // ExecDBWriteFunc chooses how to execute a write onto the database: whether synchronuously or not diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go index 10ea74616ba..63cef97601a 100644 --- a/go/vt/vtorc/inst/instance_dao_test.go +++ b/go/vt/vtorc/inst/instance_dao_test.go @@ -416,12 +416,18 @@ func TestReadOutdatedInstanceKeys(t *testing.T) { }, } + // wait for the forgetAliases cache to be initialized to prevent data race. + waitForCacheInitialization() + // We are setting InstancePollSeconds to 59 minutes, just for the test. oldVal := config.Config.InstancePollSeconds + oldCache := forgetAliases defer func() { + forgetAliases = oldCache config.Config.InstancePollSeconds = oldVal }() config.Config.InstancePollSeconds = 60 * 59 + forgetAliases = cache.New(time.Minute, time.Minute) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -569,6 +575,9 @@ func TestForgetInstanceAndInstanceIsForgotten(t *testing.T) { }, } + // wait for the forgetAliases cache to be initialized to prevent data race. + waitForCacheInitialization() + oldCache := forgetAliases // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. defer func() { @@ -628,3 +637,14 @@ func TestSnapshotTopologies(t *testing.T) { require.Equal(t, []string{"zone1-0000000100", "zone1-0000000101", "zone1-0000000112", "zone2-0000000200"}, tabletAliases) } + +// waitForCacheInitialization waits for the cache to be initialized to prevent data race in tests +// that alter the cache or depend on its behaviour. +func waitForCacheInitialization() { + for { + if cacheInitializationCompleted.Load() { + return + } + time.Sleep(100 * time.Millisecond) + } +}