From c987f7c5dfb7fcf015531e6dd7ba2f13a39ee484 Mon Sep 17 00:00:00 2001
From: Xin Hao Zhang <xzhang@cockroachlabs.com>
Date: Thu, 7 Sep 2023 15:33:12 -0400
Subject: [PATCH] scheduledlogging: fix `TestCaptureIndexUsageStats`

This commit reverts #109783 which added a `SucceedsSoon` in place
of testing each anticipated sheduling of  index usage stats
recording. Since this test checks the number of logs emitted
after each schedule, adding `SucceedsSoon` meant that entering a
retry loop could actually mean we fall into the next scheduled
index usage stats emission, thus surpassing the expected number of
logs for the schedule number being checked. This addition was meant
to fix the test for secondary tenants by waiting for all schedules
of tenants to run. This commit instead disables index usage stats for
the system tenant, allowing us to once again synchronize a single
tenant's schedule in order to verify the logs.

Fixes: #109924

Release note: None
---
 pkg/sql/scheduledlogging/BUILD.bazel          |   2 -
 .../captured_index_usage_stats.go             |   6 +
 .../captured_index_usage_stats_test.go        | 135 +++++++++++-------
 3 files changed, 92 insertions(+), 51 deletions(-)

diff --git a/pkg/sql/scheduledlogging/BUILD.bazel b/pkg/sql/scheduledlogging/BUILD.bazel
index 9d3d7b0bee9b..419ff0382a0d 100644
--- a/pkg/sql/scheduledlogging/BUILD.bazel
+++ b/pkg/sql/scheduledlogging/BUILD.bazel
@@ -36,8 +36,6 @@ go_test(
         "//pkg/security/securityassets",
         "//pkg/security/securitytest",
         "//pkg/server",
-        "//pkg/settings/cluster",
-        "//pkg/testutils",
         "//pkg/testutils/serverutils",
         "//pkg/testutils/sqlutils",
         "//pkg/testutils/testcluster",
diff --git a/pkg/sql/scheduledlogging/captured_index_usage_stats.go b/pkg/sql/scheduledlogging/captured_index_usage_stats.go
index dac23bbd727a..d03c7563aca4 100644
--- a/pkg/sql/scheduledlogging/captured_index_usage_stats.go
+++ b/pkg/sql/scheduledlogging/captured_index_usage_stats.go
@@ -71,6 +71,9 @@ type CaptureIndexUsageStatsTestingKnobs struct {
 	// scheduled interval in the case that the logging duration exceeds the
 	// default scheduled interval duration.
 	getOverlapDuration func() time.Duration
+	// onScheduleComplete allows tests to hook into when the current schedule
+	// is completed to check for the expected logs.
+	onScheduleComplete func()
 }
 
 // ModuleTestingKnobs implements base.ModuleTestingKnobs interface.
@@ -148,6 +151,9 @@ func (s *CaptureIndexUsageStatsLoggingScheduler) start(ctx context.Context, stop
 				if err != nil {
 					log.Warningf(ctx, "error capturing index usage stats: %+v", err)
 				}
+				if s.knobs != nil && s.knobs.onScheduleComplete != nil {
+					s.knobs.onScheduleComplete()
+				}
 				dur := s.durationUntilNextInterval()
 				if dur < time.Second {
 					// Avoid intervals that are too short, to prevent a hot
diff --git a/pkg/sql/scheduledlogging/captured_index_usage_stats_test.go b/pkg/sql/scheduledlogging/captured_index_usage_stats_test.go
index 5a2b66986119..27e6e6950392 100644
--- a/pkg/sql/scheduledlogging/captured_index_usage_stats_test.go
+++ b/pkg/sql/scheduledlogging/captured_index_usage_stats_test.go
@@ -21,8 +21,6 @@ import (
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/base"
-	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
-	"github.com/cockroachdb/cockroach/pkg/testutils"
 	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
 	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
 	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
@@ -83,34 +81,62 @@ func TestCaptureIndexUsageStats(t *testing.T) {
 	// timeBuffer is a short time buffer to account for non-determinism in the logging timings.
 	const timeBuffer = 4 * time.Second
 
-	settings := cluster.MakeTestingClusterSettings()
-	// Configure capture index usage statistics to be disabled. This is to test
-	// whether the disabled interval works correctly. We start in a disabled
-	// state, once the disabled interval expires, we check whether we have
-	// transitioned to an enabled state, if we have, we check that the expected
-	// logs have been emitted.
-	telemetryCaptureIndexUsageStatsEnabled.Override(context.Background(), &settings.SV, false)
-	// Configure the schedule interval at which we capture index usage
-	// statistics.
-	telemetryCaptureIndexUsageStatsInterval.Override(context.Background(), &settings.SV, stubScheduleInterval)
-	// Configure the schedule interval at which we check whether capture index
-	// usage statistics has been enabled.
-	telemetryCaptureIndexUsageStatsStatusCheckEnabledInterval.Override(context.Background(), &settings.SV, stubScheduleCheckEnabledInterval)
-	// Configure the delay between each emission of index usage stats logs.
-	telemetryCaptureIndexUsageStatsLoggingDelay.Override(context.Background(), &settings.SV, stubLoggingDelay)
+	schedulesFinishedChan := make(chan struct{})
+	logsVerifiedChan := make(chan struct{})
 
 	srv, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{
-		Settings: settings,
 		Knobs: base.TestingKnobs{
 			CapturedIndexUsageStatsKnobs: &CaptureIndexUsageStatsTestingKnobs{
 				getLoggingDuration: sd.getLoggingDuration,
 				getOverlapDuration: sd.getOverlapDuration,
+				onScheduleComplete: func() {
+					// Notify that the scheudle has completed.
+					schedulesFinishedChan <- struct{}{}
+					// Wait for logs to be verified before proceeding.
+					<-logsVerifiedChan
+				},
 			},
 		},
 	})
-
 	defer srv.Stopper().Stop(context.Background())
-	s := srv.ApplicationLayer()
+	// Close must be called before the server stops, or we will block indefinitely in the callback above.
+	defer close(logsVerifiedChan)
+
+	numTenants := 1
+	if srv.TenantController().StartedDefaultTestTenant() {
+		numTenants = 2
+	}
+
+	waitForCompletedSchedules := func(count int) {
+		for i := 0; i < count; i++ {
+			<-schedulesFinishedChan
+		}
+	}
+
+	tenantSettings := srv.ApplicationLayer().ClusterSettings()
+
+	// Configure capture index usage statistics to be disabled after the first schedule runs.
+	// This is to test whether the disabled interval works correctly. We will disable the the setting for
+	// the system tenant (if multi-tenant). Because the index usage stats scheduler starts immediately
+	// on server startup, we wait for the first schedule of each tenant to execute before setting all
+	// necessary cluster settings.
+	waitForCompletedSchedules(numTenants)
+	telemetryCaptureIndexUsageStatsEnabled.Override(context.Background(), &tenantSettings.SV, false)
+	telemetryCaptureIndexUsageStatsEnabled.Override(context.Background(), &srv.SystemLayer().ClusterSettings().SV, false)
+	// Configure the schedule interval at which we capture index usage statistics.
+	telemetryCaptureIndexUsageStatsInterval.Override(context.Background(), &tenantSettings.SV, stubScheduleInterval)
+	// Configure the schedule interval at which we check whether capture index
+	// usage statistics has been enabled.
+	telemetryCaptureIndexUsageStatsStatusCheckEnabledInterval.Override(context.Background(), &tenantSettings.SV,
+		stubScheduleCheckEnabledInterval)
+	// Configure the delay between each emission of index usage stats logs.
+	telemetryCaptureIndexUsageStatsLoggingDelay.Override(context.Background(), &tenantSettings.SV, stubLoggingDelay)
+
+	// Allow all tenants to continue.
+	logsVerifiedChan <- struct{}{}
+	if srv.TenantController().StartedDefaultTestTenant() {
+		logsVerifiedChan <- struct{}{}
+	}
 
 	db := sqlutils.MakeSQLRunner(sqlDB)
 
@@ -146,22 +172,32 @@ func TestCaptureIndexUsageStats(t *testing.T) {
 		"index_pkey",
 	}
 
-	// Check that telemetry log file contains all the entries we're expecting, at the scheduled intervals.
-
 	// Enable capture of index usage stats.
-	telemetryCaptureIndexUsageStatsEnabled.Override(context.Background(), &s.ClusterSettings().SV, true)
+	telemetryCaptureIndexUsageStatsEnabled.Override(context.Background(), &tenantSettings.SV, true)
+	//t.Fatal(telemetryCaptureIndexUsageStatsEnabled.Get(&s.ClusterSettings().SV))
+
+	// Check that telemetry log file contains all the entries we're expecting, at the scheduled intervals.
 
 	expectedTotalNumEntriesInSingleInterval := 8
 	expectedNumberOfIndividualIndexEntriesInSingleInterval := 1
 
+	// Wait for channel value from end of 1st schedule.
+	waitForCompletedSchedules(1)
+
 	// Check the expected number of entries from the 1st schedule.
-	testutils.SucceedsSoon(t, func() error {
-		return checkNumTotalEntriesAndNumIndexEntries(t,
-			expectedIndexNames,
-			expectedTotalNumEntriesInSingleInterval,
-			expectedNumberOfIndividualIndexEntriesInSingleInterval,
-		)
-	})
+	require.NoError(t, checkNumTotalEntriesAndNumIndexEntries(t,
+		expectedIndexNames,
+		expectedTotalNumEntriesInSingleInterval,
+		expectedNumberOfIndividualIndexEntriesInSingleInterval,
+	), "error encountered checking the number of total entries and number of index entries")
+
+	// Set the logging duration for the next run to be longer than the schedule
+	// interval duration.
+	stubLoggingDuration := stubScheduleInterval * 2
+	sd.setLoggingDuration(stubLoggingDuration)
+
+	// Allow schedules to proceed.
+	logsVerifiedChan <- struct{}{}
 
 	// Expect number of total entries to hold 2 times the number of entries in a
 	// single interval.
@@ -169,19 +205,18 @@ func TestCaptureIndexUsageStats(t *testing.T) {
 	// Expect number of individual index entries to hold 2 times the number of
 	// entries in a single interval.
 	expectedNumberOfIndividualIndexEntriesAfterTwoIntervals := expectedNumberOfIndividualIndexEntriesInSingleInterval * 2
-	// Set the logging duration for the next run to be longer than the schedule
-	// interval duration.
-	stubLoggingDuration := stubScheduleInterval * 2
-	sd.setLoggingDuration(stubLoggingDuration)
+
+	// Wait for channel value from end of 2nd schedule.
+	waitForCompletedSchedules(1)
 
 	// Check the expected number of entries from the 2nd schedule.
-	testutils.SucceedsSoon(t, func() error {
-		return checkNumTotalEntriesAndNumIndexEntries(t,
-			expectedIndexNames,
-			expectedTotalNumEntriesAfterTwoIntervals,
-			expectedNumberOfIndividualIndexEntriesAfterTwoIntervals,
-		)
-	})
+	require.NoError(t, checkNumTotalEntriesAndNumIndexEntries(t,
+		expectedIndexNames,
+		expectedTotalNumEntriesAfterTwoIntervals,
+		expectedNumberOfIndividualIndexEntriesAfterTwoIntervals,
+	), "error encountered checking the number of total entries and number of index entries")
+
+	logsVerifiedChan <- struct{}{}
 
 	// Expect number of total entries to hold 3 times the number of entries in a
 	// single interval.
@@ -190,17 +225,18 @@ func TestCaptureIndexUsageStats(t *testing.T) {
 	// entries in a single interval.
 	expectedNumberOfIndividualIndexEntriesAfterThreeIntervals := expectedNumberOfIndividualIndexEntriesInSingleInterval * 3
 
+	// Wait for channel value from end of 3rd schedule.
+	waitForCompletedSchedules(1)
+
 	// Check the expected number of entries from the 3rd schedule.
-	testutils.SucceedsSoon(t, func() error {
-		return checkNumTotalEntriesAndNumIndexEntries(t,
-			expectedIndexNames,
-			expectedTotalNumEntriesAfterThreeIntervals,
-			expectedNumberOfIndividualIndexEntriesAfterThreeIntervals,
-		)
-	})
+	require.NoError(t, checkNumTotalEntriesAndNumIndexEntries(t,
+		expectedIndexNames,
+		expectedTotalNumEntriesAfterThreeIntervals,
+		expectedNumberOfIndividualIndexEntriesAfterThreeIntervals,
+	), "error encountered checking the number of total entries and number of index entries")
 
 	// Stop capturing index usage statistics.
-	telemetryCaptureIndexUsageStatsEnabled.Override(context.Background(), &settings.SV, false)
+	telemetryCaptureIndexUsageStatsEnabled.Override(context.Background(), &tenantSettings.SV, false)
 
 	// Iterate through entries, ensure that the timestamp difference between each
 	// schedule is as expected.
@@ -253,6 +289,7 @@ func TestCaptureIndexUsageStats(t *testing.T) {
 		require.Greater(t, actualDuration+timeBuffer, expectedDuration, "%v+%v <= %v", expectedDuration, actualDuration, timeBuffer)
 		previousTimestamp = currentTimestamp
 	}
+
 }
 
 // checkNumTotalEntriesAndNumIndexEntries is a helper function that verifies that