From c5c6ec477a605bec33e52af73b35576e0dc3caa6 Mon Sep 17 00:00:00 2001
From: Max Englander <max@planetscale.com>
Date: Fri, 18 Aug 2023 12:00:44 -0400
Subject: [PATCH] go/cmd/vtbackup: report replication status metrics during
 catch-up phase

Signed-off-by: Max Englander <max@planetscale.com>
---
 changelog/18.0/18.0.0/summary.md |  9 +++++++
 go/cmd/vtbackup/vtbackup.go      | 44 +++++++++++++++++++++++++++++---
 2 files changed, 50 insertions(+), 3 deletions(-)
diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md
index 96f3cdf8541..61cf6be1cf8 100644
--- a/changelog/18.0/18.0.0/summary.md
+++ b/changelog/18.0/18.0.0/summary.md
@@ -18,6 +18,7 @@
     - [Deleted `vtgr`](#deleted-vtgr)
   - **[New stats](#new-stats)**
     - [VTGate Vindex unknown parameters](#vtgate-vindex-unknown-parameters)
+    - [VTBackup stat `PhaseStatus`](#vtbackup-stat-phase-status)
   - **[VTTablet](#vttablet)**
     - [VTTablet: New ResetSequences RPC](#vttablet-new-rpc-reset-sequences)
   - **[Docker](#docker)**
@@ -120,6 +121,14 @@ The `vtgr` has been deprecated in Vitess 17, also see https://github.com/vitessi
 
 The VTGate stat `VindexUnknownParameters` gauges unknown Vindex parameters found in the latest VSchema pulled from the topology.
 
+#### <a id="vtbackup-stat-phase-status"/>VTBackup `PhaseStatus` stat
+
+`PhaseStatus` reports a 1 (active) or a 0 (inactive) for each of the following phases and statuses:
+
+ * `CatchUpReplication` phase has statuses `Stalled` and `Stopped`.
+    * `Stalled` is set to `1` when replication stops advancing.
+    * `Stopped` is set to `1` when replication stops before `vtbackup` catches up with the primary.
+
 ### <a id="vttablet"/>VTTablet
 
 #### <a id="vttablet-new-rpc-reset-sequences"/>New ResetSequences rpc
diff --git a/go/cmd/vtbackup/vtbackup.go b/go/cmd/vtbackup/vtbackup.go
index 31b7a4dc596..e6b6e3fbbb1 100644
--- a/go/cmd/vtbackup/vtbackup.go
+++ b/go/cmd/vtbackup/vtbackup.go
@@ -99,6 +99,10 @@ const (
 	// place a hard cap on the overall time for a backup, while also not waiting
 	// forever for things that should be quick.
 	operationTimeout = 1 * time.Minute
+
+	phaseNameCatchUpReplication          = "CatchUpReplication"
+	phaseStatusCatchUpReplicationStalled = "Stalled"
+	phaseStatusCatchUpReplicationStopped = "Stopped"
 )
 
 var (
@@ -128,6 +132,17 @@ var (
 		"How long it took vtbackup to perform each phase (in seconds).",
 		"phase",
 	)
+	phaseStatus = stats.NewGaugesWithMultiLabels(
+		"PhaseStatus",
+		"Internal state of vtbackup phase.",
+		[]string{"phase", "status"},
+	)
+	phaseStatuses = map[string][]string{
+		phaseNameCatchUpReplication: {
+			phaseStatusCatchUpReplicationStalled,
+			phaseStatusCatchUpReplicationStopped,
+		},
+	}
 )
 
 func registerFlags(fs *pflag.FlagSet) {
@@ -203,6 +218,13 @@ func main() {
 	topoServer := topo.Open()
 	defer topoServer.Close()
 
+	// Initialize stats.
+	for phaseName, statuses := range phaseStatuses {
+		for _, status := range statuses {
+			phaseStatus.Set([]string{phaseName, status}, int64(0))
+		}
+	}
+
 	// Try to take a backup, if it's been long enough since the last one.
 	// Skip pruning if backup wasn't fully successful. We don't want to be
 	// deleting things if the backup process is not healthy.
@@ -433,6 +455,9 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
 	backupParams.BackupTime = time.Now()
 
 	// Wait for replication to catch up.
+	var lastStatus replication.ReplicationStatus
+	var status replication.ReplicationStatus
+	var statusErr error
 	waitStartTime := time.Now()
 	for {
 		select {
@@ -441,7 +466,8 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
 		case <-time.After(time.Second):
 		}
 
-		status, statusErr := mysqld.ReplicationStatus()
+		lastStatus = status
+		status, statusErr = mysqld.ReplicationStatus()
 		if statusErr != nil {
 			log.Warningf("Error getting replication status: %v", statusErr)
 			continue
@@ -453,11 +479,21 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
 			durationByPhase.Set("CatchUpReplication", int64(time.Since(waitStartTime).Seconds()))
 			break
 		}
+		if !lastStatus.Position.IsZero() {
+			if status.Position.Equal(lastStatus.Position) {
+				phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStalled}, 1)
+			} else {
+				phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStalled}, 0)
+			}
+		}
 		if !status.Healthy() {
 			log.Warning("Replication has stopped before backup could be taken. Trying to restart replication.")
+			phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStopped}, 1)
 			if err := startReplication(ctx, mysqld, topoServer); err != nil {
 				log.Warningf("Failed to restart replication: %v", err)
 			}
+		} else {
+			phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStopped}, 0)
 		}
 	}
 
@@ -467,14 +503,16 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
 	}
 
 	// Did we make any progress?
-	status, err := mysqld.ReplicationStatus()
-	if err != nil {
+	status, statusErr = mysqld.ReplicationStatus()
+	if statusErr != nil {
 		return fmt.Errorf("can't get replication status: %v", err)
 	}
 	log.Infof("Replication caught up to %v", status.Position)
 	if !status.Position.AtLeast(primaryPos) && status.Position.Equal(restorePos) {
 		return fmt.Errorf("not taking backup: replication did not make any progress from restore point: %v", restorePos)
 	}
+	phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStalled}, 0)
+	phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStopped}, 0)
 
 	// Re-enable redo logging.
 	if disabledRedoLog {