From f47c60607286b7591e505b6227db263129a7cae8 Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Sun, 2 Jul 2023 02:07:47 +0530 Subject: [PATCH] Improve VTOrc logging statements, now that we have alias as a field (#13428) --- .../primaryfailure/primary_failure_test.go | 2 +- go/vt/vtorc/inst/analysis_dao.go | 8 ++---- go/vt/vtorc/inst/audit_dao.go | 2 +- go/vt/vtorc/inst/instance_dao.go | 4 +-- go/vt/vtorc/inst/instance_dao_test.go | 2 +- go/vt/vtorc/logic/tablet_discovery.go | 4 +-- go/vt/vtorc/logic/topology_recovery.go | 25 +++++++++---------- 7 files changed, 21 insertions(+), 26 deletions(-) diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index d7a54fb421f..9b85895c66d 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -209,7 +209,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { // Parse log file and find out how much time it took for DeadPrimary to recover. logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName) // log prefix printed at the end of analysis where we conclude we have DeadPrimary - t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery validation after acquiring shard lock") + t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery") // log prefix printed at the end of recovery t2 := extractTimeFromLog(t, logFile, "auditType:recover-dead-primary") curr := time.Now().Format("2006-01-02") diff --git a/go/vt/vtorc/inst/analysis_dao.go b/go/vt/vtorc/inst/analysis_dao.go index 3fded2f814d..62f51f903eb 100644 --- a/go/vt/vtorc/inst/analysis_dao.go +++ b/go/vt/vtorc/inst/analysis_dao.go @@ -387,8 +387,8 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna a.IsReadOnly = m.GetUint("read_only") == 1 if !a.LastCheckValid { - analysisMessage := fmt.Sprintf("analysis: Keyspace: %+v, Shard: %+v, IsPrimary: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v, CountReplicasFailingToConnectToPrimary: %+v", - a.ClusterDetails.Keyspace, a.ClusterDetails.Shard, a.IsPrimary, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas, a.CountReplicasFailingToConnectToPrimary, + analysisMessage := fmt.Sprintf("analysis: Alias: %+v, Keyspace: %+v, Shard: %+v, IsPrimary: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v, CountReplicasFailingToConnectToPrimary: %+v", + a.AnalyzedInstanceAlias, a.ClusterDetails.Keyspace, a.ClusterDetails.Shard, a.IsPrimary, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas, a.CountReplicasFailingToConnectToPrimary, ) if util.ClearToLog("analysis_dao", analysisMessage) { log.Infof(analysisMessage) @@ -597,10 +597,6 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna return nil }) - for _, analysis := range result { - log.Errorf("Analysis - Instance - %v, Code - %v, LastCheckValid - %v, ReplStopped - %v", analysis.AnalyzedInstanceAlias, analysis.Analysis, analysis.LastCheckValid, analysis.ReplicationStopped) - } - result = postProcessAnalyses(result, clusters) if err != nil { diff --git a/go/vt/vtorc/inst/audit_dao.go b/go/vt/vtorc/inst/audit_dao.go index b2eab9abc64..96db7f32ccf 100644 --- a/go/vt/vtorc/inst/audit_dao.go +++ b/go/vt/vtorc/inst/audit_dao.go @@ -93,7 +93,7 @@ func AuditOperation(auditType string, tabletAlias string, message string) error return err } } - logMessage := fmt.Sprintf("auditType:%s instance:%s keyspace:%s shard:%s message:%s", auditType, tabletAlias, keyspace, shard, message) + logMessage := fmt.Sprintf("auditType:%s alias:%s keyspace:%s shard:%s message:%s", auditType, tabletAlias, keyspace, shard, message) if syslogWriter != nil { auditWrittenToFile = true go func() { diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index b751018a9da..ccb4e84fe86 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -306,7 +306,7 @@ func ReadTopologyInstanceBufferable(tabletAlias string, latency *stopwatch.Named instance.SecondsBehindPrimary.Int64 = int64(fullStatus.ReplicationStatus.ReplicationLagSeconds) } if instance.SecondsBehindPrimary.Valid && instance.SecondsBehindPrimary.Int64 < 0 { - log.Warningf("Host: %+v, instance.ReplicationLagSeconds < 0 [%+v], correcting to 0", tabletAlias, instance.SecondsBehindPrimary.Int64) + log.Warningf("Alias: %+v, instance.SecondsBehindPrimary < 0 [%+v], correcting to 0", tabletAlias, instance.SecondsBehindPrimary.Int64) instance.SecondsBehindPrimary.Int64 = 0 } // And until told otherwise: @@ -1089,7 +1089,7 @@ func ForgetInstance(tabletAlias string) error { return err } if rows == 0 { - errMsg := fmt.Sprintf("ForgetInstance(): instance %+v not found", tabletAlias) + errMsg := fmt.Sprintf("ForgetInstance(): tablet %+v not found", tabletAlias) log.Errorf(errMsg) return fmt.Errorf(errMsg) } diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go index 66393c50448..10ea74616ba 100644 --- a/go/vt/vtorc/inst/instance_dao_test.go +++ b/go/vt/vtorc/inst/instance_dao_test.go @@ -552,7 +552,7 @@ func TestForgetInstanceAndInstanceIsForgotten(t *testing.T) { { name: "Unknown tablet", tabletAlias: "unknown-tablet", - errExpected: "ForgetInstance(): instance unknown-tablet not found", + errExpected: "ForgetInstance(): tablet unknown-tablet not found", instanceForgotten: true, tabletsExpected: []string{"zone1-0000000100", "zone1-0000000101", "zone1-0000000112", "zone2-0000000200"}, }, { diff --git a/go/vt/vtorc/logic/tablet_discovery.go b/go/vt/vtorc/logic/tablet_discovery.go index e35afa7625f..f0523642bd6 100644 --- a/go/vt/vtorc/logic/tablet_discovery.go +++ b/go/vt/vtorc/logic/tablet_discovery.go @@ -345,12 +345,12 @@ func restartReplication(replicaAlias string) error { defer cancel() err = tmc.StopReplication(ctx, replicaTablet) if err != nil { - log.Info("Could not stop replication on %v", topoproto.TabletAliasString(replicaTablet.Alias)) + log.Info("Could not stop replication on %v", replicaAlias) return err } err = tmc.StartReplication(ctx, replicaTablet, reparentutil.IsReplicaSemiSync(durabilityPolicy, primaryTablet, replicaTablet)) if err != nil { - log.Info("Could not start replication on %v", topoproto.TabletAliasString(replicaTablet.Alias)) + log.Info("Could not start replication on %v", replicaAlias) return err } return nil diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 929ee10c6ad..4015ba1f5c2 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -187,7 +187,7 @@ func recoverPrimaryHasPrimary(ctx context.Context, analysisEntry *inst.Replicati _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimaryHasPrimary.", analysisEntry.AnalyzedInstanceAlias)) return false, nil, err } - log.Infof("Analysis: %v, will fix incorrect primaryship %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) + log.Infof("Analysis: %v, will fix incorrect primaryship on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) // This has to be done in the end; whether successful or not, we should mark that the recovery is done. // So that after the active period passes, we are able to run other recoveries. defer func() { @@ -541,7 +541,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er // Unhandled problem type if analysisEntry.Analysis != inst.NoProblem { if util.ClearToLog("executeCheckAndRecoverFunction", analysisEntry.AnalyzedInstanceAlias) { - log.Warningf("executeCheckAndRecoverFunction: ignoring analysisEntry that has no action plan: %+v; key: %+v", + log.Warningf("executeCheckAndRecoverFunction: ignoring analysisEntry that has no action plan: %+v; tablet: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) } } @@ -571,9 +571,8 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er // Unexpected. Shouldn't get this log.Errorf("Unable to determine if recovery is disabled globally: %v", err) } else if recoveryDisabledGlobally { - log.Infof("CheckAndRecover: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+ - "skipProcesses: %v: NOT Recovering host (disabled globally)", - analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, nil, false) + log.Infof("CheckAndRecover: Analysis: %+v, Tablet: %+v: NOT Recovering host (disabled globally)", + analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) return err } @@ -590,7 +589,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er // changes, we should be checking that this failure is indeed needed to be fixed. We do this after locking the shard to be sure // that the data that we use now is up-to-date. if isActionableRecovery { - log.Errorf("executeCheckAndRecoverFunction: Proceeding with %v recovery validation after acquiring shard lock.", analysisEntry.Analysis) + log.Errorf("executeCheckAndRecoverFunction: Proceeding with %v recovery on %v validation after acquiring shard lock.", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) // The first step we have to do is refresh the keyspace information // This is required to know if the durability policies have changed or not // If they have, then recoveries like ReplicaSemiSyncMustNotBeSet, etc won't be valid anymore @@ -620,8 +619,8 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er DiscoverInstance(analysisEntry.AnalyzedInstanceAlias, true) primaryTablet, err := shardPrimary(analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard) if err != nil { - log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+"skipProcesses: %v: error while finding the shard primary: %v", - analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, nil, false, err) + log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, Tablet: %+v: error while finding the shard primary: %v", + analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, err) return err } primaryTabletAlias := topoproto.TabletAliasString(primaryTablet.Alias) @@ -633,19 +632,19 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er } alreadyFixed, err := checkIfAlreadyFixed(analysisEntry) if err != nil { - log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+"skipProcesses: %v: error while trying to find if the problem is already fixed: %v", - analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, nil, false, err) + log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, Tablet: %+v: error while trying to find if the problem is already fixed: %v", + analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, err) return err } if alreadyFixed { - log.Infof("Analysis: %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis) + log.Infof("Analysis: %v on tablet %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) return nil } } // Actually attempt recovery: if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceAlias) { - log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v; skipProcesses: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery, false) + log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery) } recoveryAttempted, topologyRecovery, err := getCheckAndRecoverFunction(checkAndRecoverFunctionCode)(ctx, analysisEntry) if !recoveryAttempted { @@ -843,7 +842,7 @@ func fixReplica(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (r err = setReadOnly(ctx, analyzedTablet) if err != nil { - log.Info("Could not set the tablet %v to readonly - %v", topoproto.TabletAliasString(analyzedTablet.Alias), err) + log.Info("Could not set the tablet %v to readonly - %v", analysisEntry.AnalyzedInstanceAlias, err) return true, topologyRecovery, err }