Skip to content

Commit

Permalink
Improve VTOrc logging statements, now that we have alias as a field (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
GuptaManan100 authored Jul 1, 2023
1 parent 5356ef5 commit f47c606
Show file tree
Hide file tree
Showing 7 changed files with 21 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) {
// Parse log file and find out how much time it took for DeadPrimary to recover.
logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName)
// log prefix printed at the end of analysis where we conclude we have DeadPrimary
t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery validation after acquiring shard lock")
t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery")
// log prefix printed at the end of recovery
t2 := extractTimeFromLog(t, logFile, "auditType:recover-dead-primary")
curr := time.Now().Format("2006-01-02")
Expand Down
8 changes: 2 additions & 6 deletions go/vt/vtorc/inst/analysis_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -387,8 +387,8 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
a.IsReadOnly = m.GetUint("read_only") == 1

if !a.LastCheckValid {
analysisMessage := fmt.Sprintf("analysis: Keyspace: %+v, Shard: %+v, IsPrimary: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v, CountReplicasFailingToConnectToPrimary: %+v",
a.ClusterDetails.Keyspace, a.ClusterDetails.Shard, a.IsPrimary, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas, a.CountReplicasFailingToConnectToPrimary,
analysisMessage := fmt.Sprintf("analysis: Alias: %+v, Keyspace: %+v, Shard: %+v, IsPrimary: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v, CountReplicasFailingToConnectToPrimary: %+v",
a.AnalyzedInstanceAlias, a.ClusterDetails.Keyspace, a.ClusterDetails.Shard, a.IsPrimary, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas, a.CountReplicasFailingToConnectToPrimary,
)
if util.ClearToLog("analysis_dao", analysisMessage) {
log.Infof(analysisMessage)
Expand Down Expand Up @@ -597,10 +597,6 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
return nil
})

for _, analysis := range result {
log.Errorf("Analysis - Instance - %v, Code - %v, LastCheckValid - %v, ReplStopped - %v", analysis.AnalyzedInstanceAlias, analysis.Analysis, analysis.LastCheckValid, analysis.ReplicationStopped)
}

result = postProcessAnalyses(result, clusters)

if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vtorc/inst/audit_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ func AuditOperation(auditType string, tabletAlias string, message string) error
return err
}
}
logMessage := fmt.Sprintf("auditType:%s instance:%s keyspace:%s shard:%s message:%s", auditType, tabletAlias, keyspace, shard, message)
logMessage := fmt.Sprintf("auditType:%s alias:%s keyspace:%s shard:%s message:%s", auditType, tabletAlias, keyspace, shard, message)
if syslogWriter != nil {
auditWrittenToFile = true
go func() {
Expand Down
4 changes: 2 additions & 2 deletions go/vt/vtorc/inst/instance_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ func ReadTopologyInstanceBufferable(tabletAlias string, latency *stopwatch.Named
instance.SecondsBehindPrimary.Int64 = int64(fullStatus.ReplicationStatus.ReplicationLagSeconds)
}
if instance.SecondsBehindPrimary.Valid && instance.SecondsBehindPrimary.Int64 < 0 {
log.Warningf("Host: %+v, instance.ReplicationLagSeconds < 0 [%+v], correcting to 0", tabletAlias, instance.SecondsBehindPrimary.Int64)
log.Warningf("Alias: %+v, instance.SecondsBehindPrimary < 0 [%+v], correcting to 0", tabletAlias, instance.SecondsBehindPrimary.Int64)
instance.SecondsBehindPrimary.Int64 = 0
}
// And until told otherwise:
Expand Down Expand Up @@ -1089,7 +1089,7 @@ func ForgetInstance(tabletAlias string) error {
return err
}
if rows == 0 {
errMsg := fmt.Sprintf("ForgetInstance(): instance %+v not found", tabletAlias)
errMsg := fmt.Sprintf("ForgetInstance(): tablet %+v not found", tabletAlias)
log.Errorf(errMsg)
return fmt.Errorf(errMsg)
}
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vtorc/inst/instance_dao_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ func TestForgetInstanceAndInstanceIsForgotten(t *testing.T) {
{
name: "Unknown tablet",
tabletAlias: "unknown-tablet",
errExpected: "ForgetInstance(): instance unknown-tablet not found",
errExpected: "ForgetInstance(): tablet unknown-tablet not found",
instanceForgotten: true,
tabletsExpected: []string{"zone1-0000000100", "zone1-0000000101", "zone1-0000000112", "zone2-0000000200"},
}, {
Expand Down
4 changes: 2 additions & 2 deletions go/vt/vtorc/logic/tablet_discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -345,12 +345,12 @@ func restartReplication(replicaAlias string) error {
defer cancel()
err = tmc.StopReplication(ctx, replicaTablet)
if err != nil {
log.Info("Could not stop replication on %v", topoproto.TabletAliasString(replicaTablet.Alias))
log.Info("Could not stop replication on %v", replicaAlias)
return err
}
err = tmc.StartReplication(ctx, replicaTablet, reparentutil.IsReplicaSemiSync(durabilityPolicy, primaryTablet, replicaTablet))
if err != nil {
log.Info("Could not start replication on %v", topoproto.TabletAliasString(replicaTablet.Alias))
log.Info("Could not start replication on %v", replicaAlias)
return err
}
return nil
Expand Down
25 changes: 12 additions & 13 deletions go/vt/vtorc/logic/topology_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ func recoverPrimaryHasPrimary(ctx context.Context, analysisEntry *inst.Replicati
_ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimaryHasPrimary.", analysisEntry.AnalyzedInstanceAlias))
return false, nil, err
}
log.Infof("Analysis: %v, will fix incorrect primaryship %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
log.Infof("Analysis: %v, will fix incorrect primaryship on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
// This has to be done in the end; whether successful or not, we should mark that the recovery is done.
// So that after the active period passes, we are able to run other recoveries.
defer func() {
Expand Down Expand Up @@ -541,7 +541,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
// Unhandled problem type
if analysisEntry.Analysis != inst.NoProblem {
if util.ClearToLog("executeCheckAndRecoverFunction", analysisEntry.AnalyzedInstanceAlias) {
log.Warningf("executeCheckAndRecoverFunction: ignoring analysisEntry that has no action plan: %+v; key: %+v",
log.Warningf("executeCheckAndRecoverFunction: ignoring analysisEntry that has no action plan: %+v; tablet: %+v",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
}
}
Expand Down Expand Up @@ -571,9 +571,8 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
// Unexpected. Shouldn't get this
log.Errorf("Unable to determine if recovery is disabled globally: %v", err)
} else if recoveryDisabledGlobally {
log.Infof("CheckAndRecover: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+
"skipProcesses: %v: NOT Recovering host (disabled globally)",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, nil, false)
log.Infof("CheckAndRecover: Analysis: %+v, Tablet: %+v: NOT Recovering host (disabled globally)",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)

return err
}
Expand All @@ -590,7 +589,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
// changes, we should be checking that this failure is indeed needed to be fixed. We do this after locking the shard to be sure
// that the data that we use now is up-to-date.
if isActionableRecovery {
log.Errorf("executeCheckAndRecoverFunction: Proceeding with %v recovery validation after acquiring shard lock.", analysisEntry.Analysis)
log.Errorf("executeCheckAndRecoverFunction: Proceeding with %v recovery on %v validation after acquiring shard lock.", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
// The first step we have to do is refresh the keyspace information
// This is required to know if the durability policies have changed or not
// If they have, then recoveries like ReplicaSemiSyncMustNotBeSet, etc won't be valid anymore
Expand Down Expand Up @@ -620,8 +619,8 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
DiscoverInstance(analysisEntry.AnalyzedInstanceAlias, true)
primaryTablet, err := shardPrimary(analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard)
if err != nil {
log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+"skipProcesses: %v: error while finding the shard primary: %v",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, nil, false, err)
log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, Tablet: %+v: error while finding the shard primary: %v",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, err)
return err
}
primaryTabletAlias := topoproto.TabletAliasString(primaryTablet.Alias)
Expand All @@ -633,19 +632,19 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
}
alreadyFixed, err := checkIfAlreadyFixed(analysisEntry)
if err != nil {
log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+"skipProcesses: %v: error while trying to find if the problem is already fixed: %v",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, nil, false, err)
log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, Tablet: %+v: error while trying to find if the problem is already fixed: %v",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, err)
return err
}
if alreadyFixed {
log.Infof("Analysis: %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis)
log.Infof("Analysis: %v on tablet %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
return nil
}
}

// Actually attempt recovery:
if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceAlias) {
log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v; skipProcesses: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery, false)
log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery)
}
recoveryAttempted, topologyRecovery, err := getCheckAndRecoverFunction(checkAndRecoverFunctionCode)(ctx, analysisEntry)
if !recoveryAttempted {
Expand Down Expand Up @@ -843,7 +842,7 @@ func fixReplica(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (r

err = setReadOnly(ctx, analyzedTablet)
if err != nil {
log.Info("Could not set the tablet %v to readonly - %v", topoproto.TabletAliasString(analyzedTablet.Alias), err)
log.Info("Could not set the tablet %v to readonly - %v", analysisEntry.AnalyzedInstanceAlias, err)
return true, topologyRecovery, err
}

Expand Down

0 comments on commit f47c606

Please sign in to comment.