Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve VTOrc logging statements, now that we have alias as a field #13428

Merged
merged 3 commits into from
Jul 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) {
// Parse log file and find out how much time it took for DeadPrimary to recover.
logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName)
// log prefix printed at the end of analysis where we conclude we have DeadPrimary
t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery validation after acquiring shard lock")
t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery")
// log prefix printed at the end of recovery
t2 := extractTimeFromLog(t, logFile, "auditType:recover-dead-primary")
curr := time.Now().Format("2006-01-02")
Expand Down
8 changes: 2 additions & 6 deletions go/vt/vtorc/inst/analysis_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -387,8 +387,8 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
a.IsReadOnly = m.GetUint("read_only") == 1

if !a.LastCheckValid {
analysisMessage := fmt.Sprintf("analysis: Keyspace: %+v, Shard: %+v, IsPrimary: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v, CountReplicasFailingToConnectToPrimary: %+v",
a.ClusterDetails.Keyspace, a.ClusterDetails.Shard, a.IsPrimary, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas, a.CountReplicasFailingToConnectToPrimary,
analysisMessage := fmt.Sprintf("analysis: Alias: %+v, Keyspace: %+v, Shard: %+v, IsPrimary: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v, CountReplicasFailingToConnectToPrimary: %+v",
a.AnalyzedInstanceAlias, a.ClusterDetails.Keyspace, a.ClusterDetails.Shard, a.IsPrimary, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas, a.CountReplicasFailingToConnectToPrimary,
)
if util.ClearToLog("analysis_dao", analysisMessage) {
log.Infof(analysisMessage)
Expand Down Expand Up @@ -597,10 +597,6 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
return nil
})

for _, analysis := range result {
log.Errorf("Analysis - Instance - %v, Code - %v, LastCheckValid - %v, ReplStopped - %v", analysis.AnalyzedInstanceAlias, analysis.Analysis, analysis.LastCheckValid, analysis.ReplicationStopped)
}

result = postProcessAnalyses(result, clusters)

if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vtorc/inst/audit_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ func AuditOperation(auditType string, tabletAlias string, message string) error
return err
}
}
logMessage := fmt.Sprintf("auditType:%s instance:%s keyspace:%s shard:%s message:%s", auditType, tabletAlias, keyspace, shard, message)
logMessage := fmt.Sprintf("auditType:%s alias:%s keyspace:%s shard:%s message:%s", auditType, tabletAlias, keyspace, shard, message)
if syslogWriter != nil {
auditWrittenToFile = true
go func() {
Expand Down
4 changes: 2 additions & 2 deletions go/vt/vtorc/inst/instance_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ func ReadTopologyInstanceBufferable(tabletAlias string, latency *stopwatch.Named
instance.SecondsBehindPrimary.Int64 = int64(fullStatus.ReplicationStatus.ReplicationLagSeconds)
}
if instance.SecondsBehindPrimary.Valid && instance.SecondsBehindPrimary.Int64 < 0 {
log.Warningf("Host: %+v, instance.ReplicationLagSeconds < 0 [%+v], correcting to 0", tabletAlias, instance.SecondsBehindPrimary.Int64)
log.Warningf("Alias: %+v, instance.SecondsBehindPrimary < 0 [%+v], correcting to 0", tabletAlias, instance.SecondsBehindPrimary.Int64)
instance.SecondsBehindPrimary.Int64 = 0
}
// And until told otherwise:
Expand Down Expand Up @@ -1089,7 +1089,7 @@ func ForgetInstance(tabletAlias string) error {
return err
}
if rows == 0 {
errMsg := fmt.Sprintf("ForgetInstance(): instance %+v not found", tabletAlias)
errMsg := fmt.Sprintf("ForgetInstance(): tablet %+v not found", tabletAlias)
log.Errorf(errMsg)
return fmt.Errorf(errMsg)
}
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vtorc/inst/instance_dao_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ func TestForgetInstanceAndInstanceIsForgotten(t *testing.T) {
{
name: "Unknown tablet",
tabletAlias: "unknown-tablet",
errExpected: "ForgetInstance(): instance unknown-tablet not found",
errExpected: "ForgetInstance(): tablet unknown-tablet not found",
instanceForgotten: true,
tabletsExpected: []string{"zone1-0000000100", "zone1-0000000101", "zone1-0000000112", "zone2-0000000200"},
}, {
Expand Down
4 changes: 2 additions & 2 deletions go/vt/vtorc/logic/tablet_discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -345,12 +345,12 @@ func restartReplication(replicaAlias string) error {
defer cancel()
err = tmc.StopReplication(ctx, replicaTablet)
if err != nil {
log.Info("Could not stop replication on %v", topoproto.TabletAliasString(replicaTablet.Alias))
log.Info("Could not stop replication on %v", replicaAlias)
return err
}
err = tmc.StartReplication(ctx, replicaTablet, reparentutil.IsReplicaSemiSync(durabilityPolicy, primaryTablet, replicaTablet))
if err != nil {
log.Info("Could not start replication on %v", topoproto.TabletAliasString(replicaTablet.Alias))
log.Info("Could not start replication on %v", replicaAlias)
return err
}
return nil
Expand Down
25 changes: 12 additions & 13 deletions go/vt/vtorc/logic/topology_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ func recoverPrimaryHasPrimary(ctx context.Context, analysisEntry *inst.Replicati
_ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimaryHasPrimary.", analysisEntry.AnalyzedInstanceAlias))
return false, nil, err
}
log.Infof("Analysis: %v, will fix incorrect primaryship %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
log.Infof("Analysis: %v, will fix incorrect primaryship on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
// This has to be done in the end; whether successful or not, we should mark that the recovery is done.
// So that after the active period passes, we are able to run other recoveries.
defer func() {
Expand Down Expand Up @@ -555,7 +555,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
// Unhandled problem type
if analysisEntry.Analysis != inst.NoProblem {
if util.ClearToLog("executeCheckAndRecoverFunction", analysisEntry.AnalyzedInstanceAlias) {
log.Warningf("executeCheckAndRecoverFunction: ignoring analysisEntry that has no action plan: %+v; key: %+v",
log.Warningf("executeCheckAndRecoverFunction: ignoring analysisEntry that has no action plan: %+v; tablet: %+v",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
}
}
Expand Down Expand Up @@ -585,9 +585,8 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
// Unexpected. Shouldn't get this
log.Errorf("Unable to determine if recovery is disabled globally: %v", err)
} else if recoveryDisabledGlobally {
log.Infof("CheckAndRecover: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+
"skipProcesses: %v: NOT Recovering host (disabled globally)",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, nil, false)
log.Infof("CheckAndRecover: Analysis: %+v, Tablet: %+v: NOT Recovering host (disabled globally)",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)

return err
}
Expand All @@ -604,7 +603,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
// changes, we should be checking that this failure is indeed needed to be fixed. We do this after locking the shard to be sure
// that the data that we use now is up-to-date.
if isActionableRecovery {
log.Errorf("executeCheckAndRecoverFunction: Proceeding with %v recovery validation after acquiring shard lock.", analysisEntry.Analysis)
log.Errorf("executeCheckAndRecoverFunction: Proceeding with %v recovery on %v validation after acquiring shard lock.", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
// The first step we have to do is refresh the keyspace information
// This is required to know if the durability policies have changed or not
// If they have, then recoveries like ReplicaSemiSyncMustNotBeSet, etc won't be valid anymore
Expand Down Expand Up @@ -634,8 +633,8 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
DiscoverInstance(analysisEntry.AnalyzedInstanceAlias, true)
primaryTablet, err := shardPrimary(analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard)
if err != nil {
log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+"skipProcesses: %v: error while finding the shard primary: %v",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, nil, false, err)
log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, Tablet: %+v: error while finding the shard primary: %v",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, err)
return err
}
primaryTabletAlias := topoproto.TabletAliasString(primaryTablet.Alias)
Expand All @@ -647,19 +646,19 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
}
alreadyFixed, err := checkIfAlreadyFixed(analysisEntry)
if err != nil {
log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+"skipProcesses: %v: error while trying to find if the problem is already fixed: %v",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, nil, false, err)
log.Errorf("executeCheckAndRecoverFunction: Analysis: %+v, Tablet: %+v: error while trying to find if the problem is already fixed: %v",
analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, err)
return err
}
if alreadyFixed {
log.Infof("Analysis: %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis)
log.Infof("Analysis: %v on tablet %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
return nil
}
}

// Actually attempt recovery:
if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceAlias) {
log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v; skipProcesses: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery, false)
log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery)
}
recoveryAttempted, topologyRecovery, err := getCheckAndRecoverFunction(checkAndRecoverFunctionCode)(ctx, analysisEntry)
if !recoveryAttempted {
Expand Down Expand Up @@ -863,7 +862,7 @@ func fixReplica(ctx context.Context, analysisEntry *inst.ReplicationAnalysis) (r

err = setReadOnly(ctx, analyzedTablet)
if err != nil {
log.Info("Could not set the tablet %v to readonly - %v", topoproto.TabletAliasString(analyzedTablet.Alias), err)
log.Info("Could not set the tablet %v to readonly - %v", analysisEntry.AnalyzedInstanceAlias, err)
return true, topologyRecovery, err
}

Expand Down