Skip to content

Commit

Permalink
repairReplication deadlock fix
Browse files Browse the repository at this point in the history
  • Loading branch information
vmogilev committed Jan 17, 2024
1 parent 0252167 commit 948589a
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 13 deletions.
8 changes: 3 additions & 5 deletions go/vt/vttablet/tabletmanager/replmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,30 +103,28 @@ func (rm *replManager) check() {

func (rm *replManager) checkActionLocked() {
status, err := rm.tm.MysqlDaemon.ReplicationStatus()
// log.Infof("vm-debug: %s", spew.Sdump(status))
if err != nil {
log.Infof("vm-debug: %v", err)
log.Infof("slack-debug: %v", err)
if err != mysql.ErrNotReplica {
return
}
} else {
// If only one of the threads is stopped, it's probably
// intentional. So, we don't repair replication.
if status.SQLHealthy() || status.IOHealthy() {
// log.Infof("vm-debug: status.SQLHealthy:%v status.IOHealthy:%v", status.SQLHealthy(), status.IOHealthy())

return
}
}

log.Infof("vm-debug: rm.failed=%v", rm.failed)
log.Infof("slack-debug: rm.failed=%v", rm.failed)
if !rm.failed {
log.Infof("Replication is stopped, reconnecting to primary.")
}
ctx, cancel := context.WithTimeout(rm.ctx, 5*time.Second)
defer cancel()
if err := rm.tm.repairReplication(ctx); err != nil {
log.Infof("vm-debug: repairReplication failed with=%v", err)
log.Infof("slack-debug: repairReplication failed with=%v", err)
if !rm.failed {
rm.failed = true
log.Infof("Failed to reconnect to primary: %v, will keep retrying.", err)
Expand Down
32 changes: 24 additions & 8 deletions go/vt/vttablet/tabletmanager/rpc_replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,13 @@ func (tm *TabletManager) SetReplicationSource(ctx context.Context, parentAlias *
if err := tm.lock(ctx); err != nil {
return err
}
defer tm.unlock()

tm._isSetReplicationSourceRunning = true

defer func() {
tm._isSetReplicationSourceRunning = false
tm.unlock()
}()

// setReplicationSourceLocked also fixes the semi-sync. In case the tablet type is primary it assumes that it will become a replica if SetReplicationSource
// is called, so we always call fixSemiSync with a non-primary tablet type. This will always set the source side replication to false.
Expand All @@ -703,7 +709,7 @@ func (tm *TabletManager) setReplicationSourceRepairReplication(ctx context.Conte
return err
}

log.Infof("vm-debug: calling tm.TopoServer.LockShard ctx=%s", spew.Sdump(ctx))
log.Infof("slack-debug: calling tm.TopoServer.LockShard ctx=%s", spew.Sdump(ctx))
ctx, unlock, lockErr := tm.TopoServer.LockShard(ctx, parent.Tablet.GetKeyspace(), parent.Tablet.GetShard(), fmt.Sprintf("repairReplication to %v as parent)", topoproto.TabletAliasString(parentAlias)))
if lockErr != nil {
return lockErr
Expand Down Expand Up @@ -746,7 +752,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA
// unintentionally change the type of RDONLY tablets
tablet := tm.Tablet()
if tablet.Type == topodatapb.TabletType_PRIMARY {
log.Infof("vm-debug: calling tm.tmState.ChangeTabletType")
log.Infof("slack-debug: calling tm.tmState.ChangeTabletType")
if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_REPLICA, DBActionNone); err != nil {
return err
}
Expand All @@ -757,7 +763,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA
shouldbeReplicating := false
status, err := tm.MysqlDaemon.ReplicationStatus()
if err == mysql.ErrNotReplica {
log.Infof("vm-debug: err == mysql.ErrNotReplica")
log.Infof("slack-debug: err == mysql.ErrNotReplica")
// This is a special error that means we actually succeeded in reading
// the status, but the status is empty because replication is not
// configured. We assume this means we used to be a primary, so we always
Expand All @@ -784,7 +790,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA
if tabletType == topodatapb.TabletType_PRIMARY {
tabletType = topodatapb.TabletType_REPLICA
}
log.Infof("vm-debug: calling tm.fixSemiSync")
log.Infof("slack-debug: calling tm.fixSemiSync")
if err := tm.fixSemiSync(tabletType, semiSync); err != nil {
return err
}
Expand All @@ -801,7 +807,7 @@ func (tm *TabletManager) setReplicationSourceLocked(ctx context.Context, parentA
host := parent.Tablet.MysqlHostname
port := int(parent.Tablet.MysqlPort)
if status.SourceHost != host || status.SourcePort != port {
log.Infof("vm-debug: calling tm.MysqlDaemon.SetReplicationSource")
log.Infof("slack-debug: calling tm.MysqlDaemon.SetReplicationSource")
// This handles both changing the address and starting replication.
if err := tm.MysqlDaemon.SetReplicationSource(ctx, host, port, wasReplicating, shouldbeReplicating); err != nil {
if err := tm.handleRelayLogError(err); err != nil {
Expand Down Expand Up @@ -1149,7 +1155,17 @@ func (tm *TabletManager) handleRelayLogError(err error) error {
// repairReplication tries to connect this server to whoever is
// the current primary of the shard, and start replicating.
func (tm *TabletManager) repairReplication(ctx context.Context) error {
log.Infof("vm-debug: entering repairReplication")
log.Infof("slack-debug: entering repairReplication")

if tm._isSetReplicationSourceRunning {
// we are actively setting replication source,
// repairReplication will block due to higher
// authority holding a shard lock (PRS on vtctld)
log.Infof("slack-debug: we are actively setting replication source, exiting")

return nil
}

tablet := tm.Tablet()

si, err := tm.TopoServer.GetShard(ctx, tablet.Keyspace, tablet.Shard)
Expand All @@ -1170,7 +1186,7 @@ func (tm *TabletManager) repairReplication(ctx context.Context) error {

// If Orchestrator is configured and if Orchestrator is actively reparenting, we should not repairReplication
if tm.orc != nil {
log.Infof("vm-debug: tm.orc != nil")
log.Infof("slack-debug: tm.orc != nil")
re, err := tm.orc.InActiveShardRecovery(tablet)
if err != nil {
return err
Expand Down
2 changes: 2 additions & 0 deletions go/vt/vttablet/tabletmanager/tm_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@ type TabletManager struct {
_lockTablesTimer *time.Timer
// _isBackupRunning tells us whether there is a backup that is currently running
_isBackupRunning bool
// _isSetReplicationSourceRunning indicates we are actively running SetReplicationSource
_isSetReplicationSourceRunning bool
}

// BuildTabletFromInput builds a tablet record from input parameters.
Expand Down

0 comments on commit 948589a

Please sign in to comment.