From 6e6f4634781e066318e67ebc90d96284f0eed96b Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Mon, 24 Aug 2020 19:27:08 -0700 Subject: [PATCH 1/2] tm: add more logging to checkMastership Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/tabletmanager/tm_init.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index c06161f97b2..10126e6f34a 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -459,6 +459,7 @@ func (tm *TabletManager) checkMastership(ctx context.Context, si *topo.ShardInfo case topo.IsErrType(err, topo.NoNode): // There's no existing tablet record, so we can assume // no one has left us a message to step down. + log.Infof("Shard master alias matches, but there is no existing tablet record. Switching to master with 'Now' as time") tm.tmState.UpdateTablet(func(tablet *topodatapb.Tablet) { tablet.Type = topodatapb.TabletType_MASTER // Update the master term start time (current value is 0) because we @@ -468,6 +469,7 @@ func (tm *TabletManager) checkMastership(ctx context.Context, si *topo.ShardInfo }) case err == nil: if oldTablet.Type == topodatapb.TabletType_MASTER { + log.Infof("Shard master alias matches, and existing tablet agrees. Switching to master with tablet's master term start time: %v", oldTablet.MasterTermStartTime) // We're marked as master in the shard record, // and our existing tablet record agrees. tm.tmState.UpdateTablet(func(tablet *topodatapb.Tablet) { @@ -490,6 +492,7 @@ func (tm *TabletManager) checkMastership(ctx context.Context, si *topo.ShardInfo oldMasterTermStartTime := oldTablet.GetMasterTermStartTime() currentShardTime := si.GetMasterTermStartTime() if oldMasterTermStartTime.After(currentShardTime) { + log.Infof("Shard master alias does not match, but the tablet's master term start time is newer. Switching to master with tablet's master term start time: %v", oldTablet.MasterTermStartTime) tm.tmState.UpdateTablet(func(tablet *topodatapb.Tablet) { tablet.Type = topodatapb.TabletType_MASTER tablet.MasterTermStartTime = oldTablet.MasterTermStartTime From 5fc799294891ac88937f4aa36c78002c9dd0f84e Mon Sep 17 00:00:00 2001 From: Sugu Sougoumarane Date: Mon, 24 Aug 2020 20:13:51 -0700 Subject: [PATCH 2/2] tm: checkMastership: handle more corner cases Signed-off-by: Sugu Sougoumarane --- go/vt/vttablet/tabletmanager/tm_init.go | 8 ++++++ go/vt/vttablet/tabletmanager/tm_init_test.go | 29 ++++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go index 10126e6f34a..5bfae9aa2a7 100644 --- a/go/vt/vttablet/tabletmanager/tm_init.go +++ b/go/vt/vttablet/tabletmanager/tm_init.go @@ -476,6 +476,12 @@ func (tm *TabletManager) checkMastership(ctx context.Context, si *topo.ShardInfo tablet.Type = topodatapb.TabletType_MASTER tablet.MasterTermStartTime = oldTablet.MasterTermStartTime }) + } else { + log.Warningf("Shard master alias matches, but existing tablet is not master. Switching to master with the shard's master term start time: %v", oldTablet.MasterTermStartTime) + tm.tmState.UpdateTablet(func(tablet *topodatapb.Tablet) { + tablet.Type = topodatapb.TabletType_MASTER + tablet.MasterTermStartTime = si.MasterTermStartTime + }) } default: return vterrors.Wrap(err, "InitTablet failed to read existing tablet record") @@ -497,6 +503,8 @@ func (tm *TabletManager) checkMastership(ctx context.Context, si *topo.ShardInfo tablet.Type = topodatapb.TabletType_MASTER tablet.MasterTermStartTime = oldTablet.MasterTermStartTime }) + } else { + log.Infof("Existing tablet type is master, but the shard record has a different master with a newer timestamp. Remaining a replica") } } default: diff --git a/go/vt/vttablet/tabletmanager/tm_init_test.go b/go/vt/vttablet/tabletmanager/tm_init_test.go index 821c4a97505..81ef2d5cf74 100644 --- a/go/vt/vttablet/tabletmanager/tm_init_test.go +++ b/go/vt/vttablet/tabletmanager/tm_init_test.go @@ -218,9 +218,13 @@ func TestCheckMastership(t *testing.T) { // 2. Update shard's master to our alias, then try to init again. // (This simulates the case where the MasterAlias in the shard record says // that we are the master but the tablet record says otherwise. In that case, - // we assume we are not the MASTER.) + // we become master by inheriting the shard record's timestamp.) + now := time.Now() _, err = ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error { si.MasterAlias = alias + si.MasterTermStartTime = logutil.TimeToProto(now) + // Reassign to now for easier comparison. + now = si.GetMasterTermStartTime() return nil }) require.NoError(t, err) @@ -228,9 +232,9 @@ func TestCheckMastership(t *testing.T) { require.NoError(t, err) ti, err = ts.GetTablet(ctx, alias) require.NoError(t, err) - assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type) + assert.Equal(t, topodatapb.TabletType_MASTER, ti.Type) ter0 := ti.GetMasterTermStartTime() - assert.True(t, ter0.IsZero()) + assert.Equal(t, now, ter0) tm.Stop() // 3. Delete the tablet record. The shard record still says that we are the @@ -291,6 +295,25 @@ func TestCheckMastership(t *testing.T) { ter4 := ti.GetMasterTermStartTime() assert.Equal(t, ter1, ter4) tm.Stop() + + // 7. If the shard record shows a different master with a newer + // timestamp, we remain replica. + _, err = ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error { + si.MasterAlias = otherAlias + si.MasterTermStartTime = logutil.TimeToProto(ter4.Add(10 * time.Second)) + return nil + }) + require.NoError(t, err) + tablet.Type = topodatapb.TabletType_REPLICA + tablet.MasterTermStartTime = nil + err = tm.Start(tablet, 0) + require.NoError(t, err) + ti, err = ts.GetTablet(ctx, alias) + require.NoError(t, err) + assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type) + ter5 := ti.GetMasterTermStartTime() + assert.True(t, ter5.IsZero()) + tm.Stop() } func TestStartCheckMysql(t *testing.T) {