diff --git a/examples/backups/restart_tablets.sh b/examples/backups/restart_tablets.sh index bfafcf26d4f..de812a0ea8e 100755 --- a/examples/backups/restart_tablets.sh +++ b/examples/backups/restart_tablets.sh @@ -35,9 +35,9 @@ for i in 300 301 302; do done sleep 5 -# Wait for all the replica tablets to be in the serving state before initiating -# InitShardPrimary. This is essential, since we want the RESTORE phase to be -# complete before we start InitShardPrimary, otherwise we end up reading the +# Wait for all the tablets to be in the serving state before initiating +# PlannedReparentShard. This is essential, since we want the RESTORE phase to be +# complete before we start PlannedReparentShard, otherwise we end up reading the # tablet type to RESTORE and do not set semi-sync, which leads to the primary # hanging on writes. totalTime=600 @@ -50,6 +50,15 @@ for i in 101 201 301; do done done +for i in 102 202 302; do + while [ $totalTime -gt 0 ]; do + status=$(curl "http://$hostname:15$i/debug/status_details") + echo "$status" | grep "RDONLY: Serving" && break + totalTime=$((totalTime-1)) + sleep 0.1 + done +done + # Check that all the replica tablets have reached REPLICA: Serving state for i in 101 201 301; do status=$(curl "http://$hostname:15$i/debug/status_details") @@ -57,6 +66,13 @@ for i in 101 201 301; do echo "tablet-$i did not reach REPLICA: Serving state. Exiting due to failure." exit 1 done +# Check that all the rdonly tablets have reached RDONLY: Serving state +for i in 102 202 302; do + status=$(curl "http://$hostname:15$i/debug/status_details") + echo "$status" | grep "RDONLY: Serving" && continue + echo "tablet-$i did not reach RDONLY: Serving state. Exiting due to failure." + exit 1 +done vtctldclient PlannedReparentShard commerce/0 --new-primary "zone1-100" vtctldclient PlannedReparentShard customer/-80 --new-primary "zone1-200" diff --git a/go/test/endtoend/reparent/plannedreparent/reparent_test.go b/go/test/endtoend/reparent/plannedreparent/reparent_test.go index 59734bce57a..7a0b16f5890 100644 --- a/go/test/endtoend/reparent/plannedreparent/reparent_test.go +++ b/go/test/endtoend/reparent/plannedreparent/reparent_test.go @@ -123,9 +123,16 @@ func TestReparentReplicaOffline(t *testing.T) { // Perform a graceful reparent operation. out, err := utils.PrsWithTimeout(t, clusterInstance, tablets[1], false, "", "31s") require.Error(t, err) - assert.True(t, utils.SetReplicationSourceFailed(tablets[3], out)) - utils.CheckPrimaryTablet(t, clusterInstance, tablets[1]) + // Assert that PRS failed + if clusterInstance.VtctlMajorVersion <= 17 { + assert.True(t, utils.SetReplicationSourceFailed(tablets[3], out)) + utils.CheckPrimaryTablet(t, clusterInstance, tablets[1]) + } else { + assert.Contains(t, out, "rpc error: code = DeadlineExceeded desc") + utils.CheckPrimaryTablet(t, clusterInstance, tablets[0]) + } + } func TestReparentAvoid(t *testing.T) { @@ -155,7 +162,11 @@ func TestReparentAvoid(t *testing.T) { utils.StopTablet(t, tablets[0], true) out, err := utils.PrsAvoid(t, clusterInstance, tablets[1]) require.Error(t, err) - assert.Contains(t, out, "cannot find a tablet to reparent to in the same cell as the current primary") + if clusterInstance.VtctlMajorVersion <= 17 { + assert.Contains(t, out, "cannot find a tablet to reparent to in the same cell as the current primary") + } else { + assert.Contains(t, out, "rpc error: code = DeadlineExceeded desc = latest balancer error") + } utils.ValidateTopology(t, clusterInstance, false) utils.CheckPrimaryTablet(t, clusterInstance, tablets[1]) } @@ -275,17 +286,24 @@ func TestReparentWithDownReplica(t *testing.T) { // Perform a graceful reparent operation. It will fail as one tablet is down. out, err := utils.Prs(t, clusterInstance, tablets[1]) require.Error(t, err) - assert.True(t, utils.SetReplicationSourceFailed(tablets[2], out)) - - // insert data into the new primary, check the connected replica work - insertVal := utils.ConfirmReplication(t, tablets[1], []*cluster.Vttablet{tablets[0], tablets[3]}) + var insertVal int + // Assert that PRS failed + if clusterInstance.VtctlMajorVersion <= 17 { + assert.True(t, utils.SetReplicationSourceFailed(tablets[2], out)) + // insert data into the new primary, check the connected replica work + insertVal = utils.ConfirmReplication(t, tablets[1], []*cluster.Vttablet{tablets[0], tablets[3]}) + } else { + assert.Contains(t, out, fmt.Sprintf("TabletManager.PrimaryStatus on %s error", tablets[2].Alias)) + // insert data into the old primary, check the connected replica works. The primary tablet shouldn't have changed. + insertVal = utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[3]}) + } // restart mysql on the old replica, should still be connecting to the old primary tablets[2].MysqlctlProcess.InitMysql = false err = tablets[2].MysqlctlProcess.Start() require.NoError(t, err) - // Use the same PlannedReparentShard command to fix up the tablet. + // Use the same PlannedReparentShard command to promote the new primary. _, err = utils.Prs(t, clusterInstance, tablets[1]) require.NoError(t, err) diff --git a/go/vt/vtctl/grpcvtctldserver/server_slow_test.go b/go/vt/vtctl/grpcvtctldserver/server_slow_test.go index 858ac271a70..50ca8ea82be 100644 --- a/go/vt/vtctl/grpcvtctldserver/server_slow_test.go +++ b/go/vt/vtctl/grpcvtctldserver/server_slow_test.go @@ -402,6 +402,21 @@ func TestPlannedReparentShardSlow(t *testing.T) { Error: nil, }, }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000101": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, PrimaryPositionResults: map[string]struct { Position string Error error @@ -505,6 +520,21 @@ func TestPlannedReparentShardSlow(t *testing.T) { Error: nil, }, }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000101": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, PrimaryPositionResults: map[string]struct { Position string Error error diff --git a/go/vt/vtctl/grpcvtctldserver/server_test.go b/go/vt/vtctl/grpcvtctldserver/server_test.go index 48b53c53c05..5a25e074290 100644 --- a/go/vt/vtctl/grpcvtctldserver/server_test.go +++ b/go/vt/vtctl/grpcvtctldserver/server_test.go @@ -6249,6 +6249,21 @@ func TestPlannedReparentShard(t *testing.T) { Error: nil, }, }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000101": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, PrimaryPositionResults: map[string]struct { Position string Error error diff --git a/go/vt/vtctl/grpcvtctldserver/testutil/test_tmclient.go b/go/vt/vtctl/grpcvtctldserver/testutil/test_tmclient.go index b824833a7dc..9426f1327b2 100644 --- a/go/vt/vtctl/grpcvtctldserver/testutil/test_tmclient.go +++ b/go/vt/vtctl/grpcvtctldserver/testutil/test_tmclient.go @@ -281,6 +281,11 @@ type TabletManagerClient struct { Position *replicationdatapb.Status Error error } + PrimaryStatusDelays map[string]time.Duration + PrimaryStatusResults map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + } RestoreFromBackupResults map[string]struct { Events []*logutilpb.Event EventInterval time.Duration @@ -870,6 +875,32 @@ func (fake *TabletManagerClient) ReplicationStatus(ctx context.Context, tablet * return nil, assert.AnError } +// PrimaryStatus is part of the tmclient.TabletManagerClient interface. +func (fake *TabletManagerClient) PrimaryStatus(ctx context.Context, tablet *topodatapb.Tablet) (*replicationdatapb.PrimaryStatus, error) { + if fake.PrimaryStatusResults == nil { + return nil, assert.AnError + } + + key := topoproto.TabletAliasString(tablet.Alias) + + if fake.PrimaryStatusDelays != nil { + if delay, ok := fake.PrimaryStatusDelays[key]; ok { + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(delay): + // proceed to results + } + } + } + + if result, ok := fake.PrimaryStatusResults[key]; ok { + return result.Status, result.Error + } + + return nil, assert.AnError +} + type backupRestoreStreamAdapter struct { *grpcshim.BidiStream ch chan *logutilpb.Event diff --git a/go/vt/vtctl/reparentutil/planned_reparenter.go b/go/vt/vtctl/reparentutil/planned_reparenter.go index 8633afa13d0..8a27fd8c4c2 100644 --- a/go/vt/vtctl/reparentutil/planned_reparenter.go +++ b/go/vt/vtctl/reparentutil/planned_reparenter.go @@ -22,6 +22,7 @@ import ( "sync" "time" + "golang.org/x/sync/errgroup" "google.golang.org/protobuf/proto" "vitess.io/vitess/go/event" @@ -518,6 +519,11 @@ func (pr *PlannedReparenter) reparentShardLocked( return err } + err = pr.verifyAllTabletsReachable(ctx, tabletMap) + if err != nil { + return err + } + // Check invariants that PlannedReparentShard depends on. if isNoop, err := pr.preflightChecks(ctx, ev, keyspace, shard, tabletMap, &opts); err != nil { return err @@ -572,12 +578,12 @@ func (pr *PlannedReparenter) reparentShardLocked( // inserted in the new primary's journal, so we can use it below to check // that all the replicas have attached to new primary successfully. switch { - case currentPrimary == nil && ev.ShardInfo.PrimaryAlias == nil: + case currentPrimary == nil && ev.ShardInfo.PrimaryTermStartTime == nil: // Case (1): no primary has been elected ever. Initialize // the primary-elect tablet reparentJournalPos, err = pr.performInitialPromotion(ctx, ev.NewPrimary, opts) needsRefresh = true - case currentPrimary == nil && ev.ShardInfo.PrimaryAlias != nil: + case currentPrimary == nil && ev.ShardInfo.PrimaryTermStartTime != nil: // Case (2): no clear current primary. Try to find a safe promotion // candidate, and promote to it. err = pr.performPotentialPromotion(ctx, keyspace, shard, ev.NewPrimary, tabletMap, opts) @@ -713,3 +719,20 @@ func (pr *PlannedReparenter) reparentTablets( return nil } + +// verifyAllTabletsReachable verifies that all the tablets are reachable when running PRS. +func (pr *PlannedReparenter) verifyAllTabletsReachable(ctx context.Context, tabletMap map[string]*topo.TabletInfo) error { + // Create a cancellable context for the entire set of RPCs to verify reachability. + verifyCtx, verifyCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) + defer verifyCancel() + + errorGroup, groupCtx := errgroup.WithContext(verifyCtx) + for _, info := range tabletMap { + tablet := info.Tablet + errorGroup.Go(func() error { + _, err := pr.tmc.PrimaryStatus(groupCtx, tablet) + return err + }) + } + return errorGroup.Wait() +} diff --git a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go index 20815db3dfc..c37fca9a37c 100644 --- a/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go +++ b/go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go @@ -112,6 +112,18 @@ func TestPlannedReparenter_ReparentShard(t *testing.T) { SetReadWriteResults: map[string]error{ "zone1-0000000100": nil, }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, }, tablets: []*topodatapb.Tablet{ { @@ -222,6 +234,18 @@ func TestPlannedReparenter_ReparentShard(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000200": nil, }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, }, tablets: []*topodatapb.Tablet{ { @@ -300,6 +324,18 @@ func TestPlannedReparenter_ReparentShard(t *testing.T) { SetReadWriteResults: map[string]error{ "zone1-0000000100": nil, }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, }, tablets: []*topodatapb.Tablet{ { @@ -360,7 +396,17 @@ func TestPlannedReparenter_ReparentShard(t *testing.T) { // thoroughly to cover all the cases. name: "reparent fails", ts: memorytopo.NewServer("zone1"), - tmc: nil, + tmc: &testutil.TabletManagerClient{ + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, + }, tablets: []*topodatapb.Tablet{ { Alias: &topodatapb.TabletAlias{ @@ -2324,6 +2370,18 @@ func TestPlannedReparenter_reparentShardLocked(t *testing.T) { Error: nil, }, }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, PopulateReparentJournalResults: map[string]error{ "zone1-0000000200": nil, // zone1-200 gets promoted }, @@ -2405,6 +2463,18 @@ func TestPlannedReparenter_reparentShardLocked(t *testing.T) { SetReadWriteResults: map[string]error{ "zone1-0000000100": nil, }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, }, tablets: []*topodatapb.Tablet{ { @@ -2474,6 +2544,18 @@ func TestPlannedReparenter_reparentShardLocked(t *testing.T) { Error: nil, }, }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, PrimaryPositionResults: map[string]struct { Position string Error error @@ -2576,6 +2658,18 @@ func TestPlannedReparenter_reparentShardLocked(t *testing.T) { SetReplicationSourceResults: map[string]error{ "zone1-0000000100": nil, // called during reparentTablets to make this tablet a replica of newPrimary }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, }, tablets: []*topodatapb.Tablet{ // Shard has no current primary in the beginning. @@ -2647,15 +2741,29 @@ func TestPlannedReparenter_reparentShardLocked(t *testing.T) { Error error }{ "zone1-0000000200": { - Error: mysql.ErrNotReplica, + Position: &replicationdatapb.Status{ + Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-2", + }, }, "zone1-0000000100": { - Error: fmt.Errorf("not providing replication status, so that 200 wins"), + Error: mysql.ErrNotReplica, }, }, SetReplicationSourceResults: map[string]error{ "zone1-0000000100": nil, // called during reparentTablets to make this tablet a replica of newPrimary }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, }, tablets: []*topodatapb.Tablet{ // Shard has no current primary in the beginning. @@ -2703,7 +2811,20 @@ func TestPlannedReparenter_reparentShardLocked(t *testing.T) { { name: "preflight checks determine PRS is no-op", ts: memorytopo.NewServer("zone1"), - tmc: nil, + tmc: &testutil.TabletManagerClient{ + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, + }, tablets: []*topodatapb.Tablet{ { Alias: &topodatapb.TabletAlias{ @@ -2755,6 +2876,18 @@ func TestPlannedReparenter_reparentShardLocked(t *testing.T) { SetReadWriteResults: map[string]error{ "zone1-0000000100": assert.AnError, }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, }, tablets: []*topodatapb.Tablet{ { @@ -2812,6 +2945,18 @@ func TestPlannedReparenter_reparentShardLocked(t *testing.T) { name: "lost topology lock", ts: memorytopo.NewServer("zone1"), tmc: &testutil.TabletManagerClient{ + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, PrimaryPositionResults: map[string]struct { Position string Error error @@ -2892,6 +3037,18 @@ func TestPlannedReparenter_reparentShardLocked(t *testing.T) { Error: nil, }, }, + // This is only needed to verify reachability, so empty results are fine. + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": assert.AnError, }, @@ -3638,3 +3795,185 @@ func AssertReparentEventsEqual(t *testing.T, expected *events.Reparent, actual * AssertReparentEventsEqualWithMessage(t, expected, actual, "") } + +// TestPlannedReparenter_verifyAllTabletsReachable tests the functionality of verifyAllTabletsReachable. +func TestPlannedReparenter_verifyAllTabletsReachable(t *testing.T) { + tests := []struct { + name string + ts *topo.Server + tmc tmclient.TabletManagerClient + tabletMap map[string]*topo.TabletInfo + remoteOpTime time.Duration + wantErr string + }{ + { + name: "Success", + tmc: &testutil.TabletManagerClient{ + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000201": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, + }, + tabletMap: map[string]*topo.TabletInfo{ + "zone1-0000000100": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Type: topodatapb.TabletType_PRIMARY, + }, + }, + "zone1-0000000200": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 200, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + "zone1-0000000201": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 201, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + }, + }, { + name: "Failure", + tmc: &testutil.TabletManagerClient{ + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Error: fmt.Errorf("primary status failed"), + }, + "zone1-0000000201": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, + }, + tabletMap: map[string]*topo.TabletInfo{ + "zone1-0000000100": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Type: topodatapb.TabletType_PRIMARY, + }, + }, + "zone1-0000000200": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 200, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + "zone1-0000000201": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 201, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + }, + wantErr: "primary status failed", + }, { + name: "Timeout", + tmc: &testutil.TabletManagerClient{ + PrimaryStatusDelays: map[string]time.Duration{ + "zone1-0000000100": 20 * time.Second, + }, + PrimaryStatusResults: map[string]struct { + Status *replicationdatapb.PrimaryStatus + Error error + }{ + "zone1-0000000200": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000201": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + "zone1-0000000100": { + Status: &replicationdatapb.PrimaryStatus{}, + }, + }, + }, + remoteOpTime: 100 * time.Millisecond, + tabletMap: map[string]*topo.TabletInfo{ + "zone1-0000000100": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Type: topodatapb.TabletType_PRIMARY, + }, + }, + "zone1-0000000200": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 200, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + "zone1-0000000201": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 201, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + }, + wantErr: "context deadline exceeded", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + pr := &PlannedReparenter{ + ts: tt.ts, + tmc: tt.tmc, + } + if tt.remoteOpTime != 0 { + oldTime := topo.RemoteOperationTimeout + topo.RemoteOperationTimeout = tt.remoteOpTime + defer func() { + topo.RemoteOperationTimeout = oldTime + }() + } + err := pr.verifyAllTabletsReachable(context.Background(), tt.tabletMap) + if tt.wantErr == "" { + require.NoError(t, err) + return + } + require.ErrorContains(t, err, tt.wantErr) + }) + } +} diff --git a/go/vt/vtctl/reparentutil/util.go b/go/vt/vtctl/reparentutil/util.go index f4cebc3dd7d..ee1bb36236c 100644 --- a/go/vt/vtctl/reparentutil/util.go +++ b/go/vt/vtctl/reparentutil/util.go @@ -22,6 +22,8 @@ import ( "sync" "time" + "golang.org/x/sync/errgroup" + "vitess.io/vitess/go/mysql" "vitess.io/vitess/go/vt/concurrency" "vitess.io/vitess/go/vt/log" @@ -66,12 +68,12 @@ func ChooseNewPrimary( } var ( - wg sync.WaitGroup // mutex to secure the next two fields from concurrent access mu sync.Mutex // tablets that are possible candidates to be the new primary and their positions - validTablets []*topodatapb.Tablet - tabletPositions []mysql.Position + validTablets []*topodatapb.Tablet + tabletPositions []mysql.Position + errorGroup, groupCtx = errgroup.WithContext(ctx) ) for _, tablet := range tabletMap { @@ -84,22 +86,24 @@ func ChooseNewPrimary( continue } - wg.Add(1) - - go func(tablet *topodatapb.Tablet) { - defer wg.Done() + tb := tablet.Tablet + errorGroup.Go(func() error { // find and store the positions for the tablet - pos, err := findPositionForTablet(ctx, tablet, logger, tmc, waitReplicasTimeout) + pos, err := findPositionForTablet(groupCtx, tb, logger, tmc, waitReplicasTimeout) mu.Lock() defer mu.Unlock() if err == nil { - validTablets = append(validTablets, tablet) + validTablets = append(validTablets, tb) tabletPositions = append(tabletPositions, pos) } - }(tablet.Tablet) + return err + }) } - wg.Wait() + err := errorGroup.Wait() + if err != nil { + return nil, err + } // return nothing if there are no valid tablets available if len(validTablets) == 0 { @@ -107,7 +111,7 @@ func ChooseNewPrimary( } // sort the tablets for finding the best primary - err := sortTabletsForReparent(validTablets, tabletPositions, durability) + err = sortTabletsForReparent(validTablets, tabletPositions, durability) if err != nil { return nil, err }