-
Notifications
You must be signed in to change notification settings - Fork 2.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve Tablet Refresh Behavior in VReplication Traffic Switch Handling #10058
Changes from 10 commits
8889e16
8ca8690
e10b2b9
79ca4e7
c6d0505
34c99a0
ead2082
1e1696a
abe6c2f
18f9573
96b96bb
0646033
90f656f
8295a72
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -354,7 +354,7 @@ func (wr *Wrangler) cancelHorizontalResharding(ctx context.Context, keyspace, sh | |
|
||
destinationShards[i] = updatedShard | ||
|
||
if _, err := topotools.RefreshTabletsByShard(ctx, wr.ts, wr.tmc, si, nil, wr.Logger()); err != nil { | ||
if _, _, err := topotools.RefreshTabletsByShard(ctx, wr.ts, wr.tmc, si, nil, wr.Logger()); err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't make sense to return an error when canceling. |
||
return err | ||
} | ||
} | ||
|
@@ -442,7 +442,7 @@ func (wr *Wrangler) MigrateServedTypes(ctx context.Context, keyspace, shard stri | |
refreshShards = destinationShards | ||
} | ||
for _, si := range refreshShards { | ||
_, err := topotools.RefreshTabletsByShard(ctx, wr.ts, wr.tmc, si, cells, wr.Logger()) | ||
_, _, err := topotools.RefreshTabletsByShard(ctx, wr.ts, wr.tmc, si, cells, wr.Logger()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this is past the point of no return. |
||
rec.RecordError(err) | ||
} | ||
return rec.Error() | ||
|
@@ -792,7 +792,7 @@ func (wr *Wrangler) masterMigrateServedType(ctx context.Context, keyspace string | |
} | ||
|
||
for _, si := range destinationShards { | ||
if _, err := topotools.RefreshTabletsByShard(ctx, wr.ts, wr.tmc, si, nil, wr.Logger()); err != nil { | ||
if _, _, err := topotools.RefreshTabletsByShard(ctx, wr.ts, wr.tmc, si, nil, wr.Logger()); err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this is past the point of no return. |
||
return err | ||
} | ||
} | ||
|
@@ -1226,7 +1226,7 @@ func (wr *Wrangler) replicaMigrateServedFrom(ctx context.Context, ki *topo.Keysp | |
|
||
// Now refresh the source servers so they reload the denylist | ||
event.DispatchUpdate(ev, "refreshing sources tablets state so they update their denied tables") | ||
_, err := topotools.RefreshTabletsByShard(ctx, wr.ts, wr.tmc, sourceShard, cells, wr.Logger()) | ||
_, _, err := topotools.RefreshTabletsByShard(ctx, wr.ts, wr.tmc, sourceShard, cells, wr.Logger()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this is past the point of no return. |
||
return err | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -68,6 +68,16 @@ const ( | |
lockTablesCycles = 2 | ||
// time to wait between LOCK TABLES cycles on the sources during SwitchWrites | ||
lockTablesCycleDelay = time.Duration(100 * time.Millisecond) | ||
|
||
// How long to wait when refreshing the state of each tablet in a shard. Note that these | ||
// are refreshed in parallel, non-topo errors are ignored (in the error handling) and we | ||
// may only do a partial refresh. Because in some cases it's unsafe to switch the traffic | ||
// if some tablets do not refresh, we may need to look for partial results and produce | ||
// an error (with the provided details of WHY) if we see them. | ||
// Side note: the default lock/lease TTL in etcd is 60s so the default tablet refresh | ||
// timeout of 60s can cause us to lose our keyspace lock before completing the | ||
// operation too. | ||
shardTabletRefreshTimeout = time.Duration(30 * time.Second) | ||
) | ||
|
||
// trafficSwitcher contains the metadata for switching read and write traffic | ||
|
@@ -1015,7 +1025,9 @@ func (ts *trafficSwitcher) changeTableSourceWrites(ctx context.Context, access a | |
}); err != nil { | ||
return err | ||
} | ||
_, err := topotools.RefreshTabletsByShard(ctx, ts.TopoServer(), ts.TabletManagerClient(), source.GetShard(), nil, ts.Logger()) | ||
rtbsCtx, cancel := context.WithTimeout(ctx, shardTabletRefreshTimeout) | ||
defer cancel() | ||
_, _, err := topotools.RefreshTabletsByShard(rtbsCtx, ts.TopoServer(), ts.TabletManagerClient(), source.GetShard(), nil, ts.Logger()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a safe/good place to return an error if we could not refresh all tablets as its early on in the operation. I'll work on this one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sgtm There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done: 90f656f |
||
return err | ||
}) | ||
} | ||
|
@@ -1283,7 +1295,9 @@ func (ts *trafficSwitcher) allowTableTargetWrites(ctx context.Context) error { | |
}); err != nil { | ||
return err | ||
} | ||
_, err := topotools.RefreshTabletsByShard(ctx, ts.TopoServer(), ts.TabletManagerClient(), target.GetShard(), nil, ts.Logger()) | ||
rtbsCtx, cancel := context.WithTimeout(ctx, shardTabletRefreshTimeout) | ||
defer cancel() | ||
_, _, err := topotools.RefreshTabletsByShard(rtbsCtx, ts.TopoServer(), ts.TabletManagerClient(), target.GetShard(), nil, ts.Logger()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is after the point of no return. |
||
return err | ||
}) | ||
} | ||
|
@@ -1389,7 +1403,9 @@ func (ts *trafficSwitcher) dropSourceDeniedTables(ctx context.Context) error { | |
}); err != nil { | ||
return err | ||
} | ||
_, err := topotools.RefreshTabletsByShard(ctx, ts.TopoServer(), ts.TabletManagerClient(), source.GetShard(), nil, ts.Logger()) | ||
rtbsCtx, cancel := context.WithTimeout(ctx, shardTabletRefreshTimeout) | ||
defer cancel() | ||
_, _, err := topotools.RefreshTabletsByShard(rtbsCtx, ts.TopoServer(), ts.TabletManagerClient(), source.GetShard(), nil, ts.Logger()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is called when canceling. |
||
return err | ||
}) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is called from a number of places via
Wrangler.updateShardRecords()
. I don't think that we can change the partial handling here w/o other changes.