-
Notifications
You must be signed in to change notification settings - Fork 59
feat(split): replica server handle pause and cancel status #681
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -68,8 +68,9 @@ void replica_split_manager::parent_start_split( | |
return; | ||
} | ||
|
||
// TODO(heyuchen): if partition is primary, reset split related varieties | ||
|
||
if (status() == partition_status::PS_PRIMARY) { | ||
_replica->_primary_states.cleanup_split_states(); | ||
} | ||
_partition_version.store(_replica->_app_info.partition_count - 1); | ||
|
||
_split_status = split_status::SPLITTING; | ||
|
@@ -1144,6 +1145,17 @@ void replica_split_manager::trigger_primary_parent_split( | |
return; | ||
} | ||
|
||
if (meta_split_status == split_status::PAUSING || | ||
meta_split_status == split_status::CANCELING) { | ||
parent_stop_split(meta_split_status); | ||
return; | ||
} | ||
|
||
if (meta_split_status == split_status::PAUSED) { | ||
dwarn_replica("split has been paused, ignore it"); | ||
return; | ||
} | ||
|
||
// TODO(heyuchen): add other split_status check | ||
} | ||
|
||
|
@@ -1169,7 +1181,11 @@ void replica_split_manager::trigger_secondary_parent_split( | |
return; | ||
} | ||
|
||
// TODO(heyuchen): add other split_status check, response will be used in future | ||
if (request.meta_split_status == split_status::PAUSING || | ||
request.meta_split_status == split_status::CANCELING) { // secondary pause or cancel split | ||
parent_stop_split(request.meta_split_status); | ||
response.__set_is_split_stopped(true); | ||
} | ||
} | ||
|
||
// ThreadPool: THREAD_POOL_REPLICATION | ||
|
@@ -1257,5 +1273,81 @@ void replica_split_manager::on_copy_mutation_reply(error_code ec, | |
// TBD | ||
} | ||
|
||
// ThreadPool: THREAD_POOL_REPLICATION | ||
void replica_split_manager::parent_stop_split( | ||
split_status::type meta_split_status) // on parent partition | ||
{ | ||
dassert_replica(status() == partition_status::PS_PRIMARY || | ||
status() == partition_status::PS_SECONDARY, | ||
"wrong partition_status({})", | ||
enum_to_string(status())); | ||
dassert_replica(_split_status == split_status::SPLITTING || | ||
_split_status == split_status::NOT_SPLIT, | ||
"wrong split_status({})", | ||
enum_to_string(_split_status)); | ||
|
||
auto old_status = _split_status; | ||
if (_split_status == split_status::SPLITTING) { | ||
_stub->split_replica_error_handler( | ||
_child_gpid, | ||
std::bind(&replica_split_manager::child_handle_split_error, | ||
std::placeholders::_1, | ||
"stop partition split")); | ||
parent_cleanup_split_context(); | ||
} | ||
_partition_version.store(_replica->_app_info.partition_count - 1); | ||
|
||
if (status() == partition_status::PS_PRIMARY) { | ||
_replica->_primary_states.sync_send_write_request = false; | ||
_replica->broadcast_group_check(); | ||
} | ||
ddebug_replica( | ||
"{} split succeed, status = {}, old split_status = {}, child partition_index = {}", | ||
meta_split_status == split_status::PAUSING ? "pause" : "cancel", | ||
enum_to_string(status()), | ||
enum_to_string(old_status), | ||
get_gpid().get_partition_index() + _replica->_app_info.partition_count); | ||
} | ||
|
||
// ThreadPool: THREAD_POOL_REPLICATION | ||
void replica_split_manager::primary_parent_handle_stop_split( | ||
const std::shared_ptr<group_check_request> &req, | ||
const std::shared_ptr<group_check_response> &resp) // on primary parent partition | ||
{ | ||
if (!req->__isset.meta_split_status || (req->meta_split_status != split_status::PAUSING && | ||
req->meta_split_status != split_status::CANCELING)) { | ||
// partition is not executing split or not stopping split | ||
return; | ||
} | ||
|
||
if (!resp->__isset.is_split_stopped || !resp->is_split_stopped) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
// secondary has not stopped split | ||
return; | ||
} | ||
|
||
_replica->_primary_states.split_stopped_secondary.insert(req->node); | ||
auto count = 0; | ||
for (auto &iter : _replica->_primary_states.statuses) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's safety to check it each time, because |
||
if (iter.second == partition_status::PS_SECONDARY && | ||
_replica->_primary_states.split_stopped_secondary.find(iter.first) != | ||
_replica->_primary_states.split_stopped_secondary.end()) { | ||
++count; | ||
} | ||
} | ||
// all secondaries have already stop split succeed | ||
if (count == _replica->_primary_states.membership.max_replica_count - 1) { | ||
_replica->_primary_states.cleanup_split_states(); | ||
parent_send_notify_stop_request(req->meta_split_status); | ||
} | ||
} | ||
|
||
// ThreadPool: THREAD_POOL_REPLICATION | ||
void replica_split_manager::parent_send_notify_stop_request( | ||
split_status::type meta_split_status) // on primary parent | ||
{ | ||
FAIL_POINT_INJECT_F("replica_parent_send_notify_stop_request", [](dsn::string_view) {}); | ||
// TODO(hyc): TBD | ||
} | ||
|
||
} // namespace replication | ||
} // namespace dsn |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why allow stop under
NOT_SPLIT
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And, if we send
cancel
orpause
by mistake, dassert will cause crash? is your expect?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When learn happened, parent partition may stop split, set it _split_status as
NOT_SPLIT
, meta server won't know it, so it is possible when a pause or cancel split request sync to parent partition, its split_status is NOT_SPLIT.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pause or cancel split request will send to meta server, meta server should check if this table is splitting, you can reference pr679. Besides, parent partition will not set split_status as PAUSING and CANCELING, when it receives pause or cancel request, it will set it NOT_SPLIT. I don't know if I explain it clearly, you can comment to me if you have any questions.