-
Notifications
You must be signed in to change notification settings - Fork 171
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ensure interrupted FLX bootstraps get restarted after re-connecting #5466
Changes from all commits
1e972c6
d72deff
d23e73f
bc27e69
8d9b398
7c38de9
699d7b1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1010,7 +1010,7 @@ void Connection::read_or_write_error(std::error_code ec) | |
{ | ||
m_reconnect_info.m_reason = ConnectionTerminationReason::read_or_write_error; | ||
bool is_fatal = false; | ||
close_due_to_client_side_error(ec, is_fatal); // Throws | ||
close_due_to_client_side_error(ec, is_fatal); // Throws | ||
} | ||
|
||
|
||
|
@@ -1111,7 +1111,7 @@ void Connection::disconnect(const SessionErrorInfo& info) | |
m_sending = false; | ||
|
||
report_connection_state_change(ConnectionState::disconnected, info); // Throws | ||
initiate_reconnect_wait(); // Throws | ||
initiate_reconnect_wait(); // Throws | ||
} | ||
|
||
bool Connection::is_flx_sync_connection() const noexcept | ||
|
@@ -1637,8 +1637,8 @@ void Session::send_bind_message() | |
// Discard the token since it's ignored by the server. | ||
std::string empty_access_token{}; | ||
protocol.make_bind_message(protocol_version, out, session_ident, path, empty_access_token, need_client_file_ident, | ||
is_subserver); // Throws | ||
m_conn.initiate_write_message(out, this); // Throws | ||
is_subserver); // Throws | ||
m_conn.initiate_write_message(out, this); // Throws | ||
|
||
m_bind_message_sent = true; | ||
|
||
|
@@ -1683,7 +1683,7 @@ void Session::send_ident_message() | |
m_progress.latest_server_version.salt); // Throws | ||
protocol.make_pbs_ident_message(out, session_ident, m_client_file_ident, m_progress); // Throws | ||
} | ||
m_conn.initiate_write_message(out, this); // Throws | ||
m_conn.initiate_write_message(out, this); // Throws | ||
|
||
m_ident_message_sent = true; | ||
|
||
|
@@ -2063,6 +2063,8 @@ void Session::receive_download_message(const SyncProgress& progress, std::uint_f | |
} | ||
} | ||
|
||
receive_download_message_hook(); | ||
|
||
if (batch_state == DownloadBatchState::LastInBatch) { | ||
update_progress(progress); // Throws | ||
} | ||
|
@@ -2146,31 +2148,30 @@ std::error_code Session::receive_query_error_message(int error_code, std::string | |
|
||
// The caller (Connection) must discard the session if the session has become | ||
// deactivated upon return. | ||
std::error_code Session::receive_error_message(int error_code, const ProtocolErrorInfo& info) | ||
std::error_code Session::receive_error_message(int error_code_int, const ProtocolErrorInfo& info) | ||
{ | ||
logger.info("Received: ERROR \"%1\" (error_code=%2, try_again=%3, recovery_disabled=%4)", info.message, | ||
error_code, info.try_again, info.client_reset_recovery_is_disabled); // Throws | ||
error_code_int, info.try_again, info.client_reset_recovery_is_disabled); // Throws | ||
|
||
bool legal_at_this_time = (m_bind_message_sent && !m_error_message_received && !m_unbound_message_received); | ||
if (REALM_UNLIKELY(!legal_at_this_time)) { | ||
logger.error("Illegal message at this time"); | ||
return ClientError::bad_message_order; | ||
} | ||
|
||
bool known_error_code = bool(get_protocol_error_message(error_code)); | ||
bool known_error_code = bool(get_protocol_error_message(error_code_int)); | ||
if (REALM_UNLIKELY(!known_error_code)) { | ||
logger.error("Unknown error code"); // Throws | ||
return ClientError::bad_error_code; | ||
} | ||
ProtocolError error_code_2 = ProtocolError(error_code); | ||
if (REALM_UNLIKELY(!is_session_level_error(error_code_2))) { | ||
ProtocolError error_code = ProtocolError(error_code_int); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really don't like when we have variables that end in |
||
if (REALM_UNLIKELY(!is_session_level_error(error_code))) { | ||
logger.error("Not a session level error code"); // Throws | ||
return ClientError::bad_error_code; | ||
} | ||
|
||
REALM_ASSERT(!m_suspended); | ||
REALM_ASSERT(m_state == Active || m_state == Deactivating); | ||
|
||
logger.debug("Suspended"); // Throws | ||
|
||
m_error_message_received = true; | ||
|
@@ -2194,10 +2195,8 @@ std::error_code Session::receive_error_message(int error_code, const ProtocolErr | |
// Notify the application of the suspension of the session if the session is | ||
// still in the Active state | ||
if (m_state == Active) { | ||
m_conn.one_less_active_unsuspended_session(); // Throws | ||
std::error_code ec = make_error_code(error_code_2); | ||
SessionErrorInfo error_info(info, ec); | ||
on_suspended(error_info); // Throws | ||
m_conn.one_less_active_unsuspended_session(); // Throws | ||
on_suspended({info, make_error_code(error_code)}); // Throws | ||
} | ||
|
||
if (info.try_again) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -678,7 +678,11 @@ SubscriptionStore::get_next_pending_version(int64_t last_query_version, DB::vers | |
descriptor_ordering.append_sort(SortDescriptor{{{sub_sets->get_primary_key_column()}}, {true}}); | ||
auto res = sub_sets->where() | ||
.greater(sub_sets->get_primary_key_column(), last_query_version) | ||
.group() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the crux of the whole bug. When you're first firing up the sync client, the last_query_version will be zero which should include all the non-complete/non-error subscription sets, but didn't. This meant the sync client would never follow up the IDENT with a QUERY message and never make progress. |
||
.equal(m_sub_set_state, static_cast<int64_t>(SubscriptionSet::State::Pending)) | ||
.Or() | ||
.equal(m_sub_set_state, static_cast<int64_t>(SubscriptionSet::State::Bootstrapping)) | ||
.end_group() | ||
.greater_equal(m_sub_set_snapshot_version, static_cast<int64_t>(after_client_version)) | ||
.find_all(descriptor_ordering); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,9 @@ | |
#include "flx_sync_harness.hpp" | ||
#include "util/test_file.hpp" | ||
#include "realm/object-store/impl/object_accessor_impl.hpp" | ||
#include "realm/sync/config.hpp" | ||
#include "realm/sync/protocol.hpp" | ||
#include "realm/sync/noinst/client_history_impl.hpp" | ||
#include <realm/sync/noinst/server/access_token.hpp> | ||
|
||
namespace realm::app { | ||
|
@@ -35,7 +37,16 @@ const Schema g_minimal_schema{ | |
{"_id", PropertyType::ObjectId, Property::IsPrimary{true}}, | ||
}}, | ||
}; | ||
} | ||
|
||
const Schema g_large_array_schema{ | ||
ObjectSchema("TopLevel", | ||
{ | ||
{"_id", PropertyType::ObjectId, Property::IsPrimary{true}}, | ||
{"queryable_int_field", PropertyType::Int | PropertyType::Nullable}, | ||
{"list_of_strings", PropertyType::Array | PropertyType::String}, | ||
}), | ||
}; | ||
} // namespace | ||
|
||
TEST_CASE("flx: connect to FLX-enabled app", "[sync][flx][app]") { | ||
FLXSyncTestHarness harness("basic_flx_connect"); | ||
|
@@ -255,6 +266,97 @@ TEST_CASE("flx: query on non-queryable field results in query error message", "[ | |
}); | ||
} | ||
|
||
TEST_CASE("flx: interrupted bootstrap restarts/recovers on reconnect", "[sync][flx][app]") { | ||
FLXSyncTestHarness harness("flx_bootstrap_batching", {g_large_array_schema, {"queryable_int_field"}}); | ||
|
||
// First we need to seed the server with objects that are large and complex enough that they get broken | ||
// into multiple download messages. | ||
// | ||
// The server will break up changesets and download messages when they contain more than 1000 instructions | ||
// and are bigger than 1MB respectively. | ||
// | ||
// So this generates 5 objects each with 1000+ instructions that are each 1MB+ big. This should result in | ||
// 3 download messages total with one changeset each for the bootstrap download messages. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess it's 5 download messages There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So there are two download messages for subscription set zero (the schema instructions) and then 5 total for the entire sync session. The trace output for after the session resumes is below:
|
||
std::vector<ObjectId> obj_ids_at_end; | ||
harness.load_initial_data([&](SharedRealm realm) { | ||
CppContext c(realm); | ||
for (int i = 0; i < 5; ++i) { | ||
auto id = ObjectId::gen(); | ||
auto obj = Object::create(c, realm, "TopLevel", | ||
util::Any(AnyDict{{"_id", id}, | ||
{"list_of_strings", AnyVector{}}, | ||
{"queryable_int_field", static_cast<int64_t>(i * 5)}})); | ||
List str_list(obj, realm->schema().find("TopLevel")->property_for_name("list_of_strings")); | ||
for (int j = 0; j < 1024; ++j) { | ||
str_list.add(c, util::Any(std::string(1024, 'a' + (j % 26)))); | ||
} | ||
|
||
obj_ids_at_end.push_back(id); | ||
} | ||
}); | ||
SyncTestFile interrupted_realm_config(harness.app()->current_user(), harness.schema(), | ||
SyncConfig::FLXSyncEnabled{}); | ||
interrupted_realm_config.cache = false; | ||
|
||
{ | ||
SharedRealm realm; | ||
auto [interrupted_promise, interrupted] = util::make_promise_future<void>(); | ||
Realm::Config config = interrupted_realm_config; | ||
config.sync_config->on_download_message_received_hook = | ||
[download_msg_counter = int(0), | ||
promise = std::make_shared<util::Promise<void>>(std::move(interrupted_promise))]( | ||
std::weak_ptr<SyncSession> weak_session) mutable { | ||
auto session = weak_session.lock(); | ||
// We interrupt on the 5th download message, which should be 2/3rd of the way through the | ||
// bootstrap. The first two download messages are for exchanging schema instructions and then | ||
// two messages of actual data. | ||
if (!session || ++download_msg_counter != 5) { | ||
return; | ||
} | ||
|
||
session->close(); | ||
promise->emplace_value(); | ||
}; | ||
|
||
realm = Realm::get_shared_realm(config); | ||
{ | ||
auto mut_subs = realm->get_latest_subscription_set().make_mutable_copy(); | ||
auto table = realm->read_group().get_table("class_TopLevel"); | ||
mut_subs.insert_or_assign(Query(table)); | ||
std::move(mut_subs).commit(); | ||
} | ||
|
||
interrupted.get(); | ||
} | ||
|
||
{ | ||
auto realm = DB::create(sync::make_client_replication(), interrupted_realm_config.path); | ||
auto sub_store = sync::SubscriptionStore::create(realm, [](int64_t) {}); | ||
REQUIRE(sub_store->get_active_and_latest_versions() == std::pair<int64_t, int64_t>{0, 1}); | ||
auto latest_subs = sub_store->get_latest(); | ||
REQUIRE(latest_subs.state() == sync::SubscriptionSet::State::Bootstrapping); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This section is checking whether we've actually set this test up correctly and that there is a SubscriptionSet that has started bootstrapping but hasn't completed yet. |
||
REQUIRE(latest_subs.size() == 1); | ||
REQUIRE(latest_subs.at(0).object_class_name() == "TopLevel"); | ||
} | ||
|
||
auto realm = Realm::get_shared_realm(interrupted_realm_config); | ||
auto table = realm->read_group().get_table("class_TopLevel"); | ||
realm->get_latest_subscription_set().get_state_change_notification(sync::SubscriptionSet::State::Complete).get(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. without the changes in subscriptions.cpp, this test would hang forever here. |
||
wait_for_upload(*realm); | ||
wait_for_download(*realm); | ||
|
||
realm->refresh(); | ||
REQUIRE(table->size() == obj_ids_at_end.size()); | ||
for (auto& id : obj_ids_at_end) { | ||
REQUIRE(table->find_primary_key(Mixed{id})); | ||
} | ||
|
||
auto active_subs = realm->get_active_subscription_set(); | ||
auto latest_subs = realm->get_latest_subscription_set(); | ||
REQUIRE(active_subs.version() == latest_subs.version()); | ||
REQUIRE(active_subs.version() == int64_t(1)); | ||
} | ||
|
||
TEST_CASE("flx: dev mode uploads schema before query change", "[sync][flx][app]") { | ||
FLXSyncTestHarness::ServerSchema server_schema; | ||
auto default_schema = FLXSyncTestHarness::default_server_schema(); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -400,7 +400,7 @@ TEST(Sync_SubscriptionStoreNotifications) | |
CHECK_EQUAL(std::move(fut).get(), SubscriptionSet::State::Complete); | ||
} | ||
|
||
TEST(Sync_RefreshSubscriptionSetInvalidSubscriptionStore) | ||
TEST(Sync_SubscriptionStoreRefreshSubscriptionSetInvalid) | ||
{ | ||
SHARED_GROUP_TEST_PATH(sub_store_path) | ||
SubscriptionStoreFixture fixture(sub_store_path); | ||
|
@@ -451,4 +451,42 @@ TEST(Sync_SubscriptionStoreInternalSchemaMigration) | |
CHECK(!versions.get_version_for(tr, "non_existent_table")); | ||
} | ||
|
||
TEST(Sync_SubscriptionStoreNextPendingVersion) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is just a unit test for the changes in subscriptions.cpp. |
||
{ | ||
SHARED_GROUP_TEST_PATH(sub_store_path) | ||
SubscriptionStoreFixture fixture(sub_store_path); | ||
auto store = SubscriptionStore::create(fixture.db, [](int64_t) {}); | ||
|
||
auto mut_sub_set = store->get_latest().make_mutable_copy(); | ||
auto sub_set = std::move(mut_sub_set).commit(); | ||
auto complete_set = sub_set.version(); | ||
|
||
mut_sub_set = sub_set.make_mutable_copy(); | ||
sub_set = std::move(mut_sub_set).commit(); | ||
auto bootstrapping_set = sub_set.version(); | ||
|
||
mut_sub_set = sub_set.make_mutable_copy(); | ||
sub_set = std::move(mut_sub_set).commit(); | ||
auto pending_set = sub_set.version(); | ||
|
||
mut_sub_set = store->get_mutable_by_version(complete_set); | ||
mut_sub_set.update_state(SubscriptionSet::State::Complete); | ||
std::move(mut_sub_set).commit(); | ||
|
||
mut_sub_set = store->get_mutable_by_version(bootstrapping_set); | ||
mut_sub_set.update_state(SubscriptionSet::State::Bootstrapping); | ||
std::move(mut_sub_set).commit(); | ||
|
||
auto pending_version = store->get_next_pending_version(0, DB::version_type{}); | ||
CHECK(pending_version); | ||
CHECK_EQUAL(pending_version->query_version, bootstrapping_set); | ||
|
||
pending_version = store->get_next_pending_version(bootstrapping_set, DB::version_type{}); | ||
CHECK(pending_set); | ||
CHECK_EQUAL(pending_version->query_version, pending_set); | ||
|
||
pending_version = store->get_next_pending_version(pending_set, DB::version_type{}); | ||
CHECK(!pending_version); | ||
} | ||
|
||
} // namespace realm::sync |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do you really need the SyncSession here? It seems to be used only in tests. Can you use
realm->sync_session()
instead?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So there's a chicken/egg problm here where you need to define this hook before you have a valid realm, so you have to capture the realm by reference, but the hook may get called after the realm is destroyed in some cases. This caused a problem with ASAN. So I had this pass the sync session as a weak_ptr here as the easiest memory-correct way to access the sync session.