Skip to content

Commit

Permalink
Merge pull request #3650 from marta-lokhova/out_of_sync_peer_recovery
Browse files Browse the repository at this point in the history
Out of sync peer recovery

Reviewed-by: MonsieurNicolas
  • Loading branch information
latobarita authored Feb 16, 2023
2 parents 871acce + 174cddc commit e6564bb
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 1 deletion.
2 changes: 2 additions & 0 deletions src/herder/Herder.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ class Herder
// We are learning about a new envelope.
virtual EnvelopeStatus recvSCPEnvelope(SCPEnvelope const& envelope) = 0;

virtual bool isTracking() const = 0;

#ifdef BUILD_TESTS
// We are learning about a new fully-fetched envelope.
virtual EnvelopeStatus recvSCPEnvelope(SCPEnvelope const& envelope,
Expand Down
2 changes: 1 addition & 1 deletion src/herder/HerderImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class HerderImpl : public Herder
}

bool
isTracking() const
isTracking() const override
{
return mState == State::HERDER_TRACKING_NETWORK_STATE;
}
Expand Down
68 changes: 68 additions & 0 deletions src/overlay/OverlayManagerImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ using namespace std;

constexpr std::chrono::seconds PEER_IP_RESOLVE_DELAY(600);
constexpr std::chrono::seconds PEER_IP_RESOLVE_RETRY_DELAY(10);
constexpr std::chrono::seconds OUT_OF_SYNC_RECONNECT_DELAY(60);

// Regardless of the number of failed attempts &
// FLOOD_DEMAND_BACKOFF_DELAY_MS it doesn't make much sense to wait much
// longer than 2 seconds between re-issuing demands.
Expand Down Expand Up @@ -517,6 +519,62 @@ OverlayManagerImpl::connectTo(std::vector<PeerBareAddress> const& peers,
return count;
}

void
OverlayManagerImpl::updateTimerAndMaybeDropRandomPeer(bool shouldDrop)
{
// If we haven't heard from the network for a while, try randomly
// disconnecting a peer in hopes of picking a better one. (preferred peers
// aren't affected as we always want to stay connected)
auto now = mApp.getClock().now();
if (!mApp.getHerder().isTracking())
{
if (mLastOutOfSyncReconnect)
{
// We've been out of sync, check if it's time to drop a peer
if (now - *mLastOutOfSyncReconnect > OUT_OF_SYNC_RECONNECT_DELAY &&
shouldDrop)
{
auto allPeers = getOutboundAuthenticatedPeers();
std::vector<std::pair<NodeID, Peer::pointer>> nonPreferredPeers;
std::copy_if(std::begin(allPeers), std::end(allPeers),
std::back_inserter(nonPreferredPeers),
[&](auto const& peer) {
return !mApp.getOverlayManager().isPreferred(
peer.second.get());
});
if (!nonPreferredPeers.empty())
{
auto peerToDrop = rand_element(nonPreferredPeers);
peerToDrop.second->sendErrorAndDrop(
ERR_LOAD, "random disconnect due to out of sync",
Peer::DropMode::IGNORE_WRITE_QUEUE);
}
// Reset the timer to throttle dropping peers
mLastOutOfSyncReconnect =
std::make_optional<VirtualClock::time_point>(now);
}
else
{
// Still waiting for the timeout or outbound capacity
return;
}
}
else
{
// Start a timer after going out of sync. Note that we still want to
// wait for OUT_OF_SYNC_RECONNECT_DELAY for Herder recovery logic to
// trigger.
mLastOutOfSyncReconnect =
std::make_optional<VirtualClock::time_point>(now);
}
}
else
{
// Reset timer when in-sync
mLastOutOfSyncReconnect.reset();
}
}

// called every PEER_AUTHENTICATION_TIMEOUT + 1=3 seconds
void
OverlayManagerImpl::tick()
Expand Down Expand Up @@ -593,6 +651,16 @@ OverlayManagerImpl::tick()
availablePendingSlots -= pendingUsedByPreferred;
}

// Only trigger reconnecting if:
// * no outbound slots are available
// * we didn't establish any new preferred peers connections (those
// will evict regular peers anyway)
bool shouldDrop =
availableAuthenticatedSlots == 0 && availablePendingSlots > 0;
updateTimerAndMaybeDropRandomPeer(shouldDrop);

availableAuthenticatedSlots = availableOutboundAuthenticatedSlots();

// Second, if there is capacity for pending and authenticated outbound
// connections, connect to more peers. Note: connect even if
// PREFERRED_PEER_ONLY is set, to support key-based preferred peers mode
Expand Down
2 changes: 2 additions & 0 deletions src/overlay/OverlayManagerImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,10 @@ class OverlayManagerImpl : public OverlayManager
RandomEvictionCache<uint64_t, bool> mMessageCache;

void tick();
void updateTimerAndMaybeDropRandomPeer(bool shouldDrop);
VirtualTimer mTimer;
VirtualTimer mPeerIPTimer;
std::optional<VirtualClock::time_point> mLastOutOfSyncReconnect;

friend class OverlayManagerTests;

Expand Down
95 changes: 95 additions & 0 deletions src/overlay/test/OverlayTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1814,6 +1814,101 @@ TEST_CASE("peer is purged from database after few failures",
REQUIRE(!peerManager.load(localhost(cfg2.PEER_PORT)).second);
}

TEST_CASE("disconnected topology recovery")
{
auto cfgs = std::vector<Config>{};
auto peers = std::vector<std::string>{};

for (int i = 0; i < 8; ++i)
{
auto cfg = getTestConfig(i + 1);
cfgs.push_back(cfg);
peers.push_back("127.0.0.1:" + std::to_string(cfg.PEER_PORT));
}

auto doTest = [&](bool usePreferred) {
auto simulation = Topologies::separate(
7, 0.5, Simulation::OVER_LOOPBACK,
sha256(getTestConfig().NETWORK_PASSPHRASE), [&](int i) {
auto cfg = cfgs[i];
cfg.TARGET_PEER_CONNECTIONS = 1;
if (usePreferred)
{
cfg.PREFERRED_PEERS = peers;
}
else
{
cfg.KNOWN_PEERS = peers;
}
cfg.RUN_STANDALONE = false;
return cfg;
});
auto nodeIDs = simulation->getNodeIDs();

// Disconnected graph 0-1-2-3 and 4-5-6
simulation->addConnection(nodeIDs[0], nodeIDs[1]);
simulation->addConnection(nodeIDs[1], nodeIDs[2]);
simulation->addConnection(nodeIDs[2], nodeIDs[3]);
simulation->addConnection(nodeIDs[3], nodeIDs[0]);

simulation->addConnection(nodeIDs[6], nodeIDs[4]);
simulation->addConnection(nodeIDs[4], nodeIDs[5]);
simulation->addConnection(nodeIDs[5], nodeIDs[6]);

simulation->startAllNodes();

// Make sure connections are authenticated
simulation->crankForAtLeast(std::chrono::seconds(1), false);
auto nodes = simulation->getNodes();
for (auto const& node : nodes)
{
REQUIRE(node->getOverlayManager().getAuthenticatedPeersCount() ==
2);
}

simulation->crankForAtLeast(
std::chrono::seconds(
Herder::CONSENSUS_STUCK_TIMEOUT_SECONDS.count() + 1),
false);

// Herder is not tracking (did not hear externalize from the network)
REQUIRE(!nodes[4]->getHerder().isTracking());
REQUIRE(!nodes[5]->getHerder().isTracking());
REQUIRE(!nodes[6]->getHerder().isTracking());

// LM is "synced" from the LCL perspective
REQUIRE(nodes[4]->getLedgerManager().isSynced());
REQUIRE(nodes[5]->getLedgerManager().isSynced());
REQUIRE(nodes[6]->getLedgerManager().isSynced());

// Crank long enough for overlay recovery to kick in
simulation->crankForAtLeast(std::chrono::minutes(2), false);

// If regular peers: Herder is now tracking due to reconnect
// If preferred: Herder is still out of sync since no reconnects
// happened
REQUIRE(nodes[4]->getHerder().isTracking() == !usePreferred);
REQUIRE(nodes[5]->getHerder().isTracking() == !usePreferred);
REQUIRE(nodes[6]->getHerder().isTracking() == !usePreferred);

// If regular peers: because we received a newer ledger, LM is now
// "catching up" If preferred peers: no new ledgers heard, still
// "synced"
REQUIRE(nodes[4]->getLedgerManager().isSynced() == usePreferred);
REQUIRE(nodes[5]->getLedgerManager().isSynced() == usePreferred);
REQUIRE(nodes[6]->getLedgerManager().isSynced() == usePreferred);
};

SECTION("regular peers")
{
doTest(false);
}
SECTION("preferred peers")
{
doTest(true);
}
}

TEST_CASE("generalized tx sets are not sent to non-upgraded peers",
"[txset][overlay]")
{
Expand Down

0 comments on commit e6564bb

Please sign in to comment.