Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Out of sync peer recovery #3650

Merged
merged 2 commits into from
Feb 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/herder/Herder.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ class Herder
// We are learning about a new envelope.
virtual EnvelopeStatus recvSCPEnvelope(SCPEnvelope const& envelope) = 0;

virtual bool isTracking() const = 0;

#ifdef BUILD_TESTS
// We are learning about a new fully-fetched envelope.
virtual EnvelopeStatus recvSCPEnvelope(SCPEnvelope const& envelope,
Expand Down
2 changes: 1 addition & 1 deletion src/herder/HerderImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class HerderImpl : public Herder
}

bool
isTracking() const
isTracking() const override
{
return mState == State::HERDER_TRACKING_NETWORK_STATE;
}
Expand Down
68 changes: 68 additions & 0 deletions src/overlay/OverlayManagerImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ using namespace std;

constexpr std::chrono::seconds PEER_IP_RESOLVE_DELAY(600);
constexpr std::chrono::seconds PEER_IP_RESOLVE_RETRY_DELAY(10);
constexpr std::chrono::seconds OUT_OF_SYNC_RECONNECT_DELAY(60);

// Regardless of the number of failed attempts &
// FLOOD_DEMAND_BACKOFF_DELAY_MS it doesn't make much sense to wait much
// longer than 2 seconds between re-issuing demands.
Expand Down Expand Up @@ -517,6 +519,62 @@ OverlayManagerImpl::connectTo(std::vector<PeerBareAddress> const& peers,
return count;
}

void
OverlayManagerImpl::updateTimerAndMaybeDropRandomPeer(bool shouldDrop)
{
// If we haven't heard from the network for a while, try randomly
// disconnecting a peer in hopes of picking a better one. (preferred peers
// aren't affected as we always want to stay connected)
auto now = mApp.getClock().now();
if (!mApp.getHerder().isTracking())
{
if (mLastOutOfSyncReconnect)
{
// We've been out of sync, check if it's time to drop a peer
if (now - *mLastOutOfSyncReconnect > OUT_OF_SYNC_RECONNECT_DELAY &&
shouldDrop)
{
auto allPeers = getOutboundAuthenticatedPeers();
std::vector<std::pair<NodeID, Peer::pointer>> nonPreferredPeers;
std::copy_if(std::begin(allPeers), std::end(allPeers),
std::back_inserter(nonPreferredPeers),
[&](auto const& peer) {
return !mApp.getOverlayManager().isPreferred(
peer.second.get());
});
if (!nonPreferredPeers.empty())
{
auto peerToDrop = rand_element(nonPreferredPeers);
peerToDrop.second->sendErrorAndDrop(
ERR_LOAD, "random disconnect due to out of sync",
Peer::DropMode::IGNORE_WRITE_QUEUE);
}
// Reset the timer to throttle dropping peers
mLastOutOfSyncReconnect =
std::make_optional<VirtualClock::time_point>(now);
}
else
{
// Still waiting for the timeout or outbound capacity
return;
}
}
else
{
// Start a timer after going out of sync. Note that we still want to
// wait for OUT_OF_SYNC_RECONNECT_DELAY for Herder recovery logic to
// trigger.
mLastOutOfSyncReconnect =
std::make_optional<VirtualClock::time_point>(now);
}
}
else
{
// Reset timer when in-sync
mLastOutOfSyncReconnect.reset();
}
}

// called every PEER_AUTHENTICATION_TIMEOUT + 1=3 seconds
void
OverlayManagerImpl::tick()
Expand Down Expand Up @@ -593,6 +651,16 @@ OverlayManagerImpl::tick()
availablePendingSlots -= pendingUsedByPreferred;
}

// Only trigger reconnecting if:
// * no outbound slots are available
// * we didn't establish any new preferred peers connections (those
// will evict regular peers anyway)
bool shouldDrop =
availableAuthenticatedSlots == 0 && availablePendingSlots > 0;
updateTimerAndMaybeDropRandomPeer(shouldDrop);

availableAuthenticatedSlots = availableOutboundAuthenticatedSlots();

// Second, if there is capacity for pending and authenticated outbound
// connections, connect to more peers. Note: connect even if
// PREFERRED_PEER_ONLY is set, to support key-based preferred peers mode
Expand Down
2 changes: 2 additions & 0 deletions src/overlay/OverlayManagerImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,10 @@ class OverlayManagerImpl : public OverlayManager
RandomEvictionCache<uint64_t, bool> mMessageCache;

void tick();
void updateTimerAndMaybeDropRandomPeer(bool shouldDrop);
VirtualTimer mTimer;
VirtualTimer mPeerIPTimer;
std::optional<VirtualClock::time_point> mLastOutOfSyncReconnect;

friend class OverlayManagerTests;

Expand Down
95 changes: 95 additions & 0 deletions src/overlay/test/OverlayTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1814,6 +1814,101 @@ TEST_CASE("peer is purged from database after few failures",
REQUIRE(!peerManager.load(localhost(cfg2.PEER_PORT)).second);
}

TEST_CASE("disconnected topology recovery")
{
auto cfgs = std::vector<Config>{};
auto peers = std::vector<std::string>{};

for (int i = 0; i < 8; ++i)
{
auto cfg = getTestConfig(i + 1);
cfgs.push_back(cfg);
peers.push_back("127.0.0.1:" + std::to_string(cfg.PEER_PORT));
}

auto doTest = [&](bool usePreferred) {
auto simulation = Topologies::separate(
7, 0.5, Simulation::OVER_LOOPBACK,
sha256(getTestConfig().NETWORK_PASSPHRASE), [&](int i) {
auto cfg = cfgs[i];
cfg.TARGET_PEER_CONNECTIONS = 1;
if (usePreferred)
{
cfg.PREFERRED_PEERS = peers;
}
else
{
cfg.KNOWN_PEERS = peers;
}
cfg.RUN_STANDALONE = false;
return cfg;
});
auto nodeIDs = simulation->getNodeIDs();

// Disconnected graph 0-1-2-3 and 4-5-6
simulation->addConnection(nodeIDs[0], nodeIDs[1]);
simulation->addConnection(nodeIDs[1], nodeIDs[2]);
simulation->addConnection(nodeIDs[2], nodeIDs[3]);
simulation->addConnection(nodeIDs[3], nodeIDs[0]);

simulation->addConnection(nodeIDs[6], nodeIDs[4]);
simulation->addConnection(nodeIDs[4], nodeIDs[5]);
simulation->addConnection(nodeIDs[5], nodeIDs[6]);

simulation->startAllNodes();

// Make sure connections are authenticated
simulation->crankForAtLeast(std::chrono::seconds(1), false);
auto nodes = simulation->getNodes();
for (auto const& node : nodes)
{
REQUIRE(node->getOverlayManager().getAuthenticatedPeersCount() ==
2);
}

simulation->crankForAtLeast(
std::chrono::seconds(
Herder::CONSENSUS_STUCK_TIMEOUT_SECONDS.count() + 1),
false);

// Herder is not tracking (did not hear externalize from the network)
REQUIRE(!nodes[4]->getHerder().isTracking());
REQUIRE(!nodes[5]->getHerder().isTracking());
REQUIRE(!nodes[6]->getHerder().isTracking());

// LM is "synced" from the LCL perspective
REQUIRE(nodes[4]->getLedgerManager().isSynced());
REQUIRE(nodes[5]->getLedgerManager().isSynced());
REQUIRE(nodes[6]->getLedgerManager().isSynced());

// Crank long enough for overlay recovery to kick in
simulation->crankForAtLeast(std::chrono::minutes(2), false);

// If regular peers: Herder is now tracking due to reconnect
// If preferred: Herder is still out of sync since no reconnects
// happened
REQUIRE(nodes[4]->getHerder().isTracking() == !usePreferred);
REQUIRE(nodes[5]->getHerder().isTracking() == !usePreferred);
REQUIRE(nodes[6]->getHerder().isTracking() == !usePreferred);

// If regular peers: because we received a newer ledger, LM is now
// "catching up" If preferred peers: no new ledgers heard, still
// "synced"
REQUIRE(nodes[4]->getLedgerManager().isSynced() == usePreferred);
REQUIRE(nodes[5]->getLedgerManager().isSynced() == usePreferred);
REQUIRE(nodes[6]->getLedgerManager().isSynced() == usePreferred);
};

SECTION("regular peers")
{
doTest(false);
}
SECTION("preferred peers")
{
doTest(true);
}
}

TEST_CASE("generalized tx sets are not sent to non-upgraded peers",
"[txset][overlay]")
{
Expand Down