Skip to content

Commit

Permalink
Track SCP latencies in milliseconds for overlay survey
Browse files Browse the repository at this point in the history
Fixes #4333 by recording SCP latencies in milliseconds rather than
nanoseconds, which could overflow when placed in a `uint32_t`.

I also checked the other survey data fields for potential overflow and
didn't find anything else that could overflow. However, in doing so I
noticed that I failed to bump
`TIME_SLICED_SURVEY_MIN_OVERLAY_PROTOCOL_VERSION` to `34` when I bumped
the overlay protocol version, so I fixed that as well.
  • Loading branch information
bboston7 committed May 31, 2024
1 parent d25ad44 commit 24f4745
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 17 deletions.
8 changes: 4 additions & 4 deletions src/herder/HerderSCPDriver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1038,8 +1038,8 @@ HerderSCPDriver::recordSCPExternalizeEvent(uint64_t slotIndex, NodeID const& id,
std::chrono::nanoseconds::zero(), slotIndex);
mApp.getOverlayManager().getSurveyManager().modifyNodeData(
[&](CollectingNodeData& nd) {
nd.mSCPFirstToSelfLatencyNsHistogram.Update(
std::chrono::duration_cast<std::chrono::nanoseconds>(
nd.mSCPFirstToSelfLatencyMsHistogram.Update(
std::chrono::duration_cast<std::chrono::milliseconds>(
now - *timing.mFirstExternalize)
.count());
});
Expand All @@ -1063,8 +1063,8 @@ HerderSCPDriver::recordSCPExternalizeEvent(uint64_t slotIndex, NodeID const& id,
std::chrono::nanoseconds::zero(), slotIndex);
mApp.getOverlayManager().getSurveyManager().modifyNodeData(
[&](CollectingNodeData& nd) {
nd.mSCPSelfToOtherLatencyNsHistogram.Update(
std::chrono::duration_cast<std::chrono::nanoseconds>(
nd.mSCPSelfToOtherLatencyMsHistogram.Update(
std::chrono::duration_cast<std::chrono::milliseconds>(
now - *timing.mFirstExternalize)
.count());
});
Expand Down
19 changes: 10 additions & 9 deletions src/overlay/SurveyDataManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "crypto/SecretKey.h"
#include "overlay/Peer.h"
#include "util/Logging.h"
#include "util/numeric.h"

#include <Tracy.hpp>
#include <chrono>
Expand Down Expand Up @@ -67,9 +68,9 @@ initializeCollectingPeerData(

CollectingNodeData::CollectingNodeData(uint64_t initialLostSyncCount,
Application::State initialState)
: mSCPFirstToSelfLatencyNsHistogram(
: mSCPFirstToSelfLatencyMsHistogram(
medida::SamplingInterface::SampleType::kSliding)
, mSCPSelfToOtherLatencyNsHistogram(
, mSCPSelfToOtherLatencyMsHistogram(
medida::SamplingInterface::SampleType::kSliding)
, mInitialLostSyncCount(initialLostSyncCount)
, mInitialState(initialState)
Expand Down Expand Up @@ -417,12 +418,12 @@ SurveyDataManager::finalizeNodeData(Config const& config)
static_cast<uint32_t>(mFinalInboundPeerData.size());
mFinalNodeData->totalOutboundPeerCount =
static_cast<uint32_t>(mFinalOutboundPeerData.size());
mFinalNodeData->p75SCPFirstToSelfLatencyMs =
mCollectingNodeData->mSCPFirstToSelfLatencyNsHistogram.GetSnapshot()
.get75thPercentile();
mFinalNodeData->p75SCPSelfToOtherLatencyMs =
mCollectingNodeData->mSCPSelfToOtherLatencyNsHistogram.GetSnapshot()
.get75thPercentile();
mFinalNodeData->p75SCPFirstToSelfLatencyMs = doubleToClampedUint32(
mCollectingNodeData->mSCPFirstToSelfLatencyMsHistogram.GetSnapshot()
.get75thPercentile());
mFinalNodeData->p75SCPSelfToOtherLatencyMs = doubleToClampedUint32(
mCollectingNodeData->mSCPSelfToOtherLatencyMsHistogram.GetSnapshot()
.get75thPercentile());
mFinalNodeData->lostSyncCount = static_cast<uint32_t>(
mLostSyncMeter.count() - mCollectingNodeData->mInitialLostSyncCount);
switch (mCollectingNodeData->mInitialState)
Expand Down Expand Up @@ -499,7 +500,7 @@ SurveyDataManager::finalizePeerData(
finalStats.duplicateFetchMessageRecv =
peerMetrics.mDuplicateFetchMessageRecv -
collectingData.mInitialDuplicateFetchMessageRecv;
finalData.averageLatencyMs = static_cast<uint32_t>(
finalData.averageLatencyMs = doubleToClampedUint32(
collectingData.mLatencyMsHistogram.GetSnapshot().getMedian());
}
}
Expand Down
6 changes: 3 additions & 3 deletions src/overlay/SurveyDataManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ struct CollectingNodeData
uint32_t mAddedAuthenticatedPeers = 0;
uint32_t mDroppedAuthenticatedPeers = 0;

// SCP stats (in nanoseconds)
medida::Histogram mSCPFirstToSelfLatencyNsHistogram;
medida::Histogram mSCPSelfToOtherLatencyNsHistogram;
// SCP stats (in milliseconds)
medida::Histogram mSCPFirstToSelfLatencyMsHistogram;
medida::Histogram mSCPSelfToOtherLatencyMsHistogram;

// To compute how many times the node lost sync in the time slice
uint64_t const mInitialLostSyncCount;
Expand Down
2 changes: 1 addition & 1 deletion src/overlay/SurveyManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ namespace stellar

uint32_t const SurveyManager::SURVEY_THROTTLE_TIMEOUT_MULT(3);

uint32_t constexpr TIME_SLICED_SURVEY_MIN_OVERLAY_PROTOCOL_VERSION = 33;
uint32_t constexpr TIME_SLICED_SURVEY_MIN_OVERLAY_PROTOCOL_VERSION = 34;

namespace
{
Expand Down
21 changes: 21 additions & 0 deletions src/util/numeric.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "util/GlobalChecks.h"
#include "util/numeric128.h"

#include <algorithm>
#include <stdexcept>

namespace stellar
Expand Down Expand Up @@ -329,4 +330,24 @@ hugeDivide(int64_t& result, int32_t a, uint128_t const& B, uint128_t const& C,
}
return false;
}

uint32_t
doubleToClampedUint32(double d)
{
// IEEE-754 doubles have 52-bit fractions, and so can perfectly represent
// uint32_t values. Therefore, it should be safe to convert to the full
// range of uint32_t values. However, the C++ standard does not mandate
// that doubles adhere to the IEEE-754 specification. That being said, it
// would be shocking if we built for any platform that wasn't using IEEE-754
// doubles. Regardless, it's an easy compile-time check to be sure.
static_assert(std::numeric_limits<double>::is_iec559,
"double is not IEEE-754 compliant");

constexpr uint32_t maxUint32 = std::numeric_limits<uint32_t>::max();
if (std::isnan(d))
{
return maxUint32;
}
return static_cast<uint32_t>(std::clamp<double>(d, 0, maxUint32));
}
}
4 changes: 4 additions & 0 deletions src/util/numeric.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ isRepresentableAsInt64(double d)
(d < static_cast<double>(std::numeric_limits<int64_t>::max()));
}

// Convert a double to a uint32_t, clamping to the range of uint32_t. Converts
// NaN to the maximum uint32_t.
uint32_t doubleToClampedUint32(double d);

// calculates A*B/C when A*B overflows 64bits
int64_t bigDivideOrThrow(int64_t A, int64_t B, int64_t C, Rounding rounding);
// no throw version, returns true if result is valid
Expand Down

5 comments on commit 24f4745

@latobarita
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

saw approval from graydon
at bboston7@24f4745

@latobarita
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

merging bboston7/stellar-core/scp-stats-ms = 24f4745 into auto

@latobarita
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bboston7/stellar-core/scp-stats-ms = 24f4745 merged ok, testing candidate = 0f189aa

@latobarita
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@latobarita
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fast-forwarding master to auto = 0f189aa

Please sign in to comment.