Skip to content

Commit

Permalink
When retrying CASE during commissioning, extend the fail-safe. (#25595)
Browse files Browse the repository at this point in the history
If we don't do this, we can get into a situation where our retries take longer
than the fail-safe timer, so we are still retrying by the other side is not even
listening anymore.

The changes here are as follows:

1) Expose various bits on CASEClient and CASESession to allow us to compute how
   long it will take before we know whether the next CASE retry has succeeded or
   failed.
2) Add a way for OperationalSessionSetup to notify its consumers that it's going
   to retry, and how long it expects it to take before it knows whether the
   retry has succeeded.
3) Change DeviceCommissioner to extend the fail-safe when it's told that
   OperationalSessionSetup will retry.
  • Loading branch information
bzbarsky-apple authored and pull[bot] committed Oct 19, 2023
1 parent 6d4b4b7 commit 77a833e
Show file tree
Hide file tree
Showing 10 changed files with 175 additions and 15 deletions.
5 changes: 5 additions & 0 deletions src/app/CASEClient.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ void CASEClient::SetRemoteMRPIntervals(const ReliableMessageProtocolConfig & rem
mCASESession.SetRemoteMRPConfig(remoteMRPConfig);
}

const ReliableMessageProtocolConfig & CASEClient::GetRemoteMRPIntervals()
{
return mCASESession.GetRemoteMRPConfig();
}

CHIP_ERROR CASEClient::EstablishSession(const CASEClientInitParams & params, const ScopedNodeId & peer,
const Transport::PeerAddress & peerAddress,
const ReliableMessageProtocolConfig & remoteMRPConfig,
Expand Down
2 changes: 2 additions & 0 deletions src/app/CASEClient.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ class DLL_EXPORT CASEClient
public:
void SetRemoteMRPIntervals(const ReliableMessageProtocolConfig & remoteMRPConfig);

const ReliableMessageProtocolConfig & GetRemoteMRPIntervals();

CHIP_ERROR EstablishSession(const CASEClientInitParams & params, const ScopedNodeId & peer,
const Transport::PeerAddress & peerAddress, const ReliableMessageProtocolConfig & remoteMRPConfig,
SessionEstablishmentDelegate * delegate);
Expand Down
6 changes: 5 additions & 1 deletion src/app/CASESessionManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ void CASESessionManager::FindOrEstablishSession(const ScopedNodeId & peerId, Cal
Callback::Callback<OnDeviceConnectionFailure> * onFailure
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
,
uint8_t attemptCount
uint8_t attemptCount, Callback::Callback<OnDeviceConnectionRetry> * onRetry
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
)
{
Expand All @@ -60,6 +60,10 @@ void CASESessionManager::FindOrEstablishSession(const ScopedNodeId & peerId, Cal

#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
session->UpdateAttemptCount(attemptCount);
if (onRetry)
{
session->AddRetryHandler(onRetry);
}
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES

session->Connect(onConnection, onFailure);
Expand Down
2 changes: 1 addition & 1 deletion src/app/CASESessionManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ class CASESessionManager : public OperationalSessionReleaseDelegate, public Sess
Callback::Callback<OnDeviceConnectionFailure> * onFailure
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
,
uint8_t attemptCount = 1
uint8_t attemptCount = 1, Callback::Callback<OnDeviceConnectionRetry> * onRetry = nullptr
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
);

Expand Down
55 changes: 48 additions & 7 deletions src/app/OperationalSessionSetup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,15 @@ void OperationalSessionSetup::DequeueConnectionCallbacks(CHIP_ERROR error)
mConnectionFailure.DequeueAll(failureReady);
mConnectionSuccess.DequeueAll(successReady);

#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
// Clear out mConnectionRetry, so that those cancelables are not holding
// pointers to us, since we're about to go away.
while (auto * cb = mConnectionRetry.First())
{
cb->Cancel();
}
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES

//
// If we encountered no error, go ahead and call all success callbacks. Otherwise,
// call the failure callbacks.
Expand Down Expand Up @@ -304,23 +313,35 @@ void OperationalSessionSetup::DequeueConnectionCallbacks(CHIP_ERROR error)

void OperationalSessionSetup::OnSessionEstablishmentError(CHIP_ERROR error)
{
VerifyOrReturn(mState != State::Uninitialized && mState != State::NeedsAddress,
ChipLogError(Discovery, "HandleCASEConnectionFailure was called while the device was not initialized"));
VerifyOrReturn(mState == State::Connecting,
ChipLogError(Discovery, "OnSessionEstablishmentError was called while we were not connecting"));

if (CHIP_ERROR_TIMEOUT == error)
{
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
// Make a copy of the ReliableMessageProtocolConfig, since our
// mCaseClient is about to go away.
ReliableMessageProtocolConfig remoteMprConfig = mCASEClient->GetRemoteMRPIntervals();
#endif

if (CHIP_NO_ERROR == Resolver::Instance().TryNextResult(mAddressLookupHandle))
{
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
// Our retry is going to be immediate, once the event loop spins.
NotifyRetryHandlers(error, remoteMprConfig, System::Clock::kZero);
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
MoveToState(State::ResolvingAddress);
return;
}

#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
if (mRemainingAttempts > 0)
{
CHIP_ERROR err = ScheduleSessionSetupReattempt();
System::Clock::Seconds16 reattemptDelay;
CHIP_ERROR err = ScheduleSessionSetupReattempt(reattemptDelay);
if (err == CHIP_NO_ERROR)
{
NotifyRetryHandlers(error, remoteMprConfig, reattemptDelay);
return;
}
}
Expand All @@ -333,8 +354,8 @@ void OperationalSessionSetup::OnSessionEstablishmentError(CHIP_ERROR error)

void OperationalSessionSetup::OnSessionEstablished(const SessionHandle & session)
{
VerifyOrReturn(mState != State::Uninitialized,
ChipLogError(Discovery, "HandleCASEConnected was called while the device was not initialized"));
VerifyOrReturn(mState == State::Connecting,
ChipLogError(Discovery, "OnSessionEstablished was called while we were not connecting"));

if (!mSecureSession.Grab(session))
return; // Got an invalid session, do not change any state
Expand Down Expand Up @@ -489,7 +510,7 @@ void OperationalSessionSetup::UpdateAttemptCount(uint8_t attemptCount)
}
}

CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt()
CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt(System::Clock::Seconds16 & timerDelay)
{
VerifyOrDie(mRemainingAttempts > 0);
// Try again, but not if things are in shutdown such that we can't get
Expand All @@ -500,7 +521,6 @@ CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt()
}

MoveToState(State::NeedsAddress);
System::Clock::Seconds16 timerDelay;
// Stop exponential backoff before our delays get too large.
//
// Note that mAttemptsDone is always > 0 here, because we have
Expand Down Expand Up @@ -545,6 +565,27 @@ void OperationalSessionSetup::TrySetupAgain(System::Layer * systemLayer, void *
self->DequeueConnectionCallbacks(err);
// Do not touch `self` instance anymore; it has been destroyed in DequeueConnectionCallbacks.
}

void OperationalSessionSetup::AddRetryHandler(Callback::Callback<OnDeviceConnectionRetry> * onRetry)
{
mConnectionRetry.Enqueue(onRetry->Cancel());
}

void OperationalSessionSetup::NotifyRetryHandlers(CHIP_ERROR error, const ReliableMessageProtocolConfig & remoteMrpConfig,
System::Clock::Seconds16 retryDelay)
{
// Compute the time we are likely to need to detect that the retry has
// failed.
System::Clock::Timeout messageTimeout = CASESession::ComputeSigma1ResponseTimeout(remoteMrpConfig);
auto timeoutSecs = std::chrono::duration_cast<System::Clock::Seconds16>(messageTimeout);
// Add 1 second in case we had fractional milliseconds in messageTimeout.
timeoutSecs += System::Clock::Seconds16(1);
for (auto * item = mConnectionRetry.First(); item && item != &mConnectionRetry; item = item->mNext)
{
auto cb = Callback::Callback<OnDeviceConnectionRetry>::FromCancelable(item);
cb->mCall(cb->mContext, mPeerId, error, timeoutSecs + retryDelay);
}
}
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES

} // namespace chip
31 changes: 29 additions & 2 deletions src/app/OperationalSessionSetup.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@
#include <messaging/ExchangeDelegate.h>
#include <messaging/ExchangeMgr.h>
#include <messaging/Flags.h>
#include <messaging/ReliableMessageProtocolConfig.h>
#include <platform/CHIPDeviceConfig.h>
#include <protocols/secure_channel/CASESession.h>
#include <system/SystemClock.h>
#include <system/SystemLayer.h>
#include <transport/SessionManager.h>
#include <transport/TransportMgr.h>
Expand Down Expand Up @@ -117,8 +119,20 @@ class OperationalDeviceProxy : public DeviceProxy
* implementations as an example.
*/
typedef void (*OnDeviceConnected)(void * context, Messaging::ExchangeManager & exchangeMgr, const SessionHandle & sessionHandle);

/**
* Callback prototype when secure session establishment fails.
*/
typedef void (*OnDeviceConnectionFailure)(void * context, const ScopedNodeId & peerId, CHIP_ERROR error);

/**
* Callback prototype when secure session establishement has failed and will be
* retried. retryTimeout indicates how much time will pass before we know
* whether the retry has timed out waiting for a response to our Sigma1 message.
*/
typedef void (*OnDeviceConnectionRetry)(void * context, const ScopedNodeId & peerId, CHIP_ERROR error,
System::Clock::Seconds16 retryTimeout);

/**
* Object used to either establish a connection to peer or performing address lookup to a peer.
*
Expand Down Expand Up @@ -227,6 +241,9 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate,
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
// Update our remaining attempt count to be at least the given value.
void UpdateAttemptCount(uint8_t attemptCount);

// Add a retry handler for this session setup.
void AddRetryHandler(Callback::Callback<OnDeviceConnectionRetry> * onRetry);
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES

private:
Expand Down Expand Up @@ -268,6 +285,8 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate,
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
uint8_t mRemainingAttempts = 0;
uint8_t mAttemptsDone = 0;

Callback::CallbackDeque mConnectionRetry;
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES

void MoveToState(State aTargetState);
Expand Down Expand Up @@ -313,14 +332,22 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate,

#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
/**
* Schedule a setup reattempt, if possible.
* Schedule a setup reattempt, if possible. The outparam indicates how long
* it will be before the reattempt happens.
*/
CHIP_ERROR ScheduleSessionSetupReattempt();
CHIP_ERROR ScheduleSessionSetupReattempt(System::Clock::Seconds16 & timerDelay);

/**
* Helper for our backoff retry timer.
*/
static void TrySetupAgain(System::Layer * systemLayer, void * state);

/**
* Helper to notify our retry callbacks that a setup error occurred and we
* will retry.
*/
void NotifyRetryHandlers(CHIP_ERROR error, const ReliableMessageProtocolConfig & remoteMrpConfig,
System::Clock::Seconds16 retryDelay);
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
};

Expand Down
59 changes: 58 additions & 1 deletion src/controller/CHIPDeviceController.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,9 @@ ControllerDeviceInitParams DeviceController::GetControllerDeviceInitParams()

DeviceCommissioner::DeviceCommissioner() :
mOnDeviceConnectedCallback(OnDeviceConnectedFn, this), mOnDeviceConnectionFailureCallback(OnDeviceConnectionFailureFn, this),
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
mOnDeviceConnectionRetryCallback(OnDeviceConnectionRetryFn, this),
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
mDeviceAttestationInformationVerificationCallback(OnDeviceAttestationInformationVerification, this),
mDeviceNOCChainCallback(OnDeviceNOCChainGeneration, this), mSetUpCodePairer(this)
{
Expand Down Expand Up @@ -1731,6 +1734,60 @@ void DeviceCommissioner::OnDeviceConnectionFailureFn(void * context, const Scope
}
}

#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
// No specific action to take on either success or failure here; we're just
// trying to bump the fail-safe, and if that fails it's not clear there's much
// we can to with that.
static void OnExtendFailsafeForCASERetryFailure(void * context, CHIP_ERROR error)
{
ChipLogError(Controller, "Failed to extend fail-safe for CASE retry: %" CHIP_ERROR_FORMAT, error.Format());
}
static void
OnExtendFailsafeForCASERetrySuccess(void * context,
const app::Clusters::GeneralCommissioning::Commands::ArmFailSafeResponse::DecodableType & data)
{
ChipLogProgress(Controller, "Status of extending fail-safe for CASE retry: %u", to_underlying(data.errorCode));
}

void DeviceCommissioner::OnDeviceConnectionRetryFn(void * context, const ScopedNodeId & peerId, CHIP_ERROR error,
System::Clock::Seconds16 retryTimeout)
{
ChipLogError(Controller,
"Session establishment failed for " ChipLogFormatScopedNodeId ", error: %" CHIP_ERROR_FORMAT
". Next retry expected to get a response to Sigma1 or fail within %d seconds",
ChipLogValueScopedNodeId(peerId), error.Format(), retryTimeout.count());

auto self = static_cast<DeviceCommissioner *>(context);

// We need to do the fail-safe arming over the PASE session.
auto * commissioneeDevice = self->FindCommissioneeDevice(peerId.GetNodeId());
if (!commissioneeDevice)
{
// Commissioning canceled, presumably. Just ignore the notification,
// not much we can do here.
return;
}

// Extend by the default failsafe timeout plus our retry timeout, so we can
// be sure the fail-safe will not expire before we try the next time, if
// there will be a next time.
//
// TODO: Make it possible for our clients to control the exact timeout here?
uint16_t failsafeTimeout;
if (UINT16_MAX - retryTimeout.count() < kDefaultFailsafeTimeout)
{
failsafeTimeout = UINT16_MAX;
}
else
{
failsafeTimeout = static_cast<uint16_t>(retryTimeout.count() + kDefaultFailsafeTimeout);
}
self->ExtendArmFailSafe(commissioneeDevice, CommissioningStage::kFindOperational, failsafeTimeout,
MakeOptional(kMinimumCommissioningStepTimeout), OnExtendFailsafeForCASERetrySuccess,
OnExtendFailsafeForCASERetryFailure);
}
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES

// ClusterStateCache::Callback impl
void DeviceCommissioner::OnDone(app::ReadClient *)
{
Expand Down Expand Up @@ -2444,7 +2501,7 @@ void DeviceCommissioner::PerformCommissioningStep(DeviceProxy * proxy, Commissio
&mOnDeviceConnectionFailureCallback
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
,
/* attemptCount = */ 3
/* attemptCount = */ 3, &mOnDeviceConnectionRetryCallback
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
);
}
Expand Down
8 changes: 8 additions & 0 deletions src/controller/CHIPDeviceController.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
#include <protocols/secure_channel/MessageCounterManager.h>
#include <protocols/secure_channel/RendezvousParameters.h>
#include <protocols/user_directed_commissioning/UserDirectedCommissioning.h>
#include <system/SystemClock.h>
#include <transport/SessionManager.h>
#include <transport/TransportMgr.h>
#include <transport/raw/UDP.h>
Expand Down Expand Up @@ -787,6 +788,10 @@ class DLL_EXPORT DeviceCommissioner : public DeviceController,

static void OnDeviceConnectedFn(void * context, Messaging::ExchangeManager & exchangeMgr, const SessionHandle & sessionHandle);
static void OnDeviceConnectionFailureFn(void * context, const ScopedNodeId & peerId, CHIP_ERROR error);
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
static void OnDeviceConnectionRetryFn(void * context, const ScopedNodeId & peerId, CHIP_ERROR error,
System::Clock::Seconds16 retryTimeout);
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES

static void OnDeviceAttestationInformationVerification(void * context,
const Credentials::DeviceAttestationVerifier::AttestationInfo & info,
Expand Down Expand Up @@ -897,6 +902,9 @@ class DLL_EXPORT DeviceCommissioner : public DeviceController,

chip::Callback::Callback<OnDeviceConnected> mOnDeviceConnectedCallback;
chip::Callback::Callback<OnDeviceConnectionFailure> mOnDeviceConnectionFailureCallback;
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
chip::Callback::Callback<OnDeviceConnectionRetry> mOnDeviceConnectionRetryCallback;
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES

chip::Callback::Callback<Credentials::DeviceAttestationVerifier::OnAttestationInformationVerification>
mDeviceAttestationInformationVerificationCallback;
Expand Down
16 changes: 13 additions & 3 deletions src/protocols/secure_channel/CASESession.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,9 @@ static_assert(sizeof(kTBEData2_Nonce) == sizeof(kTBEData3_Nonce), "TBEData2_Nonc
//
// The session establishment fails if the response is not received within the resulting timeout window,
// which accounts for both transport latency and the server-side latency.
static constexpr ExchangeContext::Timeout kExpectedLowProcessingTime = System::Clock::Seconds16(2);
static constexpr ExchangeContext::Timeout kExpectedHighProcessingTime = System::Clock::Seconds16(30);
static constexpr ExchangeContext::Timeout kExpectedLowProcessingTime = System::Clock::Seconds16(2);
static constexpr ExchangeContext::Timeout kExpectedSigma1ProcessingTime = kExpectedLowProcessingTime;
static constexpr ExchangeContext::Timeout kExpectedHighProcessingTime = System::Clock::Seconds16(30);

CASESession::~CASESession()
{
Expand Down Expand Up @@ -273,7 +274,7 @@ CHIP_ERROR CASESession::EstablishSession(SessionManager & sessionManager, Fabric
mSessionResumptionStorage = sessionResumptionStorage;
mLocalMRPConfig = mrpLocalConfig;

mExchangeCtxt->UseSuggestedResponseTimeout(kExpectedLowProcessingTime);
mExchangeCtxt->UseSuggestedResponseTimeout(kExpectedSigma1ProcessingTime);
mPeerNodeId = peerScopedNodeId.GetNodeId();
mLocalNodeId = fabricInfo->GetNodeId();

Expand Down Expand Up @@ -1974,4 +1975,13 @@ CHIP_ERROR CASESession::OnMessageReceived(ExchangeContext * ec, const PayloadHea
return err;
}

System::Clock::Timeout CASESession::ComputeSigma1ResponseTimeout(const ReliableMessageProtocolConfig & remoteMrpConfig)
{
return GetRetransmissionTimeout(remoteMrpConfig.mActiveRetransTimeout, remoteMrpConfig.mIdleRetransTimeout,
// Assume peer is idle, since that's what we
// will assume for our initial message.
System::Clock::kZero, Transport::kMinActiveTime) +
kExpectedSigma1ProcessingTime;
}

} // namespace chip
Loading

0 comments on commit 77a833e

Please sign in to comment.