From 107079558badfdb281a49c1058fa40ecaf9ad798 Mon Sep 17 00:00:00 2001 From: Boris Zbarsky Date: Thu, 2 Feb 2023 14:55:03 -0500 Subject: [PATCH] Add the ability for OperationalSessionSetup to retry a few times automatically. (#24808) The basic idea is that if CASE establishment fails we try again, after a brief backoff. --- src/app/CASESessionManager.cpp | 11 ++- src/app/CASESessionManager.h | 10 +- src/app/OperationalSessionSetup.cpp | 119 ++++++++++++++++++++++-- src/app/OperationalSessionSetup.h | 25 ++++- src/controller/CHIPDeviceController.cpp | 7 +- src/include/platform/CHIPDeviceConfig.h | 32 +++++++ 6 files changed, 193 insertions(+), 11 deletions(-) diff --git a/src/app/CASESessionManager.cpp b/src/app/CASESessionManager.cpp index 9d4f3814943e42..3fca62d5cc37ed 100644 --- a/src/app/CASESessionManager.cpp +++ b/src/app/CASESessionManager.cpp @@ -30,7 +30,12 @@ CHIP_ERROR CASESessionManager::Init(chip::System::Layer * systemLayer, const CAS } void CASESessionManager::FindOrEstablishSession(const ScopedNodeId & peerId, Callback::Callback * onConnection, - Callback::Callback * onFailure) + Callback::Callback * onFailure +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + , + uint8_t attemptCount +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES +) { ChipLogDetail(CASESessionManager, "FindOrEstablishSession: PeerId = [%d:" ChipLogFormatX64 "]", peerId.GetFabricIndex(), ChipLogValueX64(peerId.GetNodeId())); @@ -53,6 +58,10 @@ void CASESessionManager::FindOrEstablishSession(const ScopedNodeId & peerId, Cal } } +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + session->UpdateAttemptCount(attemptCount); +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + session->Connect(onConnection, onFailure); } diff --git a/src/app/CASESessionManager.h b/src/app/CASESessionManager.h index 1e901478aaf6b7..455ca0b3b36fe1 100644 --- a/src/app/CASESessionManager.h +++ b/src/app/CASESessionManager.h @@ -77,9 +77,17 @@ class CASESessionManager : public OperationalSessionReleaseDelegate, public Sess * * The `onFailure` callback may be called before the FindOrEstablishSession * call returns, for error cases that are detected synchronously. + * + * attemptCount can be used to automatically retry multiple times if session + * setup is not successful. */ void FindOrEstablishSession(const ScopedNodeId & peerId, Callback::Callback * onConnection, - Callback::Callback * onFailure); + Callback::Callback * onFailure +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + , + uint8_t attemptCount = 1 +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + ); void ReleaseSessionsForFabric(FabricIndex fabricIndex); diff --git a/src/app/OperationalSessionSetup.cpp b/src/app/OperationalSessionSetup.cpp index f4e76ed700a370..a87741ea3a246b 100644 --- a/src/app/OperationalSessionSetup.cpp +++ b/src/app/OperationalSessionSetup.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #include using namespace chip::Callback; @@ -49,7 +50,7 @@ void OperationalSessionSetup::MoveToState(State aTargetState) { if (mState != aTargetState) { - ChipLogDetail(Controller, "OperationalSessionSetup[%u:" ChipLogFormatX64 "]: State change %d --> %d", + ChipLogDetail(Discovery, "OperationalSessionSetup[%u:" ChipLogFormatX64 "]: State change %d --> %d", mPeerId.GetFabricIndex(), ChipLogValueX64(mPeerId.GetNodeId()), to_underlying(mState), to_underlying(aTargetState)); mState = aTargetState; @@ -70,7 +71,7 @@ bool OperationalSessionSetup::AttachToExistingSecureSession() if (!sessionHandle.HasValue()) return false; - ChipLogProgress(Controller, "Found an existing secure session to [%u:" ChipLogFormatX64 "]!", mPeerId.GetFabricIndex(), + ChipLogProgress(Discovery, "Found an existing secure session to [%u:" ChipLogFormatX64 "]!", mPeerId.GetFabricIndex(), ChipLogValueX64(mPeerId.GetNodeId())); mDeviceAddress = sessionHandle.Value()->AsSecureSession()->GetPeerAddress(); @@ -214,7 +215,7 @@ void OperationalSessionSetup::UpdateDeviceData(const Transport::PeerAddress & ad return; } - ChipLogError(Controller, "Received UpdateDeviceData in incorrect state"); + ChipLogError(Discovery, "Received UpdateDeviceData in incorrect state"); DequeueConnectionCallbacks(CHIP_ERROR_INCORRECT_STATE); // Do not touch `this` instance anymore; it has been destroyed in DequeueConnectionCallbacks. } @@ -304,7 +305,7 @@ void OperationalSessionSetup::DequeueConnectionCallbacks(CHIP_ERROR error) void OperationalSessionSetup::OnSessionEstablishmentError(CHIP_ERROR error) { VerifyOrReturn(mState != State::Uninitialized && mState != State::NeedsAddress, - ChipLogError(Controller, "HandleCASEConnectionFailure was called while the device was not initialized")); + ChipLogError(Discovery, "HandleCASEConnectionFailure was called while the device was not initialized")); if (CHIP_ERROR_TIMEOUT == error) { @@ -313,6 +314,17 @@ void OperationalSessionSetup::OnSessionEstablishmentError(CHIP_ERROR error) MoveToState(State::ResolvingAddress); return; } + +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + if (mRemainingAttempts > 0) + { + CHIP_ERROR err = ScheduleSessionSetupReattempt(); + if (err == CHIP_NO_ERROR) + { + return; + } + } +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES } DequeueConnectionCallbacks(error); @@ -322,7 +334,7 @@ void OperationalSessionSetup::OnSessionEstablishmentError(CHIP_ERROR error) void OperationalSessionSetup::OnSessionEstablished(const SessionHandle & session) { VerifyOrReturn(mState != State::Uninitialized, - ChipLogError(Controller, "HandleCASEConnected was called while the device was not initialized")); + ChipLogError(Discovery, "HandleCASEConnected was called while the device was not initialized")); if (!mSecureSession.Grab(session)) return; // Got an invalid session, do not change any state @@ -377,6 +389,17 @@ OperationalSessionSetup::~OperationalSessionSetup() CHIP_ERROR OperationalSessionSetup::LookupPeerAddress() { +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + if (mRemainingAttempts > 0) + { + --mRemainingAttempts; + } + if (mAttemptsDone < UINT8_MAX) + { + ++mAttemptsDone; + } +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + // NOTE: This is public API that can be used to update our stored peer // address even when we are in State::Connected, so we do not make any // MoveToState calls in this method. @@ -418,7 +441,7 @@ void OperationalSessionSetup::PerformAddressUpdate() CHIP_ERROR err = LookupPeerAddress(); if (err != CHIP_NO_ERROR) { - ChipLogError(Controller, "Failed to look up peer address: %" CHIP_ERROR_FORMAT, err.Format()); + ChipLogError(Discovery, "Failed to look up peer address: %" CHIP_ERROR_FORMAT, err.Format()); DequeueConnectionCallbacks(err); // Do not touch `this` instance anymore; it has been destroyed in DequeueConnectionCallbacks. return; @@ -435,9 +458,93 @@ void OperationalSessionSetup::OnNodeAddressResolutionFailed(const PeerId & peerI ChipLogError(Discovery, "OperationalSessionSetup[%u:" ChipLogFormatX64 "]: operational discovery failed: %" CHIP_ERROR_FORMAT, mPeerId.GetFabricIndex(), ChipLogValueX64(mPeerId.GetNodeId()), reason.Format()); + // Does it make sense to ScheduleSessionSetupReattempt() here? DNS-SD + // resolution has its own retry/backoff mechanisms, so if it's failed we + // have already done a lot of that. + // No need to modify any variables in `this` since call below releases `this`. DequeueConnectionCallbacks(reason); // Do not touch `this` instance anymore; it has been destroyed in DequeueConnectionCallbacks. } +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES +void OperationalSessionSetup::UpdateAttemptCount(uint8_t attemptCount) +{ + if (attemptCount == 0) + { + // Nothing to do. + return; + } + + if (mState != State::NeedsAddress) + { + // We're in the middle of an attempt already, so decrement attemptCount + // by 1 to account for that. + --attemptCount; + } + + if (attemptCount > mRemainingAttempts) + { + mRemainingAttempts = attemptCount; + } +} + +CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt() +{ + VerifyOrDie(mRemainingAttempts > 0); + // Try again, but not if things are in shutdown such that we can't get + // to a system layer, and not if we've run out of attempts. + if (!mInitParams.exchangeMgr->GetSessionManager() || !mInitParams.exchangeMgr->GetSessionManager()->SystemLayer()) + { + return CHIP_ERROR_INCORRECT_STATE; + } + + MoveToState(State::NeedsAddress); + System::Clock::Seconds16 timerDelay; + // Stop exponential backoff before our delays get too large. + // + // Note that mAttemptsDone is always > 0 here, because we have + // just finished one attempt. + VerifyOrDie(mAttemptsDone > 0); + static_assert(UINT16_MAX / CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_INITIAL_DELAY_SECONDS >= + (1 << CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_MAX_BACKOFF), + "Our backoff calculation will overflow."); + timerDelay = System::Clock::Seconds16( + static_cast(CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_INITIAL_DELAY_SECONDS + << min((mAttemptsDone - 1), CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_MAX_BACKOFF))); + CHIP_ERROR err = mInitParams.exchangeMgr->GetSessionManager()->SystemLayer()->StartTimer(timerDelay, TrySetupAgain, this); + // The cast on count() is needed because the type count() returns might not + // actually be uint16_t; on some platforms it's int. + ChipLogProgress(Discovery, + "OperationalSessionSetup:attempts done: %u, attempts left: %u, retry delay %us, status %" CHIP_ERROR_FORMAT, + mAttemptsDone, mRemainingAttempts, static_cast(timerDelay.count()), err.Format()); + return err; +} + +void OperationalSessionSetup::TrySetupAgain(System::Layer * systemLayer, void * state) +{ + auto * self = static_cast(state); + + CHIP_ERROR err = CHIP_NO_ERROR; + + if (self->mState != State::NeedsAddress) + { + err = CHIP_ERROR_INCORRECT_STATE; + } + else + { + self->MoveToState(State::ResolvingAddress); + err = self->LookupPeerAddress(); + if (err == CHIP_NO_ERROR) + { + return; + } + } + + // Give up; we're either in a bad state or could not start a lookup. + self->DequeueConnectionCallbacks(err); + // Do not touch `self` instance anymore; it has been destroyed in DequeueConnectionCallbacks. +} +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + } // namespace chip diff --git a/src/app/OperationalSessionSetup.h b/src/app/OperationalSessionSetup.h index 825278ab771cab..029d355e25ef21 100644 --- a/src/app/OperationalSessionSetup.h +++ b/src/app/OperationalSessionSetup.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -158,7 +159,6 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, } mClientPool = clientPool; - mSystemLayer = params.exchangeMgr->GetSessionManager()->SystemLayer(); mPeerId = peerId; mReleaseDelegate = releaseDelegate; mState = State::NeedsAddress; @@ -224,6 +224,11 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, void OnNodeAddressResolved(const PeerId & peerId, const AddressResolve::ResolveResult & result) override; void OnNodeAddressResolutionFailed(const PeerId & peerId, CHIP_ERROR reason) override; +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + // Update our remaining attempt count to be at least the given value. + void UpdateAttemptCount(uint8_t attemptCount); +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + private: enum class State : uint8_t { @@ -237,7 +242,6 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, CASEClientInitParams mInitParams; CASEClientPoolDelegate * mClientPool = nullptr; - System::Layer * mSystemLayer; // mCASEClient is only non-null if we are in State::Connecting or just // allocated it as part of an attempt to enter State::Connecting. @@ -261,6 +265,11 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, bool mPerformingAddressUpdate = false; +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + uint8_t mRemainingAttempts = 0; + uint8_t mAttemptsDone = 0; +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + void MoveToState(State aTargetState); CHIP_ERROR EstablishConnection(const ReliableMessageProtocolConfig & config); @@ -301,6 +310,18 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, * This function will set new IP address, port and MRP retransmission intervals of the device. */ void UpdateDeviceData(const Transport::PeerAddress & addr, const ReliableMessageProtocolConfig & config); + +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + /** + * Schedule a setup reattempt, if possible. + */ + CHIP_ERROR ScheduleSessionSetupReattempt(); + + /** + * Helper for our backoff retry timer. + */ + static void TrySetupAgain(System::Layer * systemLayer, void * state); +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES }; } // namespace chip diff --git a/src/controller/CHIPDeviceController.cpp b/src/controller/CHIPDeviceController.cpp index a5bd64a78b98fa..f0caac0d738907 100644 --- a/src/controller/CHIPDeviceController.cpp +++ b/src/controller/CHIPDeviceController.cpp @@ -2441,7 +2441,12 @@ void DeviceCommissioner::PerformCommissioningStep(DeviceProxy * proxy, Commissio // If there is an error, CommissioningStageComplete will be called from OnDeviceConnectionFailureFn. auto scopedPeerId = GetPeerScopedId(proxy->GetDeviceId()); mSystemState->CASESessionMgr()->FindOrEstablishSession(scopedPeerId, &mOnDeviceConnectedCallback, - &mOnDeviceConnectionFailureCallback); + &mOnDeviceConnectionFailureCallback +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + , + /* attemptCount = */ 3 +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + ); } break; case CommissioningStage::kSendComplete: { diff --git a/src/include/platform/CHIPDeviceConfig.h b/src/include/platform/CHIPDeviceConfig.h index 4b2d93013988b0..72cdf6e36c620d 100755 --- a/src/include/platform/CHIPDeviceConfig.h +++ b/src/include/platform/CHIPDeviceConfig.h @@ -1282,6 +1282,38 @@ #define CHIP_DEVICE_CONFIG_PAIRING_SECONDARY_INSTRUCTION "" #endif +/** + * CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + * + * If 1, enable support for automatic CASE establishment retries. + */ +#ifndef CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES +#define CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES 1 +#endif + +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + +/** + * CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_INITIAL_DELAY_SECONDS + * + * The initial retry delay, in seconds, for our automatic CASE retries. + */ +#ifndef CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_INITIAL_DELAY_SECONDS +#define CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_INITIAL_DELAY_SECONDS 1 +#endif + +/** + * CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_MAX_BACKOFF + * + * The maximum number of times we back off, by a factor of 2 each time, from our + * initial CASE retry interval before we plateau. + */ +#ifndef CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_MAX_BACKOFF +#define CHIP_DEVICE_CONFIG_AUTOMATIC_CASE_RETRY_MAX_BACKOFF 5 +#endif + +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + // -------------------- App Platform Configuration -------------------- /**