From 86a67122d42f18d3729e0b94906fda969625b85e Mon Sep 17 00:00:00 2001 From: Boris Zbarsky Date: Wed, 29 Mar 2023 12:43:02 -0400 Subject: [PATCH] Fix crashes during CASE establishment. (#25868) * Fix crashes during CASE establishment. There were two potential sources of crashes: 1) TryNextResult on the resolver scheduled an async task that could not be canceled. If, while that task was pending, the OperationalSessionSetup was destroyed (which could happen if another session for the same peer had become active in the meantime and another connection attempt happened) we would end up with use-after-free. 2) ScheduleSessionSetupReattempt scheduled a timer that was never being canceled. Similar to above, if the OperationalSessionSetup got destroyed before the timer fired we could end up with use-after-free. The fix for the first problem is to make TryNextResult synchronous, instead of scheduling an async task. The fix for the second problem is to introduce a new state of OperationalSessionSetup that's used while waiting for the timer and to ensure that the timer is canceled when leaving that state or if the OperationalSessionSetup is destroyed. To preserve existing behavior, if we are in the "waiting for timer" state and new connection attempt happens and there is a session we can attach to, then we immediately move to the Connected state (and cancel the timer). * Address review comments. --- src/app/OperationalSessionSetup.cpp | 64 +++++++++++++------ src/app/OperationalSessionSetup.h | 8 +++ src/lib/address_resolve/AddressResolve.h | 33 ++++++---- .../AddressResolve_DefaultImpl.cpp | 14 ++-- .../AddressResolve_DefaultImpl.h | 1 - 5 files changed, 80 insertions(+), 40 deletions(-) diff --git a/src/app/OperationalSessionSetup.cpp b/src/app/OperationalSessionSetup.cpp index 62551d66c22247..2e9d572a18618e 100644 --- a/src/app/OperationalSessionSetup.cpp +++ b/src/app/OperationalSessionSetup.cpp @@ -53,6 +53,14 @@ void OperationalSessionSetup::MoveToState(State aTargetState) ChipLogDetail(Discovery, "OperationalSessionSetup[%u:" ChipLogFormatX64 "]: State change %d --> %d", mPeerId.GetFabricIndex(), ChipLogValueX64(mPeerId.GetNodeId()), to_underlying(mState), to_underlying(aTargetState)); + +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + if (mState == State::WaitingForRetry) + { + CancelSessionSetupReattempt(); + } +#endif + mState = aTargetState; if (aTargetState != State::Connecting) @@ -64,7 +72,9 @@ void OperationalSessionSetup::MoveToState(State aTargetState) bool OperationalSessionSetup::AttachToExistingSecureSession() { - VerifyOrReturnError(mState == State::NeedsAddress || mState == State::ResolvingAddress || mState == State::HasAddress, false); + VerifyOrReturnError(mState == State::NeedsAddress || mState == State::ResolvingAddress || mState == State::HasAddress || + mState == State::WaitingForRetry, + false); auto sessionHandle = mInitParams.sessionManager->FindSecureSessionForNode(mPeerId, MakeOptional(Transport::SecureSession::Type::kCASE)); @@ -119,6 +129,7 @@ void OperationalSessionSetup::Connect(Callback::Callback * on break; case State::ResolvingAddress: + case State::WaitingForRetry: isConnected = AttachToExistingSecureSession(); break; @@ -320,20 +331,27 @@ void OperationalSessionSetup::OnSessionEstablishmentError(CHIP_ERROR error) { #if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES // Make a copy of the ReliableMessageProtocolConfig, since our - // mCaseClient is about to go away. + // mCaseClient is about to go away once we change state. ReliableMessageProtocolConfig remoteMprConfig = mCASEClient->GetRemoteMRPIntervals(); #endif + // Move to the ResolvingAddress state, in case we have more results, + // since we expect to receive results in that state. + MoveToState(State::ResolvingAddress); if (CHIP_NO_ERROR == Resolver::Instance().TryNextResult(mAddressLookupHandle)) { #if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES - // Our retry is going to be immediate, once the event loop spins. + // Our retry has already been kicked off. NotifyRetryHandlers(error, remoteMprConfig, System::Clock::kZero); #endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES - MoveToState(State::ResolvingAddress); return; } + // Moving back to the Connecting state would be a bit of a lie, since we + // don't have an mCASEClient. Just go back to NeedsAddress, since + // that's really where we are now. + MoveToState(State::NeedsAddress); + #if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES if (mRemainingAttempts > 0) { @@ -341,6 +359,7 @@ void OperationalSessionSetup::OnSessionEstablishmentError(CHIP_ERROR error) CHIP_ERROR err = ScheduleSessionSetupReattempt(reattemptDelay); if (err == CHIP_NO_ERROR) { + MoveToState(State::WaitingForRetry); NotifyRetryHandlers(error, remoteMprConfig, reattemptDelay); return; } @@ -406,6 +425,10 @@ OperationalSessionSetup::~OperationalSessionSetup() // Make sure we don't leak it. mClientPool->Release(mCASEClient); } + +#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES + CancelSessionSetupReattempt(); +#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES } CHIP_ERROR OperationalSessionSetup::LookupPeerAddress() @@ -553,27 +576,32 @@ CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt(System::Clock: return err; } +void OperationalSessionSetup::CancelSessionSetupReattempt() +{ + // If we can't get a system layer, there is no way for us to cancel things + // at this point, but hopefully that's because everything is torn down + // anyway and hence the timer will not fire. + auto * sessionManager = mInitParams.exchangeMgr->GetSessionManager(); + VerifyOrReturn(sessionManager != nullptr); + + auto * systemLayer = sessionManager->SystemLayer(); + VerifyOrReturn(systemLayer != nullptr); + + systemLayer->CancelTimer(TrySetupAgain, this); +} + void OperationalSessionSetup::TrySetupAgain(System::Layer * systemLayer, void * state) { auto * self = static_cast(state); - CHIP_ERROR err = CHIP_NO_ERROR; - - if (self->mState != State::NeedsAddress) - { - err = CHIP_ERROR_INCORRECT_STATE; - } - else + self->MoveToState(State::ResolvingAddress); + CHIP_ERROR err = self->LookupPeerAddress(); + if (err == CHIP_NO_ERROR) { - self->MoveToState(State::ResolvingAddress); - err = self->LookupPeerAddress(); - if (err == CHIP_NO_ERROR) - { - return; - } + return; } - // Give up; we're either in a bad state or could not start a lookup. + // Give up; we could not start a lookup. self->DequeueConnectionCallbacks(err); // Do not touch `self` instance anymore; it has been destroyed in DequeueConnectionCallbacks. } diff --git a/src/app/OperationalSessionSetup.h b/src/app/OperationalSessionSetup.h index cb3f624ba6724e..60f0f0ecaba5b7 100644 --- a/src/app/OperationalSessionSetup.h +++ b/src/app/OperationalSessionSetup.h @@ -253,6 +253,8 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, HasAddress, // Have an address, CASE handshake not started yet. Connecting, // CASE handshake in progress. SecureConnected, // CASE session established. + WaitingForRetry, // No address known, but a retry is pending. Added at + // end to make logs easier to understand. }; CASEClientInitParams mInitParams; @@ -335,6 +337,12 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate, */ CHIP_ERROR ScheduleSessionSetupReattempt(System::Clock::Seconds16 & timerDelay); + /** + * Cancel a scheduled setup reattempt, if we can (i.e. if we still have + * access to the SystemLayer). + */ + void CancelSessionSetupReattempt(); + /** * Helper for our backoff retry timer. */ diff --git a/src/lib/address_resolve/AddressResolve.h b/src/lib/address_resolve/AddressResolve.h index 11c0959e3357df..8034aed7c1f28a 100644 --- a/src/lib/address_resolve/AddressResolve.h +++ b/src/lib/address_resolve/AddressResolve.h @@ -204,19 +204,30 @@ class Resolver /// in progress) virtual CHIP_ERROR LookupNode(const NodeLookupRequest & request, Impl::NodeLookupHandle & handle) = 0; - /// Inform the Lookup handle that the previous node lookup was not sufficient - /// for the purpose of the caller (e.g establishing a session fails with the - /// result of the previous lookup), and that more data is needed. + /// Inform the Lookup handle that the previous node lookup was not + /// sufficient for the purpose of the caller (e.g establishing a session + /// fails with the result of the previous lookup), and that more data is + /// needed. /// - /// If this returns CHIP_NO_ERROR, the following is expected: - /// - The listener OnNodeAddressResolved will be called with the additional data. - /// - handle must NOT be destroyed while the lookup is in progress (it - /// is part of an internal 'lookup list') - /// - handle must NOT be reused (the lookup is done on a per-node basis - /// and maintains lookup data internally while the operation is still - /// in progress) + /// This method must be called on a handle that is no longer active to + /// succeed. + /// + /// If the handle is no longer active and has results that have not been + /// delivered to the listener yet, the listener's OnNodeAddressResolved will + /// be called synchronously before the method returns. Note that depending + /// on the listener implementation this can end up destroying the handle + /// and/or the listener. + /// + /// This method will return CHIP_NO_ERROR if and only if it has called + /// OnNodeAddressResolved. + /// + /// This method will return CHIP_ERROR_INCORRECT_STATE if the handle is + /// still active. + /// + /// This method will return CHIP_ERROR_WELL_EMPTY if there are no more + /// results. /// - /// If no additional data is available at the time of the request, it returns CHIP_ERROR_WELL_EMPTY. + /// This method may return other errors in some cases. virtual CHIP_ERROR TryNextResult(Impl::NodeLookupHandle & handle) = 0; /// Stops an active lookup request. diff --git a/src/lib/address_resolve/AddressResolve_DefaultImpl.cpp b/src/lib/address_resolve/AddressResolve_DefaultImpl.cpp index a6b1bbde423dfb..10e50cbd734cfe 100644 --- a/src/lib/address_resolve/AddressResolve_DefaultImpl.cpp +++ b/src/lib/address_resolve/AddressResolve_DefaultImpl.cpp @@ -187,20 +187,14 @@ CHIP_ERROR Resolver::LookupNode(const NodeLookupRequest & request, Impl::NodeLoo CHIP_ERROR Resolver::TryNextResult(Impl::NodeLookupHandle & handle) { - VerifyOrReturnError(mSystemLayer != nullptr, CHIP_ERROR_INCORRECT_STATE); VerifyOrReturnError(!mActiveLookups.Contains(&handle), CHIP_ERROR_INCORRECT_STATE); VerifyOrReturnError(handle.HasLookupResult(), CHIP_ERROR_WELL_EMPTY); - return mSystemLayer->ScheduleWork(&OnTryNextResult, static_cast(&handle)); -} - -void Resolver::OnTryNextResult(System::Layer * layer, void * context) -{ - auto handle = static_cast(context); - auto listener = handle->GetListener(); - auto peerId = handle->GetRequest().GetPeerId(); - auto result = handle->TakeLookupResult(); + auto listener = handle.GetListener(); + auto peerId = handle.GetRequest().GetPeerId(); + auto result = handle.TakeLookupResult(); listener->OnNodeAddressResolved(peerId, result); + return CHIP_NO_ERROR; } CHIP_ERROR Resolver::CancelLookup(Impl::NodeLookupHandle & handle, FailureCallback cancel_method) diff --git a/src/lib/address_resolve/AddressResolve_DefaultImpl.h b/src/lib/address_resolve/AddressResolve_DefaultImpl.h index a70e3466326716..ac6d7dd8d1f4d0 100644 --- a/src/lib/address_resolve/AddressResolve_DefaultImpl.h +++ b/src/lib/address_resolve/AddressResolve_DefaultImpl.h @@ -183,7 +183,6 @@ class Resolver : public ::chip::AddressResolve::Resolver, public Dnssd::Operatio private: static void OnResolveTimer(System::Layer * layer, void * context) { static_cast(context)->HandleTimer(); } - static void OnTryNextResult(System::Layer * layer, void * context); /// Timer on lookup node events: min and max search times. void HandleTimer();