Skip to content

Commit

Permalink
Fix crashes during CASE establishment. (#25868)
Browse files Browse the repository at this point in the history
* Fix crashes during CASE establishment.

There were two potential sources of crashes:

1) TryNextResult on the resolver scheduled an async task that could not be
canceled.  If, while that task was pending, the OperationalSessionSetup was
destroyed (which could happen if another session for the same peer had become
active in the meantime and another connection attempt happened) we would end up
with use-after-free.

2) ScheduleSessionSetupReattempt scheduled a timer that was never being
canceled.  Similar to above, if the OperationalSessionSetup got destroyed before
the timer fired we could end up with use-after-free.

The fix for the first problem is to make TryNextResult synchronous, instead of
scheduling an async task.

The fix for the second problem is to introduce a new state of
OperationalSessionSetup that's used while waiting for the timer and to ensure
that the timer is canceled when leaving that state or if the
OperationalSessionSetup is destroyed.

To preserve existing behavior, if we are in the "waiting for timer" state and
new connection attempt happens and there is a session we can attach to, then we
immediately move to the Connected state (and cancel the timer).

* Address review comments.
  • Loading branch information
bzbarsky-apple authored and pull[bot] committed Jul 21, 2023
1 parent 7975552 commit 86a6712
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 40 deletions.
64 changes: 46 additions & 18 deletions src/app/OperationalSessionSetup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ void OperationalSessionSetup::MoveToState(State aTargetState)
ChipLogDetail(Discovery, "OperationalSessionSetup[%u:" ChipLogFormatX64 "]: State change %d --> %d",
mPeerId.GetFabricIndex(), ChipLogValueX64(mPeerId.GetNodeId()), to_underlying(mState),
to_underlying(aTargetState));

#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
if (mState == State::WaitingForRetry)
{
CancelSessionSetupReattempt();
}
#endif

mState = aTargetState;

if (aTargetState != State::Connecting)
Expand All @@ -64,7 +72,9 @@ void OperationalSessionSetup::MoveToState(State aTargetState)

bool OperationalSessionSetup::AttachToExistingSecureSession()
{
VerifyOrReturnError(mState == State::NeedsAddress || mState == State::ResolvingAddress || mState == State::HasAddress, false);
VerifyOrReturnError(mState == State::NeedsAddress || mState == State::ResolvingAddress || mState == State::HasAddress ||
mState == State::WaitingForRetry,
false);

auto sessionHandle =
mInitParams.sessionManager->FindSecureSessionForNode(mPeerId, MakeOptional(Transport::SecureSession::Type::kCASE));
Expand Down Expand Up @@ -119,6 +129,7 @@ void OperationalSessionSetup::Connect(Callback::Callback<OnDeviceConnected> * on
break;

case State::ResolvingAddress:
case State::WaitingForRetry:
isConnected = AttachToExistingSecureSession();
break;

Expand Down Expand Up @@ -320,27 +331,35 @@ void OperationalSessionSetup::OnSessionEstablishmentError(CHIP_ERROR error)
{
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
// Make a copy of the ReliableMessageProtocolConfig, since our
// mCaseClient is about to go away.
// mCaseClient is about to go away once we change state.
ReliableMessageProtocolConfig remoteMprConfig = mCASEClient->GetRemoteMRPIntervals();
#endif

// Move to the ResolvingAddress state, in case we have more results,
// since we expect to receive results in that state.
MoveToState(State::ResolvingAddress);
if (CHIP_NO_ERROR == Resolver::Instance().TryNextResult(mAddressLookupHandle))
{
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
// Our retry is going to be immediate, once the event loop spins.
// Our retry has already been kicked off.
NotifyRetryHandlers(error, remoteMprConfig, System::Clock::kZero);
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
MoveToState(State::ResolvingAddress);
return;
}

// Moving back to the Connecting state would be a bit of a lie, since we
// don't have an mCASEClient. Just go back to NeedsAddress, since
// that's really where we are now.
MoveToState(State::NeedsAddress);

#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
if (mRemainingAttempts > 0)
{
System::Clock::Seconds16 reattemptDelay;
CHIP_ERROR err = ScheduleSessionSetupReattempt(reattemptDelay);
if (err == CHIP_NO_ERROR)
{
MoveToState(State::WaitingForRetry);
NotifyRetryHandlers(error, remoteMprConfig, reattemptDelay);
return;
}
Expand Down Expand Up @@ -406,6 +425,10 @@ OperationalSessionSetup::~OperationalSessionSetup()
// Make sure we don't leak it.
mClientPool->Release(mCASEClient);
}

#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
CancelSessionSetupReattempt();
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
}

CHIP_ERROR OperationalSessionSetup::LookupPeerAddress()
Expand Down Expand Up @@ -553,27 +576,32 @@ CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt(System::Clock:
return err;
}

void OperationalSessionSetup::CancelSessionSetupReattempt()
{
// If we can't get a system layer, there is no way for us to cancel things
// at this point, but hopefully that's because everything is torn down
// anyway and hence the timer will not fire.
auto * sessionManager = mInitParams.exchangeMgr->GetSessionManager();
VerifyOrReturn(sessionManager != nullptr);

auto * systemLayer = sessionManager->SystemLayer();
VerifyOrReturn(systemLayer != nullptr);

systemLayer->CancelTimer(TrySetupAgain, this);
}

void OperationalSessionSetup::TrySetupAgain(System::Layer * systemLayer, void * state)
{
auto * self = static_cast<OperationalSessionSetup *>(state);

CHIP_ERROR err = CHIP_NO_ERROR;

if (self->mState != State::NeedsAddress)
{
err = CHIP_ERROR_INCORRECT_STATE;
}
else
self->MoveToState(State::ResolvingAddress);
CHIP_ERROR err = self->LookupPeerAddress();
if (err == CHIP_NO_ERROR)
{
self->MoveToState(State::ResolvingAddress);
err = self->LookupPeerAddress();
if (err == CHIP_NO_ERROR)
{
return;
}
return;
}

// Give up; we're either in a bad state or could not start a lookup.
// Give up; we could not start a lookup.
self->DequeueConnectionCallbacks(err);
// Do not touch `self` instance anymore; it has been destroyed in DequeueConnectionCallbacks.
}
Expand Down
8 changes: 8 additions & 0 deletions src/app/OperationalSessionSetup.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,8 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate,
HasAddress, // Have an address, CASE handshake not started yet.
Connecting, // CASE handshake in progress.
SecureConnected, // CASE session established.
WaitingForRetry, // No address known, but a retry is pending. Added at
// end to make logs easier to understand.
};

CASEClientInitParams mInitParams;
Expand Down Expand Up @@ -335,6 +337,12 @@ class DLL_EXPORT OperationalSessionSetup : public SessionDelegate,
*/
CHIP_ERROR ScheduleSessionSetupReattempt(System::Clock::Seconds16 & timerDelay);

/**
* Cancel a scheduled setup reattempt, if we can (i.e. if we still have
* access to the SystemLayer).
*/
void CancelSessionSetupReattempt();

/**
* Helper for our backoff retry timer.
*/
Expand Down
33 changes: 22 additions & 11 deletions src/lib/address_resolve/AddressResolve.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,19 +204,30 @@ class Resolver
/// in progress)
virtual CHIP_ERROR LookupNode(const NodeLookupRequest & request, Impl::NodeLookupHandle & handle) = 0;

/// Inform the Lookup handle that the previous node lookup was not sufficient
/// for the purpose of the caller (e.g establishing a session fails with the
/// result of the previous lookup), and that more data is needed.
/// Inform the Lookup handle that the previous node lookup was not
/// sufficient for the purpose of the caller (e.g establishing a session
/// fails with the result of the previous lookup), and that more data is
/// needed.
///
/// If this returns CHIP_NO_ERROR, the following is expected:
/// - The listener OnNodeAddressResolved will be called with the additional data.
/// - handle must NOT be destroyed while the lookup is in progress (it
/// is part of an internal 'lookup list')
/// - handle must NOT be reused (the lookup is done on a per-node basis
/// and maintains lookup data internally while the operation is still
/// in progress)
/// This method must be called on a handle that is no longer active to
/// succeed.
///
/// If the handle is no longer active and has results that have not been
/// delivered to the listener yet, the listener's OnNodeAddressResolved will
/// be called synchronously before the method returns. Note that depending
/// on the listener implementation this can end up destroying the handle
/// and/or the listener.
///
/// This method will return CHIP_NO_ERROR if and only if it has called
/// OnNodeAddressResolved.
///
/// This method will return CHIP_ERROR_INCORRECT_STATE if the handle is
/// still active.
///
/// This method will return CHIP_ERROR_WELL_EMPTY if there are no more
/// results.
///
/// If no additional data is available at the time of the request, it returns CHIP_ERROR_WELL_EMPTY.
/// This method may return other errors in some cases.
virtual CHIP_ERROR TryNextResult(Impl::NodeLookupHandle & handle) = 0;

/// Stops an active lookup request.
Expand Down
14 changes: 4 additions & 10 deletions src/lib/address_resolve/AddressResolve_DefaultImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,20 +187,14 @@ CHIP_ERROR Resolver::LookupNode(const NodeLookupRequest & request, Impl::NodeLoo

CHIP_ERROR Resolver::TryNextResult(Impl::NodeLookupHandle & handle)
{
VerifyOrReturnError(mSystemLayer != nullptr, CHIP_ERROR_INCORRECT_STATE);
VerifyOrReturnError(!mActiveLookups.Contains(&handle), CHIP_ERROR_INCORRECT_STATE);
VerifyOrReturnError(handle.HasLookupResult(), CHIP_ERROR_WELL_EMPTY);

return mSystemLayer->ScheduleWork(&OnTryNextResult, static_cast<void *>(&handle));
}

void Resolver::OnTryNextResult(System::Layer * layer, void * context)
{
auto handle = static_cast<Impl::NodeLookupHandle *>(context);
auto listener = handle->GetListener();
auto peerId = handle->GetRequest().GetPeerId();
auto result = handle->TakeLookupResult();
auto listener = handle.GetListener();
auto peerId = handle.GetRequest().GetPeerId();
auto result = handle.TakeLookupResult();
listener->OnNodeAddressResolved(peerId, result);
return CHIP_NO_ERROR;
}

CHIP_ERROR Resolver::CancelLookup(Impl::NodeLookupHandle & handle, FailureCallback cancel_method)
Expand Down
1 change: 0 additions & 1 deletion src/lib/address_resolve/AddressResolve_DefaultImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ class Resolver : public ::chip::AddressResolve::Resolver, public Dnssd::Operatio

private:
static void OnResolveTimer(System::Layer * layer, void * context) { static_cast<Resolver *>(context)->HandleTimer(); }
static void OnTryNextResult(System::Layer * layer, void * context);

/// Timer on lookup node events: min and max search times.
void HandleTimer();
Expand Down

0 comments on commit 86a6712

Please sign in to comment.