Skip to content

Commit

Permalink
Fix lifetime of Darwin SubscriptionCallback to avoid shutdown crashes.
Browse files Browse the repository at this point in the history
The basic issue we could run into is that the Matter stack would shut down
while our async block was still running on our client queue, and by the time
the "delete this object" block was queued on the Matter queue that queue would
be paused.  Then if the stack was restarted the queue would be unpaused, and
the deletion of the ReadClient would happen early in stack startup, when things
were not in a good state yet.

The fix is to make sure we queue the async deletion without going through the
client queue first, and avoid doing the async bits altogether when we can (when
the subscription itself errors out).

Fixes project-chip#22320
  • Loading branch information
bzbarsky-apple committed Aug 31, 2022
1 parent 08830a5 commit e3bb221
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 41 deletions.
52 changes: 31 additions & 21 deletions src/darwin/Framework/CHIP/MTRBaseDevice.mm
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,13 @@ - (void)invalidateCASESession
void OnSubscriptionEstablished(SubscriptionId aSubscriptionId) override;

void ReportData();
void ReportError(CHIP_ERROR err);
void ReportError(const StatusIB & status);
void ReportError(NSError * _Nullable err);

// Report an error, which may be due to issues in our own internal state or
// due to the OnError callback happening.
//
// aCancelSubscription should be false for the OnError case, since it will
// be immediately followed by OnDone and we want to do the deletion there.
void ReportError(CHIP_ERROR aError, bool aCancelSubscription = true);

private:
dispatch_queue_t mQueue;
Expand All @@ -347,11 +351,13 @@ - (void)invalidateCASESession
NSMutableArray * _Nullable mAttributeReports = nil;
NSMutableArray * _Nullable mEventReports = nil;

// Our lifetime management is a little complicated. On error we
// attempt to delete the ReadClient, but asynchronously. While
// that's pending, someone else (e.g. an error it runs into) could
// delete it too. And if someone else does attempt to delete it, we want to
// make sure we delete ourselves as well.
// Our lifetime management is a little complicated. On errors that don't
// originate with the ReadClient we attempt to delete ourselves (and hence
// the ReadClient), but asynchronously, because the ReadClient API doesn't
// allow sync deletion under callbacks other than OnDone. While that's
// pending, something else (e.g. an error it runs into) could end up calling
// OnDone on us. And generally if OnDone is called we want to delete
// ourselves as well.
//
// To handle this, enforce the following rules:
//
Expand Down Expand Up @@ -1604,7 +1610,7 @@ - (instancetype)initWithPath:(const ConcreteEventPath &)path
{
// If OnError is called after OnReportBegin, we should report the collected data
ReportData();
ReportError([MTRError errorForCHIPErrorCode:aError]);
ReportError(aError, /* aCancelSubscription = */ false);
}

void SubscriptionCallback::OnDone(ReadClient *)
Expand Down Expand Up @@ -1647,14 +1653,11 @@ - (instancetype)initWithPath:(const ConcreteEventPath &)path
}
}

void SubscriptionCallback::ReportError(CHIP_ERROR err) { ReportError([MTRError errorForCHIPErrorCode:err]); }

void SubscriptionCallback::ReportError(const StatusIB & status) { ReportError([MTRError errorForIMStatus:status]); }

void SubscriptionCallback::ReportError(NSError * _Nullable err)
void SubscriptionCallback::ReportError(CHIP_ERROR aError, bool aCancelSubscription)
{
auto * err = [MTRError errorForCHIPErrorCode:aError];
if (!err) {
// Very strange... Someone tried to create a MTRError for a success status?
// Very strange... Someone tried to report a success status as an error?
return;
}

Expand All @@ -1675,15 +1678,22 @@ - (instancetype)initWithPath:(const ConcreteEventPath &)path
if (onDoneHandler) {
onDoneHandler();
}
});

// Deletion of our ReadClient (and hence of ourselves, since the
// ReadClient has a pointer to us) needs to happen on the Matter work
// queue.
if (aCancelSubscription) {
// We can't synchronously delete ourselves, because we're inside one of
// the ReadClient callbacks and we need to outlive the callback's
// execution. Queue an async deletion on the Matter queue (where we are
// running already).
//
// If we now get OnDone, we will ignore that, since we have the deletion
// posted already, but that's OK even during shutdown: since we are
// queueing the deletion now, it will be processed before the Matter queue
// gets paused, which is fairly early in the shutdown process.
mHaveQueuedDeletion = true;
dispatch_async(DeviceLayer::PlatformMgrImpl().GetWorkQueue(), ^{
delete myself;
});
});

mHaveQueuedDeletion = true;
}
}
} // anonymous namespace
50 changes: 30 additions & 20 deletions src/darwin/Framework/CHIP/MTRDevice.mm
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,13 @@ - (id)strongObject
CHIP_ERROR OnResubscriptionNeeded(ReadClient * apReadClient, CHIP_ERROR aTerminationCause) override;

void ReportData();
void ReportError(CHIP_ERROR err);
void ReportError(const StatusIB & status);
void ReportError(NSError * _Nullable err);

// Report an error, which may be due to issues in our own internal state or
// due to the OnError callback happening.
//
// aCancelSubscription should be false for the OnError case, since it will
// be immediately followed by OnDone and we want to do the deletion there.
void ReportError(CHIP_ERROR aError, bool aCancelSubscription = true);

private:
dispatch_queue_t mQueue;
Expand All @@ -168,11 +172,13 @@ - (id)strongObject
NSMutableArray * _Nullable mAttributeReports = nil;
NSMutableArray * _Nullable mEventReports = nil;

// Our lifetime management is a little complicated. On error we
// attempt to delete the ReadClient, but asynchronously. While
// that's pending, someone else (e.g. an error it runs into) could
// delete it too. And if someone else does attempt to delete it, we want to
// make sure we delete ourselves as well.
// Our lifetime management is a little complicated. On errors that don't
// originate with the ReadClient we attempt to delete ourselves (and hence
// the ReadClient), but asynchronously, because the ReadClient API doesn't
// allow sync deletion under callbacks other than OnDone. While that's
// pending, something else (e.g. an error it runs into) could end up calling
// OnDone on us. And generally if OnDone is called we want to delete
// ourselves as well.
//
// To handle this, enforce the following rules:
//
Expand Down Expand Up @@ -835,7 +841,7 @@ - (void)setExpectedValues:(NSArray<NSDictionary<NSString *, id> *> *)values expe
{
// If OnError is called after OnReportBegin, we should report the collected data
ReportData();
ReportError([MTRError errorForCHIPErrorCode:aError]);
ReportError(aError, /* aCancelSubscription = */ false);
}

void SubscriptionCallback::OnDone(ReadClient *)
Expand Down Expand Up @@ -883,12 +889,9 @@ - (void)setExpectedValues:(NSArray<NSDictionary<NSString *, id> *> *)values expe
return apReadClient->DefaultResubscribePolicy(aTerminationCause);
}

void SubscriptionCallback::ReportError(CHIP_ERROR err) { ReportError([MTRError errorForCHIPErrorCode:err]); }

void SubscriptionCallback::ReportError(const StatusIB & status) { ReportError([MTRError errorForIMStatus:status]); }

void SubscriptionCallback::ReportError(NSError * _Nullable err)
void SubscriptionCallback::ReportError(CHIP_ERROR aError, bool aCancelSubscription)
{
auto * err = [MTRError errorForCHIPErrorCode:aError];
if (!err) {
// Very strange... Someone tried to create a MTRError for a success status?
return;
Expand All @@ -911,15 +914,22 @@ - (void)setExpectedValues:(NSArray<NSDictionary<NSString *, id> *> *)values expe
if (onDoneHandler) {
onDoneHandler();
}
});

// Deletion of our ReadClient (and hence of ourselves, since the
// ReadClient has a pointer to us) needs to happen on the Matter work
// queue.
if (aCancelSubscription) {
// We can't synchronously delete ourselves, because we're inside one of
// the ReadClient callbacks and we need to outlive the callback's
// execution. Queue an async deletion on the Matter queue (where we are
// running already).
//
// If we now get OnDone, we will ignore that, since we have the deletion
// posted already, but that's OK even during shutdown: since we are
// queueing the deletion now, it will be processed before the Matter queue
// gets paused, which is fairly early in the shutdown process.
mHaveQueuedDeletion = true;
dispatch_async(DeviceLayer::PlatformMgrImpl().GetWorkQueue(), ^{
delete myself;
});
});

mHaveQueuedDeletion = true;
}
}
} // anonymous namespace

0 comments on commit e3bb221

Please sign in to comment.