From e3bb221032d8711659b71826e0a391a837950b3d Mon Sep 17 00:00:00 2001 From: Boris Zbarsky Date: Wed, 31 Aug 2022 17:06:58 -0400 Subject: [PATCH] Fix lifetime of Darwin SubscriptionCallback to avoid shutdown crashes. The basic issue we could run into is that the Matter stack would shut down while our async block was still running on our client queue, and by the time the "delete this object" block was queued on the Matter queue that queue would be paused. Then if the stack was restarted the queue would be unpaused, and the deletion of the ReadClient would happen early in stack startup, when things were not in a good state yet. The fix is to make sure we queue the async deletion without going through the client queue first, and avoid doing the async bits altogether when we can (when the subscription itself errors out). Fixes https://github.com/project-chip/connectedhomeip/issues/22320 --- src/darwin/Framework/CHIP/MTRBaseDevice.mm | 52 +++++++++++++--------- src/darwin/Framework/CHIP/MTRDevice.mm | 50 ++++++++++++--------- 2 files changed, 61 insertions(+), 41 deletions(-) diff --git a/src/darwin/Framework/CHIP/MTRBaseDevice.mm b/src/darwin/Framework/CHIP/MTRBaseDevice.mm index 62b699dffcbdfa..a2e2cfad38b745 100644 --- a/src/darwin/Framework/CHIP/MTRBaseDevice.mm +++ b/src/darwin/Framework/CHIP/MTRBaseDevice.mm @@ -331,9 +331,13 @@ - (void)invalidateCASESession void OnSubscriptionEstablished(SubscriptionId aSubscriptionId) override; void ReportData(); - void ReportError(CHIP_ERROR err); - void ReportError(const StatusIB & status); - void ReportError(NSError * _Nullable err); + + // Report an error, which may be due to issues in our own internal state or + // due to the OnError callback happening. + // + // aCancelSubscription should be false for the OnError case, since it will + // be immediately followed by OnDone and we want to do the deletion there. + void ReportError(CHIP_ERROR aError, bool aCancelSubscription = true); private: dispatch_queue_t mQueue; @@ -347,11 +351,13 @@ - (void)invalidateCASESession NSMutableArray * _Nullable mAttributeReports = nil; NSMutableArray * _Nullable mEventReports = nil; - // Our lifetime management is a little complicated. On error we - // attempt to delete the ReadClient, but asynchronously. While - // that's pending, someone else (e.g. an error it runs into) could - // delete it too. And if someone else does attempt to delete it, we want to - // make sure we delete ourselves as well. + // Our lifetime management is a little complicated. On errors that don't + // originate with the ReadClient we attempt to delete ourselves (and hence + // the ReadClient), but asynchronously, because the ReadClient API doesn't + // allow sync deletion under callbacks other than OnDone. While that's + // pending, something else (e.g. an error it runs into) could end up calling + // OnDone on us. And generally if OnDone is called we want to delete + // ourselves as well. // // To handle this, enforce the following rules: // @@ -1604,7 +1610,7 @@ - (instancetype)initWithPath:(const ConcreteEventPath &)path { // If OnError is called after OnReportBegin, we should report the collected data ReportData(); - ReportError([MTRError errorForCHIPErrorCode:aError]); + ReportError(aError, /* aCancelSubscription = */ false); } void SubscriptionCallback::OnDone(ReadClient *) @@ -1647,14 +1653,11 @@ - (instancetype)initWithPath:(const ConcreteEventPath &)path } } -void SubscriptionCallback::ReportError(CHIP_ERROR err) { ReportError([MTRError errorForCHIPErrorCode:err]); } - -void SubscriptionCallback::ReportError(const StatusIB & status) { ReportError([MTRError errorForIMStatus:status]); } - -void SubscriptionCallback::ReportError(NSError * _Nullable err) +void SubscriptionCallback::ReportError(CHIP_ERROR aError, bool aCancelSubscription) { + auto * err = [MTRError errorForCHIPErrorCode:aError]; if (!err) { - // Very strange... Someone tried to create a MTRError for a success status? + // Very strange... Someone tried to report a success status as an error? return; } @@ -1675,15 +1678,22 @@ - (instancetype)initWithPath:(const ConcreteEventPath &)path if (onDoneHandler) { onDoneHandler(); } + }); - // Deletion of our ReadClient (and hence of ourselves, since the - // ReadClient has a pointer to us) needs to happen on the Matter work - // queue. + if (aCancelSubscription) { + // We can't synchronously delete ourselves, because we're inside one of + // the ReadClient callbacks and we need to outlive the callback's + // execution. Queue an async deletion on the Matter queue (where we are + // running already). + // + // If we now get OnDone, we will ignore that, since we have the deletion + // posted already, but that's OK even during shutdown: since we are + // queueing the deletion now, it will be processed before the Matter queue + // gets paused, which is fairly early in the shutdown process. + mHaveQueuedDeletion = true; dispatch_async(DeviceLayer::PlatformMgrImpl().GetWorkQueue(), ^{ delete myself; }); - }); - - mHaveQueuedDeletion = true; + } } } // anonymous namespace diff --git a/src/darwin/Framework/CHIP/MTRDevice.mm b/src/darwin/Framework/CHIP/MTRDevice.mm index 937734b497f836..bf7565bf3ac0d3 100644 --- a/src/darwin/Framework/CHIP/MTRDevice.mm +++ b/src/darwin/Framework/CHIP/MTRDevice.mm @@ -151,9 +151,13 @@ - (id)strongObject CHIP_ERROR OnResubscriptionNeeded(ReadClient * apReadClient, CHIP_ERROR aTerminationCause) override; void ReportData(); - void ReportError(CHIP_ERROR err); - void ReportError(const StatusIB & status); - void ReportError(NSError * _Nullable err); + + // Report an error, which may be due to issues in our own internal state or + // due to the OnError callback happening. + // + // aCancelSubscription should be false for the OnError case, since it will + // be immediately followed by OnDone and we want to do the deletion there. + void ReportError(CHIP_ERROR aError, bool aCancelSubscription = true); private: dispatch_queue_t mQueue; @@ -168,11 +172,13 @@ - (id)strongObject NSMutableArray * _Nullable mAttributeReports = nil; NSMutableArray * _Nullable mEventReports = nil; - // Our lifetime management is a little complicated. On error we - // attempt to delete the ReadClient, but asynchronously. While - // that's pending, someone else (e.g. an error it runs into) could - // delete it too. And if someone else does attempt to delete it, we want to - // make sure we delete ourselves as well. + // Our lifetime management is a little complicated. On errors that don't + // originate with the ReadClient we attempt to delete ourselves (and hence + // the ReadClient), but asynchronously, because the ReadClient API doesn't + // allow sync deletion under callbacks other than OnDone. While that's + // pending, something else (e.g. an error it runs into) could end up calling + // OnDone on us. And generally if OnDone is called we want to delete + // ourselves as well. // // To handle this, enforce the following rules: // @@ -835,7 +841,7 @@ - (void)setExpectedValues:(NSArray *> *)values expe { // If OnError is called after OnReportBegin, we should report the collected data ReportData(); - ReportError([MTRError errorForCHIPErrorCode:aError]); + ReportError(aError, /* aCancelSubscription = */ false); } void SubscriptionCallback::OnDone(ReadClient *) @@ -883,12 +889,9 @@ - (void)setExpectedValues:(NSArray *> *)values expe return apReadClient->DefaultResubscribePolicy(aTerminationCause); } -void SubscriptionCallback::ReportError(CHIP_ERROR err) { ReportError([MTRError errorForCHIPErrorCode:err]); } - -void SubscriptionCallback::ReportError(const StatusIB & status) { ReportError([MTRError errorForIMStatus:status]); } - -void SubscriptionCallback::ReportError(NSError * _Nullable err) +void SubscriptionCallback::ReportError(CHIP_ERROR aError, bool aCancelSubscription) { + auto * err = [MTRError errorForCHIPErrorCode:aError]; if (!err) { // Very strange... Someone tried to create a MTRError for a success status? return; @@ -911,15 +914,22 @@ - (void)setExpectedValues:(NSArray *> *)values expe if (onDoneHandler) { onDoneHandler(); } + }); - // Deletion of our ReadClient (and hence of ourselves, since the - // ReadClient has a pointer to us) needs to happen on the Matter work - // queue. + if (aCancelSubscription) { + // We can't synchronously delete ourselves, because we're inside one of + // the ReadClient callbacks and we need to outlive the callback's + // execution. Queue an async deletion on the Matter queue (where we are + // running already). + // + // If we now get OnDone, we will ignore that, since we have the deletion + // posted already, but that's OK even during shutdown: since we are + // queueing the deletion now, it will be processed before the Matter queue + // gets paused, which is fairly early in the shutdown process. + mHaveQueuedDeletion = true; dispatch_async(DeviceLayer::PlatformMgrImpl().GetWorkQueue(), ^{ delete myself; }); - }); - - mHaveQueuedDeletion = true; + } } } // anonymous namespace