From 1016136dfd5105baf4b09bf8eddada1e4af2688f Mon Sep 17 00:00:00 2001 From: Jeff Tung <100387939+jtung-apple@users.noreply.github.com> Date: Thu, 8 Dec 2022 19:04:41 -0800 Subject: [PATCH] [Darwin] MTRDevice should retry subscription when getSessionForNode fails (#23981) * [Darwin] MTRDevice should retry subscription when getSessionForNode fails * Address PR conversations * Address PR comment * Address static_cast comment properly * Fix integer signed issue and MTRDeviceController cleanup crash --- src/darwin/Framework/CHIP/MTRDevice.mm | 323 +++++++++++------- .../Framework/CHIP/MTRDeviceController.mm | 6 + .../Framework/CHIP/MTRDevice_Internal.h | 3 + 3 files changed, 210 insertions(+), 122 deletions(-) diff --git a/src/darwin/Framework/CHIP/MTRDevice.mm b/src/darwin/Framework/CHIP/MTRDevice.mm index 68da0859fe0659..38a52bba989719 100644 --- a/src/darwin/Framework/CHIP/MTRDevice.mm +++ b/src/darwin/Framework/CHIP/MTRDevice.mm @@ -130,12 +130,18 @@ - (id)strongObject #pragma mark - MTRDevice @interface MTRDevice () @property (nonatomic, readonly) os_unfair_lock lock; // protects the caches and device state +@property (nonatomic) chip::FabricIndex fabricIndex; @property (nonatomic) MTRWeakReference> * weakDelegate; @property (nonatomic) dispatch_queue_t delegateQueue; @property (nonatomic) NSArray *> * unreportedEvents; @property (nonatomic) BOOL subscriptionActive; +#define MTRDEVICE_SUBSCRIPTION_ATTEMPT_MIN_WAIT_SECONDS (1) +#define MTRDEVICE_SUBSCRIPTION_ATTEMPT_MAX_WAIT_SECONDS (3600) +@property (nonatomic) uint32_t lastSubscriptionAttemptWait; +@property (nonatomic) BOOL reattemptingSubscription; + // Read cache is attributePath => NSDictionary of value. // See MTRDeviceResponseHandler definition for value dictionary details. @property (nonatomic) NSMutableDictionary * readCache; @@ -154,16 +160,23 @@ - (instancetype)initWithNodeID:(NSNumber *)nodeID controller:(MTRDeviceControlle if (self = [super init]) { _lock = OS_UNFAIR_LOCK_INIT; _nodeID = [nodeID copy]; + _fabricIndex = controller.fabricIndex; _deviceController = controller; _queue = dispatch_queue_create("com.apple.matter.framework.device.workqueue", DISPATCH_QUEUE_SERIAL); _readCache = [NSMutableDictionary dictionary]; _expectedValueCache = [NSMutableDictionary dictionary]; _asyncCallbackWorkQueue = [[MTRAsyncCallbackWorkQueue alloc] initWithContext:self queue:_queue]; _state = MTRDeviceStateUnknown; + MTR_LOG_INFO("%@ init with hex nodeID 0x%016llX", self, _nodeID.unsignedLongLongValue); } return self; } +- (NSString *)description +{ + return [NSString stringWithFormat:@"[fabric: %u, nodeID: %@]", self, _fabricIndex, _nodeID]; +} + + (instancetype)deviceWithNodeID:(NSNumber *)nodeID controller:(MTRDeviceController *)controller { return [controller deviceForNodeID:nodeID]; @@ -177,11 +190,22 @@ + (instancetype)deviceWithNodeID:(NSNumber *)nodeID controller:(MTRDeviceControl - (void)setDelegate:(id)delegate queue:(dispatch_queue_t)queue { + MTR_LOG_INFO("%@ setDelegate %@", self, delegate); os_unfair_lock_lock(&self->_lock); _weakDelegate = [MTRWeakReference weakReferenceWithObject:delegate]; _delegateQueue = queue; - [self setupSubscription]; + [self _setupSubscription]; + + os_unfair_lock_unlock(&self->_lock); +} + +- (void)invalidate +{ + MTR_LOG_INFO("%@ invalidate", self); + os_unfair_lock_lock(&self->_lock); + + _weakDelegate = nil; os_unfair_lock_unlock(&self->_lock); } @@ -190,6 +214,9 @@ - (void)_handleSubscriptionEstablished { os_unfair_lock_lock(&self->_lock); + // reset subscription attempt wait time when subscription succeeds + _lastSubscriptionAttemptWait = 0; + _state = MTRDeviceStateReachable; id delegate = _weakDelegate.strongObject; if (delegate) { @@ -206,6 +233,7 @@ - (void)_handleSubscriptionError:(NSError *)error os_unfair_lock_lock(&self->_lock); _subscriptionActive = NO; + _unreportedEvents = nil; id delegate = _weakDelegate.strongObject; if (delegate) { @@ -235,7 +263,43 @@ - (void)_handleResubscriptionNeeded - (void)_handleSubscriptionReset { - // TODO: logic to reattempt subscription with exponential back off + os_unfair_lock_lock(&self->_lock); + // if there is no delegate then also do not retry + id delegate = _weakDelegate.strongObject; + if (!delegate) { + MTR_LOG_INFO("%@ no delegate - do not reattempt subscription", self); + os_unfair_lock_unlock(&self->_lock); + return; + } + + // don't schedule multiple retries + if (self.reattemptingSubscription) { + MTR_LOG_INFO("%@ already reattempting subscription", self); + os_unfair_lock_unlock(&self->_lock); + return; + } + + self.reattemptingSubscription = YES; + + if (_lastSubscriptionAttemptWait < MTRDEVICE_SUBSCRIPTION_ATTEMPT_MIN_WAIT_SECONDS) { + _lastSubscriptionAttemptWait = MTRDEVICE_SUBSCRIPTION_ATTEMPT_MIN_WAIT_SECONDS; + } else { + _lastSubscriptionAttemptWait *= 2; + if (_lastSubscriptionAttemptWait > MTRDEVICE_SUBSCRIPTION_ATTEMPT_MAX_WAIT_SECONDS) { + _lastSubscriptionAttemptWait = MTRDEVICE_SUBSCRIPTION_ATTEMPT_MAX_WAIT_SECONDS; + } + } + + MTR_LOG_INFO("%@ scheduling to reattempt subscription in %u seconds", self, _lastSubscriptionAttemptWait); + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, (int64_t)(_lastSubscriptionAttemptWait * NSEC_PER_SEC)), self.queue, ^{ + os_unfair_lock_lock(&self->_lock); + MTR_LOG_INFO("%@ reattempting subscription", self); + self.reattemptingSubscription = NO; + [self _setupSubscription]; + os_unfair_lock_unlock(&self->_lock); + }); + + os_unfair_lock_unlock(&self->_lock); } // assume lock is held @@ -256,6 +320,7 @@ - (void)_handleAttributeReport:(NSArray *> *)attrib { os_unfair_lock_lock(&self->_lock); + // _getAttributesToReportWithReportedValues will log attribute paths reported [self _reportAttributes:[self _getAttributesToReportWithReportedValues:attributeReport]]; os_unfair_lock_unlock(&self->_lock); @@ -284,8 +349,11 @@ - (void)_handleEventReport:(NSArray *> *)eventRepor os_unfair_lock_unlock(&self->_lock); } -- (void)setupSubscription +// assume lock is held +- (void)_setupSubscription { + os_unfair_lock_assert_owner(&self->_lock); + // for now just subscribe once if (_subscriptionActive) { return; @@ -293,113 +361,113 @@ - (void)setupSubscription _subscriptionActive = YES; - [_deviceController - getSessionForNode:_nodeID.unsignedLongLongValue - completion:^(chip::Messaging::ExchangeManager * _Nullable exchangeManager, - const chip::Optional & session, NSError * _Nullable error) { - if (error != nil) { - MTR_LOG_INFO("MTRDevice getSessionForNode error %@", error); - dispatch_async(self.queue, ^{ - [self _handleSubscriptionError:error]; - }); - return; - } - - // Wildcard endpoint, cluster, attribute, event. - auto attributePath = std::make_unique(); - auto eventPath = std::make_unique(); - // We want to get event reports at the minInterval, not the maxInterval. - eventPath->mIsUrgentEvent = true; - ReadPrepareParams readParams(session.Value()); - - readParams.mMinIntervalFloorSeconds = 0; - // Select a max interval based on the device's claimed idle sleep interval. - auto idleSleepInterval = std::chrono::duration_cast( - session.Value()->GetRemoteMRPConfig().mIdleRetransTimeout); - if (idleSleepInterval.count() < MTR_DEVICE_SUBSCRIPTION_MAX_INTERVAL_MIN) { - idleSleepInterval = System::Clock::Seconds32(MTR_DEVICE_SUBSCRIPTION_MAX_INTERVAL_MIN); - } - if (idleSleepInterval.count() > MTR_DEVICE_SUBSCRIPTION_MAX_INTERVAL_MAX) { - idleSleepInterval = System::Clock::Seconds32(MTR_DEVICE_SUBSCRIPTION_MAX_INTERVAL_MAX); - } - readParams.mMaxIntervalCeilingSeconds = static_cast(idleSleepInterval.count()); - - readParams.mpAttributePathParamsList = attributePath.get(); - readParams.mAttributePathParamsListSize = 1; - readParams.mpEventPathParamsList = eventPath.get(); - readParams.mEventPathParamsListSize = 1; - readParams.mKeepSubscriptions = true; - readParams.mIsFabricFiltered = false; - attributePath.release(); - eventPath.release(); - - std::unique_ptr callback; - std::unique_ptr readClient; - std::unique_ptr clusterStateCache; - callback = std::make_unique( - ^(NSArray * value) { - MTR_LOG_INFO("MTRDevice got attribute report %@", value); - dispatch_async(self.queue, ^{ - // OnAttributeData (after OnReportEnd) - [self _handleAttributeReport:value]; - }); - }, - ^(NSArray * value) { - MTR_LOG_INFO("MTRDevice got event report %@", value); - dispatch_async(self.queue, ^{ - // OnEventReport (after OnReportEnd) - [self _handleEventReport:value]; - }); - }, - ^(NSError * error) { - MTR_LOG_INFO("MTRDevice got subscription error %@", error); - dispatch_async(self.queue, ^{ - // OnError - [self _handleSubscriptionError:error]; - }); - }, - ^(NSError * error, NSNumber * resubscriptionDelay) { - MTR_LOG_INFO("MTRDevice got resubscription error %@ delay %@", error, resubscriptionDelay); - dispatch_async(self.queue, ^{ - // OnResubscriptionNeeded - [self _handleResubscriptionNeeded]; - }); - }, - ^(void) { - MTR_LOG_INFO("MTRDevice got subscription established"); - dispatch_async(self.queue, ^{ - // OnSubscriptionEstablished - [self _handleSubscriptionEstablished]; - }); - }, - ^(void) { - MTR_LOG_INFO("MTRDevice got subscription done"); - dispatch_async(self.queue, ^{ - // OnDone - [self _handleSubscriptionReset]; - }); - }); - readClient = std::make_unique(InteractionModelEngine::GetInstance(), exchangeManager, - callback->GetBufferedCallback(), ReadClient::InteractionType::Subscribe); - - // SendAutoResubscribeRequest cleans up the params, even on failure. - CHIP_ERROR err = readClient->SendAutoResubscribeRequest(std::move(readParams)); - - if (err != CHIP_NO_ERROR) { - NSError * error = [MTRError errorForCHIPErrorCode:err]; - MTR_LOG_INFO("MTRDevice SendAutoResubscribeRequest error %@", error); - dispatch_async(self.queue, ^{ - [self _handleSubscriptionError:error]; - }); - - return; - } - - // Callback and ReadClient will be deleted when OnDone is called or an error is - // encountered. - callback->AdoptReadClient(std::move(readClient)); - callback.release(); - }]; + [_deviceController getSessionForNode:_nodeID.unsignedLongLongValue + completion:^(chip::Messaging::ExchangeManager * _Nullable exchangeManager, + const chip::Optional & session, NSError * _Nullable error) { + if (error != nil) { + MTR_LOG_INFO("%@ getSessionForNode error %@", self, error); + dispatch_async(self.queue, ^{ + [self _handleSubscriptionError:error]; + [self _handleSubscriptionReset]; + }); + return; + } + + // Wildcard endpoint, cluster, attribute, event. + auto attributePath = std::make_unique(); + auto eventPath = std::make_unique(); + // We want to get event reports at the minInterval, not the maxInterval. + eventPath->mIsUrgentEvent = true; + ReadPrepareParams readParams(session.Value()); + + readParams.mMinIntervalFloorSeconds = 0; + // Select a max interval based on the device's claimed idle sleep interval. + auto idleSleepInterval = std::chrono::duration_cast( + session.Value()->GetRemoteMRPConfig().mIdleRetransTimeout); + if (idleSleepInterval.count() < MTR_DEVICE_SUBSCRIPTION_MAX_INTERVAL_MIN) { + idleSleepInterval = System::Clock::Seconds32(MTR_DEVICE_SUBSCRIPTION_MAX_INTERVAL_MIN); + } + if (idleSleepInterval.count() > MTR_DEVICE_SUBSCRIPTION_MAX_INTERVAL_MAX) { + idleSleepInterval = System::Clock::Seconds32(MTR_DEVICE_SUBSCRIPTION_MAX_INTERVAL_MAX); + } + readParams.mMaxIntervalCeilingSeconds = static_cast(idleSleepInterval.count()); + + readParams.mpAttributePathParamsList = attributePath.get(); + readParams.mAttributePathParamsListSize = 1; + readParams.mpEventPathParamsList = eventPath.get(); + readParams.mEventPathParamsListSize = 1; + readParams.mKeepSubscriptions = true; + readParams.mIsFabricFiltered = false; + attributePath.release(); + eventPath.release(); + + std::unique_ptr callback; + std::unique_ptr readClient; + std::unique_ptr clusterStateCache; + callback = std::make_unique( + ^(NSArray * value) { + MTR_LOG_INFO("%@ got attribute report %@", self, value); + dispatch_async(self.queue, ^{ + // OnAttributeData (after OnReportEnd) + [self _handleAttributeReport:value]; + }); + }, + ^(NSArray * value) { + MTR_LOG_INFO("%@ got event report %@", self, value); + dispatch_async(self.queue, ^{ + // OnEventReport (after OnReportEnd) + [self _handleEventReport:value]; + }); + }, + ^(NSError * error) { + MTR_LOG_INFO("%@ got subscription error %@", self, error); + dispatch_async(self.queue, ^{ + // OnError + [self _handleSubscriptionError:error]; + }); + }, + ^(NSError * error, NSNumber * resubscriptionDelay) { + MTR_LOG_INFO("%@ got resubscription error %@ delay %@", self, error, resubscriptionDelay); + dispatch_async(self.queue, ^{ + // OnResubscriptionNeeded + [self _handleResubscriptionNeeded]; + }); + }, + ^(void) { + MTR_LOG_INFO("%@ got subscription established", self); + dispatch_async(self.queue, ^{ + // OnSubscriptionEstablished + [self _handleSubscriptionEstablished]; + }); + }, + ^(void) { + MTR_LOG_INFO("%@ got subscription done", self); + dispatch_async(self.queue, ^{ + // OnDone + [self _handleSubscriptionReset]; + }); + }); + readClient = std::make_unique(InteractionModelEngine::GetInstance(), exchangeManager, + callback->GetBufferedCallback(), ReadClient::InteractionType::Subscribe); + + // SendAutoResubscribeRequest cleans up the params, even on failure. + CHIP_ERROR err = readClient->SendAutoResubscribeRequest(std::move(readParams)); + + if (err != CHIP_NO_ERROR) { + NSError * error = [MTRError errorForCHIPErrorCode:err]; + MTR_LOG_INFO("%@ SendAutoResubscribeRequest error %@", self, error); + dispatch_async(self.queue, ^{ + [self _handleSubscriptionError:error]; + }); + + return; + } + + // Callback and ReadClient will be deleted when OnDone is called or an error is + // encountered. + callback->AdoptReadClient(std::move(readClient)); + callback.release(); + }]; } #pragma mark Device Interactions @@ -408,8 +476,7 @@ - (void)setupSubscription attributeID:(NSNumber *)attributeID params:(MTRReadParams *)params { - NSString * logPrefix = [NSString - stringWithFormat:@"MTRDevice read %u %@ %@ %@", _deviceController.fabricIndex, endpointID, clusterID, attributeID]; + NSString * logPrefix = [NSString stringWithFormat:@"%@ read %@ %@ %@", self, endpointID, clusterID, attributeID]; // Create work item, set ready handler to perform task, then enqueue the work MTRAsyncCallbackQueueWorkItem * workItem = [[MTRAsyncCallbackQueueWorkItem alloc] initWithQueue:self.queue]; MTRAsyncCallbackReadyHandler readyHandler = ^(MTRDevice * device, NSUInteger retryCount) { @@ -460,8 +527,7 @@ - (void)writeAttributeWithEndpointID:(NSNumber *)endpointID expectedValueInterval:(NSNumber *)expectedValueInterval timedWriteTimeout:(NSNumber * _Nullable)timeout { - NSString * logPrefix = [NSString - stringWithFormat:@"MTRDevice write %u %@ %@ %@", _deviceController.fabricIndex, endpointID, clusterID, attributeID]; + NSString * logPrefix = [NSString stringWithFormat:@"%@ write %@ %@ %@", self, endpointID, clusterID, attributeID]; if (timeout) { timeout = MTRClampedNumber(timeout, @(1), @(UINT16_MAX)); } @@ -505,8 +571,7 @@ - (void)invokeCommandWithEndpointID:(NSNumber *)endpointID queue:(dispatch_queue_t)queue completion:(MTRDeviceResponseHandler)completion { - NSString * logPrefix = [NSString - stringWithFormat:@"MTRDevice command %u %@ %@ %@", _deviceController.fabricIndex, endpointID, clusterID, commandID]; + NSString * logPrefix = [NSString stringWithFormat:@"%@ command %@ %@ %@", self, endpointID, clusterID, commandID]; if (timeout) { timeout = MTRClampedNumber(timeout, @(1), @(UINT16_MAX)); } @@ -586,6 +651,7 @@ - (void)_checkExpiredExpectedValues // remove from expected value cache and report attributes as needed NSMutableArray * attributesToReport = [NSMutableArray array]; + NSMutableArray * attributePathsToReport = [NSMutableArray array]; for (MTRPair * attributeInfo in attributeInfoToRemove) { // compare with known value and mark for report if different MTRAttributePath * attributePath = attributeInfo.first; @@ -594,10 +660,14 @@ - (void)_checkExpiredExpectedValues if (cachedAttributeDataValue && ![self _attributeDataValue:attributeDataValue isEqualToDataValue:cachedAttributeDataValue]) { [attributesToReport addObject:@{ MTRAttributePathKey : attributePath, MTRDataKey : cachedAttributeDataValue }]; + [attributePathsToReport addObject:attributePath]; } _expectedValueCache[attributePath] = nil; } + + // log attribute paths + MTR_LOG_INFO("%@ report from expired expected values %@", self, attributePathsToReport); [self _reportAttributes:attributesToReport]; // Have a reasonable minimum wait time for expiration timers @@ -654,8 +724,8 @@ - (void)_performScheduledExpirationCheck return cachedAttributeValue; } else { // TODO: when not found in cache, generated default values should be used - MTR_LOG_INFO( - "_attributeValueDictionaryForAttributePath: could not find cached attribute values for attribute %@", attributePath); + MTR_LOG_INFO("%@ _attributeValueDictionaryForAttributePath: could not find cached attribute values for attribute %@", self, + attributePath); } os_unfair_lock_unlock(&self->_lock); @@ -676,6 +746,7 @@ - (NSArray *)_getAttributesToReportWithReportedValues:(NSArray_lock); NSMutableArray * attributesToReport = [NSMutableArray array]; + NSMutableArray * attributePathsToReport = [NSMutableArray array]; for (NSDictionary * attributeReponseValue in reportedAttributeValues) { MTRAttributePath * attributePath = attributeReponseValue[MTRAttributePathKey]; NSDictionary * attributeDataValue = attributeReponseValue[MTRDataKey]; @@ -692,7 +763,7 @@ - (NSArray *)_getAttributesToReportWithReportedValues:(NSArray_lock); NSMutableArray * attributesToReport = [NSMutableArray array]; + NSMutableArray * attributePathsToReport = [NSMutableArray array]; for (NSDictionary * attributeReponseValue in expectedAttributeValues) { MTRAttributePath * attributePath = attributeReponseValue[MTRAttributePathKey]; NSDictionary * attributeDataValue = attributeReponseValue[MTRDataKey]; @@ -758,9 +833,12 @@ - (NSArray *)_getAttributesToReportWithNewExpectedValues:(NSArray *> *)values expe NSDate * expirationTime = [NSDate dateWithTimeIntervalSinceNow:expectedValueInterval.doubleValue / 1000]; os_unfair_lock_lock(&self->_lock); - NSArray * attributesToReport = [self _getAttributesToReportWithNewExpectedValues:values expirationTime:expirationTime]; + // _getAttributesToReportWithNewExpectedValues will log attribute paths reported + NSArray * attributesToReport = [self _getAttributesToReportWithNewExpectedValues:values expirationTime:expirationTime]; [self _reportAttributes:attributesToReport]; [self _checkExpiredExpectedValues]; diff --git a/src/darwin/Framework/CHIP/MTRDeviceController.mm b/src/darwin/Framework/CHIP/MTRDeviceController.mm index 168a23fb32ac19..c9446ffcbbde0b 100644 --- a/src/darwin/Framework/CHIP/MTRDeviceController.mm +++ b/src/darwin/Framework/CHIP/MTRDeviceController.mm @@ -188,6 +188,11 @@ - (void)cleanup delete _deviceControllerDelegateBridge; _deviceControllerDelegateBridge = nullptr; } + + for (MTRDevice * device in [self.nodeIDToDeviceMap allValues]) { + [device invalidate]; + } + [self.nodeIDToDeviceMap removeAllObjects]; } - (BOOL)startup:(MTRDeviceControllerStartupParamsInternal *)startupParams @@ -530,6 +535,7 @@ - (void)removeDevice:(MTRDevice *)device os_unfair_lock_lock(&_deviceMapLock); MTRDevice * deviceToRemove = self.nodeIDToDeviceMap[device.nodeID]; if (deviceToRemove == device) { + [deviceToRemove invalidate]; self.nodeIDToDeviceMap[device.nodeID] = nil; } else { MTR_LOG_ERROR("Error: Cannot remove device %p with nodeID %llu", device, device.nodeID.unsignedLongLongValue); diff --git a/src/darwin/Framework/CHIP/MTRDevice_Internal.h b/src/darwin/Framework/CHIP/MTRDevice_Internal.h index f752ed5632c680..c1611acaa39e8f 100644 --- a/src/darwin/Framework/CHIP/MTRDevice_Internal.h +++ b/src/darwin/Framework/CHIP/MTRDevice_Internal.h @@ -33,6 +33,9 @@ typedef void (^MTRDevicePerformAsyncBlock)(MTRBaseDevice * baseDevice); - (void)setExpectedValues:(NSArray *> *)values expectedValueInterval:(NSNumber *)expectedValueIntervalMs; +// called by controller to clean up and shutdown +- (void)invalidate; + @property (nonatomic, readonly) MTRDeviceController * deviceController; @property (nonatomic, readonly, copy) NSNumber * nodeID; // Queue used for various internal bookkeeping work. In general endWork calls