From e8b4fe8e8bd3a9abb853554cc050351f12f55447 Mon Sep 17 00:00:00 2001 From: Boris Zbarsky Date: Thu, 25 May 2023 01:29:38 -0400 Subject: [PATCH] Fix ThreadSanitizer failure in controller factory. The failure looks like this: WARNING: ThreadSanitizer: race on NSMutableArray (pid=11619) Read-only access of NSMutableArray at 0x7b0c0005f5b0 by thread T3: #0 -[__NSArrayM countByEnumeratingWithState:objects:count:] :2 (CoreFoundation:x86_64+0x4a338) #1 -[MTRDeviceControllerFactory(InternalMethods) operationalInstanceAdded:] MTRDeviceControllerFactory.mm:855 (Matter:x86_64+0x1fd2a) #2 MTROperationalBrowser::OnBrowse(_DNSServiceRef_t*, unsigned int, unsigned int, int, char const*, char const*, char const*, void*) MTROperationalBrowser.mm:100 (Matter:x86_64+0x20ee63c) #3 handle_browse_response :2 (libsystem_dnssd.dylib:x86_64+0x3733) #4 _dispatch_client_callout :2 (libdispatch.dylib:x86_64+0x3316) Previous modifying access of NSMutableArray at 0x7b0c0005f5b0 by main thread: #0 -[__NSArrayM addObject:] :2 (CoreFoundation:x86_64+0x2457a) #1 -[MTRDeviceControllerFactory createController] MTRDeviceControllerFactory.mm:719 (Matter:x86_64+0x1cee3) #2 -[MTRDeviceControllerFactory createControllerOnExistingFabric:error:] MTRDeviceControllerFactory.mm:534 (Matter:x86_64+0x19792) The basic problem is that we are in the middle of adding an object to _controllers on the API consumer thread when on the Matter thread we get our browse notification. The changes here don't aim to lock around all access to _controllers, but just to make sure that our mutations of it can't race with the access on the Matter thread. More coarse locking would need to be done very carefully, given the amount of dispath_sync to the Matter thread we have going on. --- .../CHIP/MTRDeviceControllerFactory.mm | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm b/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm index 93f7abeb91a979..8853f8d58b4923 100644 --- a/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm +++ b/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm @@ -34,6 +34,8 @@ #import "MTRPersistentStorageDelegateBridge.h" #import "NSDataSpanConversion.h" +#import + #include #include #include @@ -88,6 +90,20 @@ @interface MTRDeviceControllerFactory () @property () chip::Credentials::DeviceAttestationVerifier * deviceAttestationVerifier; @property (readonly) BOOL advertiseOperational; @property (nonatomic, readonly) Credentials::IgnoreCertificateValidityPeriodPolicy * certificateValidityPolicy; +// Lock used to serialize access to the "controllers" array, since it needs to +// be touched from both whatever queue is starting controllers and from the +// Matter queue. The way this lock is used assumes that: +// +// 1) The only mutating accesses to the controllers array happen from outside +// the Matter queue (which is a good assumption, because those functions do +// sync dispatch to the Matter queue). +// 2) It's our API consumer's responsibility to serialize access to us from +// outside. +// +// This means that we only take the lock around mutations of the array and +// accesses to the array that are from code running on the Matter queue. + +@property (nonatomic, readonly) os_unfair_lock controllersLock; - (BOOL)findMatchingFabric:(FabricTable &)fabricTable params:(MTRDeviceControllerStartupParams *)params @@ -123,6 +139,7 @@ - (instancetype)init _running = NO; _chipWorkQueue = DeviceLayer::PlatformMgrImpl().GetWorkQueue(); _controllerFactory = &DeviceControllerFactory::GetInstance(); + _controllersLock = OS_UNFAIR_LOCK_INIT; _sessionKeystore = new chip::Crypto::RawKeySessionKeystore(); if ([self checkForInitError:(_sessionKeystore != nullptr) logMsg:kErrorSessionKeystoreInit]) { @@ -716,7 +733,9 @@ - (MTRDeviceController * _Nullable)createController // Add the controller to _controllers now, so if we fail partway through its // startup we will still do the right cleanups. + os_unfair_lock_lock(&_controllersLock); [_controllers addObject:controller]; + os_unfair_lock_unlock(&_controllersLock); return controller; } @@ -808,7 +827,9 @@ - (void)controllerShuttingDown:(MTRDeviceController *)controller }); } + os_unfair_lock_lock(&_controllersLock); [_controllers removeObject:controller]; + os_unfair_lock_unlock(&_controllersLock); if ([_controllers count] == 0) { dispatch_sync(_chipWorkQueue, ^{ @@ -852,7 +873,13 @@ - (nullable MTRDeviceController *)runningControllerForFabricIndex:(chip::FabricI - (void)operationalInstanceAdded:(chip::PeerId &)operationalID { - for (MTRDeviceController * controller in _controllers) { + assertChipStackLockedByCurrentThread(); + + os_unfair_lock_lock(&_controllersLock); + NSArray * controllersCopy = [_controllers copy]; + os_unfair_lock_unlock(&_controllersLock); + + for (MTRDeviceController * controller in controllersCopy) { auto * compressedFabricId = controller.compressedFabricID; if (compressedFabricId != nil && compressedFabricId.unsignedLongLongValue == operationalID.GetCompressedFabricId()) { ChipLogProgress(Controller, "Notifying controller at fabric index %u about new operational node 0x" ChipLogFormatX64,