[ECO-5095][CHA-RL*] Implement logging for RoomLifecycleManager

1. Added private property logger with thread specific dynamic context 2. Added trace, debug, error logging for private doRetry method 3. Added trace, debug, error logging for attach method
ably · Dec 11, 2024 · e686835 · e686835
1 parent 11180ae
commit e686835
Showing 1 changed file with 69 additions and 22 deletions.
diff --git a/chat-android/src/main/java/com/ably/chat/RoomLifecycleManager.kt b/chat-android/src/main/java/com/ably/chat/RoomLifecycleManager.kt
@@ -117,8 +117,12 @@ internal class RoomLifecycleManager(
     private val roomScope: CoroutineScope,
     private val statusLifecycle: DefaultRoomLifecycle,
     private val contributors: List<ContributesToRoomLifecycle>,
-    private val logger: Logger,
+    roomLogger: Logger,
 ) {
+    private val logger = roomLogger.withContext(
+        "RoomLifecycleManager",
+        dynamicContext = mapOf("scope" to { Thread.currentThread().name }),
+    )
 
     /**
      * AtomicCoroutineScope makes sure all operations are atomic and run with given priority.
@@ -183,63 +187,84 @@ internal class RoomLifecycleManager(
      * @returns Returns when the room is attached, or the room enters a failed state.
      * Spec: CHA-RL5
      */
-    @Suppress("CognitiveComplexMethod", "ThrowsCount")
+    @Suppress("CognitiveComplexMethod", "ThrowsCount", "LongMethod")
     private suspend fun doRetry(contributor: ContributesToRoomLifecycle) {
+        logger.trace("doRetry();")
+        // Handle the channel wind-down.
+        logger.debug("doRetry(); winding down channels except for feature: ${contributor.featureName}")
         // CHA-RL5a - Handle the channel wind-down for other channels
         var result = kotlin.runCatching { doChannelWindDown(contributor) }
         while (result.isFailure) {
             // CHA-RL5c - If in doing the wind down, we've entered failed state, then it's game over anyway
             if (this.statusLifecycle.status === RoomStatus.Failed) {
+                logger.error("doRetry(); channels wind down failed, RoomStatus is FAILED", result.exceptionOrNull())
                 throw result.exceptionOrNull() ?: IllegalStateException("room is in a failed state")
             }
+            logger.warn("doRetry(); channels wind down failed, retrying in $retryDurationInMs ms", result.exceptionOrNull())
             delay(retryDurationInMs)
             result = kotlin.runCatching { doChannelWindDown(contributor) }
         }
+        logger.debug("doRetry(); channels wind down successfully completed")
 
         // A helper that allows us to retry the attach operation
         val doAttachWithRetry: suspend () -> Unit = {
+            logger.trace("doRetry.doAttachWithRetry();")
             coroutineScope {
+                logger.debug("doRetry.doAttachWithRetry(); attempting to attach all feature channels")
                 statusLifecycle.setStatus(RoomStatus.Attaching)
                 val attachmentResult = doAttach()
 
                 // CHA-RL5c - If we're in failed, then we should wind down all the channels, eventually - but we're done here
                 if (attachmentResult.status === RoomStatus.Failed) {
+                    logger.error("doRetry.doAttachWithRetry(); failed to attach channels, RoomStatus is Failed", attachmentResult.exception)
                     atomicCoroutineScope.async(LifecycleOperationPrecedence.Internal.priority) {
                         runDownChannelsOnFailedAttach()
                     }
+                    logger.warn("doRetry.doAttachWithRetry(); started winding down all channels internally")
                     return@coroutineScope
                 }
 
                 // If we're in suspended, then we should wait for the channel to reattach, but wait for it to do so
                 if (attachmentResult.status === RoomStatus.Suspended) {
-                    val failedFeature = attachmentResult.failedFeature
-                        ?: throw lifeCycleException(
-                            "no failed feature in doRetry",
-                            ErrorCode.RoomLifecycleError,
-                        )
+                    logger.warn(
+                        "doRetry.doAttachWithRetry(); failed to attach channels, RoomStatus is Suspended",
+                        attachmentResult.exception,
+                    )
+                    val failedFeature = attachmentResult.failedFeature ?: throw lifeCycleException(
+                        "no failed feature in doRetry",
+                        ErrorCode.RoomLifecycleError,
+                    )
+                    logger.warn("doRetry.doAttachWithRetry(); Retrying attach due to suspended feature: ${failedFeature.featureName}")
                     // No need to catch errors, rather they should propagate to caller method
                     return@coroutineScope doRetry(failedFeature)
                 }
+
                 // We attached, huzzah!
+                logger.debug("doRetry.doAttachWithRetry(); Room ATTACH success, RoomStatus: ${statusLifecycle.status}")
             }
         }
 
         // If given suspended contributor channel has reattached, then we can retry the attach
         if (contributor.channel.state == ChannelState.attached) {
+            logger.debug("doRetry(); feature: ${contributor.featureName} reattached, retrying attach for others")
             return doAttachWithRetry()
         }
 
         // CHA-RL5d - Otherwise, wait for our suspended contributor channel to re-attach and try again
         try {
+            logger.debug("doRetry(); waiting to channel ATTACHED for feature: ${contributor.featureName}")
             listenToChannelAttachOrFailure(contributor)
+            logger.debug("doRetry(); waiting success, feature: ${contributor.featureName} ATTACHED, retrying attach for others")
             delay(retryDurationInMs) // Let other channels get into ATTACHING state
-            // Attach successful
-            return doAttachWithRetry()
         } catch (ex: AblyException) {
             // CHA-RL5c - Channel attach failed
+            logger.error("doRetry(); waiting failed for channel attach, feature: ${contributor.featureName}", ex)
+            // No need for channels wind down, since it's taken care at the start of doRetry()
             statusLifecycle.setStatus(RoomStatus.Failed, ex.errorInfo)
             throw ex
         }
+        // Attach successful, try attach for other channels
+        return doAttachWithRetry()
     }
 
     /**
@@ -292,59 +317,60 @@ internal class RoomLifecycleManager(
      */
     @Suppress("ThrowsCount")
     internal suspend fun attach() {
+        logger.trace("attach();")
         val deferredAttach = atomicCoroutineScope.async(LifecycleOperationPrecedence.AttachOrDetach.priority) { // CHA-RL1d
             if (statusLifecycle.status == RoomStatus.Attached) { // CHA-RL1a
+                logger.debug("attach(); room is already attached")
                 return@async
             }
 
             if (statusLifecycle.status == RoomStatus.Releasing) { // CHA-RL1b
-                throw lifeCycleException(
-                    "unable to attach room; room is releasing",
-                    ErrorCode.RoomIsReleasing,
-                )
+                logger.error("attach(); attach failed, room is in releasing state")
+                throw lifeCycleException("unable to attach room; room is releasing", ErrorCode.RoomIsReleasing)
             }
 
             if (statusLifecycle.status == RoomStatus.Released) { // CHA-RL1c
-                throw lifeCycleException(
-                    "unable to attach room; room is released",
-                    ErrorCode.RoomIsReleased,
-                )
+                logger.error("attach(); attach failed, room is in released state")
+                throw lifeCycleException("unable to attach room; room is released", ErrorCode.RoomIsReleased)
             }
 
             // At this point, we force the room status to be attaching
             clearAllTransientDetachTimeouts()
             operationInProgress = true
+            logger.debug("attach(); transitioned room into ATTACHING state")
             statusLifecycle.setStatus(RoomStatus.Attaching) // CHA-RL1e
 
             val attachResult = doAttach()
 
             // CHA-RL1h4 - If we're in a failed state, then we should wind down all the channels, eventually
             if (attachResult.status === RoomStatus.Failed) {
+                logger.error("attach(); failed to attach channels, RoomStatus is Failed", attachResult.exception)
                 // CHA-RL1h5 - detach all remaining channels
                 atomicCoroutineScope.async(LifecycleOperationPrecedence.Internal.priority) {
                     runDownChannelsOnFailedAttach()
                 }
+                logger.warn("attach(); started winding down all channels internally")
                 throw attachResult.exception // CHA-RL1h1
             }
 
             // CHA-RL1h1, CHA-RL1h2 - If we're in suspended, then this attach should fail, but we'll retry after a short delay async
             if (attachResult.status === RoomStatus.Suspended) {
+                logger.warn("attach(); failed to attach channels, RoomStatus is Suspended", attachResult.exception)
                 if (attachResult.failedFeature == null) {
-                    throw lifeCycleException(
-                        "no failed feature in attach",
-                        ErrorCode.RoomLifecycleError,
-                    )
+                    throw lifeCycleException("no failed feature in attach", ErrorCode.RoomLifecycleError)
                 }
                 attachResult.failedFeature?.let {
                     // CHA-RL1h3 - Enter recovery for failed room feature/contributor
                     atomicCoroutineScope.async(LifecycleOperationPrecedence.Internal.priority) {
                         doRetry(it)
                     }
+                    logger.warn("attach(); retrying attach due to suspended feature: ${it.featureName}")
                 }
                 throw attachResult.exception // CHA-RL1h1
             }
 
             // We attached, finally!
+            logger.debug("attach(): room attached successfully")
         }
 
         deferredAttach.await()
@@ -357,12 +383,17 @@ internal class RoomLifecycleManager(
      * Spec: CHA-RL1f, CHA-RL1g, CHA-RL1h
      */
     private suspend fun doAttach(): RoomAttachmentResult {
+        logger.trace("doAttach();")
         val attachResult = DefaultRoomAttachmentResult()
+        logger.debug("doAttach(); trying to attach all features: ${contributors.joinToString { it.featureName }}")
         for (feature in contributors) { // CHA-RL1f - attach each feature sequentially
             try {
+                logger.debug("doAttach(); attaching feature: ${feature.featureName}")
                 feature.channel.attachCoroutine()
                 firstAttachesCompleted[feature] = true
+                logger.debug("doAttach(); attached feature: ${feature.featureName}")
             } catch (ex: Throwable) { // CHA-RL1h - handle channel attach failure
+                logger.error("doAttach(); attach failed for feature: ${feature.featureName}", ex)
                 attachResult.throwable = ex
                 attachResult.failedFeatureField = feature
                 attachResult.errorField = lifeCycleErrorInfo(
@@ -387,17 +418,20 @@ internal class RoomLifecycleManager(
 
                 // Regardless of whether we're suspended or failed, run-down the other channels
                 // The wind-down procedure will take Precedence over any user-driven actions
+                logger.warn("doAttach(); transitioning room into ${attachResult.status.name} state")
                 statusLifecycle.setStatus(attachResult)
                 return attachResult
             }
         }
 
         // CHA-RL1g, We successfully attached all the channels - set our status to attached, start listening changes in channel status
+        logger.debug("doAttach(); attach success for all features: ${contributors.joinToString { it.featureName }}")
         this.statusLifecycle.setStatus(attachResult)
         this.operationInProgress = false
 
         // Iterate the pending discontinuity events and trigger them
         for ((contributor, error) in pendingDiscontinuityEvents) {
+            logger.warn("doAttach(); emitting discontinuity event for feature: ${contributor.featureName}")
             contributor.discontinuityDetected(error)
         }
         pendingDiscontinuityEvents.clear()
@@ -407,9 +441,11 @@ internal class RoomLifecycleManager(
     /**
      * If we've failed to attach, then we're in the failed state and all that is left to do is to detach all the channels.
      * Spec: CHA-RL1h5, CHA-RL1h6
-     * @returns Returns only when all channels are detached. Doesn't throw exception.
+     * @returns Returns only when all channels are either detached or failed. Doesn't throw exception.
      */
     private suspend fun runDownChannelsOnFailedAttach() {
+        logger.trace("runDownChannelsOnFailedAttach();")
+        logger.debug("runDownChannelsOnFailedAttach(); attempting to detach all channels")
         // At this point, we have control over the channel lifecycle, so we can hold onto it until things are resolved
         // Keep trying to detach the channels until they're all detached.
         var channelWindDown = kotlin.runCatching { doChannelWindDown() }
@@ -419,6 +455,7 @@ internal class RoomLifecycleManager(
             delay(retryDurationInMs)
             channelWindDown = kotlin.runCatching { doChannelWindDown() }
         }
+        logger.debug("runDownChannelsOnFailedAttach(); success, all channels are either detached or failed")
     }
 
     /**
@@ -431,11 +468,13 @@ internal class RoomLifecycleManager(
      */
     @Suppress("CognitiveComplexMethod", "ComplexCondition")
     private suspend fun doChannelWindDown(except: ContributesToRoomLifecycle? = null) = coroutineScope {
+        logger.trace("doChannelWindDown();")
         contributors.map { contributor: ContributesToRoomLifecycle ->
             async {
                 // CHA-RL5a1 - If its the contributor we want to wait for a conclusion on, then we should not detach it
                 // Unless we're in a failed state, in which case we should detach it
                 if (contributor.channel === except?.channel && statusLifecycle.status !== RoomStatus.Failed) {
+                    logger.debug("doChannelWindDown(); ignoring feature: ${contributor.featureName} as per except param")
                     return@async
                 }
                 // If the room's already in the failed state, or it's releasing, we should not detach a failed channel
@@ -446,12 +485,15 @@ internal class RoomLifecycleManager(
                         ) &&
                     contributor.channel.state === ChannelState.failed
                 ) {
+                    logger.debug("doChannelWindDown(); ignoring feature: ${contributor.featureName} since it's already in failed state")
                     return@async
                 }
 
                 try {
+                    logger.debug("doChannelWindDown(); detaching feature: ${contributor.featureName}")
                     contributor.channel.detachCoroutine()
                 } catch (throwable: Throwable) {
+                    logger.warn("doChannelWindDown(); failed to detach feature: ${contributor.featureName}", throwable)
                     // CHA-RL2h2 - If the contributor is in a failed state and we're not ignoring failed states, we should fail the room
                     if (
                         contributor.channel.state === ChannelState.failed &&
@@ -482,6 +524,7 @@ internal class RoomLifecycleManager(
      */
     @Suppress("ThrowsCount")
     internal suspend fun detach() {
+        logger.trace("detach();")
         val deferredDetach = atomicCoroutineScope.async(LifecycleOperationPrecedence.AttachOrDetach.priority) { // CHA-RL2i
             // CHA-RL2a - If we're already detached, this is a no-op
             if (statusLifecycle.status === RoomStatus.Detached) {
@@ -529,6 +572,7 @@ internal class RoomLifecycleManager(
      * Spec: CHA-RL2f
      */
     private suspend fun doDetach() {
+        logger.trace("doDetach();")
         var channelWindDown = kotlin.runCatching { doChannelWindDown() }
         var firstContributorFailedError: AblyException? = null
         while (channelWindDown.isFailure) { // CHA-RL2h
@@ -564,6 +608,7 @@ internal class RoomLifecycleManager(
      * Spec: CHA-RL3
      */
     internal suspend fun release() {
+        logger.trace("release();")
         val deferredRelease = atomicCoroutineScope.async(LifecycleOperationPrecedence.Release.priority) { // CHA-RL3k
             // CHA-RL3a - If we're already released, this is a no-op
             if (statusLifecycle.status === RoomStatus.Released) {
@@ -595,6 +640,7 @@ internal class RoomLifecycleManager(
      *  Spec: CHA-RL3f, CHA-RL3d
      */
     private suspend fun releaseChannels() {
+        logger.trace("releaseChannels();")
         var contributorsReleased = kotlin.runCatching { doRelease() }
         while (contributorsReleased.isFailure) {
             // Wait a short period and then try again
@@ -610,6 +656,7 @@ internal class RoomLifecycleManager(
      */
     @Suppress("RethrowCaughtException")
     private suspend fun doRelease() = coroutineScope {
+        logger.trace("doRelease();")
         contributors.map { contributor: ContributesToRoomLifecycle ->
             async {
                 // CHA-RL3e - Failed channels, we can ignore