From 8ad81806cfa3c40068b5543d311bc47453ebb257 Mon Sep 17 00:00:00 2001
From: Boris Zbarsky <bzbarsky@apple.com>
Date: Fri, 2 Sep 2022 13:30:39 -0400
Subject: [PATCH] Fix timer cancellation to be reliable in LayerImplSelect.

To minimize risk, the changes here keep the "grab all the timers we
should fire, then fire them" setup instead of switching to the "fire
the timers one at a time" approach LayerImplFreeRTOS uses.

The fix consists of the following parts:

1) Store the list of timers to fire in a member, so we can cancel things from
   that list as needed.
2) To avoid canceling things scheduled by ScheduleWork, remove the CancelTimer
   call in ScheduleWork.  This does mean we now allow multiple timers for the
   same callback+appState in the timer list, if they were created by
   ScheduleWork, but that should be OK, since the only reason that pair needs to
   be unique is to allow cancellation and we never want to cancel the things
   ScheduleWork schedules.  As a followup we should stop using the timer list
   for ScheduleWork altogether and use ScheduleLambda like LayerImplFreeRTOS
   does, but that involves fixing the unit tests that call ScheduleWork without
   actually running the platfor manager event loop and expect it to work
   somehow.

TestRead was failing the sanity assert for not losing track of timers
to fire, because it was spinning a nested event loop.  The changes to
that test stop it from doing that.

Fixes https://github.com/project-chip/connectedhomeip/issues/19387
Fixes https://github.com/project-chip/connectedhomeip/issues/22160
---
 src/controller/tests/data_model/TestRead.cpp | 48 ++++++++++-----
 src/system/SystemLayerImplFreeRTOS.cpp       | 24 +++++++-
 src/system/SystemLayerImplSelect.cpp         | 36 +++++++++--
 src/system/SystemLayerImplSelect.h           |  3 +
 src/system/tests/BUILD.gn                    |  1 +
 src/system/tests/TestSystemClock.cpp         |  2 +-
 src/system/tests/TestSystemScheduleWork.cpp  | 39 ++++++++----
 src/system/tests/TestSystemTimer.cpp         | 64 ++++++++++++++++++++
 8 files changed, 183 insertions(+), 34 deletions(-)

diff --git a/src/controller/tests/data_model/TestRead.cpp b/src/controller/tests/data_model/TestRead.cpp
index cc314e60469cd2..8648d6399e1652 100644
--- a/src/controller/tests/data_model/TestRead.cpp
+++ b/src/controller/tests/data_model/TestRead.cpp
@@ -298,6 +298,11 @@ class TestReadInteraction : public app::ReadHandler::ApplicationCallback
     // succeed.
     static void MultipleReadHelper(nlTestSuite * apSuite, TestContext & aCtx, size_t aReadCount);
 
+    // Helper for MultipleReadHelper that does not spin the event loop, so we
+    // don't end up with nested event loops.
+    static void MultipleReadHelperGuts(nlTestSuite * apSuite, TestContext & aCtx, size_t aReadCount, uint32_t & aNumSuccessCalls,
+                                       uint32_t & aNumFailureCalls);
+
     // Establish the given number of subscriptions, then issue the given number
     // of reads in parallel and wait for them all to succeed.
     static void SubscribeThenReadHelper(nlTestSuite * apSuite, TestContext & aCtx, size_t aSubscribeCount, size_t aReadCount);
@@ -2484,6 +2489,9 @@ void TestReadInteraction::SubscribeThenReadHelper(nlTestSuite * apSuite, TestCon
     uint32_t numSuccessCalls                 = 0;
     uint32_t numSubscriptionEstablishedCalls = 0;
 
+    uint32_t numReadSuccessCalls = 0;
+    uint32_t numReadFailureCalls = 0;
+
     responseDirective = kSendDataResponse;
 
     // Passing of stack variables by reference is only safe because of synchronous completion of the interaction. Otherwise, it's
@@ -2501,12 +2509,12 @@ void TestReadInteraction::SubscribeThenReadHelper(nlTestSuite * apSuite, TestCon
         NL_TEST_ASSERT(apSuite, false);
     };
 
-    auto onSubscriptionEstablishedCb = [&numSubscriptionEstablishedCalls, &apSuite, &aCtx, aSubscribeCount,
-                                        aReadCount](const app::ReadClient & readClient) {
+    auto onSubscriptionEstablishedCb = [&numSubscriptionEstablishedCalls, &apSuite, &aCtx, aSubscribeCount, aReadCount,
+                                        &numReadSuccessCalls, &numReadFailureCalls](const app::ReadClient & readClient) {
         numSubscriptionEstablishedCalls++;
         if (numSubscriptionEstablishedCalls == aSubscribeCount)
         {
-            MultipleReadHelper(apSuite, aCtx, aReadCount);
+            MultipleReadHelperGuts(apSuite, aCtx, aReadCount, numReadSuccessCalls, numReadFailureCalls);
         }
     };
 
@@ -2522,40 +2530,50 @@ void TestReadInteraction::SubscribeThenReadHelper(nlTestSuite * apSuite, TestCon
 
     NL_TEST_ASSERT(apSuite, numSuccessCalls == aSubscribeCount);
     NL_TEST_ASSERT(apSuite, numSubscriptionEstablishedCalls == aSubscribeCount);
+    NL_TEST_ASSERT(apSuite, numReadSuccessCalls == aReadCount);
+    NL_TEST_ASSERT(apSuite, numReadFailureCalls == 0);
 }
 
-void TestReadInteraction::MultipleReadHelper(nlTestSuite * apSuite, TestContext & aCtx, size_t aReadCount)
+// The guts of MultipleReadHelper which take references to the success/failure
+// counts to modify and assume the consumer will be spinning the event loop.
+void TestReadInteraction::MultipleReadHelperGuts(nlTestSuite * apSuite, TestContext & aCtx, size_t aReadCount,
+                                                 uint32_t & aNumSuccessCalls, uint32_t & aNumFailureCalls)
 {
-    auto sessionHandle       = aCtx.GetSessionBobToAlice();
-    uint32_t numSuccessCalls = 0;
-    uint32_t numFailureCalls = 0;
+    NL_TEST_ASSERT(apSuite, aNumSuccessCalls == 0);
+    NL_TEST_ASSERT(apSuite, aNumFailureCalls == 0);
+
+    auto sessionHandle = aCtx.GetSessionBobToAlice();
 
     responseDirective = kSendDataResponse;
 
     uint16_t firstExpectedResponse = totalReadCount + 1;
 
-    // Passing of stack variables by reference is only safe because of synchronous completion of the interaction. Otherwise, it's
-    // not safe to do so.
-    auto onFailureCb = [&apSuite, &numFailureCalls](const app::ConcreteDataAttributePath * attributePath, CHIP_ERROR aError) {
-        numFailureCalls++;
+    auto onFailureCb = [apSuite, &aNumFailureCalls](const app::ConcreteDataAttributePath * attributePath, CHIP_ERROR aError) {
+        aNumFailureCalls++;
 
         NL_TEST_ASSERT(apSuite, attributePath == nullptr);
     };
 
     for (size_t i = 0; i < aReadCount; ++i)
     {
-        // Passing of stack variables by reference is only safe because of synchronous completion of the interaction. Otherwise,
-        // it's not safe to do so.
-        auto onSuccessCb = [&numSuccessCalls, &apSuite, firstExpectedResponse,
+        auto onSuccessCb = [&aNumSuccessCalls, apSuite, firstExpectedResponse,
                             i](const app::ConcreteDataAttributePath & attributePath, const auto & dataResponse) {
             NL_TEST_ASSERT(apSuite, dataResponse == firstExpectedResponse + i);
-            numSuccessCalls++;
+            aNumSuccessCalls++;
         };
 
         NL_TEST_ASSERT(apSuite,
                        Controller::ReadAttribute<TestCluster::Attributes::Int16u::TypeInfo>(
                            &aCtx.GetExchangeManager(), sessionHandle, kTestEndpointId, onSuccessCb, onFailureCb) == CHIP_NO_ERROR);
     }
+}
+
+void TestReadInteraction::MultipleReadHelper(nlTestSuite * apSuite, TestContext & aCtx, size_t aReadCount)
+{
+    uint32_t numSuccessCalls = 0;
+    uint32_t numFailureCalls = 0;
+
+    MultipleReadHelperGuts(apSuite, aCtx, aReadCount, numSuccessCalls, numFailureCalls);
 
     aCtx.DrainAndServiceIO();
 
diff --git a/src/system/SystemLayerImplFreeRTOS.cpp b/src/system/SystemLayerImplFreeRTOS.cpp
index 79d892b30c54d4..1a9692302c9c48 100644
--- a/src/system/SystemLayerImplFreeRTOS.cpp
+++ b/src/system/SystemLayerImplFreeRTOS.cpp
@@ -88,10 +88,32 @@ CHIP_ERROR LayerImplFreeRTOS::ScheduleWork(TimerCompleteCallback onComplete, voi
 {
     VerifyOrReturnError(mLayerState.IsInitialized(), CHIP_ERROR_INCORRECT_STATE);
 
+    // Ideally we would not use a timer here at all, but if we try to just
+    // ScheduleLambda the lambda needs to capture the following:
+    // 1) onComplete
+    // 2) appState
+    // 3) The `this` pointer, because onComplete needs to be passed a pointer to
+    //    the System::Layer.
+    //
+    // On a 64-bit system that's 24 bytes, but lambdas passed to ScheduleLambda
+    // are capped at CHIP_CONFIG_LAMBDA_EVENT_SIZE which is 16 bytes.
+    //
+    // So for now use a timer as a poor-man's closure that captures `this` and
+    // onComplete and appState in a single pointer, so we fit inside the size
+    // limit.
+    //
+    // TODO: We could do something here where we compile-time condition on the
+    // sizes of things and use a direct ScheduleLambda if it would fit and this
+    // setup otherwise.
     TimerList::Node * timer = mTimerPool.Create(*this, SystemClock().GetMonotonicTimestamp(), onComplete, appState);
     VerifyOrReturnError(timer != nullptr, CHIP_ERROR_NO_MEMORY);
 
-    return ScheduleLambda([this, timer] { this->mTimerPool.Invoke(timer); });
+    CHIP_ERROR err = ScheduleLambda([this, timer] { this->mTimerPool.Invoke(timer); });
+    if (err != CHIP_NO_ERROR)
+    {
+        mTimerPool.Release(timer);
+    }
+    return err;
 }
 
 /**
diff --git a/src/system/SystemLayerImplSelect.cpp b/src/system/SystemLayerImplSelect.cpp
index 98f4fe71547826..f0601357b2afcb 100644
--- a/src/system/SystemLayerImplSelect.cpp
+++ b/src/system/SystemLayerImplSelect.cpp
@@ -170,6 +170,10 @@ void LayerImplSelect::CancelTimer(TimerCompleteCallback onComplete, void * appSt
     VerifyOrReturn(mLayerState.IsInitialized());
 
     TimerList::Node * timer = mTimerList.Remove(onComplete, appState);
+    if (timer == nullptr)
+    {
+        timer = mExpiredTimers.Remove(onComplete, appState);
+    }
     VerifyOrReturn(timer != nullptr);
 
 #if CHIP_SYSTEM_CONFIG_USE_DISPATCH
@@ -199,8 +203,31 @@ CHIP_ERROR LayerImplSelect::ScheduleWork(TimerCompleteCallback onComplete, void
     }
 #endif // CHIP_SYSTEM_CONFIG_USE_DISPATCH
 
-    CancelTimer(onComplete, appState);
-
+    // Ideally we would not use a timer here at all, but if we try to just
+    // ScheduleLambda the lambda needs to capture the following:
+    // 1) onComplete
+    // 2) appState
+    // 3) The `this` pointer, because onComplete needs to be passed a pointer to
+    //    the System::Layer.
+    //
+    // On a 64-bit system that's 24 bytes, but lambdas passed to ScheduleLambda
+    // are capped at CHIP_CONFIG_LAMBDA_EVENT_SIZE which is 16 bytes.
+    //
+    // So for now use a timer as a poor-man's closure that captures `this` and
+    // onComplete and appState in a single pointer, so we fit inside the size
+    // limit.
+    //
+    // TODO: We could do something here where we compile-time condition on the
+    // sizes of things and use a direct ScheduleLambda if it would fit and this
+    // setup otherwise.
+    //
+    // TODO: But also, unit tests seem to do SystemLayer::ScheduleWork without
+    // actually running a useful event loop (in the PlatformManager sense),
+    // which breaks if we use ScheduleLambda here, since that does rely on the
+    // PlatformManager event loop. So for now, keep scheduling an expires-ASAP
+    // timer, but just make sure we don't cancel existing timers with the same
+    // callback and appState, so ScheduleWork invocations don't stomp on each
+    // other.
     TimerList::Node * timer = mTimerPool.Create(*this, SystemClock().GetMonotonicTimestamp(), onComplete, appState);
     VerifyOrReturnError(timer != nullptr, CHIP_ERROR_NO_MEMORY);
 
@@ -469,9 +496,10 @@ void LayerImplSelect::HandleEvents()
 
     // Obtain the list of currently expired timers. Any new timers added by timer callback are NOT handled on this pass,
     // since that could result in infinite handling of new timers blocking any other progress.
-    TimerList expiredTimers = mTimerList.ExtractEarlier(Clock::Timeout(1) + SystemClock().GetMonotonicTimestamp());
+    VerifyOrDieWithMsg(mExpiredTimers.Empty(), DeviceLayer, "Re-entry into HandleEvents from a timer callback?");
+    mExpiredTimers          = mTimerList.ExtractEarlier(Clock::Timeout(1) + SystemClock().GetMonotonicTimestamp());
     TimerList::Node * timer = nullptr;
-    while ((timer = expiredTimers.PopEarliest()) != nullptr)
+    while ((timer = mExpiredTimers.PopEarliest()) != nullptr)
     {
         mTimerPool.Invoke(timer);
     }
diff --git a/src/system/SystemLayerImplSelect.h b/src/system/SystemLayerImplSelect.h
index f193a8860d961c..c2dd3994d1720e 100644
--- a/src/system/SystemLayerImplSelect.h
+++ b/src/system/SystemLayerImplSelect.h
@@ -101,6 +101,9 @@ class LayerImplSelect : public LayerSocketsLoop
 
     TimerPool<TimerList::Node> mTimerPool;
     TimerList mTimerList;
+    // List of expired times being processed right now.  Stored in a member so
+    // we can cancel them.
+    TimerList mExpiredTimers;
     timeval mNextTimeout;
 
     // Members for select loop
diff --git a/src/system/tests/BUILD.gn b/src/system/tests/BUILD.gn
index 47c97b57caa9e1..d10b3de9ef50c2 100644
--- a/src/system/tests/BUILD.gn
+++ b/src/system/tests/BUILD.gn
@@ -26,6 +26,7 @@ chip_test_suite("tests") {
     "TestSystemErrorStr.cpp",
     "TestSystemPacketBuffer.cpp",
     "TestSystemScheduleLambda.cpp",
+    "TestSystemScheduleWork.cpp",
     "TestSystemTimer.cpp",
     "TestSystemWakeEvent.cpp",
     "TestTimeSource.cpp",
diff --git a/src/system/tests/TestSystemClock.cpp b/src/system/tests/TestSystemClock.cpp
index 0853973b4f5846..1ac5b994a2edb2 100644
--- a/src/system/tests/TestSystemClock.cpp
+++ b/src/system/tests/TestSystemClock.cpp
@@ -118,7 +118,7 @@ static const nlTest sTests[] =
 int TestSystemClock(void)
 {
     nlTestSuite theSuite = {
-        "chip-timesource", &sTests[0], nullptr /* setup */, nullptr /* teardown */
+        "chip-systemclock", &sTests[0], nullptr /* setup */, nullptr /* teardown */
     };
 
     // Run test suit againt one context.
diff --git a/src/system/tests/TestSystemScheduleWork.cpp b/src/system/tests/TestSystemScheduleWork.cpp
index db43fcf36a78e7..6c858905f50b76 100644
--- a/src/system/tests/TestSystemScheduleWork.cpp
+++ b/src/system/tests/TestSystemScheduleWork.cpp
@@ -22,18 +22,25 @@
 #include <nlunit-test.h>
 #include <platform/CHIPDeviceLayer.h>
 
-// Test input data.
+static void IncrementIntCounter(chip::System::Layer *, void * state)
+{
+    ++(*static_cast<int *>(state));
+}
 
-static void CheckScheduleLambda(nlTestSuite * inSuite, void * aContext)
+static void StopEventLoop(chip::System::Layer *, void *)
 {
-    bool * called = new bool(false);
-    chip::DeviceLayer::SystemLayer().ScheduleLambda([called] {
-        *called = true;
-        chip::DeviceLayer::PlatformMgr().StopEventLoopTask();
-    });
+    chip::DeviceLayer::PlatformMgr().StopEventLoopTask();
+}
+
+static void CheckScheduleWorkTwice(nlTestSuite * inSuite, void * aContext)
+{
+    int * callCount = new int(0);
+    NL_TEST_ASSERT(inSuite, chip::DeviceLayer::SystemLayer().ScheduleWork(IncrementIntCounter, callCount) == CHIP_NO_ERROR);
+    NL_TEST_ASSERT(inSuite, chip::DeviceLayer::SystemLayer().ScheduleWork(IncrementIntCounter, callCount) == CHIP_NO_ERROR);
+    NL_TEST_ASSERT(inSuite, chip::DeviceLayer::SystemLayer().ScheduleWork(StopEventLoop, nullptr) == CHIP_NO_ERROR);
     chip::DeviceLayer::PlatformMgr().RunEventLoop();
-    NL_TEST_ASSERT(inSuite, *called);
-    delete called;
+    NL_TEST_ASSERT(inSuite, *callCount == 2);
+    delete callCount;
 }
 
 // Test Suite
@@ -44,7 +51,7 @@ static void CheckScheduleLambda(nlTestSuite * inSuite, void * aContext)
 // clang-format off
 static const nlTest sTests[] =
 {
-    NL_TEST_DEF("System::TestScheduleLambda", CheckScheduleLambda),
+    NL_TEST_DEF("System::TestScheduleWorkTwice", CheckScheduleWorkTwice),
     NL_TEST_SENTINEL()
 };
 // clang-format on
@@ -55,7 +62,7 @@ static int TestTeardown(void * aContext);
 // clang-format off
 static nlTestSuite kTheSuite =
 {
-    "chip-system-schedule-lambda",
+    "chip-system-schedule-work",
     &sTests[0],
     TestSetup,
     TestTeardown
@@ -67,6 +74,11 @@ static nlTestSuite kTheSuite =
  */
 static int TestSetup(void * aContext)
 {
+    if (chip::Platform::MemoryInit() != CHIP_NO_ERROR)
+    {
+        return FAILURE;
+    }
+
     if (chip::DeviceLayer::PlatformMgr().InitChipStack() != CHIP_NO_ERROR)
         return FAILURE;
 
@@ -80,10 +92,11 @@ static int TestSetup(void * aContext)
 static int TestTeardown(void * aContext)
 {
     chip::DeviceLayer::PlatformMgr().Shutdown();
+    chip::Platform::MemoryShutdown();
     return (SUCCESS);
 }
 
-int TestSystemScheduleLambda(void)
+int TestSystemScheduleWork(void)
 {
     // Run test suit againt one lContext.
     nlTestRunner(&kTheSuite, nullptr);
@@ -91,4 +104,4 @@ int TestSystemScheduleLambda(void)
     return nlTestRunnerStats(&kTheSuite);
 }
 
-CHIP_REGISTER_TEST_SUITE(TestSystemScheduleLambda)
+CHIP_REGISTER_TEST_SUITE(TestSystemScheduleWork)
diff --git a/src/system/tests/TestSystemTimer.cpp b/src/system/tests/TestSystemTimer.cpp
index 58ff87e85cd8dd..e92cc8a03d5346 100644
--- a/src/system/tests/TestSystemTimer.cpp
+++ b/src/system/tests/TestSystemTimer.cpp
@@ -257,6 +257,69 @@ void CheckOrder(nlTestSuite * inSuite, void * aContext)
     Clock::Internal::SetSystemClockForTesting(savedClock);
 }
 
+void CheckCancellation(nlTestSuite * inSuite, void * aContext)
+{
+    if (!LayerEvents<LayerImpl>::HasServiceEvents())
+        return;
+
+    TestContext & testContext = *static_cast<TestContext *>(aContext);
+    Layer & systemLayer       = *testContext.mLayer;
+    nlTestSuite * const suite = testContext.mTestSuite;
+
+    struct TestState
+    {
+        TestState(Layer & systemLayer) : systemLayer(systemLayer) {}
+
+        void Record(char c)
+        {
+            size_t n = strlen(record);
+            if (n + 1 < sizeof(record))
+            {
+                record[n++] = c;
+                record[n]   = 0;
+            }
+        }
+        static void A(Layer * layer, void * state)
+        {
+            auto self = static_cast<TestState *>(state);
+            self->Record('A');
+            self->systemLayer.CancelTimer(B, state);
+            self->systemLayer.CancelTimer(D, state);
+        }
+        static void B(Layer * layer, void * state) { static_cast<TestState *>(state)->Record('B'); }
+        static void C(Layer * layer, void * state)
+        {
+            auto self = static_cast<TestState *>(state);
+            self->Record('C');
+            self->systemLayer.CancelTimer(E, state);
+        }
+        static void D(Layer * layer, void * state) { static_cast<TestState *>(state)->Record('D'); }
+        static void E(Layer * layer, void * state) { static_cast<TestState *>(state)->Record('E'); }
+        char record[6] = { 0 };
+
+        Layer & systemLayer;
+    };
+    TestState testState(systemLayer);
+    NL_TEST_ASSERT(suite, testState.record[0] == 0);
+
+    Clock::ClockBase * const savedClock = &SystemClock();
+    Clock::Internal::MockClock mockClock;
+    Clock::Internal::SetSystemClockForTesting(&mockClock);
+
+    using namespace Clock::Literals;
+    systemLayer.StartTimer(0_ms, TestState::A, &testState);
+    systemLayer.StartTimer(0_ms, TestState::B, &testState);
+    systemLayer.StartTimer(20_ms, TestState::C, &testState);
+    systemLayer.StartTimer(30_ms, TestState::D, &testState);
+    systemLayer.StartTimer(50_ms, TestState::E, &testState);
+
+    mockClock.AdvanceMonotonic(100_ms);
+    LayerEvents<LayerImpl>::ServiceEvents(systemLayer);
+    NL_TEST_ASSERT(suite, strcmp(testState.record, "AC") == 0);
+
+    Clock::Internal::SetSystemClockForTesting(savedClock);
+}
+
 // Test the implementation helper classes TimerPool, TimerList, and TimerData.
 namespace chip {
 namespace System {
@@ -416,6 +479,7 @@ static const nlTest sTests[] =
     NL_TEST_DEF("Timer::TestOverflow",             CheckOverflow),
     NL_TEST_DEF("Timer::TestTimerStarvation",      CheckStarvation),
     NL_TEST_DEF("Timer::TestTimerOrder",           CheckOrder),
+    NL_TEST_DEF("Timer::TestTimerCancellation",    CheckCancellation),
     NL_TEST_DEF("Timer::TestTimerPool",            chip::System::TestTimer::CheckTimerPool),
     NL_TEST_SENTINEL()
 };