Skip to content

Commit

Permalink
Add a watchdog timer to catch OTA process getting stuck (#16542)
Browse files Browse the repository at this point in the history
  • Loading branch information
isiu-apple authored and pull[bot] committed Aug 21, 2023
1 parent e8a71d8 commit 18df898
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 19 deletions.
84 changes: 69 additions & 15 deletions src/app/clusters/ota-requestor/GenericOTARequestorDriver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ void GenericOTARequestorDriver::Init(OTARequestorInterface * requestor, OTAImage
else
{
// Start the first periodic query timer
StartDefaultProviderTimer();
StartSelectedTimer(SelectedTimer::kPeriodicQueryTimer);
}
}

Expand Down Expand Up @@ -118,17 +118,23 @@ bool GenericOTARequestorDriver::ProviderLocationsEqual(const ProviderLocationTyp

void GenericOTARequestorDriver::HandleError(UpdateFailureState state, CHIP_ERROR error) {}

void GenericOTARequestorDriver::HandleIdleStateExit()
{
// Start watchdog timer to monitor new Query Image session
StartSelectedTimer(SelectedTimer::kWatchdogTimer);
}

void GenericOTARequestorDriver::HandleIdleState(IdleStateReason reason)
{
switch (reason)
{
case IdleStateReason::kUnknown:
ChipLogProgress(SoftwareUpdate, "Unknown idle state reason so set the periodic timer for a next attempt");
StartDefaultProviderTimer();
StartSelectedTimer(SelectedTimer::kPeriodicQueryTimer);
break;
case IdleStateReason::kIdle:
// There is no current OTA update in progress so start the periodic query timer
StartDefaultProviderTimer();
StartSelectedTimer(SelectedTimer::kPeriodicQueryTimer);
break;
case IdleStateReason::kInvalidSession:
// An invalid session is detected which may be temporary so try to query the same provider again
Expand Down Expand Up @@ -201,7 +207,7 @@ void GenericOTARequestorDriver::UpdateNotFound(UpdateNotFoundReason reason, Syst
else
{
ChipLogProgress(SoftwareUpdate, "UpdateNotFound, not scheduling further retries");
StartDefaultProviderTimer();
StartSelectedTimer(SelectedTimer::kPeriodicQueryTimer);
}
}

Expand Down Expand Up @@ -240,7 +246,7 @@ void GenericOTARequestorDriver::UpdateDiscontinued()
UpdateCancelled();

// Restart the periodic default provider timer
StartDefaultProviderTimer();
StartSelectedTimer(SelectedTimer::kPeriodicQueryTimer);
}

// Cancel all OTA update timers
Expand Down Expand Up @@ -336,15 +342,15 @@ void GenericOTARequestorDriver::SendQueryImage()
UpdateCancelled();

// Default provider timer only runs when there is no ongoing query/update; must stop it now.
// TriggerImmediateQueryInternal() will cause the state to change from kIdle
StopDefaultProviderTimer();
// TriggerImmediateQueryInternal() will cause the state to change from kIdle, which will start
// the Watchdog timer. (Clean this up with Issue#16151)

mProviderRetryCount++;

DeviceLayer::SystemLayer().ScheduleLambda([this] { mRequestor->TriggerImmediateQueryInternal(); });
}

void GenericOTARequestorDriver::DefaultProviderTimerHandler(System::Layer * systemLayer, void * appState)
void GenericOTARequestorDriver::PeriodicQueryTimerHandler(System::Layer * systemLayer, void * appState)
{
ChipLogProgress(SoftwareUpdate, "Default Provider timer handler is invoked");

Expand All @@ -353,7 +359,7 @@ void GenericOTARequestorDriver::DefaultProviderTimerHandler(System::Layer * syst
bool listExhausted = false;
if (GetNextProviderLocation(providerLocation, listExhausted) != true)
{
StartDefaultProviderTimer();
StartSelectedTimer(SelectedTimer::kPeriodicQueryTimer);
return;
}

Expand All @@ -362,28 +368,76 @@ void GenericOTARequestorDriver::DefaultProviderTimerHandler(System::Layer * syst
SendQueryImage();
}

void GenericOTARequestorDriver::StartDefaultProviderTimer()
void GenericOTARequestorDriver::StartPeriodicQueryTimer()
{
ChipLogProgress(SoftwareUpdate, "Starting the Default Provider timer, timeout: %u seconds",
ChipLogProgress(SoftwareUpdate, "Starting the periodic query timer, timeout: %u seconds",
(unsigned int) mPeriodicQueryTimeInterval);
ScheduleDelayedAction(
System::Clock::Seconds32(mPeriodicQueryTimeInterval),
[](System::Layer *, void * context) {
(static_cast<GenericOTARequestorDriver *>(context))->DefaultProviderTimerHandler(nullptr, context);
(static_cast<GenericOTARequestorDriver *>(context))->PeriodicQueryTimerHandler(nullptr, context);
},
this);
}

void GenericOTARequestorDriver::StopDefaultProviderTimer()
void GenericOTARequestorDriver::StopPeriodicQueryTimer()
{
ChipLogProgress(SoftwareUpdate, "Stopping the Default Provider timer");
ChipLogProgress(SoftwareUpdate, "Stopping the Periodic Query timer");
CancelDelayedAction(
[](System::Layer *, void * context) {
(static_cast<GenericOTARequestorDriver *>(context))->DefaultProviderTimerHandler(nullptr, context);
(static_cast<GenericOTARequestorDriver *>(context))->PeriodicQueryTimerHandler(nullptr, context);
},
this);
}

void GenericOTARequestorDriver::WatchdogTimerHandler(System::Layer * systemLayer, void * appState)
{
ChipLogError(SoftwareUpdate, "Watchdog timer detects state stuck at %u. Cancelling download and resetting state.",
to_underlying(mRequestor->GetCurrentUpdateState()));

// Something went wrong and OTA requestor is stuck in a non-idle state for too long.
// Let's just cancel download, reset state, and re-start periodic query timer.
UpdateDiscontinued();
mRequestor->CancelImageUpdate();
StartPeriodicQueryTimer();
}

void GenericOTARequestorDriver::StartWatchdogTimer()
{
ChipLogProgress(SoftwareUpdate, "Starting the watchdog timer, timeout: %u seconds", (unsigned int) mWatchdogTimeInterval);
ScheduleDelayedAction(
System::Clock::Seconds32(mWatchdogTimeInterval),
[](System::Layer *, void * context) {
(static_cast<GenericOTARequestorDriver *>(context))->WatchdogTimerHandler(nullptr, context);
},
this);
}

void GenericOTARequestorDriver::StopWatchdogTimer()
{
ChipLogProgress(SoftwareUpdate, "Stopping the watchdog timer");
CancelDelayedAction(
[](System::Layer *, void * context) {
(static_cast<GenericOTARequestorDriver *>(context))->WatchdogTimerHandler(nullptr, context);
},
this);
}

void GenericOTARequestorDriver::StartSelectedTimer(SelectedTimer timer)
{
switch (timer)
{
case SelectedTimer::kPeriodicQueryTimer:
StopWatchdogTimer();
StartPeriodicQueryTimer();
break;
case SelectedTimer::kWatchdogTimer:
StopPeriodicQueryTimer();
StartWatchdogTimer();
break;
}
}

/**
* Returns the next available Provider location. The algorithm is to simply loop through the list of DefaultOtaProviders as a
* circular list and return the next value (based on the last used provider). If the list of DefaultOtaProviders is empty, FALSE is
Expand Down
12 changes: 9 additions & 3 deletions src/app/clusters/ota-requestor/GenericOTARequestorDriver.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class GenericOTARequestorDriver : public OTARequestorDriver
bool CanConsent() override;
uint16_t GetMaxDownloadBlockSize() override;
void HandleError(UpdateFailureState state, CHIP_ERROR error) override;
void HandleIdleStateExit() override;
void HandleIdleState(IdleStateReason reason) override;
void UpdateAvailable(const UpdateDescription & update, System::Clock::Seconds32 delay) override;
void UpdateNotFound(UpdateNotFoundReason reason, System::Clock::Seconds32 delay) override;
Expand All @@ -71,9 +72,13 @@ class GenericOTARequestorDriver : public OTARequestorDriver
bool GetNextProviderLocation(ProviderLocationType & providerLocation, bool & listExhausted) override;

protected:
void StartDefaultProviderTimer();
void StopDefaultProviderTimer();
void DefaultProviderTimerHandler(System::Layer * systemLayer, void * appState);
void StartPeriodicQueryTimer();
void StopPeriodicQueryTimer();
void PeriodicQueryTimerHandler(System::Layer * systemLayer, void * appState);
void StartWatchdogTimer();
void StopWatchdogTimer();
void WatchdogTimerHandler(System::Layer * systemLayer, void * appState);
void StartSelectedTimer(SelectedTimer timer);
void ScheduleDelayedAction(System::Clock::Seconds32 delay, System::TimerCompleteCallback action, void * aAppState);
void CancelDelayedAction(System::TimerCompleteCallback action, void * aAppState);
bool ProviderLocationsEqual(const ProviderLocationType & a, const ProviderLocationType & b);
Expand All @@ -82,6 +87,7 @@ class GenericOTARequestorDriver : public OTARequestorDriver
OTAImageProcessorInterface * mImageProcessor = nullptr;
uint32_t mOtaStartDelaySec = 0;
uint32_t mPeriodicQueryTimeInterval = (24 * 60 * 60); // Timeout for querying providers on the default OTA provider list
uint32_t mWatchdogTimeInterval = (6 * 60 * 60); // Timeout (in seconds) for checking if Requestor has reverted back to idle mode
// Maximum number of times to retry a BUSY OTA provider before moving to the next available one
static constexpr uint8_t kMaxBusyProviderRetryCount = 3;
uint8_t mProviderRetryCount; // Track retry count for the current provider
Expand Down
7 changes: 6 additions & 1 deletion src/app/clusters/ota-requestor/OTARequestor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ void OTARequestor::CancelImageUpdate()

mOtaRequestorDriver->UpdateCancelled();

RecordNewUpdateState(OTAUpdateStateEnum::kIdle, OTAChangeReasonEnum::kUnknown);
Reset();
}

CHIP_ERROR OTARequestor::GetUpdateStateProgressAttribute(EndpointId endpointId, app::DataModel::Nullable<uint8_t> & progress)
Expand Down Expand Up @@ -658,13 +658,18 @@ void OTARequestor::RecordNewUpdateState(OTAUpdateStateEnum newState, OTAChangeRe
}
OtaRequestorServerOnStateTransition(mCurrentUpdateState, newState, reason, targetSoftwareVersion);

// Issue#16151 tracks re-factoring error and state transitioning handling.
if ((newState == OTAUpdateStateEnum::kIdle) && (mCurrentUpdateState != OTAUpdateStateEnum::kIdle))
{
IdleStateReason idleStateReason = MapErrorToIdleStateReason(error);

// Inform the driver that the OTARequestor has entered the Idle state
mOtaRequestorDriver->HandleIdleState(idleStateReason);
}
else if ((mCurrentUpdateState == OTAUpdateStateEnum::kIdle) && (newState != OTAUpdateStateEnum::kIdle))
{
mOtaRequestorDriver->HandleIdleStateExit();
}

mCurrentUpdateState = newState;
}
Expand Down
10 changes: 10 additions & 0 deletions src/app/clusters/ota-requestor/OTARequestorDriver.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ enum class IdleStateReason
kInvalidSession,
};

// The current selected OTA Requestor timer to be running
enum class SelectedTimer
{
kPeriodicQueryTimer,
kWatchdogTimer,
};

// Interface class to abstract the OTA-related business logic. Each application
// must implement this interface. All calls must be non-blocking unless stated otherwise
class OTARequestorDriver
Expand All @@ -87,6 +94,9 @@ class OTARequestorDriver
/// Called when an error occurs at any OTA requestor operation
virtual void HandleError(UpdateFailureState state, CHIP_ERROR error) = 0;

/// Called when OTA Requestor has exited the Idle state for which the driver may need to take various actions
virtual void HandleIdleStateExit() = 0;

// Called when the OTA Requestor has entered the Idle state for which the driver may need to take various actions
virtual void HandleIdleState(IdleStateReason reason) = 0;

Expand Down

0 comments on commit 18df898

Please sign in to comment.