Skip to content

Commit

Permalink
Fix SoftwareFault and GeneralFault events are missed randomly (#19064)
Browse files Browse the repository at this point in the history
* Fix SoftwareFault and GeneralFault events are missed randomly

* Remove GeneralDiagnosticsDelegate and implement event API in cluster server

* Refactor new APIs from Server into  cluster server

* Address review comments

* Update examples/all-clusters-app/linux/main-common.cpp

Co-authored-by: Boris Zbarsky <[email protected]>

* Rename Server to GeneralDiagnosticsServer to avoid namespace conflict

Co-authored-by: Boris Zbarsky <[email protected]>
  • Loading branch information
yufengwangca and bzbarsky-apple authored Jun 7, 2022
1 parent a005156 commit c868f81
Show file tree
Hide file tree
Showing 13 changed files with 473 additions and 476 deletions.
204 changes: 204 additions & 0 deletions examples/all-clusters-app/linux/main-common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@
#include <app-common/zap-generated/attribute-type.h>
#include <app-common/zap-generated/attributes/Accessors.h>
#include <app/CommandHandler.h>
#include <app/clusters/general-diagnostics-server/general-diagnostics-server.h>
#include <app/clusters/identify-server/identify-server.h>
#include <app/clusters/network-commissioning/network-commissioning.h>
#include <app/clusters/software-diagnostics-server/software-diagnostics-server.h>
#include <app/server/Server.h>
#include <app/util/af.h>
#include <lib/support/CHIPMem.h>
#include <new>
#include <platform/DiagnosticDataProvider.h>
#include <platform/Linux/NetworkCommissioningDriver.h>
#include <platform/PlatformManager.h>
#include <system/SystemPacketBuffer.h>
Expand All @@ -41,6 +44,203 @@ using namespace chip::DeviceLayer;

namespace {
static LowPowerManager lowPowerManager;

bool IsClusterPresentOnAnyEndpoint(ClusterId clusterId)
{
bool retval = false;

for (auto endpointId : EnabledEndpointsWithServerCluster(clusterId))
{
IgnoreUnusedVariable(endpointId);
retval = true;
}

return retval;
}

/**
* Should be called when a software fault takes place on the Node.
*/
void HandleSoftwareFaultEvent(intptr_t arg)
{
if (!IsClusterPresentOnAnyEndpoint(Clusters::SoftwareDiagnostics::Id))
return;

Clusters::SoftwareDiagnostics::Structs::SoftwareFaultStruct::Type softwareFault;
char threadName[kMaxThreadNameLength + 1];

softwareFault.id = static_cast<uint64_t>(getpid());
Platform::CopyString(threadName, std::to_string(softwareFault.id).c_str());

softwareFault.name = CharSpan::fromCharString(threadName);

std::time_t result = std::time(nullptr);
char * asctime = std::asctime(std::localtime(&result));
softwareFault.faultRecording = ByteSpan(Uint8::from_const_char(asctime), strlen(asctime));

Clusters::SoftwareDiagnosticsServer::Instance().OnSoftwareFaultDetect(softwareFault);
}

/**
* Should be called when a general fault takes place on the Node.
*/
void HandleGeneralFaultEvent(intptr_t arg)
{
uint32_t eventId = static_cast<uint32_t>(arg);

if (!IsClusterPresentOnAnyEndpoint(Clusters::GeneralDiagnostics::Id))
return;

if (eventId == Clusters::GeneralDiagnostics::Events::HardwareFaultChange::Id)
{
GeneralFaults<kMaxHardwareFaults> previous;
GeneralFaults<kMaxHardwareFaults> current;

#if CHIP_CONFIG_TEST
// On Linux Simulation, set following hardware faults statically.
ReturnOnFailure(previous.add(EMBER_ZCL_HARDWARE_FAULT_TYPE_RADIO));
ReturnOnFailure(previous.add(EMBER_ZCL_HARDWARE_FAULT_TYPE_POWER_SOURCE));

ReturnOnFailure(current.add(EMBER_ZCL_HARDWARE_FAULT_TYPE_RADIO));
ReturnOnFailure(current.add(EMBER_ZCL_HARDWARE_FAULT_TYPE_SENSOR));
ReturnOnFailure(current.add(EMBER_ZCL_HARDWARE_FAULT_TYPE_POWER_SOURCE));
ReturnOnFailure(current.add(EMBER_ZCL_HARDWARE_FAULT_TYPE_USER_INTERFACE_FAULT));
#endif
Clusters::GeneralDiagnosticsServer::Instance().OnHardwareFaultsDetect(previous, current);
}
else if (eventId == Clusters::GeneralDiagnostics::Events::RadioFaultChange::Id)
{
GeneralFaults<kMaxRadioFaults> previous;
GeneralFaults<kMaxRadioFaults> current;

#if CHIP_CONFIG_TEST
// On Linux Simulation, set following radio faults statically.
ReturnOnFailure(previous.add(EMBER_ZCL_RADIO_FAULT_TYPE_WI_FI_FAULT));
ReturnOnFailure(previous.add(EMBER_ZCL_RADIO_FAULT_TYPE_THREAD_FAULT));

ReturnOnFailure(current.add(EMBER_ZCL_RADIO_FAULT_TYPE_WI_FI_FAULT));
ReturnOnFailure(current.add(EMBER_ZCL_RADIO_FAULT_TYPE_CELLULAR_FAULT));
ReturnOnFailure(current.add(EMBER_ZCL_RADIO_FAULT_TYPE_THREAD_FAULT));
ReturnOnFailure(current.add(EMBER_ZCL_RADIO_FAULT_TYPE_NFC_FAULT));
#endif
Clusters::GeneralDiagnosticsServer::Instance().OnRadioFaultsDetect(previous, current);
}
else if (eventId == Clusters::GeneralDiagnostics::Events::NetworkFaultChange::Id)
{
GeneralFaults<kMaxNetworkFaults> previous;
GeneralFaults<kMaxNetworkFaults> current;

#if CHIP_CONFIG_TEST
// On Linux Simulation, set following radio faults statically.
ReturnOnFailure(previous.add(EMBER_ZCL_NETWORK_FAULT_TYPE_HARDWARE_FAILURE));
ReturnOnFailure(previous.add(EMBER_ZCL_NETWORK_FAULT_TYPE_NETWORK_JAMMED));

ReturnOnFailure(current.add(EMBER_ZCL_NETWORK_FAULT_TYPE_HARDWARE_FAILURE));
ReturnOnFailure(current.add(EMBER_ZCL_NETWORK_FAULT_TYPE_NETWORK_JAMMED));
ReturnOnFailure(current.add(EMBER_ZCL_NETWORK_FAULT_TYPE_CONNECTION_FAILED));
#endif
Clusters::GeneralDiagnosticsServer::Instance().OnNetworkFaultsDetect(previous, current);
}
else
{
ChipLogError(DeviceLayer, "Unknow event ID:%d", eventId);
}
}

// when the shell is enabled, don't intercept signals since it prevents the user from
// using expected commands like CTRL-C to quit the application. (see issue #17845)
// We should stop using signals for those faults, and move to a different notification
// means, like a pipe. (see issue #19114)
#if !defined(ENABLE_CHIP_SHELL)
void OnRebootSignalHandler(int signum)
{
ChipLogDetail(DeviceLayer, "Caught signal %d", signum);

// The BootReason attribute SHALL indicate the reason for the Node’s most recent boot, the real usecase
// for this attribute is embedded system. In Linux simulation, we use different signals to tell the current
// running process to terminate with different reasons.
BootReasonType bootReason = BootReasonType::kUnspecified;
switch (signum)
{
case SIGVTALRM:
bootReason = BootReasonType::kPowerOnReboot;
break;
case SIGALRM:
bootReason = BootReasonType::kBrownOutReset;
break;
case SIGILL:
bootReason = BootReasonType::kSoftwareWatchdogReset;
break;
case SIGTRAP:
bootReason = BootReasonType::kHardwareWatchdogReset;
break;
case SIGIO:
bootReason = BootReasonType::kSoftwareUpdateCompleted;
break;
case SIGINT:
bootReason = BootReasonType::kSoftwareReset;
break;
default:
IgnoreUnusedVariable(bootReason);
ChipLogError(NotSpecified, "Unhandled signal: Should never happens");
chipDie();
break;
}

Server::GetInstance().DispatchShutDownAndStopEventLoop();
}

void OnSoftwareFaultSignalHandler(int signum)
{
ChipLogDetail(DeviceLayer, "Caught signal %d", signum);

VerifyOrDie(signum == SIGUSR1);
PlatformMgr().ScheduleWork(HandleSoftwareFaultEvent);
}

void OnGeneralFaultSignalHandler(int signum)
{
ChipLogDetail(DeviceLayer, "Caught signal %d", signum);

uint32_t eventId;
switch (signum)
{
case SIGUSR2:
eventId = Clusters::GeneralDiagnostics::Events::HardwareFaultChange::Id;
break;
case SIGHUP:
eventId = Clusters::GeneralDiagnostics::Events::RadioFaultChange::Id;
break;
case SIGTTIN:
eventId = Clusters::GeneralDiagnostics::Events::NetworkFaultChange::Id;
break;
default:
ChipLogError(NotSpecified, "Unhandled signal: Should never happens");
chipDie();
break;
}

PlatformMgr().ScheduleWork(HandleGeneralFaultEvent, static_cast<intptr_t>(eventId));
}

void SetupSignalHandlers()
{
// sigaction is not used here because Tsan interceptors seems to
// never dispatch the signals on darwin.
signal(SIGALRM, OnRebootSignalHandler);
signal(SIGVTALRM, OnRebootSignalHandler);
signal(SIGILL, OnRebootSignalHandler);
signal(SIGTRAP, OnRebootSignalHandler);
signal(SIGTERM, OnRebootSignalHandler);
signal(SIGIO, OnRebootSignalHandler);
signal(SIGINT, OnRebootSignalHandler);
signal(SIGUSR1, OnSoftwareFaultSignalHandler);
signal(SIGUSR2, OnGeneralFaultSignalHandler);
signal(SIGHUP, OnGeneralFaultSignalHandler);
signal(SIGTTIN, OnGeneralFaultSignalHandler);
}
#endif // !defined(ENABLE_CHIP_SHELL)

} // namespace

bool emberAfBasicClusterMfgSpecificPingCallback(chip::app::CommandHandler * commandObj)
Expand Down Expand Up @@ -118,6 +318,10 @@ Clusters::NetworkCommissioning::Instance sNullNetworkCommissioningInstance(kNetw

void ApplicationInit()
{
#if !defined(ENABLE_CHIP_SHELL)
SetupSignalHandlers();
#endif // !defined(ENABLE_CHIP_SHELL)

(void) kNetworkCommissioningEndpointMain;
// Enable secondary endpoint only when we need it, this should be applied to all platforms.
emberAfEndpointEnableDisable(kNetworkCommissioningEndpointSecondary, false);
Expand Down
59 changes: 0 additions & 59 deletions examples/platform/linux/AppMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,61 +110,6 @@ void EventHandler(const DeviceLayer::ChipDeviceEvent * event, intptr_t arg)
}
}

// when the shell is enabled, don't intercept signals since it prevents the user from
// using expected commands like CTRL-C to quit the application. (see issue #17845)
#if !defined(ENABLE_CHIP_SHELL)
void OnSignalHandler(int signum)
{
ChipLogDetail(DeviceLayer, "Caught signal %d", signum);

// The BootReason attribute SHALL indicate the reason for the Node’s most recent boot, the real usecase
// for this attribute is embedded system. In Linux simulation, we use different signals to tell the current
// running process to terminate with different reasons.
BootReasonType bootReason = BootReasonType::kUnspecified;
switch (signum)
{
case SIGVTALRM:
bootReason = BootReasonType::kPowerOnReboot;
break;
case SIGALRM:
bootReason = BootReasonType::kBrownOutReset;
break;
case SIGILL:
bootReason = BootReasonType::kSoftwareWatchdogReset;
break;
case SIGTRAP:
bootReason = BootReasonType::kHardwareWatchdogReset;
break;
case SIGIO:
bootReason = BootReasonType::kSoftwareUpdateCompleted;
break;
case SIGINT:
bootReason = BootReasonType::kSoftwareReset;
break;
default:
IgnoreUnusedVariable(bootReason);
ChipLogError(NotSpecified, "Unhandled signal: Should never happens");
chipDie();
break;
}

Server::GetInstance().DispatchShutDownAndStopEventLoop();
}

void SetupSignalHandlers()
{
// sigaction is not used here because Tsan interceptors seems to
// never dispatch the signals on darwin.
signal(SIGALRM, OnSignalHandler);
signal(SIGVTALRM, OnSignalHandler);
signal(SIGILL, OnSignalHandler);
signal(SIGTRAP, OnSignalHandler);
signal(SIGTERM, OnSignalHandler);
signal(SIGIO, OnSignalHandler);
signal(SIGINT, OnSignalHandler);
}
#endif // !defined(ENABLE_CHIP_SHELL)

void Cleanup()
{
#if CHIP_CONFIG_TRANSPORT_TRACE_ENABLED
Expand Down Expand Up @@ -375,10 +320,6 @@ void ChipLinuxAppMainLoop()
#endif // defined(ENABLE_CHIP_SHELL)
#endif // CHIP_DEVICE_CONFIG_ENABLE_BOTH_COMMISSIONER_AND_COMMISSIONEE

#if !defined(ENABLE_CHIP_SHELL)
SetupSignalHandlers();
#endif // !defined(ENABLE_CHIP_SHELL)

ApplicationInit();

DeviceLayer::PlatformMgr().RunEventLoop();
Expand Down
Loading

0 comments on commit c868f81

Please sign in to comment.