Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Properly handle crashes/reboots during FabricTable commit #20010

Merged
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ bool emberAfAdministratorCommissioningClusterOpenCommissioningWindowCallback(

FabricIndex fabricIndex = commandObj->GetAccessingFabricIndex();
FabricInfo * fabricInfo = Server::GetInstance().GetFabricTable().FindFabricWithIndex(fabricIndex);
auto & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
auto & commissionMgr = Server::GetInstance().GetCommissioningWindowManager();

VerifyOrExit(fabricInfo != nullptr, status.Emplace(StatusCode::EMBER_ZCL_STATUS_CODE_PAKE_PARAMETER_ERROR));
Expand Down Expand Up @@ -169,7 +169,7 @@ bool emberAfAdministratorCommissioningClusterOpenBasicCommissioningWindowCallbac

FabricIndex fabricIndex = commandObj->GetAccessingFabricIndex();
FabricInfo * fabricInfo = Server::GetInstance().GetFabricTable().FindFabricWithIndex(fabricIndex);
auto & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
auto & commissionMgr = Server::GetInstance().GetCommissioningWindowManager();

VerifyOrExit(fabricInfo != nullptr, status.Emplace(StatusCode::EMBER_ZCL_STATUS_CODE_PAKE_PARAMETER_ERROR));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ bool emberAfGeneralCommissioningClusterArmFailSafeCallback(app::CommandHandler *
const Commands::ArmFailSafe::DecodableType & commandData)
{
MATTER_TRACE_EVENT_SCOPE("ArmFailSafe", "GeneralCommissioning");
FailSafeContext & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
Commands::ArmFailSafeResponse::Type response;

ChipLogProgress(FailSafe, "GeneralCommissioning: Received ArmFailSafe (%us)",
Expand Down Expand Up @@ -196,9 +196,7 @@ bool emberAfGeneralCommissioningClusterArmFailSafeCallback(app::CommandHandler *
}
else
{
CheckSuccess(
failSafeContext.ArmFailSafe(accessingFabricIndex, System::Clock::Seconds16(commandData.expiryLengthSeconds)),
Failure);
CheckSuccess(failSafeContext.ArmFailSafe(accessingFabricIndex, commandData.expiryLengthSeconds), Failure);
Breadcrumb::Set(commandPath.mEndpointId, commandData.breadcrumb);
response.errorCode = CommissioningError::kOk;
commandObj->AddResponse(commandPath, response);
Expand All @@ -219,9 +217,9 @@ bool emberAfGeneralCommissioningClusterCommissioningCompleteCallback(
{
MATTER_TRACE_EVENT_SCOPE("CommissioningComplete", "GeneralCommissioning");

DeviceControlServer * server = &DeviceLayer::DeviceControlServer::DeviceControlSvr();
auto & failSafe = server->GetFailSafeContext();
auto & fabricTable = Server::GetInstance().GetFabricTable();
DeviceControlServer * devCtrl = &DeviceLayer::DeviceControlServer::DeviceControlSvr();
auto & failSafe = Server::GetInstance().GetFailSafeContext();
auto & fabricTable = Server::GetInstance().GetFabricTable();

ChipLogProgress(FailSafe, "GeneralCommissioning: Received CommissioningComplete");

Expand Down Expand Up @@ -267,7 +265,7 @@ bool emberAfGeneralCommissioningClusterCommissioningCompleteCallback(
*/
failSafe.DisarmFailSafe();
CheckSuccess(
server->PostCommissioningCompleteEvent(handle->AsSecureSession()->GetPeerNodeId(), handle->GetFabricIndex()),
devCtrl->PostCommissioningCompleteEvent(handle->AsSecureSession()->GetPeerNodeId(), handle->GetFabricIndex()),
Failure);

Breadcrumb::Set(commandPath.mEndpointId, 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <app/CommandHandlerInterface.h>
#include <app/InteractionModelEngine.h>
#include <app/clusters/general-commissioning-server/general-commissioning-server.h>
#include <app/server/Server.h>
#include <app/util/attribute-storage.h>
#include <lib/support/SafeInt.h>
#include <lib/support/ThreadOperationalDataset.h>
Expand Down Expand Up @@ -301,7 +302,7 @@ void FillDebugTextAndNetworkIndex(Commands::NetworkConfigResponse::Type & respon

bool CheckFailSafeArmed(CommandHandlerInterface::HandlerContext & ctx)
{
DeviceLayer::FailSafeContext & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = chip::Server::GetInstance().GetFailSafeContext();

if (failSafeContext.IsFailSafeArmed(ctx.mCommandHandler.GetAccessingFabricIndex()))
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,13 @@
#include <lib/support/ScopedBuffer.h>
#include <lib/support/logging/CHIPLogging.h>
#include <platform/CHIPDeviceLayer.h>
#include <platform/DeviceControlServer.h>
#include <string.h>
#include <trace/trace.h>
#if CHIP_CRYPTO_HSM
#include <crypto/hsm/CHIPCryptoPALHsm.h>
#endif

using namespace chip;
using namespace ::chip::DeviceLayer;
using namespace ::chip::Transport;
using namespace chip::app;
using namespace chip::app::Clusters;
Expand Down Expand Up @@ -339,14 +337,6 @@ void FailSafeCleanup(const chip::DeviceLayer::ChipDeviceEvent * event)
ChipLogError(Zcl, "OpCreds: failed to delete fabric at index %u: %" CHIP_ERROR_FORMAT, fabricIndex, err.Format());
}
}

// If an UpdateNOC command had been successfully invoked, revert the state of operational key pair, NOC and ICAC for that
// Fabric to the state prior to the Fail-Safe timer being armed, for the Fabric Index that was the subject of the UpdateNOC
// command.
if (event->FailSafeTimerExpired.updateNocCommandHasBeenInvoked)
{
// TODO: Revert the state of operational key pair, NOC and ICAC
}
}

void OnPlatformEventHandler(const chip::DeviceLayer::ChipDeviceEvent * event, intptr_t arg)
Expand Down Expand Up @@ -650,8 +640,8 @@ bool emberAfOperationalCredentialsClusterAddNOCCallback(app::CommandHandler * co
FabricInfo * newFabricInfo = nullptr;
auto & fabricTable = Server::GetInstance().GetFabricTable();

auto * secureSession = commandObj->GetExchangeContext()->GetSessionHandle()->AsSecureSession();
FailSafeContext & failSafeContext = DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto * secureSession = commandObj->GetExchangeContext()->GetSessionHandle()->AsSecureSession();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();

uint8_t compressed_fabric_id_buffer[sizeof(uint64_t)];
MutableByteSpan compressed_fabric_id(compressed_fabric_id_buffer);
Expand Down Expand Up @@ -737,8 +727,7 @@ bool emberAfOperationalCredentialsClusterAddNOCCallback(app::CommandHandler * co

// The Fabric Index associated with the armed fail-safe context SHALL be updated to match the Fabric
// Index just allocated.
err = failSafeContext.SetAddNocCommandInvoked(newFabricIndex);
VerifyOrExit(err == CHIP_NO_ERROR, nonDefaultStatus = Status::Failure);
failSafeContext.SetAddNocCommandInvoked(newFabricIndex);

// Done all intermediate steps, we are now successful
needRevert = false;
Expand Down Expand Up @@ -816,16 +805,15 @@ bool emberAfOperationalCredentialsClusterUpdateNOCCallback(app::CommandHandler *

auto nocResponse = OperationalCertStatus::kSuccess;
auto nonDefaultStatus = Status::Success;
bool needRevert = false;

CHIP_ERROR err = CHIP_NO_ERROR;
FabricIndex fabricIndex = 0;

ChipLogProgress(Zcl, "OpCreds: Received an UpdateNOC command");

auto & fabricTable = Server::GetInstance().GetFabricTable();
FailSafeContext & failSafeContext = DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
FabricInfo * fabricInfo = RetrieveCurrentFabric(commandObj);
auto & fabricTable = Server::GetInstance().GetFabricTable();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
FabricInfo * fabricInfo = RetrieveCurrentFabric(commandObj);

bool csrWasForUpdateNoc = false; //< Output param of HasPendingOperationalKey
bool hasPendingKey = fabricTable.HasPendingOperationalKey(csrWasForUpdateNoc);
Expand All @@ -849,15 +837,8 @@ bool emberAfOperationalCredentialsClusterUpdateNOCCallback(app::CommandHandler *
err = fabricTable.UpdatePendingFabricWithOperationalKeystore(fabricIndex, NOCValue, ICACValue.ValueOr(ByteSpan{}));
VerifyOrExit(err == CHIP_NO_ERROR, nocResponse = ConvertToNOCResponseStatus(err));

// From here if we error-out, we should revert the fabric table pending updates
needRevert = true;

// Flag on the fail-safe context that the UpdateNOC command was invoked.
err = failSafeContext.SetUpdateNocCommandInvoked();
VerifyOrExit(err == CHIP_NO_ERROR, nocResponse = ConvertToNOCResponseStatus(err));

// Done all intermediate steps, we are now successful
needRevert = false;
failSafeContext.SetUpdateNocCommandInvoked();

// We might have a new operational identity, so we should start advertising
// it right away. Also, we need to withdraw our old operational identity.
Expand All @@ -866,11 +847,6 @@ bool emberAfOperationalCredentialsClusterUpdateNOCCallback(app::CommandHandler *

// Attribute notification was already done by fabric table
exit:
if (needRevert)
{
fabricTable.RevertPendingOpCertsExceptRoot();
}

// We have an NOC response
if (nonDefaultStatus == Status::Success)
{
Expand Down Expand Up @@ -1039,8 +1015,8 @@ bool emberAfOperationalCredentialsClusterCSRRequestCallback(app::CommandHandler
// logs by the end. We use finalStatus as our overall success marker, not error
CHIP_ERROR err = CHIP_ERROR_INVALID_ARGUMENT;

auto & fabricTable = Server::GetInstance().GetFabricTable();
FailSafeContext & failSafeContext = DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & fabricTable = Server::GetInstance().GetFabricTable();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();

auto & CSRNonce = commandData.CSRNonce;
bool isForUpdateNoc = commandData.isForUpdateNOC.ValueOr(false);
Expand Down Expand Up @@ -1157,8 +1133,8 @@ bool emberAfOperationalCredentialsClusterAddTrustedRootCertificateCallback(
// logs by the end. We use finalStatus as our overall success marker, not error
CHIP_ERROR err = CHIP_ERROR_INVALID_ARGUMENT;

auto & rootCertificate = commandData.rootCertificate;
FailSafeContext & failSafeContext = DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & rootCertificate = commandData.rootCertificate;
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();

ChipLogProgress(Zcl, "OpCreds: Received an AddTrustedRootCertificate command");

Expand Down
2 changes: 2 additions & 0 deletions src/app/server/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ static_library("server") {
"Dnssd.h",
"EchoHandler.cpp",
"EchoHandler.h",
"FailSafeContext.cpp",
"FailSafeContext.h",
"OnboardingCodesUtil.cpp",
"OnboardingCodesUtil.h",
"Server.cpp",
Expand Down
8 changes: 4 additions & 4 deletions src/app/server/CommissioningWindowManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ void CommissioningWindowManager::OnSessionEstablished(const SessionHandle & sess

StopAdvertisement(/* aShuttingDown = */ false);

DeviceLayer::FailSafeContext & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
// This should never be armed because we don't allow CASE sessions to arm the failsafe when the commissioning window is open and
// we check that the failsafe is not armed before opening the commissioning window. None the less, it is good to double-check.
CHIP_ERROR err = CHIP_NO_ERROR;
Expand All @@ -170,7 +170,7 @@ void CommissioningWindowManager::OnSessionEstablished(const SessionHandle & sess
}
else
{
err = failSafeContext.ArmFailSafe(kUndefinedFabricId, System::Clock::Seconds16(60));
err = failSafeContext.ArmFailSafe(kUndefinedFabricId, 60);
if (err != CHIP_NO_ERROR)
{
ChipLogError(AppServer, "Error arming failsafe on PASE session establishment completion");
Expand All @@ -194,7 +194,7 @@ CHIP_ERROR CommissioningWindowManager::OpenCommissioningWindow(Seconds16 commiss
{
VerifyOrReturnError(commissioningTimeout <= MaxCommissioningTimeout() && commissioningTimeout >= MinCommissioningTimeout(),
CHIP_ERROR_INVALID_ARGUMENT);
DeviceLayer::FailSafeContext & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
VerifyOrReturnError(!failSafeContext.IsFailSafeArmed(), CHIP_ERROR_INCORRECT_STATE);

ReturnErrorOnFailure(Dnssd::ServiceAdvertiser::Instance().UpdateCommissionableInstanceName());
Expand Down Expand Up @@ -471,7 +471,7 @@ void CommissioningWindowManager::OnSessionReleased()

void CommissioningWindowManager::ExpireFailSafeIfArmed()
{
DeviceLayer::FailSafeContext & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
if (failSafeContext.IsFailSafeArmed())
{
failSafeContext.ForceFailSafeTimerExpiry();
Expand Down
113 changes: 10 additions & 103 deletions src/platform/FailSafeContext.cpp → src/app/server/FailSafeContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,15 @@
* Provides the implementation of the FailSafeContext object.
*/

#include <platform/FailSafeContext.h>

#include <lib/support/DefaultStorageKeyAllocator.h>
#include <lib/support/SafeInt.h>
#include <platform/ConfigurationManager.h>
#include <platform/internal/CHIPDeviceLayerInternal.h>

namespace chip {
namespace DeviceLayer {
#include "FailSafeContext.h"

using namespace chip::DeviceLayer;

namespace {
constexpr TLV::Tag kFabricIndexTag = TLV::ContextTag(0);
constexpr TLV::Tag kAddNocCommandTag = TLV::ContextTag(1);
constexpr TLV::Tag kUpdateNocCommandTag = TLV::ContextTag(2);
} // anonymous namespace
namespace chip {
namespace app {

void FailSafeContext::HandleArmFailSafeTimer(System::Layer * layer, void * aAppState)
{
Expand Down Expand Up @@ -89,7 +84,7 @@ void FailSafeContext::ScheduleFailSafeCleanup(FabricIndex fabricIndex, bool addN
PlatformMgr().ScheduleWork(HandleDisarmFailSafe, reinterpret_cast<intptr_t>(this));
}

CHIP_ERROR FailSafeContext::ArmFailSafe(FabricIndex accessingFabricIndex, System::Clock::Timeout expiryLength)
CHIP_ERROR FailSafeContext::ArmFailSafe(FabricIndex accessingFabricIndex, uint16_t expiryLengthSeconds)
bzbarsky-apple marked this conversation as resolved.
Show resolved Hide resolved
{
CHIP_ERROR err = CHIP_NO_ERROR;
bool cancelTimersIfError = false;
Expand All @@ -100,9 +95,8 @@ CHIP_ERROR FailSafeContext::ArmFailSafe(FabricIndex accessingFabricIndex, System
cancelTimersIfError = true;
}

SuccessOrExit(err = DeviceLayer::SystemLayer().StartTimer(expiryLength, HandleArmFailSafeTimer, this));
SuccessOrExit(err = CommitToStorage());
SuccessOrExit(err = ConfigurationMgr().SetFailSafeArmed(true));
SuccessOrExit(
err = DeviceLayer::SystemLayer().StartTimer(System::Clock::Seconds16(expiryLengthSeconds), HandleArmFailSafeTimer, this));

mFailSafeArmed = true;
mFabricIndex = accessingFabricIndex;
Expand All @@ -124,96 +118,9 @@ void FailSafeContext::DisarmFailSafe()

ResetState();

if (ConfigurationMgr().SetFailSafeArmed(false) != CHIP_NO_ERROR)
{
ChipLogError(FailSafe, "Failed to set FailSafeArmed config to false");
}

if (DeleteFromStorage() != CHIP_NO_ERROR)
{
ChipLogError(FailSafe, "Failed to delete FailSafeContext from configuration");
}

ChipLogProgress(FailSafe, "Fail-safe cleanly disarmed");
}

CHIP_ERROR FailSafeContext::SetAddNocCommandInvoked(FabricIndex nocFabricIndex)
{
mAddNocCommandHasBeenInvoked = true;
mFabricIndex = nocFabricIndex;

ReturnErrorOnFailure(CommitToStorage());

return CHIP_NO_ERROR;
}

CHIP_ERROR FailSafeContext::SetUpdateNocCommandInvoked()
{
mUpdateNocCommandHasBeenInvoked = true;

ReturnErrorOnFailure(CommitToStorage());

return CHIP_NO_ERROR;
}

CHIP_ERROR FailSafeContext::CommitToStorage()
{
DefaultStorageKeyAllocator keyAlloc;
uint8_t buf[FailSafeContextTLVMaxSize()];
TLV::TLVWriter writer;
writer.Init(buf);

TLV::TLVType outerType;
ReturnErrorOnFailure(writer.StartContainer(TLV::AnonymousTag(), TLV::kTLVType_Structure, outerType));
ReturnErrorOnFailure(writer.Put(kFabricIndexTag, mFabricIndex));

// TODO: Stop storing this, and just make the fail-safe context volatile and sweep-up stale data next boot on partial commits
ReturnErrorOnFailure(writer.Put(kAddNocCommandTag, mAddNocCommandHasBeenInvoked));
ReturnErrorOnFailure(writer.Put(kUpdateNocCommandTag, mUpdateNocCommandHasBeenInvoked));
ReturnErrorOnFailure(writer.EndContainer(outerType));

const auto failSafeContextTLVLength = writer.GetLengthWritten();
VerifyOrReturnError(CanCastTo<uint16_t>(failSafeContextTLVLength), CHIP_ERROR_BUFFER_TOO_SMALL);

return PersistedStorage::KeyValueStoreMgr().Put(keyAlloc.FailSafeContextKey(), buf,
static_cast<uint16_t>(failSafeContextTLVLength));
}

CHIP_ERROR FailSafeContext::LoadFromStorage(FabricIndex & fabricIndex, bool & addNocCommandInvoked, bool & updateNocCommandInvoked)
{
DefaultStorageKeyAllocator keyAlloc;
uint8_t buf[FailSafeContextTLVMaxSize()];
ReturnErrorOnFailure(PersistedStorage::KeyValueStoreMgr().Get(keyAlloc.FailSafeContextKey(), buf, sizeof(buf)));

TLV::ContiguousBufferTLVReader reader;
reader.Init(buf, sizeof(buf));
ReturnErrorOnFailure(reader.Next(TLV::kTLVType_Structure, TLV::AnonymousTag()));

TLV::TLVType containerType;
ReturnErrorOnFailure(reader.EnterContainer(containerType));

ReturnErrorOnFailure(reader.Next(kFabricIndexTag));
ReturnErrorOnFailure(reader.Get(fabricIndex));

ReturnErrorOnFailure(reader.Next(kAddNocCommandTag));
ReturnErrorOnFailure(reader.Get(addNocCommandInvoked));

ReturnErrorOnFailure(reader.Next(kUpdateNocCommandTag));
ReturnErrorOnFailure(reader.Get(updateNocCommandInvoked));

ReturnErrorOnFailure(reader.VerifyEndOfContainer());
ReturnErrorOnFailure(reader.ExitContainer(containerType));

return CHIP_NO_ERROR;
}

CHIP_ERROR FailSafeContext::DeleteFromStorage()
{
DefaultStorageKeyAllocator keyAlloc;

return PersistedStorage::KeyValueStoreMgr().Delete(keyAlloc.FailSafeContextKey());
}

void FailSafeContext::ForceFailSafeTimerExpiry()
{
if (!IsFailSafeArmed())
Expand All @@ -228,5 +135,5 @@ void FailSafeContext::ForceFailSafeTimerExpiry()
FailSafeTimerExpired();
}

} // namespace DeviceLayer
} // namespace app
} // namespace chip
Loading