From 58a343cb606be8af5e8af5d4b78c2d711c03d1e8 Mon Sep 17 00:00:00 2001 From: Tennessee Carmel-Veilleux Date: Thu, 7 Jul 2022 16:22:36 -0400 Subject: [PATCH] Fix UpdateNOC session invalidation and opcreds timing - UpdateNOC did not clear session of previous fabric like spec intended (#20379) - All OpCreds cluster slow commands did not try to early-ack to avoid MRP timeouts (#19132) Fixes #20379 Fixes #19132 This PR: - Fixes UpdateNOC expiring all sessions for the updated node - Adds `FlushAcksRightAwayOnSlowCommand` to CommandHandler to flush acks on slow commands - Adds Python tests for UpdateNOC behavior of session expiring - Adds `ExpireSessions` to Python for testing Testing done: - Unit tests all pass - Cert tests pass - With the session clearing, previous Python tests failed, until I fixed them with the new `ExpireSessions` API - Observed standalone acks immediately sent on opcreds cluster commands --- .../chip-tool/commands/tests/TestCommand.cpp | 2 +- src/app/CommandHandler.h | 19 ++++++ .../operational-credentials-server.cpp | 60 ++++++++++++++----- src/app/server/CommissioningWindowManager.cpp | 2 +- src/controller/CHIPDeviceController.cpp | 2 +- src/controller/CHIPDeviceController.h | 3 + .../ChipDeviceController-ScriptBinding.cpp | 11 ++++ src/controller/python/chip/ChipDeviceCtrl.py | 18 ++++++ .../python/test/test_scripts/base.py | 2 + src/messaging/ExchangeContext.cpp | 2 +- src/transport/SessionManager.cpp | 14 ++--- src/transport/SessionManager.h | 6 +- 12 files changed, 113 insertions(+), 28 deletions(-) diff --git a/examples/chip-tool/commands/tests/TestCommand.cpp b/examples/chip-tool/commands/tests/TestCommand.cpp index 89a3d00135e510..64a833ab3176d7 100644 --- a/examples/chip-tool/commands/tests/TestCommand.cpp +++ b/examples/chip-tool/commands/tests/TestCommand.cpp @@ -41,7 +41,7 @@ CHIP_ERROR TestCommand::WaitForCommissionee(const char * identity, // or is just starting out fresh outright. Let's make sure we're not re-using any cached CASE sessions // that will now be stale and mismatched with the peer, causing subsequent interactions to fail. // - GetCommissioner(identity).SessionMgr()->ExpireAllPairings(chip::ScopedNodeId(value.nodeId, fabricIndex)); + GetCommissioner(identity).SessionMgr()->ExpireAllSessions(chip::ScopedNodeId(value.nodeId, fabricIndex)); SetIdentity(identity); return GetCommissioner(identity).GetConnectedDevice(value.nodeId, &mOnDeviceConnectedCallback, diff --git a/src/app/CommandHandler.h b/src/app/CommandHandler.h index e983e15a2feee9..200feee67a1347 100644 --- a/src/app/CommandHandler.h +++ b/src/app/CommandHandler.h @@ -227,6 +227,25 @@ class CommandHandler */ Messaging::ExchangeContext * GetExchangeContext() const { return mpExchangeCtx; } + /** + * @brief Flush acks right away for a slow command + * + * Some commands that do heavy lifting of storage/crypto should + * ack right away to improve reliability and reduce needless retries. This + * method can be manually called in commands that are especially slow to + * immediately schedule an acknowledgement (if needed) since the delayed + * stand-alone ack timer may actually not hit soon enough due to blocking command + * execution. + * + */ + void FlushAcksRightAwayOnSlowCommand() + { + VerifyOrReturn(mpExchangeCtx != nullptr); + auto * msgContext = mpExchangeCtx->GetReliableMessageContext(); + VerifyOrReturn(msgContext != nullptr); + msgContext->FlushAcks(); + } + Access::SubjectDescriptor GetSubjectDescriptor() const { return mpExchangeCtx->GetSessionHandle()->GetSubjectDescriptor(); } private: diff --git a/src/app/clusters/operational-credentials-server/operational-credentials-server.cpp b/src/app/clusters/operational-credentials-server/operational-credentials-server.cpp index 3127a3f75b0c08..19c68ba82f9f9d 100644 --- a/src/app/clusters/operational-credentials-server/operational-credentials-server.cpp +++ b/src/app/clusters/operational-credentials-server/operational-credentials-server.cpp @@ -264,7 +264,7 @@ CHIP_ERROR DeleteFabricFromTable(FabricIndex fabricIndex) void CleanupSessionsForFabric(SessionManager & sessionMgr, FabricIndex fabricIndex) { InteractionModelEngine::GetInstance()->CloseTransactionsFromFabricIndex(fabricIndex); - sessionMgr.ExpireAllPairingsForFabric(fabricIndex); + sessionMgr.ExpireAllSessionsForFabric(fabricIndex); } void FailSafeCleanup(const chip::DeviceLayer::ChipDeviceEvent * event) @@ -278,13 +278,16 @@ void FailSafeCleanup(const chip::DeviceLayer::ChipDeviceEvent * event) // Session Context at the Server. if (event->FailSafeTimerExpired.addNocCommandHasBeenInvoked || event->FailSafeTimerExpired.updateNocCommandHasBeenInvoked) { - CASESessionManager * caseSessionManager = Server::GetInstance().GetCASESessionManager(); - if (caseSessionManager) + // TODO(#19259): The following scope will no longer need to exist after #19259 is fixed { - const FabricInfo * fabricInfo = Server::GetInstance().GetFabricTable().FindFabricWithIndex(fabricIndex); - VerifyOrReturn(fabricInfo != nullptr); + CASESessionManager * caseSessionManager = Server::GetInstance().GetCASESessionManager(); + if (caseSessionManager) + { + const FabricInfo * fabricInfo = Server::GetInstance().GetFabricTable().FindFabricWithIndex(fabricIndex); + VerifyOrReturn(fabricInfo != nullptr); - caseSessionManager->ReleaseSessionsForFabric(fabricInfo->GetFabricIndex()); + caseSessionManager->ReleaseSessionsForFabric(fabricInfo->GetFabricIndex()); + } } SessionManager & sessionMgr = Server::GetInstance().GetSecureSessionManager(); @@ -416,6 +419,8 @@ bool emberAfOperationalCredentialsClusterRemoveFabricCallback(app::CommandHandle return true; } + commandObj->FlushAcksRightAwayOnSlowCommand(); + CHIP_ERROR err = DeleteFabricFromTable(fabricBeingRemoved); SuccessOrExit(err); @@ -602,6 +607,7 @@ bool emberAfOperationalCredentialsClusterAddNOCCallback(app::CommandHandler * co auto nocResponse = OperationalCertStatus::kSuccess; auto nonDefaultStatus = Status::Success; bool needRevert = false; + int line = 0; CHIP_ERROR err = CHIP_NO_ERROR; FabricIndex newFabricIndex = kUndefinedFabricIndex; @@ -620,11 +626,11 @@ bool emberAfOperationalCredentialsClusterAddNOCCallback(app::CommandHandler * co ChipLogProgress(Zcl, "OpCreds: Received an AddNOC command"); - VerifyOrExit(NOCValue.size() <= Credentials::kMaxCHIPCertLength, nonDefaultStatus = Status::InvalidCommand); + VerifyOrExit(NOCValue.size() <= Credentials::kMaxCHIPCertLength, {nonDefaultStatus = Status::InvalidCommand; line = __LINE__; }); VerifyOrExit(!ICACValue.HasValue() || ICACValue.Value().size() <= Credentials::kMaxCHIPCertLength, - nonDefaultStatus = Status::InvalidCommand); - VerifyOrExit(ipkValue.size() == Crypto::CHIP_CRYPTO_SYMMETRIC_KEY_LENGTH_BYTES, nonDefaultStatus = Status::InvalidCommand); - VerifyOrExit(IsVendorIdValidOperationally(adminVendorId), nonDefaultStatus = Status::InvalidCommand); + {nonDefaultStatus = Status::InvalidCommand; line = __LINE__; }); + VerifyOrExit(ipkValue.size() == Crypto::CHIP_CRYPTO_SYMMETRIC_KEY_LENGTH_BYTES, {nonDefaultStatus = Status::InvalidCommand; line = __LINE__; }); + VerifyOrExit(IsVendorIdValidOperationally(adminVendorId), {nonDefaultStatus = Status::InvalidCommand; line = __LINE__; }); VerifyOrExit(failSafeContext.IsFailSafeArmed(commandObj->GetAccessingFabricIndex()), nonDefaultStatus = Status::FailsafeRequired); @@ -638,6 +644,9 @@ bool emberAfOperationalCredentialsClusterAddNOCCallback(app::CommandHandler * co // Internal error that would prevent IPK from being added VerifyOrExit(groupDataProvider != nullptr, nonDefaultStatus = Status::Failure); + // Flush acks before really slow work + commandObj->FlushAcksRightAwayOnSlowCommand(); + // TODO: Add support for calling AddNOC without a prior AddTrustedRootCertificate if // the root properly matches an existing one. @@ -758,7 +767,7 @@ bool emberAfOperationalCredentialsClusterAddNOCCallback(app::CommandHandler * co else { commandObj->AddStatus(commandPath, nonDefaultStatus); - ChipLogError(Zcl, "OpCreds: Failed AddNOC request with IM error 0x%02x", to_underlying(nonDefaultStatus)); + ChipLogError(Zcl, "OpCreds: Failed AddNOC request with IM error 0x%02x, line %d", to_underlying(nonDefaultStatus), line); } return true; @@ -803,6 +812,9 @@ bool emberAfOperationalCredentialsClusterUpdateNOCCallback(app::CommandHandler * VerifyOrExit(fabricInfo != nullptr, nocResponse = ConvertToNOCResponseStatus(CHIP_ERROR_INSUFFICIENT_PRIVILEGE)); fabricIndex = fabricInfo->GetFabricIndex(); + // Flush acks before really slow work + commandObj->FlushAcksRightAwayOnSlowCommand(); + err = fabricTable.UpdatePendingFabricWithOperationalKeystore(fabricIndex, NOCValue, ICACValue.ValueOr(ByteSpan{})); VerifyOrExit(err == CHIP_NO_ERROR, nocResponse = ConvertToNOCResponseStatus(err)); @@ -830,6 +842,17 @@ bool emberAfOperationalCredentialsClusterUpdateNOCCallback(app::CommandHandler * else { ChipLogProgress(Zcl, "OpCreds: UpdateNOC successful."); + + // On success, revoke all CASE sessions on the fabric hosting the exchange. + // From spec: + // + // All internal data reflecting the prior operational identifier of the Node within the Fabric + // SHALL be revoked and removed, to an outcome equivalent to the disappearance of the prior Node, + // except for the ongoing CASE session context, which SHALL temporarily remain valid until the + // `NOCResponse` has been successfully delivered or until the next transport-layer error, so + // that the response can be received by the Administrator invoking the command. + + commandObj->GetExchangeContext()->AbortAllOtherCommunicationOnFabric(); } } // No NOC response - Failed constraints @@ -921,6 +944,10 @@ bool emberAfOperationalCredentialsClusterAttestationRequestCallback(app::Command const ByteSpan kEmptyFirmwareInfo; ChipLogProgress(Zcl, "OpCreds: Received an AttestationRequest command"); + + // Flush acks before really slow work + commandObj->FlushAcksRightAwayOnSlowCommand(); + Credentials::DeviceAttestationCredentialsProvider * dacProvider = Credentials::GetDeviceAttestationCredentialsProvider(); VerifyOrExit(attestationNonce.size() == Credentials::kExpectedAttestationNonceSize, finalStatus = Status::InvalidCommand); @@ -1020,6 +1047,9 @@ bool emberAfOperationalCredentialsClusterCSRRequestCallback(app::CommandHandler VerifyOrExit(failSafeContext.IsFailSafeArmed(commandObj->GetAccessingFabricIndex()), finalStatus = Status::FailsafeRequired); VerifyOrExit(!failSafeContext.NocCommandHasBeenInvoked(), finalStatus = Status::ConstraintError); + // Flush acks before really slow work + commandObj->FlushAcksRightAwayOnSlowCommand(); + // Prepare NOCSRElements structure { constexpr size_t csrLength = Crypto::kMAX_CSR_Length; @@ -1143,10 +1173,12 @@ bool emberAfOperationalCredentialsClusterAddTrustedRootCertificateCallback( // be useful in the context. VerifyOrExit(!failSafeContext.NocCommandHasBeenInvoked(), finalStatus = Status::ConstraintError); - // TODO: Handle checking for byte-to-byte match with existing fabrics - // before allowing the add + // Flush acks before really slow work + commandObj->FlushAcksRightAwayOnSlowCommand(); + + // TODO(#17208): Handle checking for byte-to-byte match with existing fabrics before allowing the add - // TODO: Validate cert signature prior to setting. + // TODO(#17208): Validate cert signature prior to setting. err = fabricTable.AddNewPendingTrustedRootCert(rootCertificate); VerifyOrExit(err != CHIP_ERROR_NO_MEMORY, finalStatus = Status::ResourceExhausted); diff --git a/src/app/server/CommissioningWindowManager.cpp b/src/app/server/CommissioningWindowManager.cpp index 4aa945c860805e..039bf7da5105d3 100644 --- a/src/app/server/CommissioningWindowManager.cpp +++ b/src/app/server/CommissioningWindowManager.cpp @@ -55,7 +55,7 @@ void CommissioningWindowManager::OnPlatformEvent(const DeviceLayer::ChipDeviceEv DeviceLayer::SystemLayer().CancelTimer(HandleCommissioningWindowTimeout, this); mCommissioningTimeoutTimerArmed = false; Cleanup(); - mServer->GetSecureSessionManager().ExpireAllPASEPairings(); + mServer->GetSecureSessionManager().ExpireAllPASESessions(); // That should have cleared out mPASESession. #if CONFIG_NETWORK_LAYER_BLE mServer->GetBleLayerObject()->CloseAllBleConnections(); diff --git a/src/controller/CHIPDeviceController.cpp b/src/controller/CHIPDeviceController.cpp index a1dec221a8f0bd..2991a32dbbb3cb 100644 --- a/src/controller/CHIPDeviceController.cpp +++ b/src/controller/CHIPDeviceController.cpp @@ -298,7 +298,7 @@ void DeviceController::Shutdown() // sessions. It just shuts down any ongoing CASE session establishment // we're in the middle of as initiator. Maybe it should shut down // existing sessions too? - mSystemState->SessionMgr()->ExpireAllPairingsForFabric(mFabricIndex); + mSystemState->SessionMgr()->ExpireAllSessionsForFabric(mFabricIndex); FabricTable * fabricTable = mSystemState->Fabrics(); if (fabricTable != nullptr) diff --git a/src/controller/CHIPDeviceController.h b/src/controller/CHIPDeviceController.h index 158a30d620cb2c..3ac9f77ebfa2d7 100644 --- a/src/controller/CHIPDeviceController.h +++ b/src/controller/CHIPDeviceController.h @@ -288,11 +288,14 @@ class DLL_EXPORT DeviceController : public AbstractDnssdDiscoveryController return mSystemState->Fabrics(); } + // TODO(#20452): This should be removed/renamed once #20452 is fixed void ReleaseOperationalDevice(NodeId remoteNodeId); OperationalCredentialsDelegate * GetOperationalCredentialsDelegate() { return mOperationalCredentialsDelegate; } /** + * TODO(#20452): This needs to be refactored to reflect what it actually does (which is not disconnecting anything) + * * TEMPORARY - DO NOT USE or if you use please request review on why/how to * officially support such an API. * diff --git a/src/controller/python/ChipDeviceController-ScriptBinding.cpp b/src/controller/python/ChipDeviceController-ScriptBinding.cpp index 0e2342e3eca833..88874fa8be9b8a 100644 --- a/src/controller/python/ChipDeviceController-ScriptBinding.cpp +++ b/src/controller/python/ChipDeviceController-ScriptBinding.cpp @@ -186,6 +186,8 @@ ChipError::StorageType pychip_GetConnectedDeviceByNodeId(chip::Controller::Devic DeviceAvailableFunc callback); ChipError::StorageType pychip_GetDeviceBeingCommissioned(chip::Controller::DeviceCommissioner * devCtrl, chip::NodeId nodeId, CommissioneeDeviceProxy ** proxy); +ChipError::StorageType pychip_ExpireSessions(chip::Controller::DeviceCommissioner * devCtrl, chip::NodeId nodeId); + uint64_t pychip_GetCommandSenderHandle(chip::DeviceProxy * device); // CHIP Stack objects ChipError::StorageType pychip_BLEMgrImpl_ConfigureBle(uint32_t bluetoothAdapterId); @@ -639,6 +641,15 @@ ChipError::StorageType pychip_GetDeviceBeingCommissioned(chip::Controller::Devic return devCtrl->GetDeviceBeingCommissioned(nodeId, proxy).AsInteger(); } +// This is a method called VERY seldom, just for RemoveFabric/UpdateNOC +ChipError::StorageType pychip_ExpireSessions(chip::Controller::DeviceCommissioner * devCtrl, chip::NodeId nodeId) +{ + VerifyOrReturnError((devCtrl != nullptr) && (devCtrl->SessionMgr() != nullptr), CHIP_ERROR_INVALID_ARGUMENT.AsInteger()); + (void)devCtrl->ReleaseOperationalDevice(nodeId); + devCtrl->SessionMgr()->ExpireAllSessions(ScopedNodeId(nodeId, devCtrl->GetFabricIndex())); + return CHIP_NO_ERROR.AsInteger(); +} + ChipError::StorageType pychip_DeviceCommissioner_CloseBleConnection(chip::Controller::DeviceCommissioner * devCtrl) { #if CONFIG_NETWORK_LAYER_BLE diff --git a/src/controller/python/chip/ChipDeviceCtrl.py b/src/controller/python/chip/ChipDeviceCtrl.py index 0cfa53f3bd8002..4679bfd72d8d2b 100644 --- a/src/controller/python/chip/ChipDeviceCtrl.py +++ b/src/controller/python/chip/ChipDeviceCtrl.py @@ -270,6 +270,21 @@ def CloseBLEConnection(self): self.devCtrl) ) + def ExpireSessions(self, nodeid): + """Close all sessions with `nodeid` (if any existed) so that sessions get re-established. + + This is needed to properly handle operations that invalidate a node's state, such as + UpdateNOC. + + WARNING: ONLY CALL THIS IF YOU UNDERSTAND THE SIDE-EFFECTS + """ + self.CheckIsActive() + + res = self._ChipStack.Call(lambda: self._dmLib.pychip_ExpireSessions(self.devCtrl, nodeid)) + if res != 0: + raise self._ChipStack.ErrorToException(res) + + # TODO: This needs to be called MarkSessionDefunct def CloseSession(self, nodeid): self.CheckIsActive() @@ -1106,6 +1121,9 @@ def _InitLib(self): c_void_p, c_uint64, c_void_p] self._dmLib.pychip_GetDeviceBeingCommissioned.restype = c_uint32 + self._dmLib.pychip_ExpireSessions.argtypes = [c_void_p, c_uint64] + self._dmLib.pychip_ExpireSessions.restype = c_uint32 + self._dmLib.pychip_DeviceCommissioner_CloseBleConnection.argtypes = [ c_void_p] self._dmLib.pychip_DeviceCommissioner_CloseBleConnection.restype = c_uint32 diff --git a/src/controller/python/test/test_scripts/base.py b/src/controller/python/test/test_scripts/base.py index 1bceb3d7e748e8..d8dc8999ac7b5d 100644 --- a/src/controller/python/test/test_scripts/base.py +++ b/src/controller/python/test/test_scripts/base.py @@ -418,6 +418,7 @@ async def TestAddUpdateRemoveFabric(self, nodeid: int): return False await tempDevCtrl.SendCommand(nodeid, 0, Clusters.OperationalCredentials.Commands.UpdateNOC(chainForUpdateNOC.nocBytes, chainForUpdateNOC.icacBytes)) + tempDevCtrl.ExpireSessions(nodeid) await tempDevCtrl.SendCommand(nodeid, 0, Clusters.GeneralCommissioning.Commands.CommissioningComplete()) afterUpdateNocFabricCount = await self._GetCommissonedFabricCount(nodeid) if afterUpdateNocFabricCount != afterAddNocFabricCount: @@ -446,6 +447,7 @@ async def TestAddUpdateRemoveFabric(self, nodeid: int): self.logger.error("IssueNOCChain failed to generate valid cert chain for UpdateNOC with new node ID") return False updateNocResponse = await tempDevCtrl.SendCommand(nodeid, 0, Clusters.OperationalCredentials.Commands.UpdateNOC(chainForSecondUpdateNOC.nocBytes, chainForSecondUpdateNOC.icacBytes)) + tempDevCtrl.ExpireSessions(nodeid) await tempDevCtrl.SendCommand(newNodeIdForUpdateNoc, 0, Clusters.GeneralCommissioning.Commands.CommissioningComplete()) afterUpdateNocWithNewNodeIdFabricCount = await self._GetCommissonedFabricCount(nodeid) if afterUpdateNocFabricCount != afterUpdateNocWithNewNodeIdFabricCount: diff --git a/src/messaging/ExchangeContext.cpp b/src/messaging/ExchangeContext.cpp index 1e66101304a611..6cb36ec1a0e90d 100644 --- a/src/messaging/ExchangeContext.cpp +++ b/src/messaging/ExchangeContext.cpp @@ -588,7 +588,7 @@ void ExchangeContext::AbortAllOtherCommunicationOnFabric() SetIgnoreSessionRelease(true); - GetExchangeMgr()->GetSessionManager()->ExpireAllPairingsForFabric(mSession->GetFabricIndex()); + GetExchangeMgr()->GetSessionManager()->ExpireAllSessionsForFabric(mSession->GetFabricIndex()); mSession.GrabExpiredSession(session.Value()); diff --git a/src/transport/SessionManager.cpp b/src/transport/SessionManager.cpp index 8842c728b4c08e..3a7bd44a3ce38f 100644 --- a/src/transport/SessionManager.cpp +++ b/src/transport/SessionManager.cpp @@ -121,7 +121,7 @@ void SessionManager::Shutdown() /** * @brief Notification that a fabric was removed. - * This function doesn't call ExpireAllPairingsForFabric + * This function doesn't call ExpireAllSessionsForFabric * since the CASE session might still be open to send a response * on the removed fabric. */ @@ -342,7 +342,7 @@ CHIP_ERROR SessionManager::SendPreparedMessage(const SessionHandle & sessionHand return CHIP_ERROR_INCORRECT_STATE; } -void SessionManager::ExpireAllPairings(const ScopedNodeId & node) +void SessionManager::ExpireAllSessions(const ScopedNodeId & node) { mSecureSessions.ForEachSession([&](auto session) { if (session->GetPeer() == node) @@ -353,11 +353,11 @@ void SessionManager::ExpireAllPairings(const ScopedNodeId & node) }); } -void SessionManager::ExpireAllPairingsForFabric(FabricIndex fabric) +void SessionManager::ExpireAllSessionsForFabric(FabricIndex fabricIndex) { - ChipLogDetail(Inet, "Expiring all connections for fabric %d!!", fabric); + ChipLogDetail(Inet, "Expiring all sessions for fabric 0x%x!!", static_cast(fabricIndex)); mSecureSessions.ForEachSession([&](auto session) { - if (session->GetFabricIndex() == fabric) + if (session->GetFabricIndex() == fabricIndex) { session->MarkForEviction(); } @@ -365,9 +365,9 @@ void SessionManager::ExpireAllPairingsForFabric(FabricIndex fabric) }); } -void SessionManager::ExpireAllPASEPairings() +void SessionManager::ExpireAllPASESessions() { - ChipLogDetail(Inet, "Expiring all PASE pairings"); + ChipLogDetail(Inet, "Expiring all PASE sessions"); mSecureSessions.ForEachSession([&](auto session) { if (session->GetSecureSessionType() == Transport::SecureSession::Type::kPASE) { diff --git a/src/transport/SessionManager.h b/src/transport/SessionManager.h index 095c6d6dd60c5f..633d0e9eed0785 100644 --- a/src/transport/SessionManager.h +++ b/src/transport/SessionManager.h @@ -174,9 +174,9 @@ class DLL_EXPORT SessionManager : public TransportMgrDelegate, public FabricTabl Optional AllocateSession(Transport::SecureSession::Type secureSessionType, const ScopedNodeId & sessionEvictionHint); - void ExpireAllPairings(const ScopedNodeId & node); - void ExpireAllPairingsForFabric(FabricIndex fabric); - void ExpireAllPASEPairings(); + void ExpireAllSessions(const ScopedNodeId & node); + void ExpireAllSessionsForFabric(FabricIndex fabricIndex); + void ExpireAllPASESessions(); /** * @brief