From fc29093ad2fa41c2333a9f0c90a07358eeb8682a Mon Sep 17 00:00:00 2001 From: Tiago Castro Date: Thu, 7 Mar 2024 15:47:10 +0000 Subject: [PATCH 1/3] fix(agents/ha/node): disconnect controller async Dependent nvmeadm crate is blocking and certain operations such as disconnecting the controller can take quite some time. Here we are just doing a WA by running the disconnect in a tokio blocking spawn. todo: Ideally we should either make nvmeadm async or somehow provide async as alternative, which won't be great as the code will likely get duplicated... Signed-off-by: Tiago Castro --- .../agents/src/bin/ha/node/server.rs | 93 +++++++++---------- 1 file changed, 45 insertions(+), 48 deletions(-) diff --git a/control-plane/agents/src/bin/ha/node/server.rs b/control-plane/agents/src/bin/ha/node/server.rs index d2241d978..308dd9fbd 100755 --- a/control-plane/agents/src/bin/ha/node/server.rs +++ b/control-plane/agents/src/bin/ha/node/server.rs @@ -97,52 +97,48 @@ impl NodeAgentSvc { } /// Disconnect cached NVMe controller. -fn disconnect_controller(ctrlr: &NvmeController, new_path: String) -> Result<(), SvcError> { - let parsed_path = parse_uri(new_path.as_str())?; - - match get_nvme_path_entry(&ctrlr.path) { +async fn disconnect_controller(ctrlr: &NvmeController, new_path: String) -> Result<(), SvcError> { + let new_path_uri = parse_uri(new_path.as_str())?; + let path = &ctrlr.path; + match get_nvme_path_entry(path) { Some(pbuf) => { let subsystem = Subsystem::new(pbuf.path()).map_err(|_| SvcError::NotFound { kind: ResourceKind::NvmeSubsystem, - id: ctrlr.path.to_owned(), + id: path.to_owned(), })?; // sanity check to make sure this information is still up to date! - if subsystem.nqn != parsed_path.nqn { + if subsystem.nqn != new_path_uri.nqn { return Err(SvcError::UnexpectedSubsystemNqn { nqn: subsystem.nqn, - expected_nqn: parsed_path.nqn, + expected_nqn: new_path_uri.nqn, path: subsystem.name, }); } if subsystem .address - .match_host_port(parsed_path.host(), &parsed_path.port().to_string()) + .match_host_port(new_path_uri.host(), &new_path_uri.port().to_string()) { - tracing::info!(path = ctrlr.path, "Not disconnecting same NVMe controller"); + tracing::info!(path, "Not disconnecting same NVMe controller"); Ok(()) } else { - tracing::info!(path = ctrlr.path, "Disconnecting NVMe controller"); + tracing::info!(path, "Disconnecting NVMe controller"); // clarification: we're not disconnecting the subsystem, but rather the controller - subsystem.disconnect().map_err(|e| SvcError::Internal { - details: format!( - "Failed to disconnect NVMe controller {}: {:?}", - ctrlr.path, e, - ), + tokio::task::block_in_place(move || subsystem.disconnect()).map_err(|error| { + SvcError::Internal { + details: format!("Failed to disconnect NVMe controller {path}: {error:?}"), + } }) } } None => { - tracing::error!( - path = ctrlr.path, - "Failed to get system path for controller" - ); + tracing::error!(path, "Failed to get system path for controller"); Err(SvcError::NotFound { kind: ResourceKind::NvmePath, - id: ctrlr.path.to_owned(), + id: path.to_owned(), }) } } @@ -193,14 +189,12 @@ impl NodeAgentSvc { ) { Ok(subsystem) => { tracing::info!(new_path, "Successfully connected to NVMe target"); - subsystem - } - Err(error) => { - return Err(SubsystemNotFound { - nqn, - details: error.to_string(), - }); + Ok(subsystem) } + Err(error) => Err(SubsystemNotFound { + nqn: nqn.clone(), + details: error.to_string(), + }), }, Err(error) => { tracing::error!( @@ -208,15 +202,13 @@ impl NodeAgentSvc { %error, "Failed to connect to new NVMe target" ); - let nvme_err = format!( - "Failed to connect to new NVMe target: {}, new path: {}, nqn: {}", - error.full_string(), - new_path, - nqn + let details = format!( + "Failed to connect to new NVMe target: {error}, new path: {new_path}, nqn: {nqn}", + error = error.full_string(), ); - return Err(SvcError::NvmeConnectError { details: nvme_err }); + Err(SvcError::NvmeConnectError { details }) } - }; + }?; let now = Instant::now(); let timeout = self.path_connection_timeout; @@ -301,16 +293,21 @@ impl NodeAgentSvc { } }; - // Remove the subsystem. - if let Err(error) = curr_subsystem.disconnect() { - tracing::error!( - ?curr_subsystem, - ?error, - "Failed to disconnect NVMe subsystem" - ); - } else { - tracing::info!(?curr_subsystem, "NVMe subsystem successfully disconnected"); - } + tokio::task::block_in_place(move || { + // Remove the subsystem. + if let Err(error) = curr_subsystem.disconnect() { + tracing::error!( + ?curr_subsystem, + ?error, + "Failed to disconnect NVMe subsystem" + ); + } else { + tracing::info!( + ?curr_subsystem, + "NVMe subsystem successfully disconnected" + ); + } + }); } else { tracing::info!(?subsystem, "Keeping NVMe subsystem after connect timeout"); } @@ -330,9 +327,9 @@ impl NodeAgentOperations for NodeAgentSvc { request: &dyn ReplacePathInfo, _context: Option, ) -> Result<(), ReplyError> { - tracing::info!("Replacing failed NVMe path: {:?}", request); + tracing::info!("Replacing failed NVMe path: {request:?}"); // Lookup NVMe controller whose path has failed. - let ctrlr = self + let ctrlrs = self .path_cache .lookup_controllers(request.target_nqn()) .await @@ -359,8 +356,8 @@ impl NodeAgentOperations for NodeAgentSvc { // path has been successfully created, so having the first failed path in addition // to the second healthy one is OK: just display a warning and proceed as if // the call has completed successfully. - for ctrl in ctrlr { - if let Err(error) = disconnect_controller(&ctrl, request.new_path()) { + for ctrl in ctrlrs { + if let Err(error) = disconnect_controller(&ctrl, request.new_path()).await { tracing::warn!( uri=%request.new_path(), %error, From fd81be5a493377c47d3a71a28b3b35b767f7ba6b Mon Sep 17 00:00:00 2001 From: Tiago Castro Date: Thu, 7 Mar 2024 19:28:20 +0000 Subject: [PATCH 2/3] fix(agents/ha/cluster): increase path replacement timeout In a system test run we've seen the disconnect of an old path take 50s to complete. We're not sure why this is the case as we cannot reproduce it yet but for now we can increase the timeout. This is probably fine as if we can't talk to the node where the app is then we can't do much more anyway. We should also consider wether we need to wait for the disconnect at all. If we can find out if the disconnect has been started, then we can probably just complete that async and succeed the replace! Signed-off-by: Tiago Castro --- control-plane/stor-port/src/transport_api/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/control-plane/stor-port/src/transport_api/mod.rs b/control-plane/stor-port/src/transport_api/mod.rs index 243f52e57..d6dab6e8b 100644 --- a/control-plane/stor-port/src/transport_api/mod.rs +++ b/control-plane/stor-port/src/transport_api/mod.rs @@ -524,7 +524,7 @@ impl Default for RequestMinTimeout { pool: Duration::from_secs(20), nexus_shutdown: Duration::from_secs(15), nexus_snapshot: Duration::from_secs(30), - nvme_reconnect: Duration::from_secs(12), + nvme_reconnect: Duration::from_secs(62), } } } From 03033b848830fef0e01841f9848ac319715246c7 Mon Sep 17 00:00:00 2001 From: Tiago Castro Date: Fri, 8 Mar 2024 13:57:05 +0000 Subject: [PATCH 3/3] fix(agent/core): increase pool import timeout Signed-off-by: Tiago Castro --- control-plane/grpc/src/context.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/control-plane/grpc/src/context.rs b/control-plane/grpc/src/context.rs index 66057dbd7..d23cd6057 100644 --- a/control-plane/grpc/src/context.rs +++ b/control-plane/grpc/src/context.rs @@ -67,7 +67,7 @@ pub fn timeout_grpc(op_id: MessageId, timeout_opts: TimeoutOptions) -> Duration MessageIdVs::DestroyReplicaSnapshot => min_timeouts.replica_snapshot(), MessageIdVs::CreatePool => min_timeouts.pool(), - MessageIdVs::ImportPool => min_timeouts.pool(), + MessageIdVs::ImportPool => min_timeouts.pool() * 3, MessageIdVs::DestroyPool => min_timeouts.pool(), MessageIdVs::ReplacePathInfo => min_timeouts.nvme_reconnect(),