diff --git a/io-engine/src/bdev/nexus/nexus_bdev.rs b/io-engine/src/bdev/nexus/nexus_bdev.rs index 46b73117e..8e49f7a5b 100644 --- a/io-engine/src/bdev/nexus/nexus_bdev.rs +++ b/io-engine/src/bdev/nexus/nexus_bdev.rs @@ -1143,6 +1143,20 @@ impl<'n> Nexus<'n> { unsafe { Pin::new_unchecked(self.bdev_mut()) } } + /// Gets a nexus reference from an untyped bdev. + /// # Warning: + /// No checks are performed (e.g. bdev module name check), as it is assumed + /// that the provided bdev is a nexus bdev. + #[inline(always)] + pub(crate) unsafe fn unsafe_from_untyped_bdev( + bdev: spdk_rs::UntypedBdev, + ) -> &'n Nexus<'n> { + spdk_rs::Bdev::>::unsafe_from_inner_ptr( + bdev.unsafe_inner_ptr() as *mut _, + ) + .data() + } + /// Sets the required alignment of the Nexus. pub(crate) unsafe fn set_required_alignment( self: Pin<&mut Self>, diff --git a/io-engine/src/bdev/nexus/nexus_bdev_children.rs b/io-engine/src/bdev/nexus/nexus_bdev_children.rs index 889cada33..2e5eca333 100644 --- a/io-engine/src/bdev/nexus/nexus_bdev_children.rs +++ b/io-engine/src/bdev/nexus/nexus_bdev_children.rs @@ -326,9 +326,10 @@ impl<'n> Nexus<'n> { // Close and remove the child. let res = match self.lookup_child(uri) { Some(child) => { - // Remove child from the I/O path. + // Detach the child from the I/O path, and close its handles. if let Some(device) = child.get_device_name() { - self.disconnect_device(&device).await; + self.detach_device(&device).await; + self.disconnect_all_detached_devices().await; } // Close child's device. @@ -974,18 +975,39 @@ impl<'n> Nexus<'n> { return Ok(()); } - // Disconnect the device from all the channels. + // Detach the device from all the channels. + // // After disconnecting, the device will no longer be used by the // channels, and all I/Os failing due to this device will eventually // resubmit and succeeded (if any healthy children are left). - self.disconnect_device(&dev).await; + // + // Device disconnection is done in two steps (detach, than disconnect) + // in order to prevent an I/O race when retiring a device. + self.detach_device(&dev).await; + + // Disconnect the devices with failed controllers _before_ pause, + // otherwise pause would stuck. Keep all controoled that are _not_ + // failed (e.g., in the case I/O failed due to ENOSPC). + self.traverse_io_channels_async((), |channel, _| { + channel.disconnect_detached_devices(|h| h.is_ctrlr_failed()); + }) + .await; - // Destroy (close) the device. The subsystem must be paused to do this - // properly. + // Disconnect, destroy and close the device. The subsystem must be + // paused to do this properly. { debug!("{self:?}: retire: pausing..."); - self.as_mut().pause().await?; - debug!("{self:?}: retire: pausing ok"); + let res = self.as_mut().pause().await; + match &res { + Ok(_) => debug!("{self:?}: retire: pausing ok"), + Err(e) => warn!("{self:?}: retire: pausing failed: {e}"), + }; + + // Disconnect the all previously detached device handles. This has + // to be done after the pause to prevent an I/O race. + self.disconnect_all_detached_devices().await; + + res?; self.child_retire_destroy_device(&dev).await; @@ -1055,20 +1077,39 @@ impl<'n> Nexus<'n> { Ok(()) } - /// Disconnects a device from all I/O channels. - pub(crate) async fn disconnect_device(&self, dev: &str) { + /// Detaches the device's handles from all I/O channels. + /// + /// The detached handles must be disconnected and dropped by a + /// `disconnect_detached_devices()` call. + /// + /// Device disconnection is done in two steps (detach, than disconnect) in + /// order to prevent an I/O race when retiring a device. + pub(crate) async fn detach_device(&self, dev: &str) { if !self.has_io_device { return; } - debug!("{self:?}: disconnecting '{dev}' from all channels ..."); + debug!("{self:?}: detaching '{dev}' from all channels..."); self.traverse_io_channels_async(dev, |channel, dev| { - channel.disconnect_device(dev); + channel.detach_device(dev); + }) + .await; + + debug!("{self:?}: '{dev}' detached from all I/O channels"); + } + + /// Disconnects all the detached devices on all I/O channels by dropping + /// their handles. + pub(crate) async fn disconnect_all_detached_devices(&self) { + debug!("{self:?}: disconnecting all detached devices ..."); + + self.traverse_io_channels_async((), |channel, _| { + channel.disconnect_detached_devices(|_| true); }) .await; - debug!("{self:?}: '{dev}' disconnected from all I/O channels"); + debug!("{self:?}: disconnected all detached devices"); } /// Destroys the device being retired. @@ -1143,7 +1184,8 @@ impl<'n> Nexus<'n> { // Step 1: Close I/O channels for all children. for dev in nexus.child_devices() { - nexus.disconnect_device(&dev).await; + nexus.detach_device(&dev).await; + nexus.disconnect_all_detached_devices().await; device_cmd_queue().enqueue(DeviceCommand::RetireDevice { nexus_name: nexus.name.clone(), diff --git a/io-engine/src/bdev/nexus/nexus_channel.rs b/io-engine/src/bdev/nexus/nexus_channel.rs index 92fb3fef4..4f63ec87c 100644 --- a/io-engine/src/bdev/nexus/nexus_channel.rs +++ b/io-engine/src/bdev/nexus/nexus_channel.rs @@ -17,6 +17,7 @@ use spdk_rs::Thread; pub struct NexusChannel<'n> { writers: Vec>, readers: Vec>, + detached: Vec>, io_logs: Vec, previous_reader: UnsafeCell, fail_fast: u32, @@ -123,6 +124,7 @@ impl<'n> NexusChannel<'n> { Self { writers, readers, + detached: Vec::new(), io_logs: nexus.io_log_channels(), previous_reader: UnsafeCell::new(0), nexus: unsafe { nexus.pinned_mut() }, @@ -209,16 +211,57 @@ impl<'n> NexusChannel<'n> { } } - /// Disconnects a child device from the I/O path. - pub fn disconnect_device(&mut self, device_name: &str) { + /// Detaches a child device from this I/O channel, moving the device's + /// handles to the list of detached devices to disconnect later. + /// + /// The detached handles must be disconnected and dropped by a + /// `disconnect_detached_devices()` call. + pub(super) fn detach_device(&mut self, device_name: &str) { self.previous_reader = UnsafeCell::new(0); - self.readers - .retain(|c| c.get_device().device_name() != device_name); - self.writers - .retain(|c| c.get_device().device_name() != device_name); + if let Some(d) = self + .readers + .iter() + .position(|c| c.get_device().device_name() == device_name) + { + let t = self.readers.remove(d); + self.detached.push(t); + } + + if let Some(d) = self + .writers + .iter() + .position(|c| c.get_device().device_name() == device_name) + { + let t = self.writers.remove(d); + self.detached.push(t); + } + + debug!("{self:?}: device '{device_name}' detached"); + } + + /// Disconnects previously detached device handles by dropping them. + /// Devices to drop are filtered by the given predicate: true to drop + /// a device, false to keep it. + pub(super) fn disconnect_detached_devices(&mut self, mut drop_pred: F) + where + F: FnMut(&dyn BlockDeviceHandle) -> bool, + { + let n = self.detached.len(); + info!("{self:?}: disconnecting {n} detached device handles..."); + + self.detached.retain(|h| !drop_pred(h.as_ref())); - debug!("{self:?}: device '{device_name}' disconnected"); + let m = self.detached.len(); + if m == 0 { + info!("{self:?}: all detached device handles disconnected"); + } else { + let d = n - m; + info!( + "{self:?}: {d} detached device handle(s) disconnected, \ + {m} remain(s)" + ); + } } /// Refreshing our channels simply means that we either have a child going diff --git a/io-engine/src/bdev/nexus/nexus_io.rs b/io-engine/src/bdev/nexus/nexus_io.rs index 8caaa9c9a..317f18200 100644 --- a/io-engine/src/bdev/nexus/nexus_io.rs +++ b/io-engine/src/bdev/nexus/nexus_io.rs @@ -642,7 +642,8 @@ impl<'n> NexusBio<'n> { // set the IO as failed in the submission stage. self.ctx_mut().failed += 1; - self.channel_mut().disconnect_device(&device); + self.channel_mut().detach_device(&device); + self.channel_mut().disconnect_detached_devices(|_| true); if let Some(log) = self.fault_device( &device, diff --git a/io-engine/src/bdev/nvmx/controller_inner.rs b/io-engine/src/bdev/nvmx/controller_inner.rs index 1b40c6b58..35d3b8ec9 100644 --- a/io-engine/src/bdev/nvmx/controller_inner.rs +++ b/io-engine/src/bdev/nvmx/controller_inner.rs @@ -300,6 +300,8 @@ impl SpdkNvmeController { } } + /// Returns a pointer to the underlying SPDK struct. + #[inline(always)] pub fn as_ptr(&self) -> *mut spdk_nvme_ctrlr { self.0.as_ptr() } diff --git a/io-engine/src/bdev/nvmx/handle.rs b/io-engine/src/bdev/nvmx/handle.rs index 073196a7e..879247b5e 100644 --- a/io-engine/src/bdev/nvmx/handle.rs +++ b/io-engine/src/bdev/nvmx/handle.rs @@ -1523,6 +1523,11 @@ impl BlockDeviceHandle for NvmeDeviceHandle { let id = inner.ext_host_id(); Ok(*id) } + + /// Determines if the underlying controller is failed. + fn is_ctrlr_failed(&self) -> bool { + self.ctrlr.is_failed + } } impl Drop for NvmeDeviceHandle { diff --git a/io-engine/src/core/block_device.rs b/io-engine/src/core/block_device.rs index d0a4e24d0..5a0a0703f 100644 --- a/io-engine/src/core/block_device.rs +++ b/io-engine/src/core/block_device.rs @@ -476,6 +476,11 @@ pub trait BlockDeviceHandle { cb: IoCompletionCallback, cb_arg: IoCompletionCallbackArg, ) -> Result<(), CoreError>; + + /// Determines if the underlying controller is failed. + fn is_ctrlr_failed(&self) -> bool { + false + } } fn block_device_io_completion( diff --git a/io-engine/src/subsys/nvmf/subsystem.rs b/io-engine/src/subsys/nvmf/subsystem.rs index 80d68f2c3..e716dfd4c 100644 --- a/io-engine/src/subsys/nvmf/subsystem.rs +++ b/io-engine/src/subsys/nvmf/subsystem.rs @@ -1,4 +1,5 @@ use std::{ + convert::TryFrom, ffi::{c_void, CString}, fmt::{self, Debug, Display, Formatter}, mem::size_of, @@ -60,11 +61,12 @@ use spdk_rs::{ }; use crate::{ - bdev::{nexus::nexus_lookup_nqn, nvmx::NVME_CONTROLLERS}, + bdev::{nexus::NEXUS_MODULE_NAME, nvmx::NVME_CONTROLLERS, Nexus}, constants::{NVME_CONTROLLER_MODEL_ID, NVME_NQN_PREFIX}, core::{Bdev, Reactors, UntypedBdev}, eventing::{EventMetaGen, EventWithMeta}, ffihelper::{cb_arg, done_cb, AsStr, FfiResult, IntoCString}, + lvs::Lvol, subsys::{ make_subsystem_serial, nvmf::{transport::TransportId, Error, NVMF_TGT}, @@ -236,70 +238,54 @@ impl NvmfSubsystem { /// Subsystem event handlers. extern "C" fn nvmf_subsystem_event_handler( - subsystem: *mut spdk_nvmf_subsystem, + subsys: *mut spdk_nvmf_subsystem, event: spdk_nvmf_subsystem_event, ctx: *mut c_void, _cb_arg: *mut c_void, ) { - let subsystem = NvmfSubsystem::from(subsystem); - let nqn = subsystem.get_nqn(); - let nexus = nexus_lookup_nqn(&nqn); + let s = NvmfSubsystem::from(subsys); let event = NvmfSubsystemEvent::from_cb_args(event, ctx); - debug!("NVMF subsystem event {subsystem:?}: {event:?}"); + debug!("NVMF subsystem event {s:?}: {event:?}"); - match event { - NvmfSubsystemEvent::HostConnect(ctrlr) => { - info!( - "Subsystem '{nqn}': host connected: '{host}'", - host = ctrlr.hostnqn() - ); + let nqn_tgt = NqnTarget::lookup(&s.get_nqn()); + if matches!(nqn_tgt, NqnTarget::None) { + warn!( + "NVMF subsystem event {s:?}: {event:?}: \ + target for event NQN not found" + ); + } - ctrlr - .event(EventAction::NvmeConnect, subsystem.meta()) - .generate(); + match event { + NvmfSubsystemEvent::HostConnect(c) => { + c.event(EventAction::NvmeConnect, s.meta()).generate(); - if let Some(nex) = nexus { - nex.add_initiator(&ctrlr.hostnqn()); - subsystem.host_connect_nexus(ctrlr); - } else { - subsystem.host_connect_replica(ctrlr); + match nqn_tgt { + NqnTarget::Nexus(n) => s.host_connect_nexus(c, n), + NqnTarget::Replica(r) => s.host_connect_replica(c, r), + NqnTarget::None => {} } } - NvmfSubsystemEvent::HostDisconnect(ctrlr) => { - info!( - "Subsystem '{nqn}': host disconnected: '{host}'", - host = ctrlr.hostnqn() - ); + NvmfSubsystemEvent::HostDisconnect(c) => { + c.event(EventAction::NvmeDisconnect, s.meta()).generate(); - ctrlr - .event(EventAction::NvmeDisconnect, subsystem.meta()) - .generate(); - - if let Some(nex) = nexus { - nex.rm_initiator(&ctrlr.hostnqn()); - subsystem.host_disconnect_nexus(ctrlr); - } else { - subsystem.host_disconnect_replica(ctrlr); + match nqn_tgt { + NqnTarget::Nexus(n) => s.host_disconnect_nexus(c, n), + NqnTarget::Replica(r) => s.host_disconnect_replica(c, r), + NqnTarget::None => {} } } - NvmfSubsystemEvent::HostKeepAliveTimeout(ctrlr) => { - warn!( - "Subsystem '{nqn}': host keep alive timeout: '{host}'", - host = ctrlr.hostnqn() - ); - - ctrlr - .event(EventAction::NvmeKeepAliveTimeout, subsystem.meta()) + NvmfSubsystemEvent::HostKeepAliveTimeout(c) => { + c.event(EventAction::NvmeKeepAliveTimeout, s.meta()) .generate(); - if let Some(nex) = nexus { - nex.initiator_keep_alive_timeout(&ctrlr.hostnqn()); + match nqn_tgt { + NqnTarget::Nexus(n) => s.host_kato_nexus(c, n), + NqnTarget::Replica(r) => s.host_kato_replica(c, r), + NqnTarget::None => {} } } - NvmfSubsystemEvent::Unknown => { - // ignore unknown events - } + NvmfSubsystemEvent::Unknown => {} // ignore unknown events } } @@ -329,7 +315,16 @@ impl NvmfSubsystem { } /// Called upon a host connection to a nexus. - fn host_connect_nexus(&self, ctrlr: NvmfController) { + fn host_connect_nexus(&self, ctrlr: NvmfController, nex: &Nexus) { + info!( + "Host '{host}' connected to subsystem '{subsys}' on \ + nexus '{nex:?}'", + host = ctrlr.hostnqn(), + subsys = self.get_nqn(), + ); + + nex.add_initiator(&ctrlr.hostnqn()); + unsafe { spdk_nvmf_ctrlr_set_cpl_error_cb( ctrlr.0.as_ptr(), @@ -340,7 +335,16 @@ impl NvmfSubsystem { } /// Called upon a host disconnection from a nexus. - fn host_disconnect_nexus(&self, ctrlr: NvmfController) { + fn host_disconnect_nexus(&self, ctrlr: NvmfController, nex: &Nexus) { + info!( + "Host '{host}' disconnected from subsystem '{subsys}' on \ + nexus '{nex:?}'", + host = ctrlr.hostnqn(), + subsys = self.get_nqn(), + ); + + nex.rm_initiator(&ctrlr.hostnqn()); + unsafe { spdk_nvmf_ctrlr_set_cpl_error_cb( ctrlr.0.as_ptr(), @@ -350,6 +354,18 @@ impl NvmfSubsystem { } } + /// Called upon a host keep alive timeout (KATO) on a nexus. + fn host_kato_nexus(&self, ctrlr: NvmfController, nex: &Nexus) { + warn!( + "Host '{host}': keep alive timeout on subsystem '{subsys}' on \ + nexus '{nex:?}'", + host = ctrlr.hostnqn(), + subsys = self.get_nqn(), + ); + + nex.initiator_keep_alive_timeout(&ctrlr.hostnqn()); + } + /// Completion error callback for replicas. unsafe extern "C" fn replica_cpl_error_cb( req: *mut spdk_nvmf_request, @@ -375,7 +391,14 @@ impl NvmfSubsystem { } /// Called upon a host connection to a replica. - fn host_connect_replica(&self, ctrlr: NvmfController) { + fn host_connect_replica(&self, ctrlr: NvmfController, lvol: Lvol) { + info!( + "Host '{host}' connected to subsystem '{subsys}' on \ + replica '{lvol:?}'", + host = ctrlr.hostnqn(), + subsys = self.get_nqn(), + ); + unsafe { spdk_nvmf_ctrlr_set_cpl_error_cb( ctrlr.0.as_ptr(), @@ -386,7 +409,14 @@ impl NvmfSubsystem { } /// Called upon a host disconnection from a replica. - fn host_disconnect_replica(&self, ctrlr: NvmfController) { + fn host_disconnect_replica(&self, ctrlr: NvmfController, lvol: Lvol) { + info!( + "Host '{host}' disconnected from subsystem '{subsys}' on \ + replica '{lvol:?}'", + host = ctrlr.hostnqn(), + subsys = self.get_nqn(), + ); + unsafe { spdk_nvmf_ctrlr_set_cpl_error_cb( ctrlr.0.as_ptr(), @@ -396,6 +426,16 @@ impl NvmfSubsystem { } } + /// Called upon a host keep alive timeout (KATO) on a replica. + fn host_kato_replica(&self, ctrlr: NvmfController, lvol: Lvol) { + warn!( + "Host '{host}': keep alive timeout on subsystem '{subsys}' on \ + replica '{lvol:?}'", + host = ctrlr.hostnqn(), + subsys = self.get_nqn(), + ); + } + /// create a new subsystem where the NQN is based on the UUID pub fn new(uuid: &str) -> Result { let nqn = make_nqn(uuid).into_cstring(); @@ -1076,3 +1116,41 @@ impl NvmfSubsystem { fn make_nqn(id: &str) -> String { format!("{NVME_NQN_PREFIX}:{id}") } + +/// NQN target. +pub enum NqnTarget<'a> { + Nexus(&'a Nexus<'a>), + Replica(Lvol), + None, +} + +impl<'a> NqnTarget<'a> { + pub fn lookup(nqn: &str) -> Self { + let Some(bdev) = UntypedBdev::bdev_first() else { + return Self::None; + }; + + let parts: Vec<&str> = nqn.split(':').collect(); + if parts.len() != 2 || parts[0] != NVME_NQN_PREFIX { + return Self::None; + } + + let name = parts[1]; + + for b in bdev.into_iter() { + match b.driver() { + NEXUS_MODULE_NAME if b.name() == name => { + return Self::Nexus(unsafe { + Nexus::unsafe_from_untyped_bdev(*b) + }); + } + "lvol" if b.name() == name => { + return Lvol::try_from(b).map_or(Self::None, Self::Replica) + } + _ => {} + } + } + + Self::None + } +}