diff --git a/control-plane/agents/core/src/core/reconciler/volume/hot_spare.rs b/control-plane/agents/core/src/core/reconciler/volume/hot_spare.rs index bceb67b11..9a35dd932 100644 --- a/control-plane/agents/core/src/core/reconciler/volume/hot_spare.rs +++ b/control-plane/agents/core/src/core/reconciler/volume/hot_spare.rs @@ -48,21 +48,24 @@ async fn hot_spare_reconcile( volume_spec: &Arc>, context: &PollContext, ) -> PollResult { - let uuid = volume_spec.lock().uuid.clone(); - let volume_state = context.registry().get_volume_state(&uuid).await?; let _guard = match volume_spec.operation_guard(OperationMode::ReconcileStart) { Ok(guard) => guard, Err(_) => return PollResult::Ok(PollerState::Busy), }; - let mode = OperationMode::ReconcileStep; - - if !volume_spec.lock().policy.self_heal { + let volume_spec_cln = volume_spec.lock().clone(); + if !volume_spec_cln.policy.self_heal { return PollResult::Ok(PollerState::Idle); } - if !volume_spec.lock().status.created() { + if !volume_spec_cln.status.created() { return PollResult::Ok(PollerState::Idle); } + let volume_state = context + .registry() + .get_volume_spec_state(volume_spec_cln) + .await?; + let mode = OperationMode::ReconcileStep; + match volume_state.status { VolumeStatus::Online => volume_replica_count_reconciler(volume_spec, context, mode).await, VolumeStatus::Unknown | VolumeStatus::Degraded => { diff --git a/control-plane/agents/core/src/nexus/specs.rs b/control-plane/agents/core/src/nexus/specs.rs index 006a4bb20..01ed35418 100644 --- a/control-plane/agents/core/src/nexus/specs.rs +++ b/control-plane/agents/core/src/nexus/specs.rs @@ -5,7 +5,7 @@ use crate::core::{ }; use common::errors::{NexusNotFound, SvcError}; use common_lib::{ - mbus_api::{ErrorChain, ResourceKind}, + mbus_api::ResourceKind, types::v0::{ message_bus::{ AddNexusChild, AddNexusReplica, Child, ChildUri, CreateNexus, DestroyNexus, Nexus, @@ -16,7 +16,7 @@ use common_lib::{ nexus::{NexusOperation, NexusSpec}, nexus_child::NexusChild, replica::ReplicaSpec, - OperationMode, SpecStatus, SpecTransaction, TraceSpan, + OperationMode, SpecStatus, SpecTransaction, }, }, }; @@ -433,37 +433,23 @@ impl ResourceSpecsLocked { .ok_or(SvcError::ReplicaNotFound { replica_id: replica.uuid().clone(), })?; - let pool_id = replica_spec.lock().pool.clone(); - match Self::get_pool_node(registry, pool_id).await { - Some(node) => { - if let Err(error) = self - .disown_and_destroy_replica(registry, &node, replica.uuid()) - .await - { - nexus_spec.lock().clone().error_span(|| { - tracing::error!( - replica.uuid = %replica.uuid(), - error = %error.full_string(), - "Failed to disown and destroy the replica" - ) - }); - } - } - None => { - // The replica node was not found (perhaps because it is offline). - // The replica can't be destroyed because the node isn't there. - // Instead, disown the replica from the volume and let the garbage - // collector destroy it later. - nexus_spec.lock().clone().warn_span(|| { - tracing::warn!( - replica.uuid = %replica.uuid(), - "Failed to find the node where the replica is hosted" + if !replica_spec.lock().owners.owned_by_a_nexus() { + let owner_volume = { + let replica_spec = replica_spec.lock(); + replica_spec.owners.volume().cloned() + }; + if let Some(owner) = owner_volume { + if let Some(volume) = self.get_locked_volume(&owner) { + self.remove_unused_volume_replica( + registry, + &volume, + replica.uuid(), + mode, ) - }); - let _ = self.disown_volume_replica(registry, &replica_spec).await; + .await?; + } } } - Ok(()) } result => result, diff --git a/control-plane/agents/core/src/volume/registry.rs b/control-plane/agents/core/src/volume/registry.rs index df8b5cc75..7946377e6 100644 --- a/control-plane/agents/core/src/volume/registry.rs +++ b/control-plane/agents/core/src/volume/registry.rs @@ -22,6 +22,17 @@ impl Registry { .await } + /// Get the volume state for the specified volume spec. + pub(crate) async fn get_volume_spec_state( + &self, + volume_spec: VolumeSpec, + ) -> Result { + let replica_specs = self.specs().get_cloned_volume_replicas(&volume_spec.uuid); + + self.get_volume_state_with_replicas(&volume_spec, &replica_specs) + .await + } + /// Get the volume state for the specified volume #[tracing::instrument(level = "info", skip(self, volume_spec, replicas))] pub(crate) async fn get_volume_state_with_replicas( diff --git a/control-plane/agents/core/src/volume/specs.rs b/control-plane/agents/core/src/volume/specs.rs index 9b7ad39af..a1d032a14 100644 --- a/control-plane/agents/core/src/volume/specs.rs +++ b/control-plane/agents/core/src/volume/specs.rs @@ -655,6 +655,8 @@ impl ResourceSpecsLocked { { Ok(_) => Ok(()), Err(error) if !request.force() => Err(error), + Err(error @ SvcError::Conflict {}) => Err(error), + Err(error @ SvcError::StoreSave { .. }) => Err(error), Err(error) => { let node_online = match registry.get_node_wrapper(&nexus_clone.node).await { Ok(node) => { @@ -1053,6 +1055,17 @@ impl ResourceSpecsLocked { .await { Ok(_) => Ok(()), + Err(SvcError::GrpcRequestError { + source, + request, + resource, + }) if source.code() == tonic::Code::DeadlineExceeded => { + Err(SvcError::GrpcRequestError { + source, + request, + resource, + }) + } Err(error) => { if let Some(replica) = self.get_replica(&replica.uuid) { let mut replica = replica.lock(); @@ -1333,26 +1346,6 @@ impl ResourceSpecsLocked { .await } - /// Disown and destroy the replica from its volume - pub(crate) async fn disown_and_destroy_replica( - &self, - registry: &Registry, - node: &NodeId, - replica_uuid: &ReplicaId, - ) -> Result<(), SvcError> { - if let Some(replica) = self.get_replica(replica_uuid) { - // disown it from the volume first, so at the very least it can be garbage collected - // at a later point if the node is not accessible - self.disown_volume_replica(registry, &replica).await?; - self.destroy_volume_replica(registry, Some(node), &replica) - .await - } else { - Err(SvcError::ReplicaNotFound { - replica_id: replica_uuid.to_owned(), - }) - } - } - /// Remove volume by its `id` pub(super) fn remove_volume(&self, id: &VolumeId) { let mut specs = self.write();