Skip to content

Commit

Permalink
fix: delete resources pending deletion when the store is online
Browse files Browse the repository at this point in the history
When the persistent store (etcd) is not online, we may not be able to delete resources. They remain
in the deleting state.
Mimic the volume dirty spec reconciler for other resources and simply delete them if the creation
was not successful.

Destroy resources in the deleting state from their garbage collection reconciler.

Resolves: CAS-1300
(cherry picked from commit 1c156bc)

# Conflicts:
#	control-plane/agents/core/src/core/reconciler/nexus/garbage_collector.rs
#	control-plane/agents/core/src/core/reconciler/replica/mod.rs
#	control-plane/agents/core/src/pool/specs.rs
  • Loading branch information
tiagolobocastro committed Jun 7, 2022
1 parent eb3866f commit bb7ad2f
Show file tree
Hide file tree
Showing 10 changed files with 370 additions and 277 deletions.
8 changes: 8 additions & 0 deletions common/src/types/v0/store/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -388,3 +388,11 @@ impl From<NexusSpec> for DestroyNexus {
}
}
}
impl From<&NexusSpec> for DestroyNexus {
fn from(nexus: &NexusSpec) -> Self {
Self {
node: nexus.node.clone(),
uuid: nexus.uuid.clone(),
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,10 @@ async fn nexus_garbage_collector(
nexus_spec: &Arc<Mutex<NexusSpec>>,
context: &PollContext,
) -> PollResult {
let mode = OperationMode::ReconcileStep;
let results = vec![
destroy_orphaned_nexus(nexus_spec, context).await,
destroy_disowned_nexus(nexus_spec, context, mode).await,
destroy_deleting_nexus(nexus_spec, context).await,
destroy_disowned_nexus(nexus_spec, context).await,
];
GarbageCollector::squash_results(results)
}
Expand Down Expand Up @@ -101,53 +101,80 @@ async fn destroy_orphaned_nexus(
/// Given a control plane managed nexus
/// When a nexus is not owned by a volume
/// Then it should eventually be destroyed
#[tracing::instrument(level = "debug", skip(nexus_spec, context, mode), fields(nexus.uuid = %nexus_spec.lock().uuid, request.reconcile = true))]
#[tracing::instrument(level = "debug", skip(nexus_spec, context), fields(nexus.uuid = %nexus_spec.lock().uuid, request.reconcile = true))]
async fn destroy_disowned_nexus(
nexus_spec: &Arc<Mutex<NexusSpec>>,
context: &PollContext,
mode: OperationMode,
) -> PollResult {
let _guard = match nexus_spec.operation_guard(OperationMode::ReconcileStart) {
Ok(guard) => guard,
Err(_) => return PollResult::Ok(PollerState::Busy),
};

let nexus_node = {
let not_owned = {
let nexus = nexus_spec.lock();
let destroy = nexus.managed && !nexus.owned();
let mut nexus_node = None;
if destroy {
nexus_node = Some(nexus.node.clone())
}
nexus_node
nexus.managed && !nexus.owned()
};

if let Some(nexus_node) = nexus_node {
let node_online = matches!(context.registry().get_node_wrapper(&nexus_node).await, Ok(node) if node.read().await.is_online());
if node_online {
let nexus_clone = nexus_spec.lock().clone();
let nexus_uuid = nexus_clone.uuid.clone();
async {
nexus_clone.warn_span(|| tracing::warn!("Attempting to destroy disowned nexus"));
let request = DestroyNexus::from(nexus_clone.clone());
match context
.specs()
.destroy_nexus(context.registry(), &request, true, mode)
.await {
Ok(_) => {
nexus_clone.info_span(|| tracing::info!("Successfully destroyed disowned nexus"));
Ok(())
}
Err(error) => {
nexus_clone.error_span(|| tracing::error!(error = %error, "Failed to destroy disowned nexus"));
Err(error)
}
}
}
.instrument(tracing::info_span!("destroy_disowned_nexus", nexus.uuid = %nexus_uuid, request.reconcile = true))
if not_owned {
destroy_nexus(nexus_spec, context, OperationMode::ReconcileStep)
.instrument(tracing::info_span!("destroy_disowned_nexus", nexus.uuid = %nexus_spec.lock().uuid, request.reconcile = true))
.await?;
}
}

PollResult::Ok(PollerState::Idle)
}

/// Given a control plane nexus
/// When a nexus destruction fails
/// Then it should eventually be destroyed
#[tracing::instrument(level = "debug", skip(nexus_spec, context), fields(nexus.uuid = %nexus_spec.lock().uuid, request.reconcile = true))]
async fn destroy_deleting_nexus(
nexus_spec: &Arc<Mutex<NexusSpec>>,
context: &PollContext,
) -> PollResult {
let _guard = match nexus_spec.operation_guard(OperationMode::ReconcileStart) {
Ok(guard) => guard,
Err(_) => return PollResult::Ok(PollerState::Busy),
};

let deleting = nexus_spec.lock().status().deleting();
if deleting {
destroy_nexus(nexus_spec, context, OperationMode::ReconcileStep)
.instrument(tracing::info_span!("destroy_deleting_nexus", nexus.uuid = %nexus_spec.lock().uuid, request.reconcile = true))
.await?;
}

PollResult::Ok(PollerState::Idle)
}

#[tracing::instrument(level = "trace", skip(nexus_spec, context, mode), fields(nexus.uuid = %nexus_spec.lock().uuid, request.reconcile = true))]
async fn destroy_nexus(
nexus_spec: &Arc<Mutex<NexusSpec>>,
context: &PollContext,
mode: OperationMode,
) -> PollResult {
let node = nexus_spec.lock().node.clone();
let node_online = matches!(context.registry().get_node_wrapper(&node).await, Ok(node) if node.read().await.is_online());
if node_online {
let nexus_clone = nexus_spec.lock().clone();
nexus_clone.warn_span(|| tracing::warn!("Attempting to destroy nexus"));
let request = DestroyNexus::from(&nexus_clone);
match context
.specs()
.destroy_nexus(context.registry(), &request, true, mode)
.await
{
Ok(_) => {
nexus_clone.info_span(|| tracing::info!("Successfully destroyed nexus"));
Ok(PollerState::Idle)
}
Err(error) => {
nexus_clone
.error_span(|| tracing::error!(error = %error, "Failed to destroy nexus"));
Err(error)
}
}
} else {
Ok(PollerState::Idle)
}
}
17 changes: 10 additions & 7 deletions control-plane/agents/core/src/core/reconciler/persistent_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,17 @@ impl PersistentStoreReconciler {
impl TaskPoller for PersistentStoreReconciler {
async fn poll(&mut self, context: &PollContext) -> PollResult {
let specs = context.specs();
let dirty_replicas = specs.reconcile_dirty_replicas(context.registry()).await;
let dirty_nexuses = specs.reconcile_dirty_nexuses(context.registry()).await;
let dirty_volumes = specs.reconcile_dirty_volumes(context.registry()).await;
if context.registry().store_online().await {
let dirty_pools = specs.reconcile_dirty_pools(context.registry()).await;
let dirty_replicas = specs.reconcile_dirty_replicas(context.registry()).await;
let dirty_nexuses = specs.reconcile_dirty_nexuses(context.registry()).await;
let dirty_volumes = specs.reconcile_dirty_volumes(context.registry()).await;

if dirty_nexuses || dirty_replicas || dirty_volumes {
PollResult::Ok(PollerState::Busy)
} else {
PollResult::Ok(PollerState::Idle)
if dirty_nexuses || dirty_replicas || dirty_volumes || dirty_pools {
return PollResult::Ok(PollerState::Busy);
}
}

PollResult::Ok(PollerState::Idle)
}
}
99 changes: 73 additions & 26 deletions control-plane/agents/core/src/core/reconciler/replica/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,14 @@
mod tests;

use crate::core::{
specs::{OperationSequenceGuard, SpecOperations},
specs::{OperationSequenceGuard, ResourceSpecsLocked, SpecOperations},
task_poller::{
PollContext, PollEvent, PollResult, PollTimer, PollTriggerEvent, PollerState, TaskPoller,
},
};
use common_lib::types::v0::{
message_bus::ReplicaOwners,
store::{replica::ReplicaSpec, OperationMode},
};
use common_lib::types::v0::store::{replica::ReplicaSpec, OperationMode};
use parking_lot::Mutex;
use std::{ops::Deref, sync::Arc};
use std::sync::Arc;

/// Replica reconciler
#[derive(Debug)]
Expand All @@ -32,12 +29,15 @@ impl ReplicaReconciler {
#[async_trait::async_trait]
impl TaskPoller for ReplicaReconciler {
async fn poll(&mut self, context: &PollContext) -> PollResult {
let mut results = vec![];
let replicas = context.specs().get_replicas();
let mut results = Vec::with_capacity(replicas.len());

for replica in context.specs().get_replicas() {
results.push(disown_missing_owners(context, &replica).await);
results.push(destroy_orphaned_replicas(context, &replica).await);
for replica in replicas {
results.push(remove_missing_owners(&replica, context).await);
results.push(destroy_orphaned_replica(&replica, context).await);
results.push(destroy_deleting_replica(&replica, context).await);
}

Self::squash_results(results)
}

Expand All @@ -56,17 +56,20 @@ impl TaskPoller for ReplicaReconciler {
/// Remove replica owners who no longer exist.
/// In the event that the replicas become orphaned (have no owners) they will be destroyed by the
/// 'destroy_orphaned_replicas' reconcile loop.
async fn disown_missing_owners(
context: &PollContext,
async fn remove_missing_owners(
replica: &Arc<Mutex<ReplicaSpec>>,
context: &PollContext,
) -> PollResult {
let specs = context.specs();

// If we obtain the operation guard no one else can be modifying the replica spec.
if let Ok(_guard) = replica.operation_guard(OperationMode::ReconcileStart) {
let replica_spec = replica.lock().clone();
let owned = {
let replica_spec = replica.lock();
replica_spec.managed && replica_spec.owned()
};

if replica_spec.managed && replica_spec.owned() {
if owned {
let replica_spec = replica.lock().clone();
let mut owner_removed = false;
let owners = &replica_spec.owners;

Expand Down Expand Up @@ -106,35 +109,79 @@ async fn disown_missing_owners(

/// Destroy orphaned replicas.
/// Orphaned replicas are those that are managed but which don't have any owners.
async fn destroy_orphaned_replicas(
context: &PollContext,
async fn destroy_orphaned_replica(
replica: &Arc<Mutex<ReplicaSpec>>,
context: &PollContext,
) -> PollResult {
let _guard = match replica.operation_guard(OperationMode::ReconcileStart) {
Ok(guard) => guard,
Err(_) => return PollResult::Ok(PollerState::Busy),
};

let replica_spec = replica.lock().deref().clone();
if replica_spec.managed && !replica_spec.owned() {
let destroy_owned = {
let replica = replica.lock();
replica.managed && !replica.owned()
};

if destroy_owned {
destroy_replica(replica, context).await
} else {
PollResult::Ok(PollerState::Idle)
}
}

/// Given a control plane replica
/// When its destruction fails
/// Then it should eventually be destroyed
async fn destroy_deleting_replica(
replica_spec: &Arc<Mutex<ReplicaSpec>>,
context: &PollContext,
) -> PollResult {
let _guard = match replica_spec.operation_guard(OperationMode::ReconcileStart) {
Ok(guard) => guard,
Err(_) => return PollResult::Ok(PollerState::Busy),
};

let deleting = replica_spec.lock().status().deleting();
if deleting {
destroy_replica(replica_spec, context).await
} else {
PollResult::Ok(PollerState::Idle)
}
}

#[tracing::instrument(level = "debug", skip(replica_spec, context), fields(replica.uuid = %replica_spec.lock().uuid, request.reconcile = true))]
async fn destroy_replica(
replica_spec: &Arc<Mutex<ReplicaSpec>>,
context: &PollContext,
) -> PollResult {
let pool_id = replica_spec.lock().pool.clone();
if let Some(node) = ResourceSpecsLocked::get_pool_node(context.registry(), pool_id).await {
let replica_clone = replica_spec.lock().clone();
match context
.specs()
.destroy_replica_spec(
.destroy_replica(
context.registry(),
&replica_spec,
ReplicaOwners::default(),
false,
&ResourceSpecsLocked::destroy_replica_request(
replica_clone,
Default::default(),
&node,
),
true,
OperationMode::ReconcileStep,
)
.await
{
Ok(_) => {
tracing::info!(replica.uuid=%replica_spec.uuid, "Successfully destroyed orphaned replica");
tracing::info!(replica.uuid=%replica_spec.lock().uuid, "Successfully destroyed replica");
PollResult::Ok(PollerState::Idle)
}
Err(e) => {
tracing::trace!(replica.uuid=%replica_spec.uuid, error=%e, "Failed to destroy orphaned replica");
tracing::trace!(replica.uuid=%replica_spec.lock().uuid, error=%e, "Failed to destroy replica");
PollResult::Err(e)
}
}
} else {
PollResult::Ok(PollerState::Busy)
}
PollResult::Ok(PollerState::Idle)
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ use common_lib::types::v0::store::{volume::VolumeSpec, OperationMode, TraceSpan,
use crate::core::specs::SpecOperations;
use common::errors::SvcError;
use common_lib::types::v0::{
message_bus::VolumeStatus,
message_bus::{DestroyVolume, VolumeStatus},
store::{nexus_persistence::NexusInfo, replica::ReplicaSpec},
};
use parking_lot::Mutex;
use std::sync::Arc;
use tracing::Instrument;

/// Volume Garbage Collector reconciler
#[derive(Debug)]
Expand All @@ -34,6 +35,7 @@ impl TaskPoller for GarbageCollector {
async fn poll(&mut self, context: &PollContext) -> PollResult {
let mut results = vec![];
for volume in context.specs().get_locked_volumes() {
results.push(destroy_deleting_volume(&volume, context).await);
results.push(disown_unused_nexuses(&volume, context).await);
results.push(disown_unused_replicas(&volume, context).await);
}
Expand All @@ -54,6 +56,52 @@ impl TaskPoller for GarbageCollector {
}
}

#[tracing::instrument(level = "trace", skip(volume_spec, context), fields(volume.uuid = %volume_spec.lock().uuid, request.reconcile = true))]
async fn destroy_deleting_volume(
volume_spec: &Arc<Mutex<VolumeSpec>>,
context: &PollContext,
) -> PollResult {
let _guard = match volume_spec.operation_guard(OperationMode::ReconcileStart) {
Ok(guard) => guard,
Err(_) => return PollResult::Ok(PollerState::Busy),
};

let deleting = volume_spec.lock().status().deleting();
if deleting {
destroy_volume(volume_spec, context, OperationMode::ReconcileStep)
.instrument(tracing::info_span!("destroy_deleting_volume", volume.uuid = %volume_spec.lock().uuid, request.reconcile = true))
.await
} else {
PollResult::Ok(PollerState::Idle)
}
}

async fn destroy_volume(
volume_spec: &Arc<Mutex<VolumeSpec>>,
context: &PollContext,
mode: OperationMode,
) -> PollResult {
let uuid = volume_spec.lock().uuid.clone();
match context
.specs()
.destroy_volume(context.registry(), &DestroyVolume::new(&uuid), mode)
.await
{
Ok(_) => {
volume_spec
.lock()
.info_span(|| tracing::info!("Successfully destroyed volume"));
Ok(PollerState::Idle)
}
Err(error) => {
volume_spec
.lock()
.error_span(|| tracing::error!(error = %error, "Failed to destroy volume"));
Err(error)
}
}
}

/// Given a volume
/// When any of its nexuses are no longer used
/// Then they should be disowned
Expand Down
Loading

0 comments on commit bb7ad2f

Please sign in to comment.