Skip to content

Commit

Permalink
Create common scaffolding for saga undo tests (#3904)
Browse files Browse the repository at this point in the history
Several of our saga undo tests have previously been found to have
defects in which the test verifies that all attempts to execute a saga
fail but doesn't verify that saga executions fail at the expected
failure point. This loses test coverage, since the tests don't actually
execute all of the undo actions of the saga under test.

To try to prevent this problem, add a set of saga test helpers that
provide common logic for writing undo tests. The scaffold functions
handle the business of constructing a DAG, deciding where to inject an
error, and verifying that errors occur in the right places. The callers
provide a saga type and factory functions that run before and after each
saga execution to set up state, provide saga parameters, check
invariants, and clean up after each test iteration.

Convert existing tests of these types to use the new scaffolds. This
revealed that the no-pantry version of the disk snapshot test has a
similar bug: the saga requires the disk being snapshotted to be attached
to an instance, but the test wasn't creating an instance, so the saga
never got past its "look up the instance that owns the disk" step. Fix
this issue.

Add an additional scaffold that repeats nodes in a successful saga. This
case is less fragile than the undo case, but this gets rid of some
copy-pasted code.

Finally, coalesce some common instance operations (start, stop, delete,
simulate) that were used by multiple saga tests into the test helpers.

This is a test-only change, tested both by running `cargo test` and
finding no errors and by introducing bugs into some sagas and verifying
that tests run with the scaffolds catch those bugs.

Fixes #3896.
  • Loading branch information
gjcolombo authored Aug 24, 2023
1 parent b644199 commit 3412cb1
Show file tree
Hide file tree
Showing 11 changed files with 869 additions and 739 deletions.
178 changes: 29 additions & 149 deletions nexus/src/app/sagas/disk_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -795,9 +795,6 @@ pub(crate) mod test {
use omicron_common::api::external::IdentityMetadataCreateParams;
use omicron_common::api::external::Name;
use omicron_sled_agent::sim::SledAgent;
use slog::error;
use slog::Logger;
use std::num::NonZeroU32;
use uuid::Uuid;

type ControlPlaneTestContext =
Expand Down Expand Up @@ -866,41 +863,6 @@ pub(crate) mod test {
assert_eq!(disk.project_id, project_id);
}

async fn no_stuck_sagas(log: &Logger, datastore: &DataStore) -> bool {
use crate::db::model::saga_types::SagaNodeEvent;

let saga_node_events: Vec<SagaNodeEvent> = datastore
.pool_for_tests()
.await
.unwrap()
.transaction_async(|conn| async move {
use crate::db::schema::saga_node_event::dsl;

conn.batch_execute_async(
nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL,
)
.await
.unwrap();

Ok::<_, crate::db::TransactionError<()>>(
dsl::saga_node_event
.filter(dsl::event_type.eq(String::from("undo_failed")))
.select(SagaNodeEvent::as_select())
.load_async::<SagaNodeEvent>(&conn)
.await
.unwrap(),
)
})
.await
.unwrap();

for saga_node_event in &saga_node_events {
error!(log, "saga {:?} is stuck!", saga_node_event.saga_id);
}

saga_node_events.is_empty()
}

async fn no_disk_records_exist(datastore: &DataStore) -> bool {
use crate::db::model::Disk;
use crate::db::schema::disk::dsl;
Expand Down Expand Up @@ -1020,7 +982,6 @@ pub(crate) mod test {
let sled_agent = &cptestctx.sled_agent.sled_agent;
let datastore = cptestctx.server.apictx().nexus.datastore();

assert!(no_stuck_sagas(&cptestctx.logctx.log, datastore).await);
assert!(no_disk_records_exist(datastore).await);
assert!(no_volume_records_exist(datastore).await);
assert!(
Expand All @@ -1046,40 +1007,23 @@ pub(crate) mod test {
let client = &cptestctx.external_client;
let nexus = &cptestctx.server.apictx().nexus;
let project_id = create_org_and_project(&client).await;

// Build the saga DAG with the provided test parameters
let opctx = test_opctx(cptestctx);

let params = new_test_params(&opctx, project_id);
let dag = create_saga_dag::<SagaDiskCreate>(params).unwrap();

for node in dag.get_nodes() {
// Create a new saga for this node.
info!(
log,
"Creating new saga which will fail at index {:?}", node.index();
"node_name" => node.name().as_ref(),
"label" => node.label(),
);
let runnable_saga =
nexus.create_runnable_saga(dag.clone()).await.unwrap();

// Inject an error instead of running the node.
//
// This should cause the saga to unwind.
nexus
.sec()
.saga_inject_error(runnable_saga.id(), node.index())
.await
.unwrap();
nexus
.run_saga(runnable_saga)
.await
.expect_err("Saga should have failed");

// Check that no partial artifacts of disk creation exist:
verify_clean_slate(&cptestctx, &test).await;
}
crate::app::sagas::test_helpers::action_failure_can_unwind::<
SagaDiskCreate,
_,
_,
>(
nexus,
|| Box::pin(async { new_test_params(&opctx, project_id) }),
|| {
Box::pin(async {
verify_clean_slate(&cptestctx, &test).await;
})
},
log,
)
.await;
}

#[nexus_test(server = crate::Server)]
Expand All @@ -1092,61 +1036,18 @@ pub(crate) mod test {
let client = &cptestctx.external_client;
let nexus = &cptestctx.server.apictx.nexus;
let project_id = create_org_and_project(&client).await;

// Build the saga DAG with the provided test parameters
let opctx = test_opctx(&cptestctx);

let params = new_test_params(&opctx, project_id);
let dag = create_saga_dag::<SagaDiskCreate>(params).unwrap();

// The "undo_node" should always be immediately preceding the
// "error_node".
for (undo_node, error_node) in
dag.get_nodes().zip(dag.get_nodes().skip(1))
{
// Create a new saga for this node.
info!(
log,
"Creating new saga which will fail at index {:?}", error_node.index();
"node_name" => error_node.name().as_ref(),
"label" => error_node.label(),
);

let runnable_saga =
nexus.create_runnable_saga(dag.clone()).await.unwrap();

// Inject an error instead of running the node.
//
// This should cause the saga to unwind.
nexus
.sec()
.saga_inject_error(runnable_saga.id(), error_node.index())
.await
.unwrap();

// Inject a repetition for the node being undone.
//
// This means it is executing twice while unwinding.
nexus
.sec()
.saga_inject_repeat(
runnable_saga.id(),
undo_node.index(),
steno::RepeatInjected {
action: NonZeroU32::new(1).unwrap(),
undo: NonZeroU32::new(2).unwrap(),
},
)
.await
.unwrap();

nexus
.run_saga(runnable_saga)
.await
.expect_err("Saga should have failed");

verify_clean_slate(&cptestctx, &test).await;
}
crate::app::sagas::test_helpers::action_failure_can_unwind_idempotently::<
SagaDiskCreate,
_,
_
>(
nexus,
|| Box::pin(async { new_test_params(&opctx, project_id) }),
|| Box::pin(async { verify_clean_slate(&cptestctx, &test).await; }),
log
).await;
}

async fn destroy_disk(cptestctx: &ControlPlaneTestContext) {
Expand Down Expand Up @@ -1181,31 +1082,10 @@ pub(crate) mod test {

let params = new_test_params(&opctx, project_id);
let dag = create_saga_dag::<SagaDiskCreate>(params).unwrap();

let runnable_saga =
nexus.create_runnable_saga(dag.clone()).await.unwrap();

// Cause all actions to run twice. The saga should succeed regardless!
for node in dag.get_nodes() {
nexus
.sec()
.saga_inject_repeat(
runnable_saga.id(),
node.index(),
steno::RepeatInjected {
action: NonZeroU32::new(2).unwrap(),
undo: NonZeroU32::new(1).unwrap(),
},
)
.await
.unwrap();
}

// Verify that the saga's execution succeeded.
nexus
.run_saga(runnable_saga)
.await
.expect("Saga should have succeeded");
crate::app::sagas::test_helpers::actions_succeed_idempotently(
nexus, dag,
)
.await;

destroy_disk(&cptestctx).await;
verify_clean_slate(&cptestctx, &test).await;
Expand Down
30 changes: 4 additions & 26 deletions nexus/src/app/sagas/disk_delete.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@ pub(crate) mod test {
use nexus_test_utils_macros::nexus_test;
use nexus_types::external_api::params;
use omicron_common::api::external::Name;
use std::num::NonZeroU32;
use uuid::Uuid;

type ControlPlaneTestContext =
Expand Down Expand Up @@ -267,31 +266,10 @@ pub(crate) mod test {
volume_id: disk.volume_id,
};
let dag = create_saga_dag::<SagaDiskDelete>(params).unwrap();

let runnable_saga =
nexus.create_runnable_saga(dag.clone()).await.unwrap();

// Cause all actions to run twice. The saga should succeed regardless!
for node in dag.get_nodes() {
nexus
.sec()
.saga_inject_repeat(
runnable_saga.id(),
node.index(),
steno::RepeatInjected {
action: NonZeroU32::new(2).unwrap(),
undo: NonZeroU32::new(1).unwrap(),
},
)
.await
.unwrap();
}

// Verify that the saga's execution succeeded.
nexus
.run_saga(runnable_saga)
.await
.expect("Saga should have succeeded");
crate::app::sagas::test_helpers::actions_succeed_idempotently(
nexus, dag,
)
.await;

crate::app::sagas::disk_create::test::verify_clean_slate(
&cptestctx, &test,
Expand Down
Loading

0 comments on commit 3412cb1

Please sign in to comment.