diff --git a/go-chaos/cmd/worker.go b/go-chaos/cmd/worker.go index 005c3e1c1..30ffb496f 100644 --- a/go-chaos/cmd/worker.go +++ b/go-chaos/cmd/worker.go @@ -25,7 +25,8 @@ import ( worker "github.com/zeebe-io/zeebe-chaos/go-chaos/worker" ) -const jobType = "zbchaos" +const jobTypeZbChaos = "zbchaos" +const jobTypeReadExperiments = "readExperiments" func init() { rootCmd.AddCommand(workerCommand) @@ -54,7 +55,8 @@ func start_worker(cmd *cobra.Command, args []string) { } // Allow only one job at a time, otherwise job handling might interfere (e.g. override global vars) - jobWorker := client.NewJobWorker().JobType(jobType).Handler(handleZbChaosJob).MaxJobsActive(1).Open() + jobWorker := client.NewJobWorker().JobType(jobTypeZbChaos).Handler(handleZbChaosJob).MaxJobsActive(1).Open() + client.NewJobWorker().JobType(jobTypeReadExperiments).Handler(handleZbChaosJob).MaxJobsActive(1).Open() jobWorker.AwaitClose() } diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/development/leader-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/development/leader-restart/experiment.json new file mode 100644 index 000000000..f6e7d9556 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/development/leader-restart/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe Leader restart gracefully experiment", + "description": "Zeebe should be fault-tolerant. Zeebe should recover after a partition leader was restarted gracefully.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 1", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate leader of partition 1", + "provider": { + "type": "process", + "path": "shutdown-gracefully-partition.sh", + "arguments": [ "Leader", "1" ] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/development/leader-terminate/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/development/leader-terminate/experiment.json new file mode 100644 index 000000000..349da86c8 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/development/leader-terminate/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe Leader restart non-graceful experiment", + "description": "Zeebe should be fault-tolerant. We expect that Zeebe can handle non-graceful leader restarts.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 1", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate leader of partition 1 non-gracefully", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": [ "Leader", "1" ] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/development/msg-correlation/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/development/msg-correlation/experiment.json new file mode 100644 index 000000000..8fd1d57ef --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/development/msg-correlation/experiment.json @@ -0,0 +1,56 @@ +{ + "version": "0.1.0", + "title": "Zeebe message correlation experiment", + "description": "Zeebe message correlation should work even if the leader was restarted on which the message was published.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Publish message to partition one", + "tolerance": 0, + "provider": { + "type": "process", + "path": "publish-message.sh", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Terminate leader of partition 1 non-gracefully", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": [ "Leader", "1" ] + } + }, + { + "name": "Should be able to create a process and await the message correlation", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "await-message-correlation.sh", + "timeout": 900 + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/development/multiple-leader-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/development/multiple-leader-restart/experiment.json new file mode 100644 index 000000000..60f999b5d --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/development/multiple-leader-restart/experiment.json @@ -0,0 +1,110 @@ +{ + "version": "0.1.0", + "title": "Zeebe Leader restart multiple times experiment", + "description": "Zeebe should be able to handle multiple leader changes in short period.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition one", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate leader of partition one", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": ["Leader", "1"], + "status": "0" + } + }, + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition one", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Terminate leader of partition one", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": ["Leader", "1"], + "status": "0" + } + }, + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition one", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Terminate leader of partition one", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": ["Leader", "1"], + "status": "0" + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/development/stress-cpu-on-broker/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/development/stress-cpu-on-broker/experiment.json new file mode 100644 index 000000000..d44de17ff --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/development/stress-cpu-on-broker/experiment.json @@ -0,0 +1,46 @@ +{ + "version": "0.1.0", + "title": "CPU stress on an Broker", + "description": "The cpu stress on an abritrary node should not cause any failures. We should be able to start and complete instances.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 1", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Stress CPU on Broker", + "provider": { + "type": "process", + "path": "stress-cpu.sh" + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/development/worker-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/development/worker-restart/experiment.json new file mode 100644 index 000000000..91cc72f26 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/development/worker-restart/experiment.json @@ -0,0 +1,51 @@ +{ + "version": "0.1.0", + "title": "Zeebe Worker restart experiment", + "description": "Zeebe Workers should be able to reconnect after restart.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create a process and await the result", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "await-processes-with-result.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Restart worker pod", + "tolerance": 0, + "provider": { + "type": "process", + "path": "terminate-workers.sh", + "timeout": 900 + }, + "pauses": { + "after": 5 + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/deployment-distribution/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/deployment-distribution/experiment.json new file mode 100644 index 000000000..591ca0d71 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/deployment-distribution/experiment.json @@ -0,0 +1,106 @@ +{ + "version": "0.1.0", + "title": "Zeebe deployment distribution", + "description": "Zeebe deployment distribution should be fault-tolerant. Zeebe should be able to handle network outages and fail-overs and distribute the deployments after partitions are available again.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Enable net_admin capabilities", + "provider": { + "type": "process", + "path": "apply_net_admin.sh" + }, + "pauses": { + "after": 180 + } + }, + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Create network partition between leaders", + "provider": { + "type": "process", + "path": "disconnect-leaders-one-way.sh" + } + }, + { + "type": "action", + "name": "Deploy different deployment versions.", + "provider": { + "type": "process", + "path": "deploy-different-versions.sh", + "arguments": ["Follower", "3"] + } + }, + { + "type": "action", + "name": "Delete network partition", + "provider": { + "type": "process", + "path": "connect-leaders.sh" + } + }, + { + "type": "probe", + "name": "Create process instance of latest version on partition one", + "tolerance": 0, + "provider": { + "type": "process", + "path": "start-instance-on-partition-with-version.sh", + "arguments": ["1", "10"], + "timeout": 900 + } + }, + { + "type": "probe", + "name": "Create process instance of latest version on partition two", + "tolerance": 0, + "provider": { + "type": "process", + "path": "start-instance-on-partition-with-version.sh", + "arguments": ["2", "10"], + "timeout": 900 + } + }, + { + "type": "probe", + "name": "Create process instance of latest version on partition three", + "tolerance": 0, + "provider": { + "type": "process", + "path": "start-instance-on-partition-with-version.sh", + "arguments": ["3", "10"], + "timeout": 900 + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/follower-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/follower-restart/experiment.json new file mode 100644 index 000000000..29925f71b --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/follower-restart/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe follower graceful restart experiment", + "description": "Zeebe should be fault-tolerant. Zeebe should be able to handle follower restarts.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 3", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "3", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Restart follower of partition 3 gracefully", + "provider": { + "type": "process", + "path": "shutdown-gracefully-partition.sh", + "arguments": ["Follower", "3"] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/follower-terminate/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/follower-terminate/experiment.json new file mode 100644 index 000000000..56a960b3e --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/follower-terminate/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe follower restart non-graceful experiment", + "description": "Zeebe should be fault-tolerant. Zeebe should be able to handle followers terminations.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 3", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "3", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate follower of partition 3", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": [ "Follower", "3"] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/leader-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/leader-restart/experiment.json new file mode 100644 index 000000000..8faea14fa --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/leader-restart/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe Leader restart gracefully experiment", + "description": "Zeebe should be fault-tolerant. Zeebe should recover after a partition leader was restarted gracefully.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 3", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "3", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate leader of partition 3", + "provider": { + "type": "process", + "path": "shutdown-gracefully-partition.sh", + "arguments": [ "Leader", "3" ] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/leader-terminate/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/leader-terminate/experiment.json new file mode 100644 index 000000000..0bc9bcd29 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/leader-terminate/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe Leader restart non-graceful experiment", + "description": "Zeebe should be fault-tolerant. We expect that Zeebe can handle non-graceful leader restarts.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 3", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "3", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate leader of partition 3 non-gracefully", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": [ "Leader", "3" ] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/msg-correlation/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/msg-correlation/experiment.json new file mode 100644 index 000000000..8fd1d57ef --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/msg-correlation/experiment.json @@ -0,0 +1,56 @@ +{ + "version": "0.1.0", + "title": "Zeebe message correlation experiment", + "description": "Zeebe message correlation should work even if the leader was restarted on which the message was published.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Publish message to partition one", + "tolerance": 0, + "provider": { + "type": "process", + "path": "publish-message.sh", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Terminate leader of partition 1 non-gracefully", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": [ "Leader", "1" ] + } + }, + { + "name": "Should be able to create a process and await the message correlation", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "await-message-correlation.sh", + "timeout": 900 + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/multiple-leader-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/multiple-leader-restart/experiment.json new file mode 100644 index 000000000..60f999b5d --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/multiple-leader-restart/experiment.json @@ -0,0 +1,110 @@ +{ + "version": "0.1.0", + "title": "Zeebe Leader restart multiple times experiment", + "description": "Zeebe should be able to handle multiple leader changes in short period.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition one", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate leader of partition one", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": ["Leader", "1"], + "status": "0" + } + }, + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition one", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Terminate leader of partition one", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": ["Leader", "1"], + "status": "0" + } + }, + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition one", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Terminate leader of partition one", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": ["Leader", "1"], + "status": "0" + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/stress-cpu-on-broker/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/stress-cpu-on-broker/experiment.json new file mode 100644 index 000000000..a94e09430 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/stress-cpu-on-broker/experiment.json @@ -0,0 +1,46 @@ +{ + "version": "0.1.0", + "title": "CPU stress on an Broker", + "description": "The cpu stress on an abritrary node should not cause any failures. We should be able to start and complete instances.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 3", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "3", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Stress CPU on Broker", + "provider": { + "type": "process", + "path": "stress-cpu.sh" + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/worker-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/worker-restart/experiment.json new file mode 100644 index 000000000..4a48e055d --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-l/worker-restart/experiment.json @@ -0,0 +1,51 @@ +{ + "version": "0.1.0", + "title": "Zeebe Worker restart experiment", + "description": "Zeebe Workers should be able to reconnect after restart.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create a process and await the result", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "await-processes-with-result.sh", + "arguments": "3", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Restart worker pod", + "tolerance": 0, + "provider": { + "type": "process", + "path": "terminate-workers.sh", + "timeout": 900 + }, + "pauses": { + "after": 5 + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/follower-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/follower-restart/experiment.json new file mode 100644 index 000000000..29925f71b --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/follower-restart/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe follower graceful restart experiment", + "description": "Zeebe should be fault-tolerant. Zeebe should be able to handle follower restarts.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 3", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "3", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Restart follower of partition 3 gracefully", + "provider": { + "type": "process", + "path": "shutdown-gracefully-partition.sh", + "arguments": ["Follower", "3"] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/follower-terminate/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/follower-terminate/experiment.json new file mode 100644 index 000000000..56a960b3e --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/follower-terminate/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe follower restart non-graceful experiment", + "description": "Zeebe should be fault-tolerant. Zeebe should be able to handle followers terminations.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 3", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "3", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate follower of partition 3", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": [ "Follower", "3"] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/leader-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/leader-restart/experiment.json new file mode 100644 index 000000000..8faea14fa --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/leader-restart/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe Leader restart gracefully experiment", + "description": "Zeebe should be fault-tolerant. Zeebe should recover after a partition leader was restarted gracefully.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 3", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "3", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate leader of partition 3", + "provider": { + "type": "process", + "path": "shutdown-gracefully-partition.sh", + "arguments": [ "Leader", "3" ] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/leader-terminate/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/leader-terminate/experiment.json new file mode 100644 index 000000000..0bc9bcd29 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/leader-terminate/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe Leader restart non-graceful experiment", + "description": "Zeebe should be fault-tolerant. We expect that Zeebe can handle non-graceful leader restarts.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 3", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "3", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate leader of partition 3 non-gracefully", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": [ "Leader", "3" ] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/msg-correlation/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/msg-correlation/experiment.json new file mode 100644 index 000000000..8fd1d57ef --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/msg-correlation/experiment.json @@ -0,0 +1,56 @@ +{ + "version": "0.1.0", + "title": "Zeebe message correlation experiment", + "description": "Zeebe message correlation should work even if the leader was restarted on which the message was published.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Publish message to partition one", + "tolerance": 0, + "provider": { + "type": "process", + "path": "publish-message.sh", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Terminate leader of partition 1 non-gracefully", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": [ "Leader", "1" ] + } + }, + { + "name": "Should be able to create a process and await the message correlation", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "await-message-correlation.sh", + "timeout": 900 + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/multiple-leader-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/multiple-leader-restart/experiment.json new file mode 100644 index 000000000..60f999b5d --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/multiple-leader-restart/experiment.json @@ -0,0 +1,110 @@ +{ + "version": "0.1.0", + "title": "Zeebe Leader restart multiple times experiment", + "description": "Zeebe should be able to handle multiple leader changes in short period.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition one", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate leader of partition one", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": ["Leader", "1"], + "status": "0" + } + }, + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition one", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Terminate leader of partition one", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": ["Leader", "1"], + "status": "0" + } + }, + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition one", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Terminate leader of partition one", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": ["Leader", "1"], + "status": "0" + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/stress-cpu-on-broker/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/stress-cpu-on-broker/experiment.json new file mode 100644 index 000000000..a94e09430 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/stress-cpu-on-broker/experiment.json @@ -0,0 +1,46 @@ +{ + "version": "0.1.0", + "title": "CPU stress on an Broker", + "description": "The cpu stress on an abritrary node should not cause any failures. We should be able to start and complete instances.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 3", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "3", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Stress CPU on Broker", + "provider": { + "type": "process", + "path": "stress-cpu.sh" + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/worker-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/worker-restart/experiment.json new file mode 100644 index 000000000..4a48e055d --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-m/worker-restart/experiment.json @@ -0,0 +1,51 @@ +{ + "version": "0.1.0", + "title": "Zeebe Worker restart experiment", + "description": "Zeebe Workers should be able to reconnect after restart.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create a process and await the result", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "await-processes-with-result.sh", + "arguments": "3", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Restart worker pod", + "tolerance": 0, + "provider": { + "type": "process", + "path": "terminate-workers.sh", + "timeout": 900 + }, + "pauses": { + "after": 5 + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/deployment-distribution/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/deployment-distribution/experiment.json new file mode 100644 index 000000000..591ca0d71 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/deployment-distribution/experiment.json @@ -0,0 +1,106 @@ +{ + "version": "0.1.0", + "title": "Zeebe deployment distribution", + "description": "Zeebe deployment distribution should be fault-tolerant. Zeebe should be able to handle network outages and fail-overs and distribute the deployments after partitions are available again.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Enable net_admin capabilities", + "provider": { + "type": "process", + "path": "apply_net_admin.sh" + }, + "pauses": { + "after": 180 + } + }, + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Create network partition between leaders", + "provider": { + "type": "process", + "path": "disconnect-leaders-one-way.sh" + } + }, + { + "type": "action", + "name": "Deploy different deployment versions.", + "provider": { + "type": "process", + "path": "deploy-different-versions.sh", + "arguments": ["Follower", "3"] + } + }, + { + "type": "action", + "name": "Delete network partition", + "provider": { + "type": "process", + "path": "connect-leaders.sh" + } + }, + { + "type": "probe", + "name": "Create process instance of latest version on partition one", + "tolerance": 0, + "provider": { + "type": "process", + "path": "start-instance-on-partition-with-version.sh", + "arguments": ["1", "10"], + "timeout": 900 + } + }, + { + "type": "probe", + "name": "Create process instance of latest version on partition two", + "tolerance": 0, + "provider": { + "type": "process", + "path": "start-instance-on-partition-with-version.sh", + "arguments": ["2", "10"], + "timeout": 900 + } + }, + { + "type": "probe", + "name": "Create process instance of latest version on partition three", + "tolerance": 0, + "provider": { + "type": "process", + "path": "start-instance-on-partition-with-version.sh", + "arguments": ["3", "10"], + "timeout": 900 + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/follower-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/follower-restart/experiment.json new file mode 100644 index 000000000..41618c43f --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/follower-restart/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe follower graceful restart experiment", + "description": "Zeebe should be fault-tolerant. Zeebe should be able to handle follower restarts.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 1", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Restart follower of partition 1 gracefully", + "provider": { + "type": "process", + "path": "shutdown-gracefully-partition.sh", + "arguments": ["Follower", "1"] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/follower-terminate/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/follower-terminate/experiment.json new file mode 100644 index 000000000..0831f68eb --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/follower-terminate/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe follower restart non-graceful experiment", + "description": "Zeebe should be fault-tolerant. Zeebe should be able to handle followers terminations.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 1", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate follower of partition 1", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": [ "Follower", "1"] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/leader-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/leader-restart/experiment.json new file mode 100644 index 000000000..f6e7d9556 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/leader-restart/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe Leader restart gracefully experiment", + "description": "Zeebe should be fault-tolerant. Zeebe should recover after a partition leader was restarted gracefully.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 1", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate leader of partition 1", + "provider": { + "type": "process", + "path": "shutdown-gracefully-partition.sh", + "arguments": [ "Leader", "1" ] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/leader-terminate/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/leader-terminate/experiment.json new file mode 100644 index 000000000..349da86c8 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/leader-terminate/experiment.json @@ -0,0 +1,47 @@ +{ + "version": "0.1.0", + "title": "Zeebe Leader restart non-graceful experiment", + "description": "Zeebe should be fault-tolerant. We expect that Zeebe can handle non-graceful leader restarts.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 1", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate leader of partition 1 non-gracefully", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": [ "Leader", "1" ] + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/msg-correlation/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/msg-correlation/experiment.json new file mode 100644 index 000000000..8fd1d57ef --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/msg-correlation/experiment.json @@ -0,0 +1,56 @@ +{ + "version": "0.1.0", + "title": "Zeebe message correlation experiment", + "description": "Zeebe message correlation should work even if the leader was restarted on which the message was published.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Publish message to partition one", + "tolerance": 0, + "provider": { + "type": "process", + "path": "publish-message.sh", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Terminate leader of partition 1 non-gracefully", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": [ "Leader", "1" ] + } + }, + { + "name": "Should be able to create a process and await the message correlation", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "await-message-correlation.sh", + "timeout": 900 + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/multiple-leader-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/multiple-leader-restart/experiment.json new file mode 100644 index 000000000..60f999b5d --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/multiple-leader-restart/experiment.json @@ -0,0 +1,110 @@ +{ + "version": "0.1.0", + "title": "Zeebe Leader restart multiple times experiment", + "description": "Zeebe should be able to handle multiple leader changes in short period.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition one", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Terminate leader of partition one", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": ["Leader", "1"], + "status": "0" + } + }, + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition one", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Terminate leader of partition one", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": ["Leader", "1"], + "status": "0" + } + }, + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition one", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + }, + { + "type": "action", + "name": "Terminate leader of partition one", + "provider": { + "type": "process", + "path": "terminate-partition.sh", + "arguments": ["Leader", "1"], + "status": "0" + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/stress-cpu-on-broker/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/stress-cpu-on-broker/experiment.json new file mode 100644 index 000000000..d44de17ff --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/stress-cpu-on-broker/experiment.json @@ -0,0 +1,46 @@ +{ + "version": "0.1.0", + "title": "CPU stress on an Broker", + "description": "The cpu stress on an abritrary node should not cause any failures. We should be able to start and complete instances.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create process instances on partition 1", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-steady-state.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Stress CPU on Broker", + "provider": { + "type": "process", + "path": "stress-cpu.sh" + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/worker-restart/experiment.json b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/worker-restart/experiment.json new file mode 100644 index 000000000..91cc72f26 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/camunda-cloud/production-s/worker-restart/experiment.json @@ -0,0 +1,51 @@ +{ + "version": "0.1.0", + "title": "Zeebe Worker restart experiment", + "description": "Zeebe Workers should be able to reconnect after restart.", + "contributions": { + "reliability": "high", + "availability": "high" + }, + "steady-state-hypothesis": { + "title": "Zeebe is alive", + "probes": [ + { + "name": "All pods should be ready", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "verify-readiness.sh", + "timeout": 900 + } + }, + { + "name": "Should be able to create a process and await the result", + "type": "probe", + "tolerance": 0, + "provider": { + "type": "process", + "path": "await-processes-with-result.sh", + "arguments": "1", + "timeout": 900 + } + } + ] + }, + "method": [ + { + "type": "action", + "name": "Restart worker pod", + "tolerance": 0, + "provider": { + "type": "process", + "path": "terminate-workers.sh", + "timeout": 900 + }, + "pauses": { + "after": 5 + } + } + ], + "rollbacks": [] +} diff --git a/go-chaos/internal/chaos-experiments/chaos_experiments.go b/go-chaos/internal/chaos-experiments/chaos_experiments.go new file mode 100644 index 000000000..60ee02f08 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/chaos_experiments.go @@ -0,0 +1,46 @@ +package chaos_experiments + +import ( + "embed" + "encoding/json" + "fmt" + "strings" +) + +// chaosContent holds our static camunda cloud chaos experiments, which are copied with the go:embed directive +// +//go:embed camunda-cloud/* +var chaosContent embed.FS + +const experimentFileName = "experiment.json" + +type Experiments struct { + Experiments []map[string]interface{} +} + +func ReadExperimentsForClusterPlan(clusterPlan string) (Experiments, error) { + normalizedClusterPlan := strings.ToLower(strings.Replace(clusterPlan, " ", "", -1)) + rootPath := fmt.Sprintf("camunda-cloud/%s", normalizedClusterPlan) + dirEntries, err := chaosContent.ReadDir(rootPath) + if err != nil { + return Experiments{}, err + } + + experiments := Experiments{} + for _, dir := range dirEntries { + if dir.IsDir() { + experimentBytes, err := chaosContent.ReadFile(fmt.Sprintf("%s/%s/%s", rootPath, dir.Name(), experimentFileName)) + if err != nil { + return experiments, err + } + var jsonObj map[string]interface{} + err = json.Unmarshal(experimentBytes, &jsonObj) + if err != nil { + return experiments, err + } + experiments.Experiments = append(experiments.Experiments, jsonObj) + } + } + + return experiments, err +} diff --git a/go-chaos/internal/chaos-experiments/chaos_experiments_test.go b/go-chaos/internal/chaos-experiments/chaos_experiments_test.go new file mode 100644 index 000000000..314b5a758 --- /dev/null +++ b/go-chaos/internal/chaos-experiments/chaos_experiments_test.go @@ -0,0 +1,24 @@ +package chaos_experiments + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func Test_ShouldReadExperiments(t *testing.T) { + // given + + // when + experimentsForClusterPlan, err := ReadExperimentsForClusterPlan("Production - S") + + // then + require.NoError(t, err) + assert.NotNil(t, experimentsForClusterPlan) + + require.NoError(t, err) + assert.Greater(t, len(experimentsForClusterPlan.Experiments), 2) + assert.NotEqual(t, experimentsForClusterPlan.Experiments[0], experimentsForClusterPlan.Experiments[1]) + assert.NotEqual(t, "Zeebe deployment distribution", experimentsForClusterPlan.Experiments[0]) +} diff --git a/go-chaos/worker/chaos_worker.go b/go-chaos/worker/chaos_worker.go index 8b9c309e6..5c95be04f 100644 --- a/go-chaos/worker/chaos_worker.go +++ b/go-chaos/worker/chaos_worker.go @@ -20,6 +20,7 @@ import ( "github.com/camunda/zeebe/clients/go/v8/pkg/entities" "github.com/camunda/zeebe/clients/go/v8/pkg/worker" + chaos_experiments "github.com/zeebe-io/zeebe-chaos/go-chaos/internal/chaos-experiments" ) type CommandRunner func([]string, context.Context) error @@ -39,6 +40,7 @@ type AuthenticationProvider struct { } type ZbChaosVariables struct { + ClusterPlan *string ClusterId *string Provider ChaosProvider AuthenticationDetails AuthenticationProvider @@ -74,3 +76,33 @@ func HandleZbChaosJob(client worker.JobClient, job entities.Job, commandRunner C _, _ = client.NewCompleteJobCommand().JobKey(job.Key).Send(ctx) } + +func HandleReadExperiments(client worker.JobClient, job entities.Job) { + ctx := context.Background() + + jobVariables := ZbChaosVariables{ + Provider: ChaosProvider{ + Timeout: 15 * 60, // 15 minute default Timeout + }, + } + err := job.GetVariablesAs(&jobVariables) + if err != nil { + // Can't parse variables, no sense in retrying + _, _ = client.NewFailJobCommand().JobKey(job.Key).Retries(0).Send(ctx) + return + } + + experiments, err := chaos_experiments.ReadExperimentsForClusterPlan(*jobVariables.ClusterPlan) + if err != nil { + _, _ = client.NewFailJobCommand().JobKey(job.Key).Retries(0).ErrorMessage(err.Error()).Send(ctx) + return + } + + command, err := client.NewCompleteJobCommand().JobKey(job.Key).VariablesFromObject(experiments) + if err != nil { + _, _ = client.NewFailJobCommand().JobKey(job.Key).Retries(0).ErrorMessage(err.Error()).Send(ctx) + return + } + + _, _ = command.Send(ctx) +} diff --git a/go-chaos/worker/chaos_worker_test.go b/go-chaos/worker/chaos_worker_test.go index 6a0ce730e..17c528439 100644 --- a/go-chaos/worker/chaos_worker_test.go +++ b/go-chaos/worker/chaos_worker_test.go @@ -24,6 +24,7 @@ import ( "github.com/camunda/zeebe/clients/go/v8/pkg/pb" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + chaos_experiments "github.com/zeebe-io/zeebe-chaos/go-chaos/internal/chaos-experiments" ) func Test_ShouldFailToHandleJobWithoutPayload(t *testing.T) { @@ -47,6 +48,24 @@ func Test_ShouldFailToHandleJobWithoutPayload(t *testing.T) { assert.Equal(t, 0, fakeJobClient.RetriesVal) } +func Test_ShouldFailToHandleReadExperimentsJobWithoutPayload(t *testing.T) { + // given + fakeJobClient := &FakeJobClient{} + job := entities.Job{ + &pb.ActivatedJob{ + Key: 123, + }, + } + + // when + HandleReadExperiments(fakeJobClient, job) + + // then + assert.True(t, fakeJobClient.Failed) + assert.Equal(t, 123, fakeJobClient.Key) + assert.Equal(t, 0, fakeJobClient.RetriesVal) +} + func Test_ShouldHandleCommand(t *testing.T) { // given fakeJobClient := &FakeJobClient{} @@ -76,6 +95,46 @@ func Test_ShouldHandleCommand(t *testing.T) { assert.Equal(t, expectedArgs, appliedArgs) } +func Test_ShouldSendExperimentsForClusterPlan(t *testing.T) { + // given + fakeJobClient := &FakeJobClient{} + job := entities.Job{ + &pb.ActivatedJob{ + Key: 123, + Variables: "{\"clusterPlan\":\"Production - S\"}", + }, + } + + // when + HandleReadExperiments(fakeJobClient, job) + + // then + assert.True(t, fakeJobClient.Succeeded) + assert.Equal(t, 123, fakeJobClient.Key) + experiments, err := chaos_experiments.ReadExperimentsForClusterPlan("Production - S") + require.NoError(t, err) + assert.Equal(t, experiments, fakeJobClient.Variables) +} + +func Test_ShouldFailWhenNoClusterPlanForReadExperimentsJob(t *testing.T) { + // given + fakeJobClient := &FakeJobClient{} + job := entities.Job{ + &pb.ActivatedJob{ + Key: 123, + Variables: "{\"clusterPlan\":\"noop\"}", + }, + } + + // when + HandleReadExperiments(fakeJobClient, job) + + // then + assert.True(t, fakeJobClient.Failed) + assert.Equal(t, 123, fakeJobClient.Key) + assert.Equal(t, "open camunda-cloud/noop: file does not exist", fakeJobClient.ErrorMsg) +} + func Test_ShouldFailJobWhenHandleFails(t *testing.T) { // given fakeJobClient := &FakeJobClient{} diff --git a/go-chaos/worker/fake.go b/go-chaos/worker/fake.go index f2a9a4fb9..1593e9ab4 100644 --- a/go-chaos/worker/fake.go +++ b/go-chaos/worker/fake.go @@ -30,6 +30,7 @@ type FakeJobClient struct { ErrorMsg string Failed bool Succeeded bool + Variables interface{} } type FakeCompleteClient struct { @@ -53,6 +54,11 @@ func (f *FakeCompleteClient) Send(ctx context.Context) (*pb.CompleteJobResponse, return &pb.CompleteJobResponse{}, nil } +func (f *FakeCompleteClient) VariablesFromObject(v interface{}) (commands.DispatchCompleteJobCommand, error) { + f.JobClient.Variables = v + return f, nil +} + // Fake FAIL Client func (f *FakeJobClient) NewFailJobCommand() commands.FailJobCommandStep1 { @@ -78,6 +84,11 @@ func (f *FakeFailClient) Retries(retries int32) commands.FailJobCommandStep3 { return f } +func (f *FakeFailClient) ErrorMessage(errorMsg string) commands.FailJobCommandStep3 { + f.JobClient.ErrorMsg = errorMsg + return f +} + func (f *FakeFailClient) Send(ctx context.Context) (*pb.FailJobResponse, error) { return &pb.FailJobResponse{}, nil }