Skip to content

Commit

Permalink
feat: verify cluster can survive dataloss of one broker at a time. (#275
Browse files Browse the repository at this point in the history
)

After a broker recovered from loss of disk, cluster should be able to
survive another broker's disk loss. After a series of loss of disk of
one broker at a time, the cluster should not suffer dataloss. We verify
this by creating instances of the process that is deployed before the
disk loss.

In this we don't have to call `zbchaos dataloss prepare` because there
is no need to add init containers. Since we are only deleting one broker
at a time, the pod can be immediately restarted.

related to #4
  • Loading branch information
deepthidevaki authored Dec 7, 2022
2 parents f62c5b7 + f4e80fd commit b77c0ce
Showing 1 changed file with 145 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
{
"version": "0.1.0",
"title": "Zeebe dataloss experiment",
"description": "Zeebe should be able to handle data loss of one broker at a time.",
"contributions": {
"reliability": "high",
"availability": "high"
},
"steady-state-hypothesis": {
"title": "Zeebe is alive",
"probes": [
{
"name": "All pods should be ready",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": [
"verify", "readiness"
],
"timeout": 900
}
}
]
},
"method": [
{
"type": "action",
"name": "Deploy process",
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["deploy", "process"]
},
"timeout": 900
},
{
"type": "action",
"name": "Delete data of broker 0 and restart the pod",
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["dataloss", "delete", "--nodeId=0"]
},
"pauses": {
"after": 60
}
},
{
"type": "probe",
"name": "Broker 0 can recover after data loss",
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["verify", "readiness"]
},
"timeout": 900
},
{
"type": "action",
"name": "Delete data of broker 1 and restart the pod",
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["dataloss", "delete", "--nodeId=1"]
},
"pauses": {
"after": 60
}
},
{
"type": "probe",
"name": "Broker 1 can recover after data loss",
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["verify", "readiness"]
},
"timeout": 900
},
{
"type": "action",
"name": "Delete data of broker 2 and restart the pod",
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["dataloss", "delete", "--nodeId=2"]
},
"pauses": {
"after": 60
}
},
{
"type": "probe",
"name": "Broker 2 can recover after data loss",
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": ["verify", "readiness"]
},
"timeout": 900
},
{
"name": "There is no data loss. Should be able to create process instances on partition 1",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": [
"verify", "instance-creation", "--partitionId=1"
],
"timeout": 900
}
},
{
"name": "There is no data loss. Should be able to create process instances on partition 2",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": [
"verify", "instance-creation", "--partitionId=2"
],
"timeout": 900
}
},
{
"name": "There is no data loss. Should be able to create process instances on partition 3",
"type": "probe",
"tolerance": 0,
"provider": {
"type": "process",
"path": "zbchaos",
"arguments": [
"verify", "instance-creation", "--partitionId=3"
],
"timeout": 900
}
}
],
"rollbacks": []
}

0 comments on commit b77c0ce

Please sign in to comment.