Skip to content

Commit

Permalink
Merge pull request #87543 from cockroachdb/blathers/backport-release-…
Browse files Browse the repository at this point in the history
…22.1-80526

release-22.1: roachtest: allow more errors for REGION survivability DRT
  • Loading branch information
irfansharif authored Sep 8, 2022
2 parents 96ce6c9 + 63792ed commit f819c84
Showing 1 changed file with 18 additions and 11 deletions.
29 changes: 18 additions & 11 deletions pkg/cmd/roachtest/tests/tpcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,21 @@ func registerTPCC(r registry.Registry) {
if err != nil {
return tpccChaosEventProcessor{}, err
}
// We see a slow trickle of errors after a server has been force shutdown due
// to queries before the shutdown not fully completing. You can inspect this
// by looking at the workload logs and corresponding the errors with the
// prometheus graphs.
// The errors seen can be of the form:
// * ERROR: inbox communication error: rpc error: code = Canceled
// desc = context canceled (SQLSTATE 58C01)
// Setting this allows some errors to occur.
allowedErrorsMultiplier := 5
if tc.survivalGoal == "region" {
// REGION failures last a bit longer after a region has gone down.
allowedErrorsMultiplier *= 20
}
maxErrorsDuringUptime := warehousesPerRegion * tpcc.NumWorkersPerWarehouse * allowedErrorsMultiplier

return tpccChaosEventProcessor{
workloadInstances: workloadInstances,
workloadNodeIP: prometheusNodeIP[0],
Expand All @@ -624,17 +639,9 @@ func registerTPCC(r registry.Registry) {
"orderStatus",
"stockLevel",
},
ch: chaosEventCh,
promClient: promv1.NewAPI(client),
// We see a slow trickle of errors after a server has been force shutdown due
// to queries before the shutdown not fully completing. You can inspect this
// by looking at the workload logs and corresponding the errors with the
// prometheus graphs.
// The errors seen can be be of the form:
// * ERROR: inbox communication error: rpc error: code = Canceled
// desc = context canceled (SQLSTATE 58C01)
// Setting this allows some errors to occur.
maxErrorsDuringUptime: warehousesPerRegion * tpcc.NumWorkersPerWarehouse,
ch: chaosEventCh,
promClient: promv1.NewAPI(client),
maxErrorsDuringUptime: maxErrorsDuringUptime,
// "delivery" does not trigger often.
allowZeroSuccessDuringUptime: true,
}, nil
Expand Down

0 comments on commit f819c84

Please sign in to comment.