Skip to content

Commit

Permalink
[ResponseOps] Revisit connector action retry back-off (#173779)
Browse files Browse the repository at this point in the history
Resolves #172518

## Summary

Updates the retry delay calculation to cap the delay at 1hr and
introduces jitter.

### Checklist

- [x] [Unit or functional
tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)
were updated or added to match the most common scenarios


### To verify
- Create a rule and then force a retry failure
- Verify that the retry follows the pattern below:

  Attempt 1: now
  Attempt 2: 30s after the first attempt
  Attempt 3: 0 - 5m after the second attempt
  Attempt 4: 0 - 10m after the third attempt
  Attempt 5: 0 - 20m after the fourth attempt
  Attempt 6: 0 - 40m after the fifth attempt
  Attempt n: 0 - 1hr for all other attempts
  • Loading branch information
doakalexi authored Jan 3, 2024
1 parent 39f1561 commit c4821c5
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import {
import { schema } from '@kbn/config-schema';
import { RequeueInvalidTasksConfig } from '../config';

const baseDelay = 5 * 60 * 1000;
const executionContext = executionContextServiceMock.createSetupContract();
const minutesFromNow = (mins: number): Date => secondsFromNow(mins * 60);
const mockRequeueInvalidTasksConfig = {
Expand Down Expand Up @@ -315,8 +316,16 @@ describe('TaskManagerRunner', () => {
expect(instance.attempts).toEqual(initialAttempts + 1);
expect(instance.status).toBe('running');
expect(instance.startedAt!.getTime()).toEqual(Date.now());
const expectedRunAt = Date.now() + calculateDelay(initialAttempts + 1);
expect(instance.retryAt!.getTime()).toEqual(expectedRunAt + timeoutMinutes * 60 * 1000);

const minRunAt = Date.now();
const maxRunAt = minRunAt + baseDelay * Math.pow(2, initialAttempts - 1);
expect(instance.retryAt!.getTime()).toBeGreaterThanOrEqual(
minRunAt + timeoutMinutes * 60 * 1000
);
expect(instance.retryAt!.getTime()).toBeLessThanOrEqual(
maxRunAt + timeoutMinutes * 60 * 1000
);

expect(instance.enabled).not.toBeDefined();
});

Expand Down Expand Up @@ -347,9 +356,13 @@ describe('TaskManagerRunner', () => {
expect(store.update).toHaveBeenCalledTimes(1);
const instance = store.update.mock.calls[0][0];

const expectedRetryAt = new Date(Date.now() + calculateDelay(initialAttempts + 1));
expect(instance.retryAt!.getTime()).toEqual(
new Date(expectedRetryAt.getTime() + timeoutMinutes * 60 * 1000).getTime()
const minRunAt = Date.now();
const maxRunAt = minRunAt + baseDelay * Math.pow(2, initialAttempts - 1);
expect(instance.retryAt!.getTime()).toBeGreaterThanOrEqual(
minRunAt + timeoutMinutes * 60 * 1000
);
expect(instance.retryAt!.getTime()).toBeLessThanOrEqual(
maxRunAt + timeoutMinutes * 60 * 1000
);
expect(instance.enabled).not.toBeDefined();
});
Expand Down Expand Up @@ -742,8 +755,12 @@ describe('TaskManagerRunner', () => {
const instance = store.update.mock.calls[0][0];

expect(instance.id).toEqual(id);
const expectedRunAt = new Date(Date.now() + calculateDelay(initialAttempts));
expect(instance.runAt.getTime()).toEqual(expectedRunAt.getTime());

const minRunAt = Date.now();
const maxRunAt = minRunAt + baseDelay * Math.pow(2, initialAttempts - 2);
expect(instance.runAt.getTime()).toBeGreaterThanOrEqual(minRunAt);
expect(instance.runAt.getTime()).toBeLessThanOrEqual(maxRunAt);

expect(instance.params).toEqual({ a: 'b' });
expect(instance.state).toEqual({ hey: 'there' });
expect(instance.enabled).not.toBeDefined();
Expand Down Expand Up @@ -1098,8 +1115,11 @@ describe('TaskManagerRunner', () => {
expect(store.update).toHaveBeenCalledTimes(1);
const instance = store.update.mock.calls[0][0];

const expectedRunAt = new Date(Date.now() + calculateDelay(initialAttempts));
expect(instance.runAt.getTime()).toEqual(expectedRunAt.getTime());
const minRunAt = Date.now();
const maxRunAt = minRunAt + baseDelay * Math.pow(2, initialAttempts - 2);
expect(instance.runAt.getTime()).toBeGreaterThanOrEqual(minRunAt);
expect(instance.runAt.getTime()).toBeLessThanOrEqual(maxRunAt);

expect(instance.enabled).not.toBeDefined();
});

Expand Down Expand Up @@ -2542,6 +2562,26 @@ describe('TaskManagerRunner', () => {
`Error encountered when running onTaskRemoved() hook for testbar2 "foo": Fail`
);
});

describe('calculateDelay', () => {
it('returns 30s on the first attempt', () => {
expect(calculateDelay(1)).toBe(30000);
});

it('returns delay with jitter', () => {
const delay = calculateDelay(5);
// with jitter should be random between 0 and 40 min (inclusive)
expect(delay).toBeGreaterThanOrEqual(0);
expect(delay).toBeLessThanOrEqual(2400000);
});

it('returns delay capped at 1 hour', () => {
const delay = calculateDelay(10);
// with jitter should be random between 0 and 1 hr (inclusive)
expect(delay).toBeGreaterThanOrEqual(0);
expect(delay).toBeLessThanOrEqual(60 * 60 * 1000);
});
});
});

interface TestOpts {
Expand Down
12 changes: 8 additions & 4 deletions x-pack/plugins/task_manager/server/task_running/task_runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import apm from 'elastic-apm-node';
import { v4 as uuidv4 } from 'uuid';
import { withSpan } from '@kbn/apm-utils';
import { defaults, flow, identity, isUndefined, omit } from 'lodash';
import { defaults, flow, identity, isUndefined, omit, random } from 'lodash';
import { ExecutionContextStart, Logger, SavedObjectsErrorHelpers } from '@kbn/core/server';
import { UsageCounter } from '@kbn/usage-collection-plugin/server';
import moment from 'moment';
Expand Down Expand Up @@ -931,12 +931,16 @@ export function asRan(task: InstanceOf<TaskRunningStage.RAN, RanTask>): RanTask
}

export function calculateDelay(attempts: number) {
// Return 30s for the first retry attempt
if (attempts === 1) {
return 30 * 1000; // 30s
return 30 * 1000;
} else {
// get multiples of 5 min
const defaultBackoffPerFailure = 5 * 60 * 1000;
return defaultBackoffPerFailure * Math.pow(2, attempts - 2);
const maxDelay = 60 * 60 * 1000;
// For each remaining attempt return an exponential delay with jitter that is capped at 1 hour.
// We adjust the attempts by 2 to ensure that delay starts at 5m for the second retry attempt
// and increases exponentially from there.
return random(Math.min(maxDelay, defaultBackoffPerFailure * Math.pow(2, attempts - 2)));
}
}

Expand Down

0 comments on commit c4821c5

Please sign in to comment.