Skip to content

Commit

Permalink
[Alerting] Smarter retry interval for ES Connectivity errors (#123642)
Browse files Browse the repository at this point in the history
* Checking for es connectivity errors and adjusting retry interval accordingly

* PR feedback

Co-authored-by: Kibana Machine <[email protected]>
  • Loading branch information
ymao1 and kibanamachine authored Jan 27, 2022
1 parent 11537ea commit 0d951bc
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
* 2.0.
*/

import { isAlertSavedObjectNotFoundError } from './is_alert_not_found_error';
import { isAlertSavedObjectNotFoundError, isEsUnavailableError } from './is_alerting_error';
import { ErrorWithReason } from './error_with_reason';
import { SavedObjectsErrorHelpers } from '../../../../../src/core/server';
import uuid from 'uuid';
Expand All @@ -31,3 +31,24 @@ describe('isAlertSavedObjectNotFoundError', () => {
expect(isAlertSavedObjectNotFoundError(error, id)).toBe(true);
});
});

describe('isEsUnavailableError', () => {
const id = uuid.v4();
const errorSONF = SavedObjectsErrorHelpers.createGenericNotFoundEsUnavailableError('alert', id);

test('identifies es unavailable errors', () => {
// ensure the error created by SO parses as a string with the format we expect
expect(`${errorSONF}`.includes(`alert/${id}`)).toBe(true);

expect(isEsUnavailableError(errorSONF, id)).toBe(true);
});

test('identifies generic errors', () => {
expect(isEsUnavailableError(new Error(`not found`), id)).toBe(false);
});

test('identifies es unavailable errors wrapped in an ErrorWithReason', () => {
const error = new ErrorWithReason(AlertExecutionStatusErrorReasons.Read, errorSONF);
expect(isEsUnavailableError(error, id)).toBe(true);
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@
import { SavedObjectsErrorHelpers } from '../../../../../src/core/server';
import { isErrorWithReason } from './error_with_reason';

export function isAlertSavedObjectNotFoundError(err: Error, alertId: string) {
export function isAlertSavedObjectNotFoundError(err: Error, ruleId: string) {
// if this is an error with a reason, the actual error needs to be extracted
const actualError = isErrorWithReason(err) ? err.error : err;

return (
SavedObjectsErrorHelpers.isNotFoundError(actualError) && `${actualError}`.includes(alertId)
);
return SavedObjectsErrorHelpers.isNotFoundError(actualError) && `${actualError}`.includes(ruleId);
}

export function isEsUnavailableError(err: Error, ruleId: string) {
// if this is an error with a reason, the actual error needs to be extracted
const actualError = isErrorWithReason(err) ? err.error : err;
return SavedObjectsErrorHelpers.isEsUnavailableError(actualError);
}
56 changes: 56 additions & 0 deletions x-pack/plugins/alerting/server/task_runner/task_runner.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3557,6 +3557,62 @@ describe('Task Runner', () => {
});
});

test('reschedules for next schedule interval if es connectivity error encountered and schedule interval is less than connectivity retry', async () => {
rulesClient.get.mockImplementation(() => {
throw SavedObjectsErrorHelpers.createGenericNotFoundEsUnavailableError('alert', '1');
});

const taskRunner = new TaskRunner(
ruleType,
mockedTaskInstance,
taskRunnerFactoryInitializerParams
);

encryptedSavedObjectsClient.getDecryptedAsInternalUser.mockResolvedValue({
id: '1',
type: 'alert',
attributes: {
apiKey: Buffer.from('123:abc').toString('base64'),
enabled: true,
},
references: [],
});

const runnerResult = await taskRunner.run();
expect(runnerResult.schedule!.interval).toEqual(mockedTaskInstance.schedule!.interval);
});

test('reschedules for smaller interval if es connectivity error encountered and schedule interval is greater than connectivity retry', async () => {
rulesClient.get.mockImplementation(() => {
throw SavedObjectsErrorHelpers.createGenericNotFoundEsUnavailableError('alert', '1');
});

const taskRunner = new TaskRunner(
ruleType,
{
...mockedTaskInstance,
schedule: {
interval: '1d',
},
},
taskRunnerFactoryInitializerParams
);

encryptedSavedObjectsClient.getDecryptedAsInternalUser.mockResolvedValue({
id: '1',
type: 'alert',
attributes: {
apiKey: Buffer.from('123:abc').toString('base64'),
enabled: true,
},
references: [],
});

const runnerResult = await taskRunner.run();

expect(runnerResult.schedule!.interval).toEqual('5m');
});

test('correctly logs warning when Alert Task Runner throws due to failing to fetch the alert in a space', async () => {
rulesClient.get.mockImplementation(() => {
throw SavedObjectsErrorHelpers.createGenericNotFoundError('alert', '1');
Expand Down
17 changes: 15 additions & 2 deletions x-pack/plugins/alerting/server/task_runner/task_runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ import { promiseResult, map, Resultable, asOk, asErr, resolveErr } from '../lib/
import { taskInstanceToAlertTaskInstance } from './alert_task_instance';
import { EVENT_LOG_ACTIONS } from '../plugin';
import { IEvent, IEventLogger, SAVED_OBJECT_REL_PRIMARY } from '../../../event_log/server';
import { isAlertSavedObjectNotFoundError } from '../lib/is_alert_not_found_error';
import { isAlertSavedObjectNotFoundError, isEsUnavailableError } from '../lib/is_alerting_error';
import { RulesClient } from '../rules_client';
import { partiallyUpdateAlert } from '../saved_objects';
import {
Expand All @@ -52,6 +52,7 @@ import {
AlertInstanceState,
AlertInstanceContext,
WithoutReservedActionGroups,
parseDuration,
} from '../../common';
import { NormalizedRuleType, UntypedNormalizedRuleType } from '../rule_type_registry';
import { getEsErrorMessage } from '../lib/errors';
Expand All @@ -62,6 +63,7 @@ import {
import { createAbortableEsClientFactory } from '../lib/create_abortable_es_client_factory';

const FALLBACK_RETRY_INTERVAL = '5m';
const CONNECTIVITY_RETRY_INTERVAL = '5m';
const MONITORING_HISTORY_LIMIT = 200;

// 1,000,000 nanoseconds in 1 millisecond
Expand Down Expand Up @@ -773,7 +775,18 @@ export class TaskRunner<
);
throwUnrecoverableError(error);
}
return { interval: taskSchedule?.interval ?? FALLBACK_RETRY_INTERVAL };

let retryInterval = taskSchedule?.interval ?? FALLBACK_RETRY_INTERVAL;

// Set retry interval smaller for ES connectivity errors
if (isEsUnavailableError(error, ruleId)) {
retryInterval =
parseDuration(retryInterval) > parseDuration(CONNECTIVITY_RETRY_INTERVAL)
? CONNECTIVITY_RETRY_INTERVAL
: retryInterval;
}

return { interval: retryInterval };
}),
monitoring: ruleMonitoring,
};
Expand Down

0 comments on commit 0d951bc

Please sign in to comment.