diff --git a/src/core/server/metrics/logging/get_event_loop_threshold_log.test.ts b/src/core/server/metrics/logging/get_event_loop_threshold_log.test.ts new file mode 100644 index 0000000000000..6c43d2db8f94f --- /dev/null +++ b/src/core/server/metrics/logging/get_event_loop_threshold_log.test.ts @@ -0,0 +1,43 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +import type { OpsMetrics } from '..'; +import { getEventLoopThresholdLog } from './get_event_loop_threshold_log'; + +function createMockOpsMetrics(eventLoopDelay: number | undefined) { + return { + process: { + event_loop_delay: eventLoopDelay, + }, + } as Pick; +} + +describe('getEventLoopThresholdLog', () => { + it('returns empty message on undefined `process.event_loop_delay`', () => { + const thresholdMs = 200; + const mockDelay = undefined; + const result = getEventLoopThresholdLog(createMockOpsMetrics(mockDelay), thresholdMs); + expect(result.message).toBe(''); + }); + + it('returns empty message when `process.event_loop_delay` is less than threshold', () => { + const thresholdMs = 200; + const mockDelay = 190; + const result = getEventLoopThresholdLog(createMockOpsMetrics(mockDelay), thresholdMs); + expect(result.message).toBe(''); + }); + + it('returns message when `process.event_loop_delay` exceeds the threshold', () => { + const thresholdMs = 200; + const mockDelay = 500; + const result = getEventLoopThresholdLog(createMockOpsMetrics(mockDelay), thresholdMs); + expect(result.message).toMatchInlineSnapshot( + `"Event loop delay threshold exceeded 200ms. Recieved 500.000"` + ); + }); +}); diff --git a/src/core/server/metrics/logging/get_event_loop_threshold_log.ts b/src/core/server/metrics/logging/get_event_loop_threshold_log.ts new file mode 100644 index 0000000000000..d033252b5100f --- /dev/null +++ b/src/core/server/metrics/logging/get_event_loop_threshold_log.ts @@ -0,0 +1,35 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +import numeral from '@elastic/numeral'; +import type { OpsMetrics } from '..'; + +/** + * Generates a message to be logged when event_loop_delay exceeds defined threshold. + * + * @internal + */ +export function getEventLoopThresholdLog( + metrics: Pick, + thresholdMs: number +) { + const { process } = metrics; + // Event loop delay is in ms + const eventLoopDelayVal = process?.event_loop_delay; + let message = ''; + + if (eventLoopDelayVal && eventLoopDelayVal > thresholdMs) { + message = `Event loop delay threshold exceeded ${thresholdMs}ms. Recieved ${numeral( + eventLoopDelayVal + ).format('0.000')}`; + } + + return { + message, + }; +} diff --git a/src/core/server/metrics/logging/index.ts b/src/core/server/metrics/logging/index.ts index 3c1144f486984..7699a5603bb2d 100644 --- a/src/core/server/metrics/logging/index.ts +++ b/src/core/server/metrics/logging/index.ts @@ -7,3 +7,4 @@ */ export { getEcsOpsMetricsLog } from './get_ops_metrics_log'; +export { getEventLoopThresholdLog } from './get_event_loop_threshold_log'; diff --git a/src/core/server/metrics/metrics_service.test.ts b/src/core/server/metrics/metrics_service.test.ts index d7de41fd7ccf7..c1bd1201f67f5 100644 --- a/src/core/server/metrics/metrics_service.test.ts +++ b/src/core/server/metrics/metrics_service.test.ts @@ -17,6 +17,7 @@ import { loggingSystemMock } from '../logging/logging_system.mock'; import { take } from 'rxjs/operators'; const testInterval = 100; +const testEventLoopDelayThreshold = 50; const dummyMetrics = { metricA: 'value', metricB: 'otherValue' }; @@ -30,7 +31,10 @@ describe('MetricsService', () => { jest.useFakeTimers(); const configService = configServiceMock.create({ - atPath: { interval: moment.duration(testInterval) }, + atPath: { + interval: moment.duration(testInterval), + eventLoopDelayThreshold: moment.duration(testEventLoopDelayThreshold), + }, }); const coreContext = mockCoreContext.create({ logger, configService }); metricsService = new MetricsService(coreContext); @@ -214,6 +218,43 @@ describe('MetricsService', () => { ] `); }); + + it('warns when process.event_loop_delay exceeds threshold', async () => { + const firstMetrics = { + process: { + event_loop_delay: 30, + }, + }; + const secondMetrics = { + process: { + event_loop_delay: 100, + }, + }; + + const opsLogger = logger.get('metrics', 'ops'); + + mockOpsCollector.collect + .mockResolvedValueOnce(firstMetrics) + .mockResolvedValueOnce(secondMetrics); + await metricsService.setup({ http: httpMock }); + const { getOpsMetrics$ } = await metricsService.start(); + + const nextEmission = async () => { + jest.advanceTimersByTime(testInterval); + const emission = await getOpsMetrics$().pipe(take(1)).toPromise(); + await new Promise((resolve) => process.nextTick(resolve)); + return emission; + }; + + await nextEmission(); + const opsLogs = loggingSystemMock.collect(opsLogger).warn; + expect(opsLogs.length).toEqual(1); + expect(opsLogs[0]).toMatchInlineSnapshot(` + Array [ + "Event loop delay threshold exceeded 50ms. Recieved 100.000", + ] + `); + }); }); describe('#stop', () => { diff --git a/src/core/server/metrics/metrics_service.ts b/src/core/server/metrics/metrics_service.ts index 78e4dd98f93d6..613cc189e608a 100644 --- a/src/core/server/metrics/metrics_service.ts +++ b/src/core/server/metrics/metrics_service.ts @@ -15,7 +15,7 @@ import { InternalHttpServiceSetup } from '../http'; import { InternalMetricsServiceSetup, InternalMetricsServiceStart, OpsMetrics } from './types'; import { OpsMetricsCollector } from './ops_metrics_collector'; import { opsConfig, OpsConfigType } from './ops_config'; -import { getEcsOpsMetricsLog } from './logging'; +import { getEcsOpsMetricsLog, getEventLoopThresholdLog } from './logging'; interface MetricsServiceSetupDeps { http: InternalHttpServiceSetup; @@ -37,26 +37,32 @@ export class MetricsService } public async setup({ http }: MetricsServiceSetupDeps): Promise { - const config = await this.coreContext.configService + const { + cGroupOverrides, + interval, + eventLoopDelayThreshold, + } = await this.coreContext.configService .atPath(opsConfig.path) .pipe(first()) .toPromise(); + const thresholdMs = eventLoopDelayThreshold.asMilliseconds(); + this.metricsCollector = new OpsMetricsCollector(http.server, { logger: this.logger, - ...config.cGroupOverrides, + ...cGroupOverrides, }); - await this.refreshMetrics(); + await this.refreshMetrics(thresholdMs); this.collectInterval = setInterval(() => { - this.refreshMetrics(); - }, config.interval.asMilliseconds()); + this.refreshMetrics(thresholdMs); + }, interval.asMilliseconds()); const metricsObservable = this.metrics$.asObservable(); this.service = { - collectionInterval: config.interval.asMilliseconds(), + collectionInterval: interval.asMilliseconds(), getOpsMetrics$: () => metricsObservable, }; @@ -71,10 +77,15 @@ export class MetricsService return this.service; } - private async refreshMetrics() { + private async refreshMetrics(thresholdMs: number) { const metrics = await this.metricsCollector!.collect(); const { message, meta } = getEcsOpsMetricsLog(metrics); + const { message: eventLoopThresholdMessage } = getEventLoopThresholdLog(metrics, thresholdMs); + this.opsMetricsLogger.debug(message!, meta); + if (eventLoopThresholdMessage) { + this.opsMetricsLogger.warn(eventLoopThresholdMessage); + } this.metricsCollector!.reset(); this.metrics$.next(metrics); } diff --git a/src/core/server/metrics/ops_config.ts b/src/core/server/metrics/ops_config.ts index 886900b854f9f..5c18e655ff816 100644 --- a/src/core/server/metrics/ops_config.ts +++ b/src/core/server/metrics/ops_config.ts @@ -12,6 +12,7 @@ export const opsConfig = { path: 'ops', schema: schema.object({ interval: schema.duration({ defaultValue: '5s' }), + eventLoopDelayThreshold: schema.duration({ defaultValue: '350ms' }), cGroupOverrides: schema.object({ cpuPath: schema.maybe(schema.string()), cpuAcctPath: schema.maybe(schema.string()),