Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Metrics.Ops] log event loop delays #103478

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

import type { OpsMetrics } from '..';
import { getEventLoopThresholdLog } from './get_event_loop_threshold_log';

function createMockOpsMetrics(eventLoopDelay: number | undefined) {
return {
process: {
event_loop_delay: eventLoopDelay,
},
} as Pick<OpsMetrics, 'process'>;
}

describe('getEventLoopThresholdLog', () => {
it('returns empty message on undefined `process.event_loop_delay`', () => {
const thresholdMs = 200;
const mockDelay = undefined;
const result = getEventLoopThresholdLog(createMockOpsMetrics(mockDelay), thresholdMs);
expect(result.message).toBe('');
});

it('returns empty message when `process.event_loop_delay` is less than threshold', () => {
const thresholdMs = 200;
const mockDelay = 190;
const result = getEventLoopThresholdLog(createMockOpsMetrics(mockDelay), thresholdMs);
expect(result.message).toBe('');
});

it('returns message when `process.event_loop_delay` exceeds the threshold', () => {
const thresholdMs = 200;
const mockDelay = 500;
const result = getEventLoopThresholdLog(createMockOpsMetrics(mockDelay), thresholdMs);
expect(result.message).toMatchInlineSnapshot(
`"Event loop delay threshold exceeded 200ms. Recieved 500.000"`
);
});
});
35 changes: 35 additions & 0 deletions src/core/server/metrics/logging/get_event_loop_threshold_log.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

import numeral from '@elastic/numeral';
import type { OpsMetrics } from '..';

/**
* Generates a message to be logged when event_loop_delay exceeds defined threshold.
*
* @internal
*/
export function getEventLoopThresholdLog(
metrics: Pick<OpsMetrics, 'process'>,
thresholdMs: number
) {
const { process } = metrics;
// Event loop delay is in ms
const eventLoopDelayVal = process?.event_loop_delay;
let message = '';

if (eventLoopDelayVal && eventLoopDelayVal > thresholdMs) {
message = `Event loop delay threshold exceeded ${thresholdMs}ms. Recieved ${numeral(
eventLoopDelayVal
).format('0.000')}`;
}

return {
message,
};
}
1 change: 1 addition & 0 deletions src/core/server/metrics/logging/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
*/

export { getEcsOpsMetricsLog } from './get_ops_metrics_log';
export { getEventLoopThresholdLog } from './get_event_loop_threshold_log';
43 changes: 42 additions & 1 deletion src/core/server/metrics/metrics_service.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { loggingSystemMock } from '../logging/logging_system.mock';
import { take } from 'rxjs/operators';

const testInterval = 100;
const testEventLoopDelayThreshold = 50;

const dummyMetrics = { metricA: 'value', metricB: 'otherValue' };

Expand All @@ -30,7 +31,10 @@ describe('MetricsService', () => {
jest.useFakeTimers();

const configService = configServiceMock.create({
atPath: { interval: moment.duration(testInterval) },
atPath: {
interval: moment.duration(testInterval),
eventLoopDelayThreshold: moment.duration(testEventLoopDelayThreshold),
},
});
const coreContext = mockCoreContext.create({ logger, configService });
metricsService = new MetricsService(coreContext);
Expand Down Expand Up @@ -214,6 +218,43 @@ describe('MetricsService', () => {
]
`);
});

it('warns when process.event_loop_delay exceeds threshold', async () => {
const firstMetrics = {
process: {
event_loop_delay: 30,
},
};
const secondMetrics = {
process: {
event_loop_delay: 100,
},
};

const opsLogger = logger.get('metrics', 'ops');

mockOpsCollector.collect
.mockResolvedValueOnce(firstMetrics)
.mockResolvedValueOnce(secondMetrics);
await metricsService.setup({ http: httpMock });
const { getOpsMetrics$ } = await metricsService.start();

const nextEmission = async () => {
jest.advanceTimersByTime(testInterval);
const emission = await getOpsMetrics$().pipe(take(1)).toPromise();
await new Promise((resolve) => process.nextTick(resolve));
return emission;
};

await nextEmission();
const opsLogs = loggingSystemMock.collect(opsLogger).warn;
expect(opsLogs.length).toEqual(1);
expect(opsLogs[0]).toMatchInlineSnapshot(`
Array [
"Event loop delay threshold exceeded 50ms. Recieved 100.000",
]
`);
});
});

describe('#stop', () => {
Expand Down
27 changes: 19 additions & 8 deletions src/core/server/metrics/metrics_service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import { InternalHttpServiceSetup } from '../http';
import { InternalMetricsServiceSetup, InternalMetricsServiceStart, OpsMetrics } from './types';
import { OpsMetricsCollector } from './ops_metrics_collector';
import { opsConfig, OpsConfigType } from './ops_config';
import { getEcsOpsMetricsLog } from './logging';
import { getEcsOpsMetricsLog, getEventLoopThresholdLog } from './logging';

interface MetricsServiceSetupDeps {
http: InternalHttpServiceSetup;
Expand All @@ -37,26 +37,32 @@ export class MetricsService
}

public async setup({ http }: MetricsServiceSetupDeps): Promise<InternalMetricsServiceSetup> {
const config = await this.coreContext.configService
const {
cGroupOverrides,
interval,
eventLoopDelayThreshold,
} = await this.coreContext.configService
.atPath<OpsConfigType>(opsConfig.path)
.pipe(first())
.toPromise();

const thresholdMs = eventLoopDelayThreshold.asMilliseconds();

this.metricsCollector = new OpsMetricsCollector(http.server, {
logger: this.logger,
...config.cGroupOverrides,
...cGroupOverrides,
});

await this.refreshMetrics();
await this.refreshMetrics(thresholdMs);

this.collectInterval = setInterval(() => {
this.refreshMetrics();
}, config.interval.asMilliseconds());
this.refreshMetrics(thresholdMs);
}, interval.asMilliseconds());

const metricsObservable = this.metrics$.asObservable();

this.service = {
collectionInterval: config.interval.asMilliseconds(),
collectionInterval: interval.asMilliseconds(),
getOpsMetrics$: () => metricsObservable,
};

Expand All @@ -71,10 +77,15 @@ export class MetricsService
return this.service;
}

private async refreshMetrics() {
private async refreshMetrics(thresholdMs: number) {
const metrics = await this.metricsCollector!.collect();
const { message, meta } = getEcsOpsMetricsLog(metrics);
const { message: eventLoopThresholdMessage } = getEventLoopThresholdLog(metrics, thresholdMs);

this.opsMetricsLogger.debug(message!, meta);
if (eventLoopThresholdMessage) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we may need to throttle this or else it may be logged very frequently (however often metrics are refreshed which I think is 30s by default).

this.opsMetricsLogger.warn(eventLoopThresholdMessage);
}
this.metricsCollector!.reset();
this.metrics$.next(metrics);
}
Expand Down
1 change: 1 addition & 0 deletions src/core/server/metrics/ops_config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ export const opsConfig = {
path: 'ops',
schema: schema.object({
interval: schema.duration({ defaultValue: '5s' }),
eventLoopDelayThreshold: schema.duration({ defaultValue: '350ms' }),
cGroupOverrides: schema.object({
cpuPath: schema.maybe(schema.string()),
cpuAcctPath: schema.maybe(schema.string()),
Expand Down