Skip to content

Commit

Permalink
feat: monitor event loop lag (#11127)
Browse files Browse the repository at this point in the history
This PR adds an event loop monitor that can be viewed in Grafana.
  • Loading branch information
alexghr authored Jan 9, 2025
1 parent 1c23662 commit 422f125
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 0 deletions.
49 changes: 49 additions & 0 deletions yarn-project/telemetry-client/src/event_loop_monitor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import { promiseWithResolvers } from '@aztec/foundation/promise';
import { Timer } from '@aztec/foundation/timer';

import { EVENT_LOOP_LAG } from './metrics.js';
import { type Meter, type ObservableGauge, type ObservableResult, ValueType } from './telemetry.js';

/**
* Detector for custom Aztec attributes
*/
export class EventLoopMonitor {
private eventLoopLag: ObservableGauge;
private started = false;

constructor(meter: Meter) {
this.eventLoopLag = meter.createObservableGauge(EVENT_LOOP_LAG, {
unit: 'us',
valueType: ValueType.INT,
description: 'How busy is the event loop',
});
}

start(): void {
if (this.started) {
return;
}
this.eventLoopLag.addCallback(this.measureLag);
}

stop(): void {
if (!this.started) {
return;
}
this.eventLoopLag.removeCallback(this.measureLag);
}

private measureLag = async (obs: ObservableResult): Promise<void> => {
const timer = new Timer();
const { promise, resolve } = promiseWithResolvers<number>();
// how long does it take to schedule the next macro task?
// if this number spikes then we're (1) either blocking the event loop with long running sync code
// or (2) spamming the event loop with micro tasks
setImmediate(() => {
resolve(timer.us());
});

const lag = await promise;
obs.observe(Math.floor(lag));
};
}
2 changes: 2 additions & 0 deletions yarn-project/telemetry-client/src/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,5 @@ export const PROOF_VERIFIER_COUNT = 'aztec.proof_verifier.count';

export const VALIDATOR_RE_EXECUTION_TIME = 'aztec.validator.re_execution_time';
export const VALIDATOR_FAILED_REEXECUTION_COUNT = 'aztec.validator.failed_reexecution_count';

export const EVENT_LOOP_LAG = 'aztec.event_loop_lag';
9 changes: 9 additions & 0 deletions yarn-project/telemetry-client/src/otel.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@ import { BatchSpanProcessor, NodeTracerProvider } from '@opentelemetry/sdk-trace
import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from '@opentelemetry/semantic-conventions';

import { type TelemetryClientConfig } from './config.js';
import { EventLoopMonitor } from './event_loop_monitor.js';
import { linearBuckets } from './histogram_utils.js';
import { registerOtelLoggerProvider } from './otel_logger_provider.js';
import { getOtelResource } from './otel_resource.js';
import { type Gauge, type TelemetryClient } from './telemetry.js';

export class OpenTelemetryClient implements TelemetryClient {
hostMetrics: HostMetrics | undefined;
eventLoopMonitor: EventLoopMonitor | undefined;
targetInfo: Gauge | undefined;
private meters: Map<string, Meter> = new Map<string, Meter>();
private tracers: Map<string, Tracer> = new Map<string, Tracer>();
Expand Down Expand Up @@ -87,6 +89,10 @@ export class OpenTelemetryClient implements TelemetryClient {
meterProvider: this.meterProvider,
});

this.eventLoopMonitor = new EventLoopMonitor(
this.meterProvider.getMeter(this.resource.attributes[ATTR_SERVICE_NAME] as string),
);

// See these two links for more information on providing target information:
// https://opentelemetry.io/docs/specs/otel/compatibility/prometheus_and_openmetrics/#resource-attributes
// https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#supporting-target-metadata-in-both-push-based-and-pull-based-systems
Expand All @@ -96,6 +102,7 @@ export class OpenTelemetryClient implements TelemetryClient {

this.targetInfo.record(1, this.resource.attributes);
this.hostMetrics.start();
this.eventLoopMonitor.start();
}

public isEnabled() {
Expand All @@ -111,6 +118,8 @@ export class OpenTelemetryClient implements TelemetryClient {
}

public async stop() {
this.eventLoopMonitor?.stop();

const flushAndShutdown = async (provider: { forceFlush: () => Promise<void>; shutdown: () => Promise<void> }) => {
await provider.forceFlush();
await provider.shutdown();
Expand Down

0 comments on commit 422f125

Please sign in to comment.