Skip to content

Commit

Permalink
feat: persistence in helm chart for validator and boot node (#10543)
Browse files Browse the repository at this point in the history
chore: give validators/boot-nodes 100Gi in network configs
feat: allow metrics to be instantly flushed
chore: flush archiver metrics on startup
feat: allow making range queries to prometheus in tests
  • Loading branch information
just-mitch authored and lucasxia01 committed Dec 11, 2024
1 parent d2259ff commit ed67b4c
Show file tree
Hide file tree
Showing 17 changed files with 120 additions and 18 deletions.
13 changes: 13 additions & 0 deletions spartan/aztec-network/templates/boot-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ spec:
matchLabels:
{{- include "aztec-network.selectorLabels" . | nindent 6 }}
app: boot-node
volumeClaimTemplates:
- metadata:
name: boot-node-data
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: {{ .Values.bootNode.storageSize }}
template:
metadata:
labels:
Expand Down Expand Up @@ -119,6 +127,8 @@ spec:
mountPath: /shared/p2p
- name: config
mountPath: /shared/config
- name: boot-node-data
mountPath: {{ .Values.bootNode.dataDir }}
{{- if .Values.bootNode.deployContracts }}
- name: scripts-output
mountPath: /shared/contracts
Expand Down Expand Up @@ -182,6 +192,9 @@ spec:
emptyDir: {}
- name: config
emptyDir: {}
- name: boot-node-data
persistentVolumeClaim:
claimName: boot-node-data
{{- if .Values.bootNode.deployContracts }}
- name: scripts
configMap:
Expand Down
14 changes: 13 additions & 1 deletion spartan/aztec-network/templates/validator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ spec:
matchLabels:
{{- include "aztec-network.selectorLabels" . | nindent 6 }}
app: validator
volumeClaimTemplates:
- metadata:
name: validator-data
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: {{ .Values.validator.storageSize }}
template:
metadata:
labels:
Expand Down Expand Up @@ -53,7 +61,6 @@ spec:
{{- end }}
if [ "{{ .Values.validator.dynamicBootNode }}" = "true" ]; then
# Get the list of pod IPs for the validator service
echo "{{ include "aztec-network.pxeUrl" . }}" > /shared/pxe/pxe_url
else
until curl --silent --head --fail "${BOOT_NODE_HOST}/status" > /dev/null; do
Expand Down Expand Up @@ -136,6 +143,8 @@ spec:
mountPath: /shared/p2p
- name: config
mountPath: /shared/config
- name: validator-data
mountPath: {{ .Values.validator.dataDir }}
env:
- name: POD_IP
valueFrom:
Expand Down Expand Up @@ -197,6 +206,9 @@ spec:
emptyDir: {}
- name: config
emptyDir: {}
- name: validator-data
persistentVolumeClaim:
claimName: validator-data
---
# If this is not a public network, create a headless service for StatefulSet DNS entries
{{ if not .Values.network.public }}
Expand Down
6 changes: 5 additions & 1 deletion spartan/aztec-network/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ bootNode:
outboxAddress: ""
feeJuiceAddress: ""
feeJuicePortalAddress: ""
storage: "8Gi"
stakingAssetAddress: ""
storageSize: "1Gi"
dataDir: "/data"

validator:
# If true, the validator will use its peers to serve as the boot node.
Expand Down Expand Up @@ -108,6 +110,8 @@ validator:
requests:
memory: "2Gi"
cpu: "200m"
storageSize: "1Gi"
dataDir: "/data"

proverNode:
externalHost: ""
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/values/4-validators-with-metrics.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ validator:
- 0x90F79bf6EB2c4f870365E785982E1f101E93b906
validator:
disabled: false
sequencer:
enforceTimeTable: false

bootNode:
validator:
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/values/exp-1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ images:
pullPolicy: Always

validator:
storageSize: "100Gi"
replicas: 48
validatorKeys:
- 0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80
Expand Down Expand Up @@ -124,6 +125,7 @@ validator:

bootNode:
peerIdPrivateKey: 080212200ba8451c6d62b03c4441f0a466c0bce7a3a595f2cf50a055ded3305c77aa3af0
storageSize: "100Gi"
validator:
disabled: true

Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/values/rc-1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ telemetry:
otelCollectorEndpoint: http://35.197.100.168:4318

validator:
storageSize: "100Gi"
replicas: 48
validatorKeys:
- 0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80
Expand Down Expand Up @@ -125,6 +126,7 @@ bootNode:
peerIdPrivateKey: 080212200ba8451c6d62b03c4441f0a466c0bce7a3a595f2cf50a055ded3305c77aa3af0
validator:
disabled: true
storageSize: "100Gi"

proverAgent:
replicas: 8
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/values/rc-2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ telemetry:

validator:
replicas: 48
storageSize: "100Gi"
validatorKeys:
- 0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80
- 0x59c6995e998f97a5a0044966f0945389dc9e86dae88c7a8412f4603b6b78690d
Expand Down Expand Up @@ -122,6 +123,7 @@ validator:
disabled: false

bootNode:
storageSize: "100Gi"
peerIdPrivateKey: 080212200ba8451c6d62b03c4441f0a466c0bce7a3a595f2cf50a055ded3305c77aa3af0
validator:
disabled: true
Expand Down
2 changes: 1 addition & 1 deletion yarn-project/archiver/src/archiver/archiver.ts
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ export class Archiver implements ArchiveSource {
pollingIntervalMs: config.archiverPollingIntervalMS ?? 10_000,
batchSize: config.archiverBatchSize ?? 100,
},
new ArchiverInstrumentation(telemetry, () => archiverStore.estimateSize()),
await ArchiverInstrumentation.new(telemetry, () => archiverStore.estimateSize()),
{ l1StartBlock, l1GenesisTime, epochDuration, slotDuration, ethereumSlotDuration },
);
await archiver.start(blockUntilSynced);
Expand Down
19 changes: 18 additions & 1 deletion yarn-project/archiver/src/archiver/instrumentation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@ export class ArchiverInstrumentation {
private blockHeight: Gauge;
private blockSize: Gauge;
private syncDuration: Histogram;
private l1BlocksSynced: UpDownCounter;
private proofsSubmittedDelay: Histogram;
private proofsSubmittedCount: UpDownCounter;
private dbMetrics: LmdbMetrics;

private log = createLogger('archiver:instrumentation');

constructor(private telemetry: TelemetryClient, lmdbStats?: LmdbStatsCallback) {
private constructor(private telemetry: TelemetryClient, lmdbStats?: LmdbStatsCallback) {
const meter = telemetry.getMeter('Archiver');
this.blockHeight = meter.createGauge(Metrics.ARCHIVER_BLOCK_HEIGHT, {
description: 'The height of the latest block processed by the archiver',
Expand Down Expand Up @@ -59,6 +60,11 @@ export class ArchiverInstrumentation {
},
});

this.l1BlocksSynced = meter.createUpDownCounter(Metrics.ARCHIVER_L1_BLOCKS_SYNCED, {
description: 'Number of blocks synced from L1',
valueType: ValueType.INT,
});

this.dbMetrics = new LmdbMetrics(
meter,
{
Expand All @@ -77,13 +83,24 @@ export class ArchiverInstrumentation {
);
}

public static async new(telemetry: TelemetryClient, lmdbStats?: LmdbStatsCallback) {
const instance = new ArchiverInstrumentation(telemetry, lmdbStats);

instance.l1BlocksSynced.add(0);

await instance.telemetry.flush();

return instance;
}

public isEnabled(): boolean {
return this.telemetry.isEnabled();
}

public processNewBlocks(syncTimePerBlock: number, blocks: L2Block[]) {
this.syncDuration.record(Math.ceil(syncTimePerBlock));
this.blockHeight.record(Math.max(...blocks.map(b => b.number)));
this.l1BlocksSynced.add(blocks.length);
for (const block of blocks) {
this.blockSize.record(block.body.txEffects.length);
}
Expand Down
2 changes: 1 addition & 1 deletion yarn-project/end-to-end/scripts/network_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -180,5 +180,5 @@ docker run --rm --network=host \
-e GRAFANA_PASSWORD=$GRAFANA_PASSWORD \
-e DEBUG=${DEBUG:-""} \
-e LOG_JSON=1 \
-e LOG_LEVEL=verbose \
-e LOG_LEVEL=${LOG_LEVEL:-"verbose"} \
aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG $TEST
30 changes: 26 additions & 4 deletions yarn-project/end-to-end/src/quality_of_service/alert_checker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ import * as yaml from 'js-yaml';
export interface AlertConfig {
alert: string;
expr: string;
start?: number;
end?: number;
step?: number;
for: string;
labels: Record<string, string>;
annotations: Record<string, string>;
Expand All @@ -18,7 +21,7 @@ export interface AlertCheckerConfig {

// This config is good if you're running the otel-lgtm stack locally
const DEFAULT_CONFIG: AlertCheckerConfig = {
grafanaEndpoint: 'http://localhost:3000/api/datasources/proxy/uid/prometheus/api/v1/query',
grafanaEndpoint: 'http://localhost:3000/api/datasources/proxy/uid/prometheus/api/v1',
grafanaCredentials: 'admin:admin',
};

Expand All @@ -41,10 +44,29 @@ export class AlertChecker {
return data.alerts;
}

private async queryGrafana(expr: string): Promise<number> {
private async queryGrafana({ expr, start, end, step }: AlertConfig): Promise<number> {
const credentials = Buffer.from(this.config.grafanaCredentials).toString('base64');

const response = await fetch(`${this.config.grafanaEndpoint}?query=${encodeURIComponent(expr)}`, {
let query = `query=${encodeURIComponent(expr)}`;
let action = 'query';

if (start) {
action = 'query_range';
query += `&start=${start}`;
}

if (end) {
query += `&end=${end}`;
}

if (step) {
query += `&step=${step}`;
}

const urlString = `${this.config.grafanaEndpoint}/${action}?${query}`;
this.logger.debug(`Querying Grafana: ${urlString}`);

const response = await fetch(urlString, {
headers: {
Authorization: `Basic ${credentials}`,
},
Expand All @@ -65,7 +87,7 @@ export class AlertChecker {
for (const alert of alerts) {
this.logger.info(`Checking alert: ${JSON.stringify(alert)}`);

const metricValue = await this.queryGrafana(alert.expr);
const metricValue = await this.queryGrafana(alert);
this.logger.info(`Metric value: ${metricValue}`);
if (metricValue > 0) {
this.logger.error(`Alert ${alert.alert} triggered! Value: ${metricValue}`);
Expand Down
22 changes: 15 additions & 7 deletions yarn-project/end-to-end/src/spartan/gating-passive.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ const qosAlerts: AlertConfig[] = [
for: '10m',
annotations: {},
},
{
// Checks that we are not syncing from scratch each time we reboot
alert: 'ArchiverL1BlocksSynced',
expr: 'rate(aztec_archiver_l1_blocks_synced[1m]) > 0.5',
labels: { severity: 'error' },
for: '10m',
annotations: {},
},
];

const config = setupEnvironment(process.env);
Expand All @@ -52,6 +60,12 @@ describe('a test that passively observes the network in the presence of network
const MAX_MISSED_SLOT_PERCENT = 0.6;

afterAll(async () => {
await startPortForward({
resource: `svc/metrics-grafana`,
namespace: 'metrics',
containerPort: config.CONTAINER_METRICS_PORT,
hostPort: config.HOST_METRICS_PORT,
});
await runAlertCheck(config, qosAlerts, debugLogger);
});

Expand All @@ -69,12 +83,6 @@ describe('a test that passively observes the network in the presence of network
hostPort: HOST_ETHEREUM_PORT,
});

await startPortForward({
resource: `svc/metrics-grafana`,
namespace: 'metrics',
containerPort: config.CONTAINER_METRICS_PORT,
hostPort: config.HOST_METRICS_PORT,
});
const client = await createCompatibleClient(PXE_URL, debugLogger);
const ethCheatCodes = new EthCheatCodes(ETHEREUM_HOST);
const rollupCheatCodes = new RollupCheatCodes(
Expand All @@ -93,7 +101,7 @@ describe('a test that passively observes the network in the presence of network
// note, don't forget that normally an epoch doesn't need epochDuration worth of blocks,
// but here we do double duty:
// we want a handful of blocks, and we want to pass the epoch boundary
await awaitL2BlockNumber(rollupCheatCodes, epochDuration, 60 * 5, debugLogger);
await awaitL2BlockNumber(rollupCheatCodes, epochDuration, 60 * 6, debugLogger);

let deploymentOutput: string = '';
deploymentOutput = await applyNetworkShaping({
Expand Down
2 changes: 1 addition & 1 deletion yarn-project/end-to-end/src/spartan/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ const k8sLocalConfigSchema = z.object({
HOST_METRICS_PORT: z.coerce.number().min(1, 'HOST_METRICS_PORT env variable must be set'),
CONTAINER_METRICS_PORT: z.coerce.number().default(80),
GRAFANA_PASSWORD: z.string().min(1, 'GRAFANA_PASSWORD env variable must be set'),
METRICS_API_PATH: z.string().default('/api/datasources/proxy/uid/spartan-metrics-prometheus/api/v1/query'),
METRICS_API_PATH: z.string().default('/api/datasources/proxy/uid/spartan-metrics-prometheus/api/v1'),
SPARTAN_DIR: z.string().min(1, 'SPARTAN_DIR env variable must be set'),
K8S: z.literal('local'),
});
Expand Down
1 change: 1 addition & 0 deletions yarn-project/telemetry-client/src/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ export const MEMPOOL_PROVER_QUOTE_COUNT = 'aztec.mempool.prover_quote_count';
export const MEMPOOL_PROVER_QUOTE_SIZE = 'aztec.mempool.prover_quote_size';

export const ARCHIVER_SYNC_DURATION = 'aztec.archiver.sync_duration';
export const ARCHIVER_L1_BLOCKS_SYNCED = 'aztec.archiver.l1_blocks_synced';
export const ARCHIVER_BLOCK_HEIGHT = 'aztec.archiver.block_height';
export const ARCHIVER_BLOCK_SIZE = 'aztec.archiver.block_size';
export const ARCHIVER_ROLLUP_PROOF_DELAY = 'aztec.archiver.rollup_proof_delay';
Expand Down
4 changes: 4 additions & 0 deletions yarn-project/telemetry-client/src/noop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ export class NoopTelemetryClient implements TelemetryClient {
return Promise.resolve();
}

flush(): Promise<void> {
return Promise.resolve();
}

isEnabled() {
return false;
}
Expand Down
8 changes: 8 additions & 0 deletions yarn-project/telemetry-client/src/otel.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,14 @@ export class OpenTelemetryClient implements TelemetryClient {
return true;
}

public async flush() {
await Promise.all([
this.meterProvider.forceFlush(),
this.loggerProvider.forceFlush(),
this.traceProvider instanceof NodeTracerProvider ? this.traceProvider.forceFlush() : Promise.resolve(),
]);
}

public async stop() {
const flushAndShutdown = async (provider: { forceFlush: () => Promise<void>; shutdown: () => Promise<void> }) => {
await provider.forceFlush();
Expand Down
7 changes: 6 additions & 1 deletion yarn-project/telemetry-client/src/telemetry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import {
import * as Attributes from './attributes.js';
import * as Metrics from './metrics.js';

export { ValueType, Span } from '@opentelemetry/api';
export { Span, ValueType } from '@opentelemetry/api';

type ValuesOf<T> = T extends Record<string, infer U> ? U : never;

Expand Down Expand Up @@ -115,6 +115,11 @@ export interface TelemetryClient {
* Stops the telemetry client.
*/
stop(): Promise<void>;

/**
* Flushes the telemetry client.
*/
flush(): Promise<void>;
}

/** Objects that adhere to this interface can use @trackSpan */
Expand Down

0 comments on commit ed67b4c

Please sign in to comment.