From 3e1c02efed2bc10b5f88f3017f9940eb68533510 Mon Sep 17 00:00:00 2001 From: just-mitch <68168980+just-mitch@users.noreply.github.com> Date: Thu, 10 Oct 2024 15:26:51 +0100 Subject: [PATCH] feat: stable deployments for spartan (#9147) A bunch of things to fix spartan deployment under the "default" values and "3-validators". Big thing is that the 16 and 48 values file need the metrics chart to be deployed, but the smaller ones don't. This is so that we can run the KIND tests in CI without metrics (which may be dumb, since if it fails we lose all logs). Big fix is to sleep before the boot node and validators come up. This is to allow time for the k8s services to assign them DNS names. Remove the limits on the nodes so they are faster. --------- Co-authored-by: ludamad --- scripts/ci/get_e2e_jobs.sh | 3 ++- spartan/aztec-network/templates/_helpers.tpl | 13 ++++++--- .../aztec-network/templates/boot-node.yaml | 5 +++- .../aztec-network/templates/l2-contracts.yaml | 2 +- .../aztec-network/templates/prover-node.yaml | 25 +++++++++++++++++ spartan/aztec-network/templates/pxe.yaml | 11 ++++++++ .../aztec-network/templates/validator.yaml | 27 ++++++++++++++++++- spartan/aztec-network/values.yaml | 26 ++++++------------ .../aztec-network/values/16-validators.yaml | 10 +++++++ .../aztec-network/values/48-validators.yaml | 10 +++++++ yarn-project/end-to-end/Earthfile | 4 +-- .../end-to-end/scripts/network_test.sh | 22 ++++++++++++--- 12 files changed, 128 insertions(+), 30 deletions(-) diff --git a/scripts/ci/get_e2e_jobs.sh b/scripts/ci/get_e2e_jobs.sh index dbccca50fa2..7203bdfa432 100755 --- a/scripts/ci/get_e2e_jobs.sh +++ b/scripts/ci/get_e2e_jobs.sh @@ -7,7 +7,7 @@ cd "$(dirname "$0")"/../.. BRANCH=$1 LABELS=$2 -# Define the allow_list +# Define the jobs that will run on every PR allow_list=( "e2e-2-pxes" "e2e-authwit" @@ -26,6 +26,7 @@ allow_list=( "e2e-cheat-codes" "e2e-prover-fake-proofs" "e2e-lending-contract" + "kind-network-smoke" ) # Add labels from input to the allow_list diff --git a/spartan/aztec-network/templates/_helpers.tpl b/spartan/aztec-network/templates/_helpers.tpl index 087d8403356..ebc3bc5ff4f 100644 --- a/spartan/aztec-network/templates/_helpers.tpl +++ b/spartan/aztec-network/templates/_helpers.tpl @@ -67,13 +67,20 @@ http://{{ include "aztec-network.fullname" . }}-metrics.{{ .Release.Namespace }} {{- end -}} {{- define "aztec-network.otelCollectorMetricsEndpoint" -}} -http://metrics-opentelemetry-collector.metrics:4318/v1/metrics +{{- if .Values.telemetry.enabled -}} +{{- if .Values.telemetry.otelCollectorEndpoint -}} +{{- .Values.telemetry.otelCollectorEndpoint -}}/v1/metrics +{{- end -}} +{{- end -}} {{- end -}} {{- define "aztec-network.otelCollectorTracesEndpoint" -}} -http://metrics-opentelemetry-collector.metrics:4318/v1/traces +{{- if .Values.telemetry.enabled -}} +{{- if .Values.telemetry.otelCollectorEndpoint -}} +{{- .Values.telemetry.otelCollectorEndpoint -}}/v1/traces +{{- end -}} +{{- end -}} {{- end -}} - {{- define "helpers.flag" -}} diff --git a/spartan/aztec-network/templates/boot-node.yaml b/spartan/aztec-network/templates/boot-node.yaml index 4f9e4454f46..2b65ae2e6d2 100644 --- a/spartan/aztec-network/templates/boot-node.yaml +++ b/spartan/aztec-network/templates/boot-node.yaml @@ -31,11 +31,13 @@ spec: sleep 5 done echo "Ethereum node is ready!" + {{- if .Values.telemetry.enabled }} until curl --head --silent {{ include "aztec-network.otelCollectorMetricsEndpoint" . }} > /dev/null; do echo "Waiting for OpenTelemetry collector..." sleep 5 done echo "OpenTelemetry collector is ready!" + {{- end }} - name: deploy-contracts image: {{ .Values.images.aztec.image }} command: @@ -56,10 +58,11 @@ spec: - name: boot-node image: {{ .Values.images.aztec.image }} command: + # sleep to allow dns name to be resolvable [ "/bin/bash", "-c", - "source /shared/contracts.env && env && node --no-warnings /usr/src/yarn-project/aztec/dest/bin/index.js start --node --archiver --sequencer --pxe", + "sleep 10 && source /shared/contracts.env && env && node --no-warnings /usr/src/yarn-project/aztec/dest/bin/index.js start --node --archiver --sequencer --pxe", ] livenessProbe: exec: diff --git a/spartan/aztec-network/templates/l2-contracts.yaml b/spartan/aztec-network/templates/l2-contracts.yaml index 9d3275f9309..cb81b52adaa 100644 --- a/spartan/aztec-network/templates/l2-contracts.yaml +++ b/spartan/aztec-network/templates/l2-contracts.yaml @@ -46,7 +46,7 @@ metadata: data: deploy-contracts.sh: | #!/bin/sh - set -e + set -ex # Run the deploy-l1-contracts command and capture the output output=$(node --no-warnings /usr/src/yarn-project/aztec/dest/bin/index.js deploy-l1-contracts --validators {{ join "," .Values.validator.validatorAddresses | quote }}) diff --git a/spartan/aztec-network/templates/prover-node.yaml b/spartan/aztec-network/templates/prover-node.yaml index 9f8c1028557..ac7c23f6d7e 100644 --- a/spartan/aztec-network/templates/prover-node.yaml +++ b/spartan/aztec-network/templates/prover-node.yaml @@ -18,6 +18,30 @@ spec: app: prover-node spec: initContainers: + - name: wait-for-boot-node + image: {{ .Values.images.curl.image }} + command: + - /bin/sh + - -c + - | + until curl -s -X POST -H 'Content-Type: application/json' \ + -d '{"jsonrpc":"2.0","method":"web3_clientVersion","params":[],"id":67}' \ + {{ include "aztec-network.ethereumHost" . }} | grep -q anvil; do + echo "Waiting for Ethereum node..." + sleep 5 + done + echo "Ethereum node is ready!" + {{- if .Values.telemetry.enabled }} + until curl --head --silent {{ include "aztec-network.otelCollectorMetricsEndpoint" . }} > /dev/null; do + echo "Waiting for OpenTelemetry collector..." + sleep 5 + done + echo "OpenTelemetry collector is ready!" + {{- end }} + until curl --head --silent {{ include "aztec-network.bootNodeUrl" . }}/status; do + echo "Waiting for boot node..." + sleep 5 + done - name: configure-prover-env image: "{{ .Values.images.aztec.image }}" imagePullPolicy: {{ .Values.images.aztec.pullPolicy }} @@ -33,6 +57,7 @@ spec: env: - name: ETHEREUM_HOST value: {{ include "aztec-network.ethereumHost" . | quote }} + containers: - name: prover-node image: "{{ .Values.images.aztec.image }}" diff --git a/spartan/aztec-network/templates/pxe.yaml b/spartan/aztec-network/templates/pxe.yaml index 4066b796e2d..281160e4d2d 100644 --- a/spartan/aztec-network/templates/pxe.yaml +++ b/spartan/aztec-network/templates/pxe.yaml @@ -17,6 +17,17 @@ spec: {{- include "aztec-network.selectorLabels" . | nindent 8 }} app: pxe spec: + initContainers: + - name: wait-for-boot-node + image: {{ .Values.images.curl.image }} + command: + - /bin/sh + - -c + - | + until curl --head --silent {{ include "aztec-network.bootNodeUrl" . }}/status; do + echo "Waiting for boot node..." + sleep 5 + done containers: - name: pxe image: "{{ .Values.images.aztec.image }}" diff --git a/spartan/aztec-network/templates/validator.yaml b/spartan/aztec-network/templates/validator.yaml index 900c23b3cca..efde67dc633 100644 --- a/spartan/aztec-network/templates/validator.yaml +++ b/spartan/aztec-network/templates/validator.yaml @@ -21,6 +21,31 @@ spec: # We expect the validators to have already been added to the smart contract by this point - but this container still needs # to be run in order to get the values initContainers: + - name: wait-for-boot-node + image: {{ .Values.images.curl.image }} + command: + - /bin/sh + - -c + - | + until curl -s -X POST -H 'Content-Type: application/json' \ + -d '{"jsonrpc":"2.0","method":"web3_clientVersion","params":[],"id":67}' \ + {{ include "aztec-network.ethereumHost" . }} | grep -q anvil; do + echo "Waiting for Ethereum node..." + sleep 5 + done + echo "Ethereum node is ready!" + {{- if .Values.telemetry.enabled }} + until curl --head --silent {{ include "aztec-network.otelCollectorMetricsEndpoint" . }} > /dev/null; do + echo "Waiting for OpenTelemetry collector..." + sleep 5 + done + echo "OpenTelemetry collector is ready!" + {{- end }} + until curl --head --silent {{ include "aztec-network.bootNodeUrl" . }}/status; do + echo "Waiting for boot node..." + sleep 5 + done + - name: configure-validator-env image: "{{ .Values.images.aztec.image }}" imagePullPolicy: {{ .Values.images.aztec.pullPolicy }} @@ -50,7 +75,7 @@ spec: command: - "/bin/bash" - "-c" - - "source /shared/contracts.env && env && node --no-warnings /usr/src/yarn-project/aztec/dest/bin/index.js start --node --archiver --sequencer" + - "sleep 10 && source /shared/contracts.env && env && node --no-warnings /usr/src/yarn-project/aztec/dest/bin/index.js start --node --archiver --sequencer" volumeMounts: - name: shared-volume mountPath: /shared diff --git a/spartan/aztec-network/values.yaml b/spartan/aztec-network/values.yaml index 4b9a711ba7b..913e8bea1bd 100644 --- a/spartan/aztec-network/values.yaml +++ b/spartan/aztec-network/values.yaml @@ -2,6 +2,10 @@ network: public: false enableBots: true +telemetry: + enabled: false + otelCollectorEndpoint: + images: aztec: image: aztecprotocol/aztec @@ -31,13 +35,14 @@ bootNode: requests: memory: "2Gi" cpu: "200m" - limits: - memory: "4Gi" - cpu: "4" storage: "8Gi" validator: replicas: 1 + validatorKeys: + - 0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80 + validatorAddresses: + - 0xf39Fd6e51aad88F6F4ce6aB8827279cffFb92266 service: p2pPort: 40400 nodePort: 8080 @@ -54,9 +59,6 @@ validator: requests: memory: "2Gi" cpu: "200m" - limits: - memory: "28Gi" - cpu: "7" storage: "8Gi" proverNode: @@ -71,9 +73,6 @@ proverNode: requests: memory: "2Gi" cpu: "200m" - limits: - memory: "120Gi" - cpu: "15" storage: "8Gi" pxe: @@ -93,9 +92,6 @@ pxe: requests: memory: "2Gi" cpu: "200m" - limits: - memory: "4Gi" - cpu: "1" bot: logLevel: "debug" @@ -124,9 +120,6 @@ bot: requests: memory: "2Gi" cpu: "200m" - limits: - memory: "4Gi" - cpu: "1" ethereum: replicas: 1 @@ -152,7 +145,4 @@ ethereum: requests: memory: "2Gi" cpu: "200m" - limits: - memory: "4Gi" - cpu: "1" storage: "8Gi" diff --git a/spartan/aztec-network/values/16-validators.yaml b/spartan/aztec-network/values/16-validators.yaml index 1856034dd05..c44e8c72c39 100644 --- a/spartan/aztec-network/values/16-validators.yaml +++ b/spartan/aztec-network/values/16-validators.yaml @@ -1,3 +1,13 @@ +########## +# BEWARE # +########## +# You need to deploy the metrics helm chart before using this values file. +# head to spartan/metrics and run `./install.sh` +# (then `./forward.sh` if you want to see it) +telemetry: + enabled: true + otelCollectorEndpoint: http://metrics-opentelemetry-collector.metrics:4318 + bootNode: sequencer: minTxsPerBlock: 4 diff --git a/spartan/aztec-network/values/48-validators.yaml b/spartan/aztec-network/values/48-validators.yaml index 3dd5a2cadf3..ded4e3f1361 100644 --- a/spartan/aztec-network/values/48-validators.yaml +++ b/spartan/aztec-network/values/48-validators.yaml @@ -1,3 +1,13 @@ +########## +# BEWARE # +########## +# You need to deploy the metrics helm chart before using this values file. +# head to spartan/metrics and run `./install.sh` +# (then `./forward.sh` if you want to see it) +telemetry: + enabled: true + otelCollectorEndpoint: http://metrics-opentelemetry-collector.metrics:4318 + validator: debug: "aztec:*,-aztec:avm_simulator:*,-aztec:libp2p_service" replicas: 48 diff --git a/yarn-project/end-to-end/Earthfile b/yarn-project/end-to-end/Earthfile index a8346f103d8..1c0ce337a3b 100644 --- a/yarn-project/end-to-end/Earthfile +++ b/yarn-project/end-to-end/Earthfile @@ -292,12 +292,12 @@ e2e-cli-wallet: LOCALLY RUN COMPOSE_FILE=scripts/docker-compose-wallet.yml ./scripts/e2e_compose_test.sh e2e_cli_wallet -network-smoke: +kind-network-smoke: ARG values_file LOCALLY RUN NAMESPACE=smoke FRESH_INSTALL=true VALUES_FILE=${values_file:-default.yaml} ./scripts/network_test.sh ./src/spartan/smoke.test.ts -network-transfer: +kind-network-transfer: ARG values_file LOCALLY RUN NAMESPACE=transfer FRESH_INSTALL=true VALUES_FILE=${values_file:-default.yaml} ./scripts/network_test.sh ./src/spartan/transfer.test.ts diff --git a/yarn-project/end-to-end/scripts/network_test.sh b/yarn-project/end-to-end/scripts/network_test.sh index 72cfef176ef..0b0bf816562 100755 --- a/yarn-project/end-to-end/scripts/network_test.sh +++ b/yarn-project/end-to-end/scripts/network_test.sh @@ -16,6 +16,13 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Main positional parameter TEST="$1" +REPO=$(git rev-parse --show-toplevel) +if [ "$(uname)" = "Linux" ] && [ "$(uname -m)" = "x86_64" ]; then + "$REPO"/spartan/scripts/setup_local_k8s.sh +else + echo "Not on x64 Linux, not installing k8s and helm." +fi + # Default values for environment variables VALUES_FILE="${VALUES_FILE:-default.yaml}" CHAOS_VALUES="${CHAOS_VALUES:-}" @@ -55,12 +62,13 @@ function show_status_until_pxe_ready() { } show_status_until_pxe_ready & +SHOW_STATUS_PID=$! # Install the Helm chart -helm upgrade --install spartan "$(git rev-parse --show-toplevel)/spartan/aztec-network/" \ +helm upgrade --install spartan "$REPO/spartan/aztec-network/" \ --namespace "$NAMESPACE" \ --create-namespace \ - --values "$(git rev-parse --show-toplevel)/spartan/aztec-network/values/$VALUES_FILE" \ + --values "$REPO/spartan/aztec-network/values/$VALUES_FILE" \ --set images.aztec.image="aztecprotocol/aztec:$AZTEC_DOCKER_TAG" \ --set ingress.enabled=true \ --wait \ @@ -71,8 +79,16 @@ kubectl wait pod -l app==pxe --for=condition=Ready -n "$NAMESPACE" --timeout=10m # tunnel in to get access directly to our PXE service in k8s (kubectl port-forward --namespace $NAMESPACE svc/spartan-aztec-network-pxe 9082:8080 2>/dev/null >/dev/null || true) & +PORT_FORWARD_PID=$! + +cleanup() { + echo "Cleaning up..." + kill $PORT_FORWARD_PID || true + kill $SHOW_STATUS_PID || true +} + +trap cleanup EXIT SIGINT SIGTERM -# run our test in the host network namespace (so we can access the above with localhost) docker run --rm --network=host \ -e PXE_URL=http://localhost:9082 \ -e DEBUG="aztec:*" \