From 988b2c6ae8b43e2075fe815a7c9cb31f3033530e Mon Sep 17 00:00:00 2001 From: Mitch Date: Tue, 10 Dec 2024 12:33:57 -0500 Subject: [PATCH 1/4] feat: deploy metrics via CI and terraform fix: don't use loadbalancer for loki --- .github/workflows/metrics-deploy.yml | 143 ++++++++++++++++++ spartan/metrics/values/prod.yaml | 4 - spartan/terraform/deploy-metrics/data.tf | 1 + spartan/terraform/deploy-metrics/main.tf | 52 +++++++ spartan/terraform/deploy-metrics/outputs.tf | 1 + spartan/terraform/deploy-metrics/variables.tf | 15 ++ 6 files changed, 212 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/metrics-deploy.yml create mode 100644 spartan/terraform/deploy-metrics/data.tf create mode 100644 spartan/terraform/deploy-metrics/main.tf create mode 100644 spartan/terraform/deploy-metrics/outputs.tf create mode 100644 spartan/terraform/deploy-metrics/variables.tf diff --git a/.github/workflows/metrics-deploy.yml b/.github/workflows/metrics-deploy.yml new file mode 100644 index 00000000000..779e0cbad17 --- /dev/null +++ b/.github/workflows/metrics-deploy.yml @@ -0,0 +1,143 @@ +name: Aztec Metrics Stack Deployment + +on: + workflow_call: + inputs: + namespace: + description: The namespace to deploy to, e.g. metrics + required: true + type: string + default: metrics + values_file: + description: The values file to use, e.g. 1-validators.yaml + required: true + type: string + default: "prod.yaml" + respect_tf_lock: + description: Whether to respect the Terraform lock + required: false + type: string + default: "true" + run_terraform_destroy: + description: Whether to run terraform destroy before deploying + required: false + type: string + default: "false" + ref: + description: The branch name to deploy from + required: false + type: string + default: "master" + secrets: + GCP_SA_KEY: + required: true + workflow_dispatch: + inputs: + namespace: + description: The namespace to deploy to, e.g. metrics + required: true + default: metrics + values_file: + description: The values file to use, e.g. prod.yaml + required: true + default: "prod.yaml" + respect_tf_lock: + description: Whether to respect the Terraform lock + required: false + default: "true" + run_terraform_destroy: + description: Whether to run terraform destroy before deploying + required: false + default: "false" + ref: + description: The branch name to deploy from + required: false + default: "master" + +jobs: + metrics_deployment: + # This job will run on Ubuntu + runs-on: ubuntu-latest + concurrency: + group: deploy-${{ github.ref }} # Only one job per branch + cancel-in-progress: false # Allow previous deployment to complete to avoid corruption + + # Set up a variable based on the branch name + env: + NAMESPACE: ${{ inputs.namespace }} + VALUES_FILE: ${{ inputs.values_file }} + CHART_PATH: ./spartan/metrics + CLUSTER_NAME: aztec-gke + REGION: us-west1-a + TF_STATE_BUCKET: aztec-terraform + GKE_CLUSTER_CONTEXT: gke_testnet-440309_us-west1-a_aztec-gke + + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + ref: ${{ inputs.ref }} + + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v2 + + - name: Install GKE Auth Plugin + run: | + gcloud components install gke-gcloud-auth-plugin --quiet + + - name: Configure kubectl with GKE cluster + run: | + gcloud container clusters get-credentials ${{ env.CLUSTER_NAME }} --region ${{ env.REGION }} + + - name: Ensure Terraform state bucket exists + run: | + if ! gsutil ls gs://${{ env.TF_STATE_BUCKET }} >/dev/null 2>&1; then + echo "Creating GCS bucket for Terraform state..." + gsutil mb -l us-east4 gs://${{ env.TF_STATE_BUCKET }} + gsutil versioning set on gs://${{ env.TF_STATE_BUCKET }} + else + echo "Terraform state bucket already exists" + fi + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v2 + with: + terraform_version: "1.5.0" # Specify your desired version + + - name: Terraform Init + working-directory: ./spartan/terraform/deploy-metrics + run: | + terraform init \ + -backend-config="bucket=${{ env.TF_STATE_BUCKET }}" \ + -backend-config="prefix=metrics-deploy/${{ env.REGION }}/${{ env.CLUSTER_NAME }}/${{ env.NAMESPACE }}/terraform.tfstate" + + - name: Terraform Destroy + working-directory: ./spartan/terraform/deploy-metrics + if: ${{ inputs.run_terraform_destroy == 'true' }} + # Destroy fails if the resources are already destroyed, so we continue on error + continue-on-error: true + run: | + terraform destroy -auto-approve \ + -var="RELEASE_NAME=${{ env.NAMESPACE }}" \ + -var="VALUES_FILE=${{ env.VALUES_FILE }}" \ + -var="GKE_CLUSTER_CONTEXT=${{ env.GKE_CLUSTER_CONTEXT }}" \ + -lock=${{ inputs.respect_tf_lock }} + + - name: Terraform Plan + working-directory: ./spartan/terraform/deploy-metrics + run: | + terraform plan \ + -var="RELEASE_NAME=${{ env.NAMESPACE }}" \ + -var="VALUES_FILE=${{ env.VALUES_FILE }}" \ + -var="GKE_CLUSTER_CONTEXT=${{ env.GKE_CLUSTER_CONTEXT }}" \ + -out=tfplan \ + -lock=${{ inputs.respect_tf_lock }} + + - name: Terraform Apply + working-directory: ./spartan/terraform/deploy-metrics + run: terraform apply -lock=${{ inputs.respect_tf_lock }} -auto-approve tfplan diff --git a/spartan/metrics/values/prod.yaml b/spartan/metrics/values/prod.yaml index 2da726d4431..04a0d9ccf66 100644 --- a/spartan/metrics/values/prod.yaml +++ b/spartan/metrics/values/prod.yaml @@ -57,7 +57,3 @@ loki: enabled: true persistence: size: 64Gi - - gateway: - service: - type: LoadBalancer diff --git a/spartan/terraform/deploy-metrics/data.tf b/spartan/terraform/deploy-metrics/data.tf new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/spartan/terraform/deploy-metrics/data.tf @@ -0,0 +1 @@ + diff --git a/spartan/terraform/deploy-metrics/main.tf b/spartan/terraform/deploy-metrics/main.tf new file mode 100644 index 00000000000..3d169a07e88 --- /dev/null +++ b/spartan/terraform/deploy-metrics/main.tf @@ -0,0 +1,52 @@ +terraform { + backend "gcs" { + bucket = "aztec-terraform" + prefix = "terraform/state" + } + required_providers { + helm = { + source = "hashicorp/helm" + version = "~> 2.16.1" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.24.0" + } + } +} + +provider "kubernetes" { + alias = "gke-cluster" + config_path = "~/.kube/config" + config_context = var.GKE_CLUSTER_CONTEXT +} + +provider "helm" { + alias = "gke-cluster" + kubernetes { + config_path = "~/.kube/config" + config_context = var.GKE_CLUSTER_CONTEXT + } +} + +# Aztec Helm release for gke-cluster +resource "helm_release" "aztec-gke-cluster" { + provider = helm.gke-cluster + name = var.RELEASE_NAME + repository = "../../" + chart = "metrics" + namespace = var.RELEASE_NAME + create_namespace = true + upgrade_install = true + dependency_update = true + + # base values file + values = [file("../../metrics/values/${var.VALUES_FILE}")] + + + # Setting timeout and wait conditions + timeout = 1200 # 20 minutes in seconds + wait = true + wait_for_jobs = true + +} diff --git a/spartan/terraform/deploy-metrics/outputs.tf b/spartan/terraform/deploy-metrics/outputs.tf new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/spartan/terraform/deploy-metrics/outputs.tf @@ -0,0 +1 @@ + diff --git a/spartan/terraform/deploy-metrics/variables.tf b/spartan/terraform/deploy-metrics/variables.tf new file mode 100644 index 00000000000..a0021f71cf4 --- /dev/null +++ b/spartan/terraform/deploy-metrics/variables.tf @@ -0,0 +1,15 @@ +variable "GKE_CLUSTER_CONTEXT" { + description = "GKE cluster context" + type = string + default = "gke_testnet-440309_us-east4-a_spartan-gke" +} + +variable "RELEASE_NAME" { + description = "Name of helm deployment and k8s namespace" + type = string +} + +variable "VALUES_FILE" { + description = "Name of the values file to use for deployment" + type = string +} From 23a87bda95120426e90bb7234094910a4d666f32 Mon Sep 17 00:00:00 2001 From: Mitch Date: Tue, 10 Dec 2024 13:42:54 -0500 Subject: [PATCH 2/4] fix: different loki/grafana config for prod/kind --- spartan/metrics/values.yaml | 16 +---------- spartan/metrics/values/kind.yaml | 34 ++++++++++++++++++++++ spartan/metrics/values/prod.yaml | 49 +++++++++++++++++++++++++------- 3 files changed, 74 insertions(+), 25 deletions(-) diff --git a/spartan/metrics/values.yaml b/spartan/metrics/values.yaml index 6e75301c12b..df2ca87aa21 100644 --- a/spartan/metrics/values.yaml +++ b/spartan/metrics/values.yaml @@ -31,20 +31,6 @@ opentelemetry-collector: kubernetesAttributes: enabled: true config: - exporters: - # debug: - # verbosity: detailed - otlphttp/logs: - endpoint: http://metrics-loki.metrics:3100/otlp - otlp/tempo: - endpoint: http://metrics-tempo.metrics:4317 - tls: - insecure: true - prometheus: - endpoint: ${env:MY_POD_IP}:8889 - metric_expiration: 5m - resource_to_telemetry_conversion: - enabled: true extensions: health_check: endpoint: ${env:MY_POD_IP}:13133 @@ -91,7 +77,7 @@ opentelemetry-collector: # - debug # Enable and configure the Loki subchart -# https://artifacthub.io/packages/helm/grafana/loki-simple-scalable +# https://artifacthub.io/packages/helm/grafana/loki # loki: # Nothing set here, because we need to use values from the values directory; # otherwise, things don't get overridden correctly. diff --git a/spartan/metrics/values/kind.yaml b/spartan/metrics/values/kind.yaml index c8b8a970b25..857d74af2b0 100644 --- a/spartan/metrics/values/kind.yaml +++ b/spartan/metrics/values/kind.yaml @@ -1,3 +1,20 @@ +opentelemetry-collector: + config: + exporters: + # debug: + # verbosity: detailed + otlphttp/logs: + endpoint: http://metrics-loki.metrics:3100/otlp + otlp/tempo: + endpoint: http://metrics-tempo.metrics:4317 + tls: + insecure: true + prometheus: + endpoint: ${env:MY_POD_IP}:8889 + metric_expiration: 5m + resource_to_telemetry_conversion: + enabled: true + loki: deploymentMode: SingleBinary loki: @@ -23,3 +40,20 @@ loki: replicas: 0 write: replicas: 0 + +grafana: + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Loki + type: loki + url: http://metrics-loki.metrics:3100 + - name: Tempo + type: tempo + url: http://metrics-tempo.metrics:3100 + - name: Prometheus + type: prometheus + uid: spartan-metrics-prometheus + isDefault: true + url: http://metrics-prometheus-server.metrics:80 diff --git a/spartan/metrics/values/prod.yaml b/spartan/metrics/values/prod.yaml index 04a0d9ccf66..55742b3748a 100644 --- a/spartan/metrics/values/prod.yaml +++ b/spartan/metrics/values/prod.yaml @@ -1,13 +1,3 @@ -# Enable and configure Grafana -# https://artifacthub.io/packages/helm/grafana/grafana -grafana: - service: - type: LoadBalancer - persistence: - type: pvc - enabled: true - size: "10Gi" - opentelemetry-collector: ports: jaeger-compact: @@ -15,6 +5,21 @@ opentelemetry-collector: service: enabled: true type: LoadBalancer + config: + exporters: + # debug: + # verbosity: detailed + otlphttp/logs: + endpoint: http://loki-write.metrics:3100/otlp + otlp/tempo: + endpoint: http://metrics-tempo.metrics:4317 + tls: + insecure: true + prometheus: + endpoint: ${env:MY_POD_IP}:8889 + metric_expiration: 5m + resource_to_telemetry_conversion: + enabled: true# Enable and configure Grafana loki: loki: @@ -57,3 +62,27 @@ loki: enabled: true persistence: size: 64Gi + +# https://artifacthub.io/packages/helm/grafana/grafana +grafana: + service: + type: LoadBalancer + persistence: + type: pvc + enabled: true + size: "10Gi" + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Loki + type: loki + url: http://loki-read.metrics:3100 + - name: Tempo + type: tempo + url: http://metrics-tempo.metrics:3100 + - name: Prometheus + type: prometheus + uid: spartan-metrics-prometheus + isDefault: true + url: http://metrics-prometheus-server.metrics:80 From 2eeca7c7dce0fb42765d9e90952268eddee84483 Mon Sep 17 00:00:00 2001 From: Mitch Date: Tue, 10 Dec 2024 13:57:07 -0500 Subject: [PATCH 3/4] fix: typos and force update --- spartan/metrics/values/prod.yaml | 2 +- spartan/terraform/deploy-metrics/main.tf | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/spartan/metrics/values/prod.yaml b/spartan/metrics/values/prod.yaml index 55742b3748a..1e726bdec01 100644 --- a/spartan/metrics/values/prod.yaml +++ b/spartan/metrics/values/prod.yaml @@ -19,7 +19,7 @@ opentelemetry-collector: endpoint: ${env:MY_POD_IP}:8889 metric_expiration: 5m resource_to_telemetry_conversion: - enabled: true# Enable and configure Grafana + enabled: true loki: loki: diff --git a/spartan/terraform/deploy-metrics/main.tf b/spartan/terraform/deploy-metrics/main.tf index 3d169a07e88..2a72457fb6e 100644 --- a/spartan/terraform/deploy-metrics/main.tf +++ b/spartan/terraform/deploy-metrics/main.tf @@ -39,6 +39,7 @@ resource "helm_release" "aztec-gke-cluster" { create_namespace = true upgrade_install = true dependency_update = true + force_update = true # base values file values = [file("../../metrics/values/${var.VALUES_FILE}")] From b4e7c558fbaa3de660be321bfb5046c9e7671b38 Mon Sep 17 00:00:00 2001 From: Mitch Date: Tue, 10 Dec 2024 17:40:21 -0500 Subject: [PATCH 4/4] fix: example values file chore: delete dead workflow --- .github/workflows/metrics-deploy.yml | 2 +- .github/workflows/metrics-deploys.yml | 129 -------------------------- 2 files changed, 1 insertion(+), 130 deletions(-) delete mode 100644 .github/workflows/metrics-deploys.yml diff --git a/.github/workflows/metrics-deploy.yml b/.github/workflows/metrics-deploy.yml index 779e0cbad17..331317426fb 100644 --- a/.github/workflows/metrics-deploy.yml +++ b/.github/workflows/metrics-deploy.yml @@ -9,7 +9,7 @@ on: type: string default: metrics values_file: - description: The values file to use, e.g. 1-validators.yaml + description: The values file to use, e.g. prod.yaml required: true type: string default: "prod.yaml" diff --git a/.github/workflows/metrics-deploys.yml b/.github/workflows/metrics-deploys.yml deleted file mode 100644 index b3552e414d3..00000000000 --- a/.github/workflows/metrics-deploys.yml +++ /dev/null @@ -1,129 +0,0 @@ -name: Deploy metrics -on: - # push: - # branches: [devnet] - workflow_dispatch: - inputs: {} - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -env: - DOCKERHUB_PASSWORD: ${{ secrets.DOCKERHUB_PASSWORD }} - GIT_COMMIT: ${{ github.sha }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - # TF Vars - TF_VAR_DOCKERHUB_ACCOUNT: aztecprotocol - TF_VAR_GRAFANA_CLIENT_ID: ${{ secrets.GRAFANA_CLIENT_ID }} - TF_VAR_GRAFANA_CLIENT_SECRET: ${{ secrets.GRAFANA_CLIENT_SECRET }} - TF_VAR_IMAGE_TAG: ${{ github.sha }} - -jobs: - setup: - uses: ./.github/workflows/setup-runner.yml - with: - username: master - runner_type: builder-x86 - secrets: inherit - build: - needs: setup - runs-on: ${{ github.actor }}-x86 - steps: - - uses: actions/checkout@v4 - with: - ref: "${{ env.GIT_COMMIT }}" - fetch-depth: 0 - - uses: ./.github/ci-setup-action - with: - concurrency_key: build-metrics-${{ github.actor }} - dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}" - - - name: Check if metrics have changed - id: check_metrics_changes - uses: actions/github-script@v7 - with: - script: | - const { execSync } = require('child_process'); - const changedFiles = execSync('git diff --name-only ${{ github.event.before }} ${{ github.sha }}').toString().split('\n'); - const fileChanged = changedFiles.some(file => file.startsWith('metrics')); - return fileChanged - - - name: Build & push prometheus - working-directory: ./metrics/prometheus - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - docker build -t aztecprotocol/aztec-prometheus . - docker tag aztecprotocol/aztec-prometheus aztecprotocol/aztec-prometheus:$GIT_COMMIT - docker push aztecprotocol/aztec-prometheus - docker push aztecprotocol/aztec-prometheus:$GIT_COMMIT - - - name: Build & push grafana - working-directory: ./metrics/grafana - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - docker build -t aztecprotocol/aztec-grafana . - docker tag aztecprotocol/aztec-grafana aztecprotocol/aztec-grafana:$GIT_COMMIT - docker push aztecprotocol/aztec-grafana - docker push aztecprotocol/aztec-grafana:$GIT_COMMIT - - - name: Build & push open telemetry - working-directory: ./metrics/otel - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - docker build -t aztecprotocol/aztec-otel . - docker tag aztecprotocol/aztec-otel aztecprotocol/aztec-otel:$GIT_COMMIT - docker push aztecprotocol/aztec-otel - docker push aztecprotocol/aztec-otel:$GIT_COMMIT - - terraform_deploy: - runs-on: ubuntu-latest - needs: build - steps: - - uses: actions/checkout@v4 - with: - ref: "${{ env.GIT_COMMIT }}" - fetch-depth: 0 - - uses: ./.github/ci-setup-action - - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: 1.7.5 - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: eu-west-2 - - - name: Check if metrics have changed - id: check_metrics_changes - uses: actions/github-script@v7 - with: - script: | - const { execSync } = require('child_process'); - const changedFiles = execSync('git diff --name-only ${{ github.event.before }} ${{ github.sha }}').toString().split('\n'); - const fileChanged = changedFiles.some(file => file.startsWith('metrics')); - return fileChanged - - - name: Deploy prometheus - working-directory: ./metrics/prometheus/terraform - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - terraform init -input=false -backend-config="key=aztec-prometheus" - terraform apply -input=false -auto-approve - - - name: Deploy grafana - working-directory: ./metrics/grafana/terraform - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - terraform init -input=false -backend-config="key=aztec-grafana" - terraform apply -input=false -auto-approve - - - name: Deploy open telemetry - working-directory: ./metrics/otel/terraform - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - terraform init -input=false -backend-config="key=aztec-otel" - terraform apply -input=false -auto-approve