From e21069db0b3fcdf5665b62bb821a5a1d2abe2cee Mon Sep 17 00:00:00 2001 From: just-mitch <68168980+just-mitch@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:04:16 -0500 Subject: [PATCH] feat: metrics via terraform (#10594) Also, fix the prod deployment so that it uses the correct service endpoints for the "scalable" loki config. Have ran the metrics deployment workflow, and the Loki datasource now works as expected: Screenshot 2024-12-10 at 14 11 17 fix #10191 fix #10439 --- .github/workflows/metrics-deploy.yml | 143 ++++++++++++++++++ .github/workflows/metrics-deploys.yml | 129 ---------------- spartan/metrics/values.yaml | 16 +- spartan/metrics/values/kind.yaml | 34 +++++ spartan/metrics/values/prod.yaml | 51 +++++-- spartan/terraform/deploy-metrics/data.tf | 1 + spartan/terraform/deploy-metrics/main.tf | 53 +++++++ spartan/terraform/deploy-metrics/outputs.tf | 1 + spartan/terraform/deploy-metrics/variables.tf | 15 ++ 9 files changed, 286 insertions(+), 157 deletions(-) create mode 100644 .github/workflows/metrics-deploy.yml delete mode 100644 .github/workflows/metrics-deploys.yml create mode 100644 spartan/terraform/deploy-metrics/data.tf create mode 100644 spartan/terraform/deploy-metrics/main.tf create mode 100644 spartan/terraform/deploy-metrics/outputs.tf create mode 100644 spartan/terraform/deploy-metrics/variables.tf diff --git a/.github/workflows/metrics-deploy.yml b/.github/workflows/metrics-deploy.yml new file mode 100644 index 00000000000..331317426fb --- /dev/null +++ b/.github/workflows/metrics-deploy.yml @@ -0,0 +1,143 @@ +name: Aztec Metrics Stack Deployment + +on: + workflow_call: + inputs: + namespace: + description: The namespace to deploy to, e.g. metrics + required: true + type: string + default: metrics + values_file: + description: The values file to use, e.g. prod.yaml + required: true + type: string + default: "prod.yaml" + respect_tf_lock: + description: Whether to respect the Terraform lock + required: false + type: string + default: "true" + run_terraform_destroy: + description: Whether to run terraform destroy before deploying + required: false + type: string + default: "false" + ref: + description: The branch name to deploy from + required: false + type: string + default: "master" + secrets: + GCP_SA_KEY: + required: true + workflow_dispatch: + inputs: + namespace: + description: The namespace to deploy to, e.g. metrics + required: true + default: metrics + values_file: + description: The values file to use, e.g. prod.yaml + required: true + default: "prod.yaml" + respect_tf_lock: + description: Whether to respect the Terraform lock + required: false + default: "true" + run_terraform_destroy: + description: Whether to run terraform destroy before deploying + required: false + default: "false" + ref: + description: The branch name to deploy from + required: false + default: "master" + +jobs: + metrics_deployment: + # This job will run on Ubuntu + runs-on: ubuntu-latest + concurrency: + group: deploy-${{ github.ref }} # Only one job per branch + cancel-in-progress: false # Allow previous deployment to complete to avoid corruption + + # Set up a variable based on the branch name + env: + NAMESPACE: ${{ inputs.namespace }} + VALUES_FILE: ${{ inputs.values_file }} + CHART_PATH: ./spartan/metrics + CLUSTER_NAME: aztec-gke + REGION: us-west1-a + TF_STATE_BUCKET: aztec-terraform + GKE_CLUSTER_CONTEXT: gke_testnet-440309_us-west1-a_aztec-gke + + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + ref: ${{ inputs.ref }} + + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v2 + + - name: Install GKE Auth Plugin + run: | + gcloud components install gke-gcloud-auth-plugin --quiet + + - name: Configure kubectl with GKE cluster + run: | + gcloud container clusters get-credentials ${{ env.CLUSTER_NAME }} --region ${{ env.REGION }} + + - name: Ensure Terraform state bucket exists + run: | + if ! gsutil ls gs://${{ env.TF_STATE_BUCKET }} >/dev/null 2>&1; then + echo "Creating GCS bucket for Terraform state..." + gsutil mb -l us-east4 gs://${{ env.TF_STATE_BUCKET }} + gsutil versioning set on gs://${{ env.TF_STATE_BUCKET }} + else + echo "Terraform state bucket already exists" + fi + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v2 + with: + terraform_version: "1.5.0" # Specify your desired version + + - name: Terraform Init + working-directory: ./spartan/terraform/deploy-metrics + run: | + terraform init \ + -backend-config="bucket=${{ env.TF_STATE_BUCKET }}" \ + -backend-config="prefix=metrics-deploy/${{ env.REGION }}/${{ env.CLUSTER_NAME }}/${{ env.NAMESPACE }}/terraform.tfstate" + + - name: Terraform Destroy + working-directory: ./spartan/terraform/deploy-metrics + if: ${{ inputs.run_terraform_destroy == 'true' }} + # Destroy fails if the resources are already destroyed, so we continue on error + continue-on-error: true + run: | + terraform destroy -auto-approve \ + -var="RELEASE_NAME=${{ env.NAMESPACE }}" \ + -var="VALUES_FILE=${{ env.VALUES_FILE }}" \ + -var="GKE_CLUSTER_CONTEXT=${{ env.GKE_CLUSTER_CONTEXT }}" \ + -lock=${{ inputs.respect_tf_lock }} + + - name: Terraform Plan + working-directory: ./spartan/terraform/deploy-metrics + run: | + terraform plan \ + -var="RELEASE_NAME=${{ env.NAMESPACE }}" \ + -var="VALUES_FILE=${{ env.VALUES_FILE }}" \ + -var="GKE_CLUSTER_CONTEXT=${{ env.GKE_CLUSTER_CONTEXT }}" \ + -out=tfplan \ + -lock=${{ inputs.respect_tf_lock }} + + - name: Terraform Apply + working-directory: ./spartan/terraform/deploy-metrics + run: terraform apply -lock=${{ inputs.respect_tf_lock }} -auto-approve tfplan diff --git a/.github/workflows/metrics-deploys.yml b/.github/workflows/metrics-deploys.yml deleted file mode 100644 index b3552e414d3..00000000000 --- a/.github/workflows/metrics-deploys.yml +++ /dev/null @@ -1,129 +0,0 @@ -name: Deploy metrics -on: - # push: - # branches: [devnet] - workflow_dispatch: - inputs: {} - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -env: - DOCKERHUB_PASSWORD: ${{ secrets.DOCKERHUB_PASSWORD }} - GIT_COMMIT: ${{ github.sha }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - # TF Vars - TF_VAR_DOCKERHUB_ACCOUNT: aztecprotocol - TF_VAR_GRAFANA_CLIENT_ID: ${{ secrets.GRAFANA_CLIENT_ID }} - TF_VAR_GRAFANA_CLIENT_SECRET: ${{ secrets.GRAFANA_CLIENT_SECRET }} - TF_VAR_IMAGE_TAG: ${{ github.sha }} - -jobs: - setup: - uses: ./.github/workflows/setup-runner.yml - with: - username: master - runner_type: builder-x86 - secrets: inherit - build: - needs: setup - runs-on: ${{ github.actor }}-x86 - steps: - - uses: actions/checkout@v4 - with: - ref: "${{ env.GIT_COMMIT }}" - fetch-depth: 0 - - uses: ./.github/ci-setup-action - with: - concurrency_key: build-metrics-${{ github.actor }} - dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}" - - - name: Check if metrics have changed - id: check_metrics_changes - uses: actions/github-script@v7 - with: - script: | - const { execSync } = require('child_process'); - const changedFiles = execSync('git diff --name-only ${{ github.event.before }} ${{ github.sha }}').toString().split('\n'); - const fileChanged = changedFiles.some(file => file.startsWith('metrics')); - return fileChanged - - - name: Build & push prometheus - working-directory: ./metrics/prometheus - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - docker build -t aztecprotocol/aztec-prometheus . - docker tag aztecprotocol/aztec-prometheus aztecprotocol/aztec-prometheus:$GIT_COMMIT - docker push aztecprotocol/aztec-prometheus - docker push aztecprotocol/aztec-prometheus:$GIT_COMMIT - - - name: Build & push grafana - working-directory: ./metrics/grafana - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - docker build -t aztecprotocol/aztec-grafana . - docker tag aztecprotocol/aztec-grafana aztecprotocol/aztec-grafana:$GIT_COMMIT - docker push aztecprotocol/aztec-grafana - docker push aztecprotocol/aztec-grafana:$GIT_COMMIT - - - name: Build & push open telemetry - working-directory: ./metrics/otel - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - docker build -t aztecprotocol/aztec-otel . - docker tag aztecprotocol/aztec-otel aztecprotocol/aztec-otel:$GIT_COMMIT - docker push aztecprotocol/aztec-otel - docker push aztecprotocol/aztec-otel:$GIT_COMMIT - - terraform_deploy: - runs-on: ubuntu-latest - needs: build - steps: - - uses: actions/checkout@v4 - with: - ref: "${{ env.GIT_COMMIT }}" - fetch-depth: 0 - - uses: ./.github/ci-setup-action - - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: 1.7.5 - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: eu-west-2 - - - name: Check if metrics have changed - id: check_metrics_changes - uses: actions/github-script@v7 - with: - script: | - const { execSync } = require('child_process'); - const changedFiles = execSync('git diff --name-only ${{ github.event.before }} ${{ github.sha }}').toString().split('\n'); - const fileChanged = changedFiles.some(file => file.startsWith('metrics')); - return fileChanged - - - name: Deploy prometheus - working-directory: ./metrics/prometheus/terraform - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - terraform init -input=false -backend-config="key=aztec-prometheus" - terraform apply -input=false -auto-approve - - - name: Deploy grafana - working-directory: ./metrics/grafana/terraform - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - terraform init -input=false -backend-config="key=aztec-grafana" - terraform apply -input=false -auto-approve - - - name: Deploy open telemetry - working-directory: ./metrics/otel/terraform - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - terraform init -input=false -backend-config="key=aztec-otel" - terraform apply -input=false -auto-approve diff --git a/spartan/metrics/values.yaml b/spartan/metrics/values.yaml index 6e75301c12b..df2ca87aa21 100644 --- a/spartan/metrics/values.yaml +++ b/spartan/metrics/values.yaml @@ -31,20 +31,6 @@ opentelemetry-collector: kubernetesAttributes: enabled: true config: - exporters: - # debug: - # verbosity: detailed - otlphttp/logs: - endpoint: http://metrics-loki.metrics:3100/otlp - otlp/tempo: - endpoint: http://metrics-tempo.metrics:4317 - tls: - insecure: true - prometheus: - endpoint: ${env:MY_POD_IP}:8889 - metric_expiration: 5m - resource_to_telemetry_conversion: - enabled: true extensions: health_check: endpoint: ${env:MY_POD_IP}:13133 @@ -91,7 +77,7 @@ opentelemetry-collector: # - debug # Enable and configure the Loki subchart -# https://artifacthub.io/packages/helm/grafana/loki-simple-scalable +# https://artifacthub.io/packages/helm/grafana/loki # loki: # Nothing set here, because we need to use values from the values directory; # otherwise, things don't get overridden correctly. diff --git a/spartan/metrics/values/kind.yaml b/spartan/metrics/values/kind.yaml index c8b8a970b25..857d74af2b0 100644 --- a/spartan/metrics/values/kind.yaml +++ b/spartan/metrics/values/kind.yaml @@ -1,3 +1,20 @@ +opentelemetry-collector: + config: + exporters: + # debug: + # verbosity: detailed + otlphttp/logs: + endpoint: http://metrics-loki.metrics:3100/otlp + otlp/tempo: + endpoint: http://metrics-tempo.metrics:4317 + tls: + insecure: true + prometheus: + endpoint: ${env:MY_POD_IP}:8889 + metric_expiration: 5m + resource_to_telemetry_conversion: + enabled: true + loki: deploymentMode: SingleBinary loki: @@ -23,3 +40,20 @@ loki: replicas: 0 write: replicas: 0 + +grafana: + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Loki + type: loki + url: http://metrics-loki.metrics:3100 + - name: Tempo + type: tempo + url: http://metrics-tempo.metrics:3100 + - name: Prometheus + type: prometheus + uid: spartan-metrics-prometheus + isDefault: true + url: http://metrics-prometheus-server.metrics:80 diff --git a/spartan/metrics/values/prod.yaml b/spartan/metrics/values/prod.yaml index 2da726d4431..1e726bdec01 100644 --- a/spartan/metrics/values/prod.yaml +++ b/spartan/metrics/values/prod.yaml @@ -1,13 +1,3 @@ -# Enable and configure Grafana -# https://artifacthub.io/packages/helm/grafana/grafana -grafana: - service: - type: LoadBalancer - persistence: - type: pvc - enabled: true - size: "10Gi" - opentelemetry-collector: ports: jaeger-compact: @@ -15,6 +5,21 @@ opentelemetry-collector: service: enabled: true type: LoadBalancer + config: + exporters: + # debug: + # verbosity: detailed + otlphttp/logs: + endpoint: http://loki-write.metrics:3100/otlp + otlp/tempo: + endpoint: http://metrics-tempo.metrics:4317 + tls: + insecure: true + prometheus: + endpoint: ${env:MY_POD_IP}:8889 + metric_expiration: 5m + resource_to_telemetry_conversion: + enabled: true loki: loki: @@ -58,6 +63,26 @@ loki: persistence: size: 64Gi - gateway: - service: - type: LoadBalancer +# https://artifacthub.io/packages/helm/grafana/grafana +grafana: + service: + type: LoadBalancer + persistence: + type: pvc + enabled: true + size: "10Gi" + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Loki + type: loki + url: http://loki-read.metrics:3100 + - name: Tempo + type: tempo + url: http://metrics-tempo.metrics:3100 + - name: Prometheus + type: prometheus + uid: spartan-metrics-prometheus + isDefault: true + url: http://metrics-prometheus-server.metrics:80 diff --git a/spartan/terraform/deploy-metrics/data.tf b/spartan/terraform/deploy-metrics/data.tf new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/spartan/terraform/deploy-metrics/data.tf @@ -0,0 +1 @@ + diff --git a/spartan/terraform/deploy-metrics/main.tf b/spartan/terraform/deploy-metrics/main.tf new file mode 100644 index 00000000000..2a72457fb6e --- /dev/null +++ b/spartan/terraform/deploy-metrics/main.tf @@ -0,0 +1,53 @@ +terraform { + backend "gcs" { + bucket = "aztec-terraform" + prefix = "terraform/state" + } + required_providers { + helm = { + source = "hashicorp/helm" + version = "~> 2.16.1" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.24.0" + } + } +} + +provider "kubernetes" { + alias = "gke-cluster" + config_path = "~/.kube/config" + config_context = var.GKE_CLUSTER_CONTEXT +} + +provider "helm" { + alias = "gke-cluster" + kubernetes { + config_path = "~/.kube/config" + config_context = var.GKE_CLUSTER_CONTEXT + } +} + +# Aztec Helm release for gke-cluster +resource "helm_release" "aztec-gke-cluster" { + provider = helm.gke-cluster + name = var.RELEASE_NAME + repository = "../../" + chart = "metrics" + namespace = var.RELEASE_NAME + create_namespace = true + upgrade_install = true + dependency_update = true + force_update = true + + # base values file + values = [file("../../metrics/values/${var.VALUES_FILE}")] + + + # Setting timeout and wait conditions + timeout = 1200 # 20 minutes in seconds + wait = true + wait_for_jobs = true + +} diff --git a/spartan/terraform/deploy-metrics/outputs.tf b/spartan/terraform/deploy-metrics/outputs.tf new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/spartan/terraform/deploy-metrics/outputs.tf @@ -0,0 +1 @@ + diff --git a/spartan/terraform/deploy-metrics/variables.tf b/spartan/terraform/deploy-metrics/variables.tf new file mode 100644 index 00000000000..a0021f71cf4 --- /dev/null +++ b/spartan/terraform/deploy-metrics/variables.tf @@ -0,0 +1,15 @@ +variable "GKE_CLUSTER_CONTEXT" { + description = "GKE cluster context" + type = string + default = "gke_testnet-440309_us-east4-a_spartan-gke" +} + +variable "RELEASE_NAME" { + description = "Name of helm deployment and k8s namespace" + type = string +} + +variable "VALUES_FILE" { + description = "Name of the values file to use for deployment" + type = string +}