diff --git a/.github/workflows/metrics-deploy.yml b/.github/workflows/metrics-deploy.yml new file mode 100644 index 00000000000..331317426fb --- /dev/null +++ b/.github/workflows/metrics-deploy.yml @@ -0,0 +1,143 @@ +name: Aztec Metrics Stack Deployment + +on: + workflow_call: + inputs: + namespace: + description: The namespace to deploy to, e.g. metrics + required: true + type: string + default: metrics + values_file: + description: The values file to use, e.g. prod.yaml + required: true + type: string + default: "prod.yaml" + respect_tf_lock: + description: Whether to respect the Terraform lock + required: false + type: string + default: "true" + run_terraform_destroy: + description: Whether to run terraform destroy before deploying + required: false + type: string + default: "false" + ref: + description: The branch name to deploy from + required: false + type: string + default: "master" + secrets: + GCP_SA_KEY: + required: true + workflow_dispatch: + inputs: + namespace: + description: The namespace to deploy to, e.g. metrics + required: true + default: metrics + values_file: + description: The values file to use, e.g. prod.yaml + required: true + default: "prod.yaml" + respect_tf_lock: + description: Whether to respect the Terraform lock + required: false + default: "true" + run_terraform_destroy: + description: Whether to run terraform destroy before deploying + required: false + default: "false" + ref: + description: The branch name to deploy from + required: false + default: "master" + +jobs: + metrics_deployment: + # This job will run on Ubuntu + runs-on: ubuntu-latest + concurrency: + group: deploy-${{ github.ref }} # Only one job per branch + cancel-in-progress: false # Allow previous deployment to complete to avoid corruption + + # Set up a variable based on the branch name + env: + NAMESPACE: ${{ inputs.namespace }} + VALUES_FILE: ${{ inputs.values_file }} + CHART_PATH: ./spartan/metrics + CLUSTER_NAME: aztec-gke + REGION: us-west1-a + TF_STATE_BUCKET: aztec-terraform + GKE_CLUSTER_CONTEXT: gke_testnet-440309_us-west1-a_aztec-gke + + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + ref: ${{ inputs.ref }} + + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v2 + + - name: Install GKE Auth Plugin + run: | + gcloud components install gke-gcloud-auth-plugin --quiet + + - name: Configure kubectl with GKE cluster + run: | + gcloud container clusters get-credentials ${{ env.CLUSTER_NAME }} --region ${{ env.REGION }} + + - name: Ensure Terraform state bucket exists + run: | + if ! gsutil ls gs://${{ env.TF_STATE_BUCKET }} >/dev/null 2>&1; then + echo "Creating GCS bucket for Terraform state..." + gsutil mb -l us-east4 gs://${{ env.TF_STATE_BUCKET }} + gsutil versioning set on gs://${{ env.TF_STATE_BUCKET }} + else + echo "Terraform state bucket already exists" + fi + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v2 + with: + terraform_version: "1.5.0" # Specify your desired version + + - name: Terraform Init + working-directory: ./spartan/terraform/deploy-metrics + run: | + terraform init \ + -backend-config="bucket=${{ env.TF_STATE_BUCKET }}" \ + -backend-config="prefix=metrics-deploy/${{ env.REGION }}/${{ env.CLUSTER_NAME }}/${{ env.NAMESPACE }}/terraform.tfstate" + + - name: Terraform Destroy + working-directory: ./spartan/terraform/deploy-metrics + if: ${{ inputs.run_terraform_destroy == 'true' }} + # Destroy fails if the resources are already destroyed, so we continue on error + continue-on-error: true + run: | + terraform destroy -auto-approve \ + -var="RELEASE_NAME=${{ env.NAMESPACE }}" \ + -var="VALUES_FILE=${{ env.VALUES_FILE }}" \ + -var="GKE_CLUSTER_CONTEXT=${{ env.GKE_CLUSTER_CONTEXT }}" \ + -lock=${{ inputs.respect_tf_lock }} + + - name: Terraform Plan + working-directory: ./spartan/terraform/deploy-metrics + run: | + terraform plan \ + -var="RELEASE_NAME=${{ env.NAMESPACE }}" \ + -var="VALUES_FILE=${{ env.VALUES_FILE }}" \ + -var="GKE_CLUSTER_CONTEXT=${{ env.GKE_CLUSTER_CONTEXT }}" \ + -out=tfplan \ + -lock=${{ inputs.respect_tf_lock }} + + - name: Terraform Apply + working-directory: ./spartan/terraform/deploy-metrics + run: terraform apply -lock=${{ inputs.respect_tf_lock }} -auto-approve tfplan diff --git a/.github/workflows/metrics-deploys.yml b/.github/workflows/metrics-deploys.yml deleted file mode 100644 index b3552e414d3..00000000000 --- a/.github/workflows/metrics-deploys.yml +++ /dev/null @@ -1,129 +0,0 @@ -name: Deploy metrics -on: - # push: - # branches: [devnet] - workflow_dispatch: - inputs: {} - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -env: - DOCKERHUB_PASSWORD: ${{ secrets.DOCKERHUB_PASSWORD }} - GIT_COMMIT: ${{ github.sha }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - # TF Vars - TF_VAR_DOCKERHUB_ACCOUNT: aztecprotocol - TF_VAR_GRAFANA_CLIENT_ID: ${{ secrets.GRAFANA_CLIENT_ID }} - TF_VAR_GRAFANA_CLIENT_SECRET: ${{ secrets.GRAFANA_CLIENT_SECRET }} - TF_VAR_IMAGE_TAG: ${{ github.sha }} - -jobs: - setup: - uses: ./.github/workflows/setup-runner.yml - with: - username: master - runner_type: builder-x86 - secrets: inherit - build: - needs: setup - runs-on: ${{ github.actor }}-x86 - steps: - - uses: actions/checkout@v4 - with: - ref: "${{ env.GIT_COMMIT }}" - fetch-depth: 0 - - uses: ./.github/ci-setup-action - with: - concurrency_key: build-metrics-${{ github.actor }} - dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}" - - - name: Check if metrics have changed - id: check_metrics_changes - uses: actions/github-script@v7 - with: - script: | - const { execSync } = require('child_process'); - const changedFiles = execSync('git diff --name-only ${{ github.event.before }} ${{ github.sha }}').toString().split('\n'); - const fileChanged = changedFiles.some(file => file.startsWith('metrics')); - return fileChanged - - - name: Build & push prometheus - working-directory: ./metrics/prometheus - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - docker build -t aztecprotocol/aztec-prometheus . - docker tag aztecprotocol/aztec-prometheus aztecprotocol/aztec-prometheus:$GIT_COMMIT - docker push aztecprotocol/aztec-prometheus - docker push aztecprotocol/aztec-prometheus:$GIT_COMMIT - - - name: Build & push grafana - working-directory: ./metrics/grafana - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - docker build -t aztecprotocol/aztec-grafana . - docker tag aztecprotocol/aztec-grafana aztecprotocol/aztec-grafana:$GIT_COMMIT - docker push aztecprotocol/aztec-grafana - docker push aztecprotocol/aztec-grafana:$GIT_COMMIT - - - name: Build & push open telemetry - working-directory: ./metrics/otel - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - docker build -t aztecprotocol/aztec-otel . - docker tag aztecprotocol/aztec-otel aztecprotocol/aztec-otel:$GIT_COMMIT - docker push aztecprotocol/aztec-otel - docker push aztecprotocol/aztec-otel:$GIT_COMMIT - - terraform_deploy: - runs-on: ubuntu-latest - needs: build - steps: - - uses: actions/checkout@v4 - with: - ref: "${{ env.GIT_COMMIT }}" - fetch-depth: 0 - - uses: ./.github/ci-setup-action - - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: 1.7.5 - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: eu-west-2 - - - name: Check if metrics have changed - id: check_metrics_changes - uses: actions/github-script@v7 - with: - script: | - const { execSync } = require('child_process'); - const changedFiles = execSync('git diff --name-only ${{ github.event.before }} ${{ github.sha }}').toString().split('\n'); - const fileChanged = changedFiles.some(file => file.startsWith('metrics')); - return fileChanged - - - name: Deploy prometheus - working-directory: ./metrics/prometheus/terraform - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - terraform init -input=false -backend-config="key=aztec-prometheus" - terraform apply -input=false -auto-approve - - - name: Deploy grafana - working-directory: ./metrics/grafana/terraform - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - terraform init -input=false -backend-config="key=aztec-grafana" - terraform apply -input=false -auto-approve - - - name: Deploy open telemetry - working-directory: ./metrics/otel/terraform - if: steps.check_metrics_changes.outputs.result == 'true' - run: | - terraform init -input=false -backend-config="key=aztec-otel" - terraform apply -input=false -auto-approve diff --git a/spartan/metrics/values.yaml b/spartan/metrics/values.yaml index 6e75301c12b..df2ca87aa21 100644 --- a/spartan/metrics/values.yaml +++ b/spartan/metrics/values.yaml @@ -31,20 +31,6 @@ opentelemetry-collector: kubernetesAttributes: enabled: true config: - exporters: - # debug: - # verbosity: detailed - otlphttp/logs: - endpoint: http://metrics-loki.metrics:3100/otlp - otlp/tempo: - endpoint: http://metrics-tempo.metrics:4317 - tls: - insecure: true - prometheus: - endpoint: ${env:MY_POD_IP}:8889 - metric_expiration: 5m - resource_to_telemetry_conversion: - enabled: true extensions: health_check: endpoint: ${env:MY_POD_IP}:13133 @@ -91,7 +77,7 @@ opentelemetry-collector: # - debug # Enable and configure the Loki subchart -# https://artifacthub.io/packages/helm/grafana/loki-simple-scalable +# https://artifacthub.io/packages/helm/grafana/loki # loki: # Nothing set here, because we need to use values from the values directory; # otherwise, things don't get overridden correctly. diff --git a/spartan/metrics/values/kind.yaml b/spartan/metrics/values/kind.yaml index c8b8a970b25..857d74af2b0 100644 --- a/spartan/metrics/values/kind.yaml +++ b/spartan/metrics/values/kind.yaml @@ -1,3 +1,20 @@ +opentelemetry-collector: + config: + exporters: + # debug: + # verbosity: detailed + otlphttp/logs: + endpoint: http://metrics-loki.metrics:3100/otlp + otlp/tempo: + endpoint: http://metrics-tempo.metrics:4317 + tls: + insecure: true + prometheus: + endpoint: ${env:MY_POD_IP}:8889 + metric_expiration: 5m + resource_to_telemetry_conversion: + enabled: true + loki: deploymentMode: SingleBinary loki: @@ -23,3 +40,20 @@ loki: replicas: 0 write: replicas: 0 + +grafana: + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Loki + type: loki + url: http://metrics-loki.metrics:3100 + - name: Tempo + type: tempo + url: http://metrics-tempo.metrics:3100 + - name: Prometheus + type: prometheus + uid: spartan-metrics-prometheus + isDefault: true + url: http://metrics-prometheus-server.metrics:80 diff --git a/spartan/metrics/values/prod.yaml b/spartan/metrics/values/prod.yaml index 2da726d4431..1e726bdec01 100644 --- a/spartan/metrics/values/prod.yaml +++ b/spartan/metrics/values/prod.yaml @@ -1,13 +1,3 @@ -# Enable and configure Grafana -# https://artifacthub.io/packages/helm/grafana/grafana -grafana: - service: - type: LoadBalancer - persistence: - type: pvc - enabled: true - size: "10Gi" - opentelemetry-collector: ports: jaeger-compact: @@ -15,6 +5,21 @@ opentelemetry-collector: service: enabled: true type: LoadBalancer + config: + exporters: + # debug: + # verbosity: detailed + otlphttp/logs: + endpoint: http://loki-write.metrics:3100/otlp + otlp/tempo: + endpoint: http://metrics-tempo.metrics:4317 + tls: + insecure: true + prometheus: + endpoint: ${env:MY_POD_IP}:8889 + metric_expiration: 5m + resource_to_telemetry_conversion: + enabled: true loki: loki: @@ -58,6 +63,26 @@ loki: persistence: size: 64Gi - gateway: - service: - type: LoadBalancer +# https://artifacthub.io/packages/helm/grafana/grafana +grafana: + service: + type: LoadBalancer + persistence: + type: pvc + enabled: true + size: "10Gi" + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Loki + type: loki + url: http://loki-read.metrics:3100 + - name: Tempo + type: tempo + url: http://metrics-tempo.metrics:3100 + - name: Prometheus + type: prometheus + uid: spartan-metrics-prometheus + isDefault: true + url: http://metrics-prometheus-server.metrics:80 diff --git a/spartan/terraform/deploy-metrics/data.tf b/spartan/terraform/deploy-metrics/data.tf new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/spartan/terraform/deploy-metrics/data.tf @@ -0,0 +1 @@ + diff --git a/spartan/terraform/deploy-metrics/main.tf b/spartan/terraform/deploy-metrics/main.tf new file mode 100644 index 00000000000..2a72457fb6e --- /dev/null +++ b/spartan/terraform/deploy-metrics/main.tf @@ -0,0 +1,53 @@ +terraform { + backend "gcs" { + bucket = "aztec-terraform" + prefix = "terraform/state" + } + required_providers { + helm = { + source = "hashicorp/helm" + version = "~> 2.16.1" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.24.0" + } + } +} + +provider "kubernetes" { + alias = "gke-cluster" + config_path = "~/.kube/config" + config_context = var.GKE_CLUSTER_CONTEXT +} + +provider "helm" { + alias = "gke-cluster" + kubernetes { + config_path = "~/.kube/config" + config_context = var.GKE_CLUSTER_CONTEXT + } +} + +# Aztec Helm release for gke-cluster +resource "helm_release" "aztec-gke-cluster" { + provider = helm.gke-cluster + name = var.RELEASE_NAME + repository = "../../" + chart = "metrics" + namespace = var.RELEASE_NAME + create_namespace = true + upgrade_install = true + dependency_update = true + force_update = true + + # base values file + values = [file("../../metrics/values/${var.VALUES_FILE}")] + + + # Setting timeout and wait conditions + timeout = 1200 # 20 minutes in seconds + wait = true + wait_for_jobs = true + +} diff --git a/spartan/terraform/deploy-metrics/outputs.tf b/spartan/terraform/deploy-metrics/outputs.tf new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/spartan/terraform/deploy-metrics/outputs.tf @@ -0,0 +1 @@ + diff --git a/spartan/terraform/deploy-metrics/variables.tf b/spartan/terraform/deploy-metrics/variables.tf new file mode 100644 index 00000000000..a0021f71cf4 --- /dev/null +++ b/spartan/terraform/deploy-metrics/variables.tf @@ -0,0 +1,15 @@ +variable "GKE_CLUSTER_CONTEXT" { + description = "GKE cluster context" + type = string + default = "gke_testnet-440309_us-east4-a_spartan-gke" +} + +variable "RELEASE_NAME" { + description = "Name of helm deployment and k8s namespace" + type = string +} + +variable "VALUES_FILE" { + description = "Name of the values file to use for deployment" + type = string +}