Skip to content

Commit

Permalink
feat: metrics via terraform (#10594)
Browse files Browse the repository at this point in the history
Also, fix the prod deployment so that it uses the correct service
endpoints for the "scalable" loki config.

Have ran the metrics deployment workflow, and the Loki datasource now
works as expected:

<img width="1603" alt="Screenshot 2024-12-10 at 14 11 17"
src="https://github.com/user-attachments/assets/2ca0adb5-1dd0-480b-a16c-39e622e922c3">

fix #10191
fix #10439
  • Loading branch information
just-mitch authored Dec 10, 2024
1 parent 9eaa527 commit e21069d
Show file tree
Hide file tree
Showing 9 changed files with 286 additions and 157 deletions.
143 changes: 143 additions & 0 deletions .github/workflows/metrics-deploy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
name: Aztec Metrics Stack Deployment

on:
workflow_call:
inputs:
namespace:
description: The namespace to deploy to, e.g. metrics
required: true
type: string
default: metrics
values_file:
description: The values file to use, e.g. prod.yaml
required: true
type: string
default: "prod.yaml"
respect_tf_lock:
description: Whether to respect the Terraform lock
required: false
type: string
default: "true"
run_terraform_destroy:
description: Whether to run terraform destroy before deploying
required: false
type: string
default: "false"
ref:
description: The branch name to deploy from
required: false
type: string
default: "master"
secrets:
GCP_SA_KEY:
required: true
workflow_dispatch:
inputs:
namespace:
description: The namespace to deploy to, e.g. metrics
required: true
default: metrics
values_file:
description: The values file to use, e.g. prod.yaml
required: true
default: "prod.yaml"
respect_tf_lock:
description: Whether to respect the Terraform lock
required: false
default: "true"
run_terraform_destroy:
description: Whether to run terraform destroy before deploying
required: false
default: "false"
ref:
description: The branch name to deploy from
required: false
default: "master"

jobs:
metrics_deployment:
# This job will run on Ubuntu
runs-on: ubuntu-latest
concurrency:
group: deploy-${{ github.ref }} # Only one job per branch
cancel-in-progress: false # Allow previous deployment to complete to avoid corruption

# Set up a variable based on the branch name
env:
NAMESPACE: ${{ inputs.namespace }}
VALUES_FILE: ${{ inputs.values_file }}
CHART_PATH: ./spartan/metrics
CLUSTER_NAME: aztec-gke
REGION: us-west1-a
TF_STATE_BUCKET: aztec-terraform
GKE_CLUSTER_CONTEXT: gke_testnet-440309_us-west1-a_aztec-gke

steps:
- name: Checkout code
uses: actions/checkout@v3
with:
ref: ${{ inputs.ref }}

- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.GCP_SA_KEY }}

- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v2

- name: Install GKE Auth Plugin
run: |
gcloud components install gke-gcloud-auth-plugin --quiet
- name: Configure kubectl with GKE cluster
run: |
gcloud container clusters get-credentials ${{ env.CLUSTER_NAME }} --region ${{ env.REGION }}
- name: Ensure Terraform state bucket exists
run: |
if ! gsutil ls gs://${{ env.TF_STATE_BUCKET }} >/dev/null 2>&1; then
echo "Creating GCS bucket for Terraform state..."
gsutil mb -l us-east4 gs://${{ env.TF_STATE_BUCKET }}
gsutil versioning set on gs://${{ env.TF_STATE_BUCKET }}
else
echo "Terraform state bucket already exists"
fi
- name: Setup Terraform
uses: hashicorp/setup-terraform@v2
with:
terraform_version: "1.5.0" # Specify your desired version

- name: Terraform Init
working-directory: ./spartan/terraform/deploy-metrics
run: |
terraform init \
-backend-config="bucket=${{ env.TF_STATE_BUCKET }}" \
-backend-config="prefix=metrics-deploy/${{ env.REGION }}/${{ env.CLUSTER_NAME }}/${{ env.NAMESPACE }}/terraform.tfstate"
- name: Terraform Destroy
working-directory: ./spartan/terraform/deploy-metrics
if: ${{ inputs.run_terraform_destroy == 'true' }}
# Destroy fails if the resources are already destroyed, so we continue on error
continue-on-error: true
run: |
terraform destroy -auto-approve \
-var="RELEASE_NAME=${{ env.NAMESPACE }}" \
-var="VALUES_FILE=${{ env.VALUES_FILE }}" \
-var="GKE_CLUSTER_CONTEXT=${{ env.GKE_CLUSTER_CONTEXT }}" \
-lock=${{ inputs.respect_tf_lock }}
- name: Terraform Plan
working-directory: ./spartan/terraform/deploy-metrics
run: |
terraform plan \
-var="RELEASE_NAME=${{ env.NAMESPACE }}" \
-var="VALUES_FILE=${{ env.VALUES_FILE }}" \
-var="GKE_CLUSTER_CONTEXT=${{ env.GKE_CLUSTER_CONTEXT }}" \
-out=tfplan \
-lock=${{ inputs.respect_tf_lock }}
- name: Terraform Apply
working-directory: ./spartan/terraform/deploy-metrics
run: terraform apply -lock=${{ inputs.respect_tf_lock }} -auto-approve tfplan
129 changes: 0 additions & 129 deletions .github/workflows/metrics-deploys.yml

This file was deleted.

16 changes: 1 addition & 15 deletions spartan/metrics/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,6 @@ opentelemetry-collector:
kubernetesAttributes:
enabled: true
config:
exporters:
# debug:
# verbosity: detailed
otlphttp/logs:
endpoint: http://metrics-loki.metrics:3100/otlp
otlp/tempo:
endpoint: http://metrics-tempo.metrics:4317
tls:
insecure: true
prometheus:
endpoint: ${env:MY_POD_IP}:8889
metric_expiration: 5m
resource_to_telemetry_conversion:
enabled: true
extensions:
health_check:
endpoint: ${env:MY_POD_IP}:13133
Expand Down Expand Up @@ -91,7 +77,7 @@ opentelemetry-collector:
# - debug

# Enable and configure the Loki subchart
# https://artifacthub.io/packages/helm/grafana/loki-simple-scalable
# https://artifacthub.io/packages/helm/grafana/loki
# loki:
# Nothing set here, because we need to use values from the values directory;
# otherwise, things don't get overridden correctly.
Expand Down
34 changes: 34 additions & 0 deletions spartan/metrics/values/kind.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
opentelemetry-collector:
config:
exporters:
# debug:
# verbosity: detailed
otlphttp/logs:
endpoint: http://metrics-loki.metrics:3100/otlp
otlp/tempo:
endpoint: http://metrics-tempo.metrics:4317
tls:
insecure: true
prometheus:
endpoint: ${env:MY_POD_IP}:8889
metric_expiration: 5m
resource_to_telemetry_conversion:
enabled: true

loki:
deploymentMode: SingleBinary
loki:
Expand All @@ -23,3 +40,20 @@ loki:
replicas: 0
write:
replicas: 0

grafana:
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: Loki
type: loki
url: http://metrics-loki.metrics:3100
- name: Tempo
type: tempo
url: http://metrics-tempo.metrics:3100
- name: Prometheus
type: prometheus
uid: spartan-metrics-prometheus
isDefault: true
url: http://metrics-prometheus-server.metrics:80
Loading

0 comments on commit e21069d

Please sign in to comment.