Skip to content

Commit

Permalink
Refactor ECS Agent & Alloy configurations (#7)
Browse files Browse the repository at this point in the history
* feat: refactor alloy configs

* fix: remove unused input from alloy module

* fix(alloy): name for otlp exporter was invalid, fix

* fix: update casing for ecs-agent alloy config

* fix(alloy): get config working 🎉

* feat(ci): deploy on main branch

* fix(ci): drop ifs for testing porpoises

* fix(ci): add --quiet to aws s3 cp call

* fix(ci): reintroduce if: main branch checks

* fix(ci): reintroduce cachix action

* feat(ci): make the workflow cleaner to read

* feat(ci): job-end summary

* fix(ci): remove check to test with

* fix(ci): add check back in
  • Loading branch information
hbjydev authored May 1, 2024
1 parent 5b9b7da commit def0bfb
Show file tree
Hide file tree
Showing 9 changed files with 171 additions and 69 deletions.
37 changes: 27 additions & 10 deletions .github/workflows/flake.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ on:
branches:
- main

# env:
# CACHIX_BINARY_CACHE: altf4llc-os
env:
CACHIX_BINARY_CACHE: altf4llc-os

jobs:
check:
Expand All @@ -16,17 +16,20 @@ jobs:
- uses: cachix/install-nix-action@v25
with:
enable_kvm: true
# - uses: cachix/cachix-action@v14
# with:
# authToken: ${{ secrets.ALTF4LLC_CACHIX_AUTH_TOKEN }}
# name: ${{ env.CACHIX_BINARY_CACHE }}
- uses: cachix/cachix-action@v14
with:
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
name: ${{ env.CACHIX_BINARY_CACHE }}
- uses: actions/checkout@v4
- run: nix develop -c just check

build:
needs:
- check
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write
strategy:
matrix:
profile:
Expand All @@ -36,9 +39,23 @@ jobs:
- uses: cachix/install-nix-action@v25
with:
enable_kvm: true
# - uses: cachix/cachix-action@v14
# with:
# authToken: ${{ secrets.ALTF4LLC_CACHIX_AUTH_TOKEN }}
# name: ${{ env.CACHIX_BINARY_CACHE }}

- uses: cachix/cachix-action@v14
with:
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
name: ${{ env.CACHIX_BINARY_CACHE }}

- uses: actions/checkout@v4
- run: nix develop -c just build "${{ matrix.profile }}"

- uses: aws-actions/configure-aws-credentials@v4
if: github.ref == 'refs/heads/main'
with:
aws-region: us-west-2
role-to-assume: arn:aws:iam::677459762413:role/altf4llc-gha-vms-nix

- if: github.ref == 'refs/heads/main'
run: aws sts get-caller-identity

- if: github.ref == 'refs/heads/main'
run: nix develop -c just publish-ami "${{ matrix.profile }}"
87 changes: 87 additions & 0 deletions ci-build-publish.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/usr/bin/env bash
set -euo pipefail

bucket="$1"
profile="$2"

ci="${CI:-false}"

current_group=""
group() {
# Starts a group (GitHub Actions)
current_group="$1"
if [[ "$ci" == "true" ]]; then
echo "::group::$1";
else
echo "> $1"
fi
}

endgroup() {
# Ends the group (GitHub Actions)
if [[ "$ci" == "true" ]]; then
echo "::endgroup::"
else
echo "> Finished $current_group"
fi
current_group=""
}

ciout() {
# Sets the value as a job output
if [[ "$ci" == "true" ]]; then echo "$1=$2" >> "$GITHUB_OUTPUT"; fi
}

cisum() {
if [[ "$ci" == "true" ]]; then
echo "$@" >> "$GITHUB_STEP_SUMMARY"
fi
}

build_time=$(date +%s)
image_name="altf4llc-$profile-$build_time"
ciout image_name "$image_name"

group "Building source VHD"
derivation=$(just build "$profile")
output=$(echo "$derivation" | jq -r '.[].outputs.out')
image_path=$(cd "$output" && ls -- *.vhd)
endgroup

group "Uploading VHD to S3"
aws s3 cp "$output/$image_path" "s3://$bucket/$image_name.vhd" --quiet
endgroup

group "Importing VHD as snapshot in EC2"
task_id=$(aws ec2 import-snapshot --disk-container "Format=VHD,UserBucket={S3Bucket=$bucket,S3Key=$image_name.vhd}" --output json | jq -r ".ImportTaskId")

echo "Waiting for snapshot import to complete."
until [[ $(aws ec2 describe-import-snapshot-tasks --import-task-ids "$task_id" --output json | jq -r '.ImportSnapshotTasks[].SnapshotTaskDetail.Status') == "completed" ]]; do
echo "Snapshot is not imported yet, waiting..."
sleep 5
done

snapshot_id=$(aws ec2 describe-import-snapshot-tasks --import-task-ids "$task_id" --output json | jq -r '.ImportSnapshotTasks[].SnapshotTaskDetail.SnapshotId')

echo "New snapshot is $snapshot_id."
ciout snapshot_id "$snapshot_id"
endgroup

echo "::group::Registering new AMI"
ami_id=$(aws ec2 register-image --architecture x86_64 --ena-support --name "$image_name" --description "A NixOS AMI: {{profile}}" --block-device-mappings "DeviceName=/dev/sda1,Ebs={SnapshotId=$snapshot_id}" --root-device-name /dev/sda1 | jq .ImageId)
echo "AMI is registered: $ami_id"
ciout ami_id "$ami_id"
echo "::endgroup::"

echo "::group::Cleaning up image VHD from bucket"
aws s3 rm "s3://$bucket/$image_name.vhd"
echo "::endgroup::"

cisum "# :rocket: AMI build successful"
cisum ""
cisum "An image was successfully built for Nix profile \`$profile\`."
cisum ""
cisum "- Build time: \`$build_time\`"
cisum "- VHD import job ID: \`$task_id\`"
cisum "- AMI ID: \`$ami_id\`"
cisum "- Snapshot ID: \`$snapshot_id\`"
32 changes: 1 addition & 31 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,4 @@ build profile:
nix build --json --print-build-logs --no-link '.#{{profile}}'

publish-ami profile:
#!/usr/bin/env bash
set -euo pipefail
BUILD_TIME=$(date +%s)
IMAGE_NAME="altf4llc-{{profile}}-$BUILD_TIME"
DERIVATION=$(just build {{profile}})
OUTPUT=$(echo "$DERIVATION" | jq -r '.[].outputs.out')
IMAGE_PATH=$(cd "$OUTPUT" && ls *.vhd)

echo "Uploading VHD to S3."
aws s3 cp "$OUTPUT/$IMAGE_PATH" "s3://{{ami_bucket}}/$IMAGE_NAME.vhd"

echo "Starting snapshot import."
TASK_ID=$(aws ec2 import-snapshot --disk-container "Format=VHD,UserBucket={S3Bucket={{ami_bucket}},S3Key=$IMAGE_NAME.vhd}" --output json | jq -r ".ImportTaskId")

echo "Waiting for snapshot import to complete."
until [[ $(aws ec2 describe-import-snapshot-tasks --import-task-ids "$TASK_ID" --output json | jq -r '.ImportSnapshotTasks[].SnapshotTaskDetail.Status') == "completed" ]]; do
echo "Snapshot is not imported yet, waiting..."
sleep 5
done

SNAPSHOT_ID=$(aws ec2 describe-import-snapshot-tasks --import-task-ids "$TASK_ID" --output json | jq -r '.ImportSnapshotTasks[].SnapshotTaskDetail.SnapshotId')

echo "New snapshot is $SNAPSHOT_ID."

AMI_ID=$(aws ec2 register-image --architecture x86_64 --ena-support --name "$IMAGE_NAME" --description "A NixOS AMI: {{profile}}" --block-device-mappings "DeviceName=/dev/sda1,Ebs={SnapshotId=$SNAPSHOT_ID}" --root-device-name /dev/sda1 | jq .ImageId)

echo "AMI is registered: $AMI_ID"

echo "Cleaning up image VHD from bucket"
aws s3 rm "s3://{{ami_bucket}}/$IMAGE_NAME.vhd"
bash ./ci-build-publish.sh "{{ami_bucket}}" "{{profile}}"
42 changes: 31 additions & 11 deletions modules/mixins/alloy/config.alloy
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# We ship everything over OTLP
otelcol.exporter.otlp "gc-fwd" {
// We ship everything over OTLP
otelcol.exporter.otlp "gc_fwd" {
client {
endpoint = "gc-fwd.altf4.internal:4317"
tls {
Expand All @@ -8,38 +8,58 @@ otelcol.exporter.otlp "gc-fwd" {
}
}

# Convert Prometheus data for OTLP
// Convert Prometheus data for OTLP
otelcol.receiver.prometheus "default" {
output {
metrics = [otelcol.exporter.otlp.gc-fwd.input]
metrics = [otelcol.exporter.otlp.gc_fwd.input]
}
}

# Convert Loki data for OTLP
// Convert OTLP data for Prometheus
otelcol.exporter.prometheus "default" {
forward_to = [prometheus.relabel.instance.receiver]
}

// Convert Loki data for OTLP
otelcol.receiver.loki "default" {
output {
logs = [otelcol.exporter.otlp.gc-fwd.input]
logs = [otelcol.exporter.otlp.gc_fwd.input]
}
}

# Extract Systemd unit from journal entry
// Extract Systemd unit from journal entry
loki.relabel "journal" {
forward_to = []

rule {
source_labels = ["__journal__systemd_unit"]
target_label = "unit"
}

rule {
source_labels = ["__journal_container_name"]
target_label = "container_name"
}

rule {
source_labels = ["__journal_image_name"]
target_label = "container_image"
}

rule {
source_labels = ["__journal_container_id"]
target_label = "container_id"
}
}

# Fetch journal entries
// Fetch journal entries
loki.source.journal "journal" {
forward_to = [otelcol.receiver.loki.default.receiver]
relabel_rules = loki.relabel.journal.rules
labels = {component = "loki.source.journal"}
}

# Set instance label to the hostname
// Set instance label to the hostname
prometheus.relabel "instance" {
forward_to = [otelcol.receiver.prometheus.default.receiver]
rule {
Expand All @@ -48,7 +68,7 @@ prometheus.relabel "instance" {
}
}

# Export system metrics
// Export system metrics
prometheus.exporter.unix "host" {
procfs_path = "/host/proc"
sysfs_path = "/host/sys"
Expand All @@ -60,7 +80,7 @@ prometheus.exporter.unix "host" {
}
}

# Scrape system metrics
// Scrape system metrics
prometheus.scrape "host" {
targets = prometheus.exporter.unix.host.targets
forward_to = [prometheus.relabel.instance.receiver]
Expand Down
2 changes: 1 addition & 1 deletion modules/mixins/alloy/default.nix
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{config, ...}: {
{...}: {
# see TODO further down
imports = [../docker];

Expand Down
12 changes: 12 additions & 0 deletions modules/mixins/docker/config.alloy
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
prometheus.exporter.cadvisor "cadvisor" {
docker_host = "unix:///var/run/docker.sock"
storage_duration = "5m"
}

prometheus.scrape "cadvisor" {
targets = prometheus.exporter.cadvisor.cadvisor.targets
forward_to = [prometheus.relabel.instance.receiver]
scrape_interval = "30s"
}

// vim:ft=hcl
7 changes: 7 additions & 0 deletions modules/mixins/docker/default.nix
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
{...}: {
virtualisation.docker.enable = true;
virtualisation.oci-containers.backend = "docker";

# Monitoring
environment.etc."alloy/docker.alloy" = {
source = ./config.alloy;
mode = "0440";
user = "root";
};
}
19 changes: 4 additions & 15 deletions modules/mixins/ecs-agent/config.alloy
Original file line number Diff line number Diff line change
@@ -1,15 +1,4 @@
prometheus.exporter.cadvisor "cadvisor" {
docker_host = "unix:///var/run/docker.sock"
storage_duration = "5m"
}

prometheus.scrape "cadvisor" {
targets = prometheus.exporter.cadvisor.cadvisor.targets
forward_to = [prometheus.relabel.instance.receiver]
scrape_interval = "30s"
}

prometheus.scrape "ecs-agent" {
prometheus.scrape "ecs_agent" {
targets = [
{"__address__" = "127.0.0.1:51680", instance = env("HOSTNAME")},
]
Expand All @@ -24,9 +13,9 @@ otelcol.receiver.otlp "otlp" {
http {}

output {
metrics = [prometheus.relabel.instance.receiver]
logs = [otelcol.receiver.loki.default.receiver]
traces = [otelcol.exporter.otlp.gc-fwd.input]
metrics = [otelcol.exporter.prometheus.default.input]
logs = [otelcol.exporter.otlp.gc_fwd.input]
traces = [otelcol.exporter.otlp.gc_fwd.input]
}
}

Expand Down
2 changes: 1 addition & 1 deletion modules/mixins/ecs-agent/default.nix
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{pkgs, ...}: {
{...}: {
imports = [
../docker
../alloy
Expand Down

0 comments on commit def0bfb

Please sign in to comment.