Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor ECS Agent & Alloy configurations #7

Merged
merged 14 commits into from
May 1, 2024
Merged
37 changes: 27 additions & 10 deletions .github/workflows/flake.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ on:
branches:
- main

# env:
# CACHIX_BINARY_CACHE: altf4llc-os
env:
CACHIX_BINARY_CACHE: altf4llc-os

jobs:
check:
Expand All @@ -16,17 +16,20 @@ jobs:
- uses: cachix/install-nix-action@v25
with:
enable_kvm: true
# - uses: cachix/cachix-action@v14
# with:
# authToken: ${{ secrets.ALTF4LLC_CACHIX_AUTH_TOKEN }}
# name: ${{ env.CACHIX_BINARY_CACHE }}
- uses: cachix/cachix-action@v14
with:
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
name: ${{ env.CACHIX_BINARY_CACHE }}
- uses: actions/checkout@v4
- run: nix develop -c just check

build:
needs:
- check
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write
strategy:
matrix:
profile:
Expand All @@ -36,9 +39,23 @@ jobs:
- uses: cachix/install-nix-action@v25
with:
enable_kvm: true
# - uses: cachix/cachix-action@v14
# with:
# authToken: ${{ secrets.ALTF4LLC_CACHIX_AUTH_TOKEN }}
# name: ${{ env.CACHIX_BINARY_CACHE }}

- uses: cachix/cachix-action@v14
with:
authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
name: ${{ env.CACHIX_BINARY_CACHE }}

- uses: actions/checkout@v4
- run: nix develop -c just build "${{ matrix.profile }}"

- uses: aws-actions/configure-aws-credentials@v4
if: github.ref == 'refs/heads/main'
with:
aws-region: us-west-2
role-to-assume: arn:aws:iam::677459762413:role/altf4llc-gha-vms-nix

- if: github.ref == 'refs/heads/main'
run: aws sts get-caller-identity

- if: github.ref == 'refs/heads/main'
run: nix develop -c just publish-ami "${{ matrix.profile }}"
87 changes: 87 additions & 0 deletions ci-build-publish.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/usr/bin/env bash
set -euo pipefail

bucket="$1"
profile="$2"

ci="${CI:-false}"

current_group=""
group() {
# Starts a group (GitHub Actions)
current_group="$1"
if [[ "$ci" == "true" ]]; then
echo "::group::$1";
else
echo "> $1"
fi
}

endgroup() {
# Ends the group (GitHub Actions)
if [[ "$ci" == "true" ]]; then
echo "::endgroup::"
else
echo "> Finished $current_group"
fi
current_group=""
}

ciout() {
# Sets the value as a job output
if [[ "$ci" == "true" ]]; then echo "$1=$2" >> "$GITHUB_OUTPUT"; fi
}

cisum() {
if [[ "$ci" == "true" ]]; then
echo "$@" >> "$GITHUB_STEP_SUMMARY"
fi
}

build_time=$(date +%s)
image_name="altf4llc-$profile-$build_time"
ciout image_name "$image_name"

group "Building source VHD"
derivation=$(just build "$profile")
output=$(echo "$derivation" | jq -r '.[].outputs.out')
image_path=$(cd "$output" && ls -- *.vhd)
endgroup

group "Uploading VHD to S3"
aws s3 cp "$output/$image_path" "s3://$bucket/$image_name.vhd" --quiet
endgroup

group "Importing VHD as snapshot in EC2"
task_id=$(aws ec2 import-snapshot --disk-container "Format=VHD,UserBucket={S3Bucket=$bucket,S3Key=$image_name.vhd}" --output json | jq -r ".ImportTaskId")

echo "Waiting for snapshot import to complete."
until [[ $(aws ec2 describe-import-snapshot-tasks --import-task-ids "$task_id" --output json | jq -r '.ImportSnapshotTasks[].SnapshotTaskDetail.Status') == "completed" ]]; do
echo "Snapshot is not imported yet, waiting..."
sleep 5
done

snapshot_id=$(aws ec2 describe-import-snapshot-tasks --import-task-ids "$task_id" --output json | jq -r '.ImportSnapshotTasks[].SnapshotTaskDetail.SnapshotId')

echo "New snapshot is $snapshot_id."
ciout snapshot_id "$snapshot_id"
endgroup

echo "::group::Registering new AMI"
ami_id=$(aws ec2 register-image --architecture x86_64 --ena-support --name "$image_name" --description "A NixOS AMI: {{profile}}" --block-device-mappings "DeviceName=/dev/sda1,Ebs={SnapshotId=$snapshot_id}" --root-device-name /dev/sda1 | jq .ImageId)
echo "AMI is registered: $ami_id"
ciout ami_id "$ami_id"
echo "::endgroup::"

echo "::group::Cleaning up image VHD from bucket"
aws s3 rm "s3://$bucket/$image_name.vhd"
echo "::endgroup::"

cisum "# :rocket: AMI build successful"
cisum ""
cisum "An image was successfully built for Nix profile \`$profile\`."
cisum ""
cisum "- Build time: \`$build_time\`"
cisum "- VHD import job ID: \`$task_id\`"
cisum "- AMI ID: \`$ami_id\`"
cisum "- Snapshot ID: \`$snapshot_id\`"
32 changes: 1 addition & 31 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,4 @@ build profile:
nix build --json --print-build-logs --no-link '.#{{profile}}'

publish-ami profile:
#!/usr/bin/env bash
set -euo pipefail
BUILD_TIME=$(date +%s)
IMAGE_NAME="altf4llc-{{profile}}-$BUILD_TIME"

DERIVATION=$(just build {{profile}})
OUTPUT=$(echo "$DERIVATION" | jq -r '.[].outputs.out')
IMAGE_PATH=$(cd "$OUTPUT" && ls *.vhd)

echo "Uploading VHD to S3."
aws s3 cp "$OUTPUT/$IMAGE_PATH" "s3://{{ami_bucket}}/$IMAGE_NAME.vhd"

echo "Starting snapshot import."
TASK_ID=$(aws ec2 import-snapshot --disk-container "Format=VHD,UserBucket={S3Bucket={{ami_bucket}},S3Key=$IMAGE_NAME.vhd}" --output json | jq -r ".ImportTaskId")

echo "Waiting for snapshot import to complete."
until [[ $(aws ec2 describe-import-snapshot-tasks --import-task-ids "$TASK_ID" --output json | jq -r '.ImportSnapshotTasks[].SnapshotTaskDetail.Status') == "completed" ]]; do
echo "Snapshot is not imported yet, waiting..."
sleep 5
done

SNAPSHOT_ID=$(aws ec2 describe-import-snapshot-tasks --import-task-ids "$TASK_ID" --output json | jq -r '.ImportSnapshotTasks[].SnapshotTaskDetail.SnapshotId')

echo "New snapshot is $SNAPSHOT_ID."

AMI_ID=$(aws ec2 register-image --architecture x86_64 --ena-support --name "$IMAGE_NAME" --description "A NixOS AMI: {{profile}}" --block-device-mappings "DeviceName=/dev/sda1,Ebs={SnapshotId=$SNAPSHOT_ID}" --root-device-name /dev/sda1 | jq .ImageId)

echo "AMI is registered: $AMI_ID"

echo "Cleaning up image VHD from bucket"
aws s3 rm "s3://{{ami_bucket}}/$IMAGE_NAME.vhd"
bash ./ci-build-publish.sh "{{ami_bucket}}" "{{profile}}"
42 changes: 31 additions & 11 deletions modules/mixins/alloy/config.alloy
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# We ship everything over OTLP
otelcol.exporter.otlp "gc-fwd" {
// We ship everything over OTLP
otelcol.exporter.otlp "gc_fwd" {
client {
endpoint = "gc-fwd.altf4.internal:4317"
tls {
Expand All @@ -8,38 +8,58 @@ otelcol.exporter.otlp "gc-fwd" {
}
}

# Convert Prometheus data for OTLP
// Convert Prometheus data for OTLP
otelcol.receiver.prometheus "default" {
output {
metrics = [otelcol.exporter.otlp.gc-fwd.input]
metrics = [otelcol.exporter.otlp.gc_fwd.input]
}
}

# Convert Loki data for OTLP
// Convert OTLP data for Prometheus
otelcol.exporter.prometheus "default" {
forward_to = [prometheus.relabel.instance.receiver]
}

// Convert Loki data for OTLP
otelcol.receiver.loki "default" {
output {
logs = [otelcol.exporter.otlp.gc-fwd.input]
logs = [otelcol.exporter.otlp.gc_fwd.input]
}
}

# Extract Systemd unit from journal entry
// Extract Systemd unit from journal entry
loki.relabel "journal" {
forward_to = []

rule {
source_labels = ["__journal__systemd_unit"]
target_label = "unit"
}

rule {
source_labels = ["__journal_container_name"]
target_label = "container_name"
}

rule {
source_labels = ["__journal_image_name"]
target_label = "container_image"
}

rule {
source_labels = ["__journal_container_id"]
target_label = "container_id"
}
}

# Fetch journal entries
// Fetch journal entries
loki.source.journal "journal" {
forward_to = [otelcol.receiver.loki.default.receiver]
relabel_rules = loki.relabel.journal.rules
labels = {component = "loki.source.journal"}
}

# Set instance label to the hostname
// Set instance label to the hostname
prometheus.relabel "instance" {
forward_to = [otelcol.receiver.prometheus.default.receiver]
rule {
Expand All @@ -48,7 +68,7 @@ prometheus.relabel "instance" {
}
}

# Export system metrics
// Export system metrics
prometheus.exporter.unix "host" {
procfs_path = "/host/proc"
sysfs_path = "/host/sys"
Expand All @@ -60,7 +80,7 @@ prometheus.exporter.unix "host" {
}
}

# Scrape system metrics
// Scrape system metrics
prometheus.scrape "host" {
targets = prometheus.exporter.unix.host.targets
forward_to = [prometheus.relabel.instance.receiver]
Expand Down
2 changes: 1 addition & 1 deletion modules/mixins/alloy/default.nix
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{config, ...}: {
{...}: {
# see TODO further down
imports = [../docker];

Expand Down
12 changes: 12 additions & 0 deletions modules/mixins/docker/config.alloy
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
prometheus.exporter.cadvisor "cadvisor" {
docker_host = "unix:///var/run/docker.sock"
storage_duration = "5m"
}

prometheus.scrape "cadvisor" {
targets = prometheus.exporter.cadvisor.cadvisor.targets
forward_to = [prometheus.relabel.instance.receiver]
scrape_interval = "30s"
}

// vim:ft=hcl
7 changes: 7 additions & 0 deletions modules/mixins/docker/default.nix
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
{...}: {
virtualisation.docker.enable = true;
virtualisation.oci-containers.backend = "docker";

# Monitoring
environment.etc."alloy/docker.alloy" = {
source = ./config.alloy;
mode = "0440";
user = "root";
};
}
19 changes: 4 additions & 15 deletions modules/mixins/ecs-agent/config.alloy
Original file line number Diff line number Diff line change
@@ -1,15 +1,4 @@
prometheus.exporter.cadvisor "cadvisor" {
docker_host = "unix:///var/run/docker.sock"
storage_duration = "5m"
}

prometheus.scrape "cadvisor" {
targets = prometheus.exporter.cadvisor.cadvisor.targets
forward_to = [prometheus.relabel.instance.receiver]
scrape_interval = "30s"
}

prometheus.scrape "ecs-agent" {
prometheus.scrape "ecs_agent" {
targets = [
{"__address__" = "127.0.0.1:51680", instance = env("HOSTNAME")},
]
Expand All @@ -24,9 +13,9 @@ otelcol.receiver.otlp "otlp" {
http {}

output {
metrics = [prometheus.relabel.instance.receiver]
logs = [otelcol.receiver.loki.default.receiver]
traces = [otelcol.exporter.otlp.gc-fwd.input]
metrics = [otelcol.exporter.prometheus.default.input]
logs = [otelcol.exporter.otlp.gc_fwd.input]
traces = [otelcol.exporter.otlp.gc_fwd.input]
}
}

Expand Down
2 changes: 1 addition & 1 deletion modules/mixins/ecs-agent/default.nix
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{pkgs, ...}: {
{...}: {
imports = [
../docker
../alloy
Expand Down