Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bare-metal GPU runtime class #1135

Merged
merged 2 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/e2e_manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ on:
options:
- genpolicy
- getdents
- gpu
- openssl
- policy
- regression
Expand All @@ -24,6 +25,7 @@ on:
options:
- AKS-CLH-SNP
- K3s-QEMU-SNP
- K3s-QEMU-SNP-GPU
- K3s-QEMU-TDX
skip-undeploy:
description: "Skip undeploy"
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/e2e_nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,16 @@ jobs:
- name: K3s-QEMU-TDX
runner: TDX
self-hosted: true
- name: K3s-QEMU-SNP-GPU
runner: SNP
self-hosted: true
test-name: [servicemesh, openssl, policy, workloadsecret, volumestatefulset]
include:
- platform:
name: K3s-QEMU-SNP-GPU
runner: SNP
self-hosted: true
test-name: [gpu]
fail-fast: false
name: "${{ matrix.platform.name }}"
uses: ./.github/workflows/e2e.yml
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ jobs:
coordinatorImg=$(nix run .#containers.push-coordinator -- "$container_registry/contrast/coordinator")
nodeInstallerMsftImg=$(nix run .#containers.push-node-installer-microsoft -- "$container_registry/contrast/node-installer-microsoft")
nodeInstallerKataImg=$(nix run .#containers.push-node-installer-kata -- "$container_registry/contrast/node-installer-kata")
nodeInstallerKataGPUImg=$(nix run .#containers.push-node-installer-kata-gpu -- "$container_registry/contrast/node-installer-kata")
initializerImg=$(nix run .#containers.push-initializer -- "$container_registry/contrast/initializer")
serviceMeshImg=$(nix run .#containers.push-service-mesh-proxy -- "$container_registry/contrast/service-mesh-proxy")
tardevSnapshotterImg=$(nix run .#containers.push-tardev-snapshotter -- "$container_registry/contrast/tardev-snapshotter")
Expand All @@ -256,6 +257,7 @@ jobs:
echo "coordinatorImg=$coordinatorImg" | tee -a "$GITHUB_ENV"
echo "nodeInstallerMsftImg=$nodeInstallerMsftImg" | tee -a "$GITHUB_ENV"
echo "nodeInstallerKataImg=$nodeInstallerKataImg" | tee -a "$GITHUB_ENV"
echo "nodeInstallerKataGPUImg=$nodeInstallerKataGPUImg" | tee -a "$GITHUB_ENV"
echo "initializerImg=$initializerImg" | tee -a "$GITHUB_ENV"
echo "serviceMeshImg=$serviceMeshImg" | tee -a "$GITHUB_ENV"
echo "tardevSnapshotterImg=$tardevSnapshotterImg" | tee -a "$GITHUB_ENV"
Expand All @@ -272,6 +274,7 @@ jobs:
echo "coordinatorImgTagged=$(tag "$coordinatorImg")" | tee -a "$GITHUB_ENV"
echo "nodeInstallerMsftImgTagged=$(tag "$nodeInstallerMsftImg")" | tee -a "$GITHUB_ENV"
echo "nodeInstallerKataImgTagged=$(tag "$nodeInstallerKataImg")" | tee -a "$GITHUB_ENV"
echo "nodeInstallerKataGPUImgTagged=$(tag "$nodeInstallerKataGPUImg")" | tee -a "$GITHUB_ENV"
echo "initializerImgTagged=$(tag "$initializerImg")" | tee -a "$GITHUB_ENV"
echo "serviceMeshImgTagged=$(tag "$serviceMeshImg")" | tee -a "$GITHUB_ENV"
echo "nydusPullImgTagged=$(tag "$nydusPullImg")" | tee -a "$GITHUB_ENV"
Expand All @@ -294,6 +297,7 @@ jobs:
echo "ghcr.io/edgelesssys/contrast/service-mesh-proxy:latest=$serviceMeshImgTagged"
echo "ghcr.io/edgelesssys/contrast/node-installer-microsoft:latest=$nodeInstallerMsftImgTagged"
echo "ghcr.io/edgelesssys/contrast/node-installer-kata:latest=$nodeInstallerKataImgTagged"
echo "ghcr.io/edgelesssys/contrast/node-installer-kata-gpu:latest=$nodeInstallerKataGPUImgTagged"
echo "ghcr.io/edgelesssys/contrast/tardev-snapshotter:latest=$tardevSnapshotterImgTagged"
echo "ghcr.io/edgelesssys/contrast/nydus-snapshotter:latest=$nydusSnapshotterImgTagged"
echo "ghcr.io/edgelesssys/contrast/nydus-pull:latest=$nydusPullImgTagged"
Expand Down
91 changes: 91 additions & 0 deletions e2e/gpu/gpu_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Copyright 2024 Edgeless Systems GmbH
// SPDX-License-Identifier: AGPL-3.0-only

//go:build e2e

package gpu

import (
"bytes"
"context"
"flag"
"os"
"testing"
"time"

"github.com/edgelesssys/contrast/e2e/internal/contrasttest"
"github.com/edgelesssys/contrast/internal/kuberesource"
"github.com/edgelesssys/contrast/internal/manifest"
"github.com/edgelesssys/contrast/internal/platforms"
"github.com/stretchr/testify/require"
)

const (
gpuPodName = "gpu-pod"
gpuName = "NVIDIA H100 PCIe"
)

// TestGPU runs e2e tests on an GPU-enabled Contrast.
func TestGPU(t *testing.T) {
platform, err := platforms.FromString(contrasttest.Flags.PlatformStr)
require.NoError(t, err)
ct := contrasttest.New(t)

runtimeHandler, err := manifest.RuntimeHandler(platform)
require.NoError(t, err)

resources := kuberesource.OpenSSL()
coordinator := kuberesource.CoordinatorBundle()

resources = append(resources, coordinator...)

resources = kuberesource.PatchRuntimeHandlers(resources, runtimeHandler)

resources = kuberesource.AddPortForwarders(resources)

ct.Init(t, resources)
require.True(t, t.Run("generate", ct.Generate), "contrast generate needs to succeed for subsequent tests")

require.True(t, t.Run("apply", ct.Apply), "Kubernetes resources need to be applied for subsequent tests")

require.True(t, t.Run("set", ct.Set), "contrast set needs to succeed for subsequent tests")

require.True(t, t.Run("contrast verify", ct.Verify), "contrast verify needs to succeed for subsequent tests")

applyGPUPod := func(t *testing.T) {
yaml, err := os.ReadFile("./e2e/gpu/testdata/gpu-pod.yaml")
require.NoError(t, err)

yaml = bytes.ReplaceAll(
bytes.ReplaceAll(yaml, []byte("@@REPLACE_NAMESPACE@@"), []byte(ct.Namespace)),
[]byte("@@REPLACE_RUNTIME@@"), []byte(ct.RuntimeClassName),
)

ct.ApplyFromYAML(t, yaml)
}

require.True(t, t.Run("apply GPU pod", applyGPUPod), "GPU pod needs to deploy successfully for subsequent tests")

t.Run("check GPU availability", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), ct.FactorPlatformTimeout(5*time.Minute))
defer cancel()

require := require.New(t)

err := ct.Kubeclient.WaitForPod(ctx, ct.Namespace, gpuPodName)
require.NoError(err, "GPU pod %s did not start", gpuPodName)

argv := []string{"/bin/sh", "-c", "nvidia-smi"}
stdout, stderr, err := ct.Kubeclient.Exec(ctx, ct.Namespace, gpuPodName, argv)
require.NoError(err, "stderr: %q", stderr)

require.Contains(stdout, gpuName, "nvidia-smi output should contain %s", gpuName)
})
}

func TestMain(m *testing.M) {
contrasttest.RegisterFlags()
flag.Parse()

os.Exit(m.Run())
}
25 changes: 25 additions & 0 deletions e2e/gpu/testdata/gpu-pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# TODO(msanft): Move this to internal/kuberesource/sets.go as soon as genpolicy
# support for GPU pods is added.
apiVersion: v1
kind: Pod
metadata:
name: gpu-pod
namespace: "@@REPLACE_NAMESPACE@@"
annotations:
# Allow-all policy
# TODO(msanft): Generate a policy dynamically once we support policy generation for GPU pods.
io.katacontainers.config.agent.policy: IyBDb3B5cmlnaHQgKGMpIDIwMjMgTWljcm9zb2Z0IENvcnBvcmF0aW9uCiMKIyBTUERYLUxpY2Vuc2UtSWRlbnRpZmllcjogQXBhY2hlLTIuMAojCgpwYWNrYWdlIGFnZW50X3BvbGljeQoKZGVmYXVsdCBBZGRBUlBOZWlnaGJvcnNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBBZGRTd2FwUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ2xvc2VTdGRpblJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IENvcHlGaWxlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlU2FuZGJveFJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IERlc3Ryb3lTYW5kYm94UmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgRXhlY1Byb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHZXRNZXRyaWNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgR2V0T09NRXZlbnRSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHdWVzdERldGFpbHNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBMaXN0SW50ZXJmYWNlc1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IExpc3RSb3V0ZXNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBNZW1Ib3RwbHVnQnlQcm9iZVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IE9ubGluZUNQVU1lbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFBhdXNlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUHVsbEltYWdlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUmVhZFN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZVN0YWxlVmlydGlvZnNTaGFyZU1vdW50c1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc2VlZFJhbmRvbURldlJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc3VtZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFNldEd1ZXN0RGF0ZVRpbWVSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTZXRQb2xpY3lSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTaWduYWxQcm9jZXNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgU3RhcnRDb250YWluZXJSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGFydFRyYWNpbmdSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGF0c0NvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFN0b3BUcmFjaW5nUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVHR5V2luUmVzaXplUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlRXBoZW1lcmFsTW91bnRzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlSW50ZXJmYWNlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlUm91dGVzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgV2FpdFByb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBXcml0ZVN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQo=
io.katacontainers.config.hypervisor.default_memory: "15258"
cdi.k8s.io/gpu: "nvidia.com/pgpu=0"
spec:
runtimeClassName: "@@REPLACE_RUNTIME@@"
restartPolicy: OnFailure
containers:
- name: vllm
image: ghcr.io/edgelesssys/contrast/ubuntu:24.04
env:
- name: NVIDIA_VISIBLE_DEVICES
value: all
resources:
limits:
"nvidia.com/GH100_H100_PCIE": 1
17 changes: 15 additions & 2 deletions e2e/internal/contrasttest/contrasttest.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ type ContrastTest struct {
ImageReplacementsFile string
Platform platforms.Platform
NamespaceFile string
RuntimeClassName string
Kubeclient *kubeclient.Kubeclient

// outputs of contrast subcommands
Expand All @@ -70,15 +71,21 @@ type ContrastTest struct {

// New creates a new contrasttest.T object bound to the given test.
func New(t *testing.T) *ContrastTest {
require := require.New(t)

platform, err := platforms.FromString(Flags.PlatformStr)
require.NoError(t, err)
require.NoError(err)

runtimeClass, err := kuberesource.ContrastRuntimeClass(platform)
require.NoError(err)

return &ContrastTest{
Namespace: MakeNamespace(t, Flags.NamespaceSuffix),
WorkDir: t.TempDir(),
ImageReplacementsFile: Flags.ImageReplacementsFile,
Platform: platform,
NamespaceFile: Flags.NamespaceFile,
RuntimeClassName: *runtimeClass.Handler,
Kubeclient: kubeclient.NewForTest(t),
}
}
Expand Down Expand Up @@ -283,9 +290,15 @@ func patchReferenceValues(k *kubeclient.Kubeclient, platform platforms.Platform)
// Apply the generated resources to the Kubernetes test environment.
func (ct *ContrastTest) Apply(t *testing.T) {
require := require.New(t)

yaml, err := os.ReadFile(path.Join(ct.WorkDir, "resources.yml"))
require.NoError(err)
ct.ApplyFromYAML(t, yaml)
}

// ApplyFromYAML applies the given YAML to the Kubernetes test environment.
func (ct *ContrastTest) ApplyFromYAML(t *testing.T, yaml []byte) {
require := require.New(t)

objects, err := kubeapi.UnmarshalUnstructuredK8SResource(yaml)
require.NoError(err)

Expand Down
6 changes: 6 additions & 0 deletions internal/kuberesource/parts.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ func NodeInstaller(namespace string, platform platforms.Platform) (*NodeInstalle
snapshotterVolumes = tardevSnapshotterVolumes
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.MetalQEMUSNPGPU:
nodeInstallerImageURL = "ghcr.io/edgelesssys/contrast/node-installer-kata:latest"
if platform == platforms.MetalQEMUSNPGPU {
nodeInstallerImageURL = "ghcr.io/edgelesssys/contrast/node-installer-kata-gpu:latest"
}
containers = append(containers, nydusSnapshotter, nydusPull)
nydusSnapshotterVolumes = append(nydusSnapshotterVolumes,
Volume().
Expand All @@ -171,6 +174,9 @@ func NodeInstaller(namespace string, platform platforms.Platform) (*NodeInstalle
snapshotterVolumes = nydusSnapshotterVolumes
case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX:
nodeInstallerImageURL = "ghcr.io/edgelesssys/contrast/node-installer-kata:latest"
if platform == platforms.K3sQEMUSNPGPU {
nodeInstallerImageURL = "ghcr.io/edgelesssys/contrast/node-installer-kata-gpu:latest"
}
containers = append(containers, nydusSnapshotter, nydusPull)
nydusSnapshotterVolumes = append(nydusSnapshotterVolumes,
Volume().
Expand Down
7 changes: 6 additions & 1 deletion justfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,16 @@ node-installer platform=default_platform:
just push "tardev-snapshotter"
just push "node-installer-microsoft"
;;
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"Metal-QEMU-SNP-GPU"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
just push "nydus-snapshotter"
just push "nydus-pull"
just push "node-installer-kata"
;;
"Metal-QEMU-SNP-GPU"|"K3s-QEMU-SNP-GPU")
just push "nydus-snapshotter"
just push "nydus-pull"
just push "node-installer-kata-gpu"
;;
"AKS-PEER-SNP")
nix run -L .#scripts.deploy-caa -- \
--kustomization=./infra/azure-peerpods/kustomization.yaml \
Expand Down
63 changes: 36 additions & 27 deletions packages/by-name/contrast/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ let
subPackages = [
"e2e/genpolicy"
"e2e/getdents"
"e2e/gpu"
"e2e/openssl"
"e2e/servicemesh"
"e2e/release"
Expand Down Expand Up @@ -81,35 +82,43 @@ let
];
};

snpRefVals = {
snp =
let
launch-digest =
if kata.contrast-node-installer-image.debugRuntime then
kata.snp-launch-digest.override { debug = true; }
else
kata.snp-launch-digest;
in
[
{
trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/milan.hex");
productName = "Milan";
}
{
trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/genoa.hex");
productName = "Genoa";
}
];
};
snpRefValsWith =
{ gpu }:
{
snp =
let
os-image =
if gpu then
kata.contrast-node-installer-image.gpu.os-image
else
kata.contrast-node-installer-image.os-image;
launch-digest = kata.snp-launch-digest.override {
inherit os-image;
debug = kata.contrast-node-installer-image.debugRuntime;
};
in
[
{
trustedMeasurement = builtins.readFile "${launch-digest}/milan.hex";
productName = "Milan";
}
{
trustedMeasurement = builtins.readFile "${launch-digest}/genoa.hex";
productName = "Genoa";
}
];
};

snpRefVals = snpRefValsWith { gpu = false; };
snpGpuRefVals = snpRefValsWith { gpu = true; };

tdxRefVals = {
tdx = [
(
let
launch-digests =
if kata.contrast-node-installer-image.debugRuntime then
kata.tdx-launch-digests.override { debug = true; }
else
kata.tdx-launch-digests;
launch-digests = kata.tdx-launch-digests.override {
debug = kata.contrast-node-installer-image.debugRuntime;
};
in
{
mrTd = builtins.readFile "${launch-digests}/mrtd.hex";
Expand All @@ -135,9 +144,9 @@ let
"${k3s-qemu-tdx-handler}" = tdxRefVals;
"${rke2-qemu-tdx-handler}" = tdxRefVals;
"${metal-qemu-snp-handler}" = snpRefVals;
"${metal-qemu-snp-gpu-handler}" = snpRefVals;
"${metal-qemu-snp-gpu-handler}" = snpGpuRefVals;
"${k3s-qemu-snp-handler}" = snpRefVals;
"${k3s-qemu-snp-gpu-handler}" = snpRefVals;
"${k3s-qemu-snp-gpu-handler}" = snpGpuRefVals;
}
);

Expand Down
Loading