Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support bare-metal Kata GPU containers #1133

Merged
merged 6 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions nodeinstaller/internal/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ func KataRuntimeConfig(baseDir string, platform platforms.Platform, qemuExtraKer
config.Hypervisor["qemu"]["cold_plug_vfio"] = "root-port"
// GPU images tend to be larger, so give a better default timeout that
// allows for pulling those.
config.Agent["kata"]["dial_timeout"] = 600
config.Runtime["create_container_timeout"] = 600
}
default:
Expand Down
15 changes: 15 additions & 0 deletions overlays/nixpkgs.nix
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,19 @@ final: prev:
--set SOURCE_DATE_EPOCH 0
'';
});

# Fixes a dangling symlink in the libnvidia-container package that confuses
# the nvidia-container-toolkit.
# TODO(msanft): Remove once https://github.com/NixOS/nixpkgs/pull/375291 is merged and
# pulled into Contrast.
libnvidia-container = prev.libnvidia-container.overrideAttrs (prev: {
postFixup = ''
# Recreate library symlinks which ldconfig would have created
for lib in libnvidia-container libnvidia-container-go; do
rm -f "$out/lib/$lib.so"
ln -s "$out/lib/$lib.so.${prev.version}" "$out/lib/$lib.so.1"
ln -s "$out/lib/$lib.so.1" "$out/lib/$lib.so"
done
'';
});
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Moritz Sanft <[email protected]>
Date: Fri, 3 Jan 2025 13:29:05 +0100
Subject: [PATCH] agent: remove CDI support

This reverts commits aa2e1a57bdd4b9e0d6c0810c05229d77278066c6
and 3995fe71f9b7b4304dc9cdfd842d296a633c15c3 while keeping the Cargo
lockfiles intact to allow for easier rebasing of this patch.
---
src/agent/src/device/mod.rs | 158 ------------------------------------
src/agent/src/rpc.rs | 13 +--
2 files changed, 1 insertion(+), 170 deletions(-)

diff --git a/src/agent/src/device/mod.rs b/src/agent/src/device/mod.rs
index 400b6f1386e1b4a1a4cda1e3e3da2f66640165c7..53e77d82c88912488ead9052f44e397345f8b8ad 100644
--- a/src/agent/src/device/mod.rs
+++ b/src/agent/src/device/mod.rs
@@ -11,9 +11,6 @@ use self::vfio_device_handler::{VfioApDeviceHandler, VfioPciDeviceHandler};
use crate::pci;
use crate::sandbox::Sandbox;
use anyhow::{anyhow, Context, Result};
-use cdi::annotations::parse_annotations;
-use cdi::cache::{new_cache, with_auto_refresh, CdiOption};
-use cdi::spec_dirs::with_spec_dirs;
use kata_types::device::DeviceHandlerManager;
use nix::sys::stat;
use oci::{LinuxDeviceCgroup, Spec};
@@ -28,8 +25,6 @@ use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::Mutex;
-use tokio::time;
-use tokio::time::Duration;
use tracing::instrument;

pub mod block_device_handler;
@@ -243,69 +238,6 @@ pub async fn add_devices(
update_spec_devices(logger, spec, dev_updates)
}

-#[instrument]
-pub async fn handle_cdi_devices(
- logger: &Logger,
- spec: &mut Spec,
- spec_dir: &str,
- cdi_timeout: u64,
-) -> Result<()> {
- if let Some(container_type) = spec
- .annotations()
- .as_ref()
- .and_then(|a| a.get("io.katacontainers.pkg.oci.container_type"))
- {
- if container_type == "pod_sandbox" {
- return Ok(());
- }
- }
-
- let (_, devices) = parse_annotations(spec.annotations().as_ref().unwrap())?;
-
- if devices.is_empty() {
- info!(logger, "no CDI annotations, no devices to inject");
- return Ok(());
- }
- // Explicitly set the cache options to disable auto-refresh and
- // to use the single spec dir "/var/run/cdi" for tests it can be overridden
- let options: Vec<CdiOption> = vec![with_auto_refresh(false), with_spec_dirs(&[spec_dir])];
- let cache: Arc<std::sync::Mutex<cdi::cache::Cache>> = new_cache(options);
-
- for _ in 0..=cdi_timeout {
- let inject_result = {
- // Lock cache within this scope, std::sync::Mutex has no Send
- // and await will not work with time::sleep
- let mut cache = cache.lock().unwrap();
- match cache.refresh() {
- Ok(_) => {}
- Err(e) => {
- return Err(anyhow!("error refreshing cache: {:?}", e));
- }
- }
- cache.inject_devices(Some(spec), devices.clone())
- };
-
- match inject_result {
- Ok(_) => {
- info!(
- logger,
- "all devices injected successfully, modified CDI container spec: {:?}", &spec
- );
- return Ok(());
- }
- Err(e) => {
- info!(logger, "error injecting devices: {:?}", e);
- println!("error injecting devices: {:?}", e);
- }
- }
- time::sleep(Duration::from_millis(1000)).await;
- }
- Err(anyhow!(
- "failed to inject devices after CDI timeout of {} seconds",
- cdi_timeout
- ))
-}
-
#[instrument]
async fn validate_device(
logger: &Logger,
@@ -1178,94 +1110,4 @@ mod tests {
assert!(name.is_ok(), "{}", name.unwrap_err());
assert_eq!(name.unwrap(), devname);
}
-
- #[tokio::test]
- async fn test_handle_cdi_devices() {
- let logger = slog::Logger::root(slog::Discard, o!());
- let mut spec = Spec::default();
-
- let mut annotations = HashMap::new();
- // cdi.k8s.io/vendor1_devices: vendor1.com/device=foo
- annotations.insert(
- "cdi.k8s.io/vfio17".to_string(),
- "kata.com/gpu=0".to_string(),
- );
- spec.set_annotations(Some(annotations));
-
- let temp_dir = tempdir().expect("Failed to create temporary directory");
- let cdi_file = temp_dir.path().join("kata.json");
-
- let cdi_version = "0.6.0";
- let kind = "kata.com/gpu";
- let device_name = "0";
- let annotation_whatever = "false";
- let annotation_whenever = "true";
- let inner_env = "TEST_INNER_ENV=TEST_INNER_ENV_VALUE";
- let outer_env = "TEST_OUTER_ENV=TEST_OUTER_ENV_VALUE";
- let inner_device = "/dev/zero";
- let outer_device = "/dev/null";
-
- let cdi_content = format!(
- r#"{{
- "cdiVersion": "{cdi_version}",
- "kind": "{kind}",
- "devices": [
- {{
- "name": "{device_name}",
- "annotations": {{
- "whatever": "{annotation_whatever}",
- "whenever": "{annotation_whenever}"
- }},
- "containerEdits": {{
- "env": [
- "{inner_env}"
- ],
- "deviceNodes": [
- {{
- "path": "{inner_device}"
- }}
- ]
- }}
- }}
- ],
- "containerEdits": {{
- "env": [
- "{outer_env}"
- ],
- "deviceNodes": [
- {{
- "path": "{outer_device}"
- }}
- ]
- }}
- }}"#
- );
-
- fs::write(&cdi_file, cdi_content).expect("Failed to write CDI file");
-
- let res =
- handle_cdi_devices(&logger, &mut spec, temp_dir.path().to_str().unwrap(), 0).await;
- println!("modfied spec {:?}", spec);
- assert!(res.is_ok(), "{}", res.err().unwrap());
-
- let linux = spec.linux().as_ref().unwrap();
- let devices = linux
- .resources()
- .as_ref()
- .unwrap()
- .devices()
- .as_ref()
- .unwrap();
- assert_eq!(devices.len(), 2);
-
- let env = spec.process().as_ref().unwrap().env().as_ref().unwrap();
-
- // find string TEST_OUTER_ENV in env
- let outer_env = env.iter().find(|e| e.starts_with("TEST_OUTER_ENV"));
- assert!(outer_env.is_some(), "TEST_OUTER_ENV not found in env");
-
- // find TEST_INNER_ENV in env
- let inner_env = env.iter().find(|e| e.starts_with("TEST_INNER_ENV"));
- assert!(inner_env.is_some(), "TEST_INNER_ENV not found in env");
- }
}
diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs
index 0a1c6d34adfffcbc3aef1b55a77556b8b82e85c0..b3888633744a718586069314a192c9c0fd92459e 100644
--- a/src/agent/src/rpc.rs
+++ b/src/agent/src/rpc.rs
@@ -58,7 +58,7 @@ use rustjail::process::ProcessOperations;
use crate::cdh;
use crate::device::block_device_handler::get_virtio_blk_pci_device_name;
use crate::device::network_device_handler::wait_for_net_interface;
-use crate::device::{add_devices, handle_cdi_devices, update_env_pci};
+use crate::device::{add_devices, update_env_pci};
use crate::features::get_build_features;
use crate::image::KATA_IMAGE_WORK_DIR;
use crate::linux_abi::*;
@@ -130,8 +130,6 @@ const ERR_NO_SANDBOX_PIDNS: &str = "Sandbox does not have sandbox_pidns";
// not available.
const IPTABLES_RESTORE_WAIT_SEC: u64 = 5;

-const CDI_TIMEOUT_LIMIT: u64 = 100;
-
// Convenience function to obtain the scope logger.
fn sl() -> slog::Logger {
slog_scope::logger()
@@ -226,15 +224,6 @@ impl AgentService {
// cannot predict everything from the caller.
add_devices(&sl(), &req.devices, &mut oci, &self.sandbox).await?;

- // In guest-kernel mode some devices need extra handling. Taking the
- // GPU as an example the shim will inject CDI annotations that will
- // be used by the kata-agent to do containerEdits according to the
- // CDI spec coming from a registry that is created on the fly by UDEV
- // or other entities for a specifc device.
- // In Kata we only consider the directory "/var/run/cdi", "/etc" may be
- // readonly
- handle_cdi_devices(&sl(), &mut oci, "/var/run/cdi", CDI_TIMEOUT_LIMIT).await?;
-
cdh_handler(&mut oci).await?;

// Both rootfs and volumes (invoked with --volume for instance) will
6 changes: 6 additions & 0 deletions packages/by-name/kata/kata-runtime/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ buildGoModule rec {
# See: https://github.com/kata-containers/kata-containers/pull/10719
# TODO(msanft): Remove once upstream PR is released.
./0018-runtime-use-actual-booleans-for-QMP-device_add-boole.patch

# Revert CDI support in kata-agent, which breaks legacy mode GPU facilitation which
# we currently use.
# TODO(msanft): Get native CDI working, which will allow us to drop this patch / undo the revert.
# See https://dev.azure.com/Edgeless/Edgeless/_workitems/edit/5061
./0019-agent-remove-CDI-support.patch
];
};

Expand Down
54 changes: 52 additions & 2 deletions packages/nixos/gpu.nix
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,34 @@ let
});
};
});

# nix-store-mount-hook mounts the VM's nix store into the container.
# TODO(burgerdev): only do that for containers that actually get a GPU device.
nix-store-mount-hook = pkgs.writeShellApplication {
name = "nix-store-mount-hook";
runtimeInputs = with pkgs; [
coreutils
util-linux
jq
];
text = ''
# Reads from the state JSON supplied on stdin.
bundle="$(jq -r .bundle)"
rootfs="$bundle/rootfs"
id="$(basename "$bundle")"

lower=/nix/store
target="$rootfs$lower"
mkdir -p "$target"

overlays="/run/kata-containers/nix-overlays/$id"
upperdir="$overlays/upperdir"
workdir="$overlays/workdir"
mkdir -p "$upperdir" "$workdir"

mount -t overlay -o "lowerdir=$lower:$target,upperdir=$upperdir,workdir=$workdir" none "$target"
'';
};
in

{
Expand All @@ -90,15 +118,37 @@ in
videoAcceleration = false;
};

# WARNING: Kata sets systemd's default target to `kata-containers.target`. Thus, some upstream services may not work out-of-the-box,
# as they are `WantedBy=multi-user.target` or similar. In such cases, the service needs to be adjusted to be `WantedBy=kata-containers.target`
# instead.

# Configure the persistenced for use with CC GPUs (e.g. H100).
# TODO(msanft): This needs to be adjusted for non-CC-GPUs.
# See: https://docs.nvidia.com/cc-deployment-guide-snp.pdf (Page 23 & 24)
systemd.services."nvidia-persistenced" = {
wantedBy = [ "kata-containers.target" ];
serviceConfig.ExecStart = lib.mkForce "${lib.getExe config.hardware.nvidia.package.persistenced} --uvm-persistence-mode --verbose";
};

# kata-containers.target needs to pull this in so that we get a valid
# CDI configuration inside the PodVM. This is not necessary, as we use the
# legacy mode as of now, but will be once we switch to CDI.
systemd.services."nvidia-container-toolkit-cdi-generator".wantedBy = [ "kata-containers.target" ];
msanft marked this conversation as resolved.
Show resolved Hide resolved

hardware.nvidia-container-toolkit.enable = true;

# Make NVIDIA the "default" graphics driver to replace Mesa,
# which saves us another Perl dependency.
hardware.graphics.package = nvidiaPackage;
hardware.graphics.package32 = nvidiaPackage;

image.repart.partitions."10-root".contents."/usr/share/oci/hooks/prestart/nvidia-container-toolkit.sh".source =
lib.getExe pkgs.nvidia-ctk-oci-hook;
image.repart.partitions."10-root".contents = {
"/usr/share/oci/hooks/prestart/nvidia-container-toolkit.sh".source =
lib.getExe pkgs.nvidia-ctk-oci-hook;
"/usr/share/oci/hooks/prestart/nix-store-mount-hook.sh".source = lib.getExe nix-store-mount-hook;
};

environment.systemPackages = [ pkgs.nvidia-ctk-with-config ];

boot.initrd.kernelModules = [
# Extra kernel modules required to talk to the GPU in CC-Mode.
Expand Down