diff --git a/runner/README.md b/runner/README.md index 330c6822f..ad06c3e78 100644 --- a/runner/README.md +++ b/runner/README.md @@ -97,6 +97,7 @@ These are nonexhaustive lists of external dependencies (executables, libraries) * `mountpoint` * `lsblk` * `mkfs.ext4` +* (NVIDIA GPU SSH fleet instances only) `nvidia-smi` * ... Debian/Ubuntu packages: `mount` (`mount`, `umount`), `util-linux` (`mountpoint`, `lsblk`), `e2fsprogs` (`mkfs.ext4`) diff --git a/runner/internal/shim/gpu.go b/runner/internal/shim/gpu.go index d2047c321..bdd61d800 100644 --- a/runner/internal/shim/gpu.go +++ b/runner/internal/shim/gpu.go @@ -8,14 +8,12 @@ import ( "io" "log" "os" - "os/exec" "strconv" "strings" execute "github.com/alexellis/go-execute/v2" ) -const nvidiaSmiImage = "dstackai/base:py3.13-0.6-cuda-12.1" const amdSmiImage = "un1def/amd-smi:6.2.2-0" type GpuVendor string @@ -36,7 +34,7 @@ func GetGpuVendor() GpuVendor { if _, err := os.Stat("/dev/kfd"); !errors.Is(err, os.ErrNotExist) { return Amd } - if _, err := exec.LookPath("nvidia-smi"); err == nil { + if _, err := os.Stat("/dev/nvidiactl"); !errors.Is(err, os.ErrNotExist) { return Nvidia } return NoVendor @@ -56,14 +54,8 @@ func getNvidiaGpuInfo() []GpuInfo { gpus := []GpuInfo{} cmd := execute.ExecTask{ - Command: "docker", - Args: []string{ - "run", - "--rm", - "--gpus", "all", - nvidiaSmiImage, - "nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv,nounits", - }, + Command: "nvidia-smi", + Args: []string{"--query-gpu=gpu_name,memory.total", "--format=csv,nounits"}, StreamStdio: false, } res, err := cmd.Execute(context.Background())