Skip to content

Commit

Permalink
Merge pull request #333 from nebius/mv_gpubench
Browse files Browse the repository at this point in the history
Move gpubench to worker image and bind mount it
  • Loading branch information
asteny authored Jan 20, 2025
2 parents 5dbc164 + 03ea722 commit d5a56ab
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 29 deletions.
28 changes: 0 additions & 28 deletions images/jail/jail.dockerfile
Original file line number Diff line number Diff line change
@@ -1,28 +1,3 @@
# BASE_IMAGE defined here for second multistage build
ARG BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

# First stage: Build the gpubench application
FROM golang:1.22 AS gpubench_builder

ARG GO_LDFLAGS=""
ARG CGO_ENABLED=0
ARG GOOS=linux
ARG GOARCH=amd64

WORKDIR /app

COPY jail/gpubench/go.mod jail/gpubench/go.sum ./

RUN go mod download

COPY jail/gpubench/main.go .

RUN GOOS=$GOOS GOARCH=$GOARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
go build -o gpubench .

#######################################################################################################################
# Second stage: Build jail image

ARG BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

FROM $BASE_IMAGE AS jail
Expand Down Expand Up @@ -188,9 +163,6 @@ RUN mv /usr/bin/docker /usr/bin/docker.real
COPY jail/scripts/docker.sh /usr/bin/docker
RUN chmod +x /usr/bin/docker

# Copy binary that performs GPU benchmark
COPY --from=gpubench_builder /app/gpubench /usr/bin/

# Create directory for pivoting host's root
RUN mkdir -m 555 /mnt/host

Expand Down
28 changes: 28 additions & 0 deletions images/worker/slurmd.dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,28 @@
# BASE_IMAGE defined here for second multistage build
ARG BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

# First stage: Build the gpubench application
FROM golang:1.22 AS gpubench_builder

ARG GO_LDFLAGS=""
ARG CGO_ENABLED=0
ARG GOOS=linux
ARG GOARCH=amd64

WORKDIR /app

COPY jail/gpubench/go.mod jail/gpubench/go.sum ./

RUN go mod download

COPY jail/gpubench/main.go .

RUN GOOS=$GOOS GOARCH=$GOARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
go build -o gpubench .

#######################################################################################################################
# Second stage: Build worker image

ARG BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

FROM $BASE_IMAGE AS worker_slurmd
Expand Down Expand Up @@ -134,6 +159,9 @@ RUN rm -rf /etc/update-motd.d/*
# Expose the port used for accessing slurmd
EXPOSE 6818

# Copy binary that performs GPU benchmark
COPY --from=gpubench_builder /app/gpubench /usr/bin/

# Create dir and file for multilog hack
RUN mkdir -p /var/log/slurm/multilog && \
touch /var/log/slurm/multilog/current && \
Expand Down
6 changes: 5 additions & 1 deletion images/worker/supervisord_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ if [ -n "${CGROUP_V2}" ]; then

if [ -n "${CGROUP_PATH}" ]; then
echo "cgroup v2 detected, creating cgroup for ${CGROUP_PATH}"
mkdir -p /sys/fs/cgroup/${CGROUP_PATH}/../system.slice
mkdir -p /sys/fs/cgroup/"${CGROUP_PATH}"/../system.slice
# TODO: uncomment this line when 24.11 will be tested. It is OOMKillStep for taskPluginParam
# echo "1" > /sys/fs/cgroup/${CGROUP_PATH}/../system.slice/memory.oom.group
else
Expand All @@ -33,6 +33,10 @@ for file in /mnt/slurm-configs/*; do
touch "/etc/slurm/$filename" && mount --bind "$file" "/etc/slurm/$filename"
done

echo "Bind-mount gpubenchmark from container ot jail"
touch /mnt/jail/usr/bin/gpubench
mount --bind /usr/bin/gpubench /mnt/jail/usr/bin/gpubench

echo "Make ulimits as big as possible"
set_ulimit() {
local limit_option=$1
Expand Down

0 comments on commit d5a56ab

Please sign in to comment.