Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core][BugFix] Fix GPU detach when using docker container as runtime env #3436

Merged
merged 12 commits into from
Apr 23, 2024
5 changes: 4 additions & 1 deletion examples/job_queue/job_docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

name: job_docker

envs:
TIME_TO_SLEEP: 180

resources:
accelerators: T4:0.5
image_id: docker:ubuntu:20.04
Expand All @@ -18,7 +21,7 @@ setup: |
run: |
timestamp=$(date +%s)
conda env list
for i in {1..180}; do
for i in $(seq 1 $TIME_TO_SLEEP); do
echo "$timestamp $i"
sleep 1
done
13 changes: 11 additions & 2 deletions sky/provision/docker_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,14 +218,23 @@ def initialize(self) -> str:
f'{specific_image}')
container_running = self._check_container_status()
if container_running:
running_image = (self._run(
check_docker_image(self.container_name, self.docker_cmd)))
running_image = self._run(
check_docker_image(self.container_name, self.docker_cmd))
if running_image != specific_image:
logger.error(
f'A container with name {self.container_name} is running '
f'image {running_image} instead of {specific_image} (which '
'was provided in the YAML)')
else:
# Edit docker config first to avoid disconnecting the container
# from GPUs when a systemctl command is called. This is a known
# issue with nvidia container toolkit:
# https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
self._run(
'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
'/etc/docker/daemon.json > /tmp/daemon.json;'
'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
'sudo systemctl restart docker')
user_docker_run_options = self.docker_config.get('run_options', [])
start_command = docker_start_cmds(
specific_image,
Expand Down
11 changes: 11 additions & 0 deletions sky/skylet/providers/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,17 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str],
f'The `image_env` is:\n{image_env}')
raise e

# Edit docker config first to avoid disconnecting the container
# from GPUs when a systemctl command is called. This is a known
# issue with nvidia container toolkit:
# https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
self.run(
'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
'/etc/docker/daemon.json > /tmp/daemon.json;'
'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
'sudo systemctl restart docker',
run_env='host')

user_docker_run_options = self.docker_config.get(
'run_options', []) + self.docker_config.get(
f'{"head" if as_head else "worker"}_run_options', [])
Expand Down
32 changes: 22 additions & 10 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -1012,8 +1012,9 @@ def test_kubernetes_storage_mounts():
[
"docker:nvidia/cuda:11.8.0-devel-ubuntu18.04",
"docker:ubuntu:18.04",
# Test image with python 3.11 installed by default.
"docker:continuumio/miniconda3",
# Test latest image with python 3.11 installed by default.
# Does not work for python 3.12 due to ray's requirement for 3.11.
'docker:continuumio/miniconda3:24.1.2-0',
])
def test_docker_storage_mounts(generic_cloud: str, image_id: str):
# Tests bucket mounting on docker container
Expand Down Expand Up @@ -1193,25 +1194,31 @@ def test_job_queue(generic_cloud: str):
[
"docker:nvidia/cuda:11.8.0-devel-ubuntu18.04",
"docker:ubuntu:18.04",
# Test image with python 3.11 installed by default.
"docker:continuumio/miniconda3",
# Test latest image with python 3.11 installed by default.
# Does not work for python 3.12 due to ray's requirement for 3.11.
'docker:continuumio/miniconda3:24.1.2-0',
])
def test_job_queue_with_docker(generic_cloud: str, image_id: str):
name = _get_cluster_name() + image_id[len('docker:'):][:4]
total_timeout_minutes = 40 if generic_cloud == 'azure' else 15
time_to_sleep = 300 if generic_cloud == 'azure' else 180
test = Test(
'job_queue_with_docker',
[
f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml',
f'sky exec {name} -n {name}-1 -d --image-id {image_id} examples/job_queue/job_docker.yaml',
f'sky exec {name} -n {name}-2 -d --image-id {image_id} examples/job_queue/job_docker.yaml',
f'sky exec {name} -n {name}-3 -d --image-id {image_id} examples/job_queue/job_docker.yaml',
f'sky exec {name} -n {name}-1 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
f'sky exec {name} -n {name}-2 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
f'sky exec {name} -n {name}-3 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING',
f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING',
f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING',
f'sky cancel -y {name} 2',
'sleep 5',
f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING',
f'sky cancel -y {name} 3',
# Make sure the GPU is still visible to the container.
f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
f'sky logs {name} 4 --status',
f'sky stop -y {name}',
# Make sure the job status preserve after stop and start the
# cluster. This is also a test for the docker container to be
Expand All @@ -1222,10 +1229,14 @@ def test_job_queue_with_docker(generic_cloud: str, image_id: str):
f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED',
f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
f'sky logs {name} 4 --status',
f'sky logs {name} 5 --status',
f'sky logs {name} 6 --status',
# Make sure it is still visible after an stop & start cycle.
f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
f'sky logs {name} 7 --status'
],
f'sky down -y {name}',
timeout=total_timeout_minutes * 60,
)
run_one_test(test)

Expand Down Expand Up @@ -2812,8 +2823,9 @@ def test_aws_custom_image():
[
"docker:nvidia/cuda:11.8.0-devel-ubuntu18.04",
"docker:ubuntu:18.04",
# Test image with python 3.11 installed by default.
"docker:continuumio/miniconda3",
# Test latest image with python 3.11 installed by default.
# Does not work for python 3.12 due to ray's requirement for 3.11.
'docker:continuumio/miniconda3:24.1.2-0',
])
def test_kubernetes_custom_image(image_id):
"""Test Kubernetes custom image"""
Expand Down
Loading