skypilot-org · cblmemo · Apr 23, 2024 · Apr 11, 2024 · Apr 17, 2024 · Apr 18, 2024
diff --git a/examples/job_queue/job_docker.yaml b/examples/job_queue/job_docker.yaml
@@ -8,6 +8,9 @@
 
 name: job_docker
 
+envs:
+  TIME_TO_SLEEP: 180
+
 resources:
   accelerators: T4:0.5
   image_id: docker:ubuntu:20.04
@@ -18,7 +21,7 @@ setup: |
 run: |
   timestamp=$(date +%s)
   conda env list
-  for i in {1..180}; do
+  for i in $(seq 1 $TIME_TO_SLEEP); do
     echo "$timestamp $i"
     sleep 1
   done
diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py
@@ -218,14 +218,23 @@ def initialize(self) -> str:
                     f'{specific_image}')
         container_running = self._check_container_status()
         if container_running:
-            running_image = (self._run(
-                check_docker_image(self.container_name, self.docker_cmd)))
+            running_image = self._run(
+                check_docker_image(self.container_name, self.docker_cmd))
             if running_image != specific_image:
                 logger.error(
                     f'A container with name {self.container_name} is running '
                     f'image {running_image} instead of {specific_image} (which '
                     'was provided in the YAML)')
         else:
+            # Edit docker config first to avoid disconnecting the container
+            # from GPUs when a systemctl command is called. This is a known
+            # issue with nvidia container toolkit:
+            # https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
+            self._run(
+                'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
+                '/etc/docker/daemon.json > /tmp/daemon.json;'
+                'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
+                'sudo systemctl restart docker')
             user_docker_run_options = self.docker_config.get('run_options', [])
             start_command = docker_start_cmds(
                 specific_image,

diff --git a/sky/skylet/providers/command_runner.py b/sky/skylet/providers/command_runner.py
@@ -229,6 +229,17 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str],
                     f'The `image_env` is:\n{image_env}')
                 raise e
 
+            # Edit docker config first to avoid disconnecting the container
+            # from GPUs when a systemctl command is called. This is a known
+            # issue with nvidia container toolkit:
+            # https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
+            self.run(
+                'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
+                '/etc/docker/daemon.json > /tmp/daemon.json;'
+                'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
+                'sudo systemctl restart docker',
+                run_env='host')
+
             user_docker_run_options = self.docker_config.get(
                 'run_options', []) + self.docker_config.get(
                     f'{"head" if as_head else "worker"}_run_options', [])

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
@@ -1012,8 +1012,9 @@ def test_kubernetes_storage_mounts():
     [
         "docker:nvidia/cuda:11.8.0-devel-ubuntu18.04",
         "docker:ubuntu:18.04",
-        # Test image with python 3.11 installed by default.
-        "docker:continuumio/miniconda3",
+        # Test latest image with python 3.11 installed by default.
+        # Does not work for python 3.12 due to ray's requirement for 3.11.
+        'docker:continuumio/miniconda3:24.1.2-0',
     ])
 def test_docker_storage_mounts(generic_cloud: str, image_id: str):
     # Tests bucket mounting on docker container
@@ -1193,25 +1194,31 @@ def test_job_queue(generic_cloud: str):
     [
         "docker:nvidia/cuda:11.8.0-devel-ubuntu18.04",
         "docker:ubuntu:18.04",
-        # Test image with python 3.11 installed by default.
-        "docker:continuumio/miniconda3",
+        # Test latest image with python 3.11 installed by default.
+        # Does not work for python 3.12 due to ray's requirement for 3.11.
+        'docker:continuumio/miniconda3:24.1.2-0',
     ])
 def test_job_queue_with_docker(generic_cloud: str, image_id: str):
     name = _get_cluster_name() + image_id[len('docker:'):][:4]
+    total_timeout_minutes = 40 if generic_cloud == 'azure' else 15
+    time_to_sleep = 300 if generic_cloud == 'azure' else 180
     test = Test(
         'job_queue_with_docker',
         [
             f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml',
-            f'sky exec {name} -n {name}-1 -d --image-id {image_id} examples/job_queue/job_docker.yaml',
-            f'sky exec {name} -n {name}-2 -d --image-id {image_id} examples/job_queue/job_docker.yaml',
-            f'sky exec {name} -n {name}-3 -d --image-id {image_id} examples/job_queue/job_docker.yaml',
+            f'sky exec {name} -n {name}-1 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
+            f'sky exec {name} -n {name}-2 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
+            f'sky exec {name} -n {name}-3 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING',
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING',
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING',
             f'sky cancel -y {name} 2',
             'sleep 5',
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING',
             f'sky cancel -y {name} 3',
+            # Make sure the GPU is still visible to the container.
+            f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
+            f'sky logs {name} 4 --status',
             f'sky stop -y {name}',
             # Make sure the job status preserve after stop and start the
             # cluster. This is also a test for the docker container to be
@@ -1222,10 +1229,14 @@ def test_job_queue_with_docker(generic_cloud: str, image_id: str):
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED',
             f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
             f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky logs {name} 4 --status',
             f'sky logs {name} 5 --status',
+            f'sky logs {name} 6 --status',
+            # Make sure it is still visible after an stop & start cycle.
+            f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
+            f'sky logs {name} 7 --status'
         ],
         f'sky down -y {name}',
+        timeout=total_timeout_minutes * 60,
     )
     run_one_test(test)
 
@@ -2812,8 +2823,9 @@ def test_aws_custom_image():
     [
         "docker:nvidia/cuda:11.8.0-devel-ubuntu18.04",
         "docker:ubuntu:18.04",
-        # Test image with python 3.11 installed by default.
-        "docker:continuumio/miniconda3",
+        # Test latest image with python 3.11 installed by default.
+        # Does not work for python 3.12 due to ray's requirement for 3.11.
+        'docker:continuumio/miniconda3:24.1.2-0',
     ])
 def test_kubernetes_custom_image(image_id):
     """Test Kubernetes custom image"""