From 31dc1c83b92423d9400b4c4d065f5cf4b9a0e41a Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 22 Apr 2024 19:29:33 +0000 Subject: [PATCH] Fix jobs longer than 12 days --- sky/backends/cloud_vm_ray_backend.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 38bebf1e602..834eb397f67 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -268,7 +268,13 @@ def get_or_fail(futures, pg) -> List[int]: \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\" returncodes = [1] * len(futures) # Wait for 1 task to be ready. - ready, unready = ray.wait(futures) + ready = [] + # Recall ray.wait if ready is empty. This is because ray.wait + # with timeout=None will only wait for 10**6 seconds, which will + # cause the task longer than 12 days returned before it is + # ready. Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846 + while not ready: + ready, unready = ray.wait(futures) idx = futures.index(ready[0]) returncodes[idx] = ray.get(ready[0]) while unready: