skypilot-org · Michaelvll · May 12, 2023 · May 11, 2023 · May 11, 2023 · May 11, 2023
diff --git a/sky/spot/controller.py b/sky/spot/controller.py
@@ -22,7 +22,10 @@
 from sky.utils import common_utils
 from sky.utils import subprocess_utils
 
-logger = sky_logging.init_logger(__name__)
+# Use the explicit logger name so that the logger is under the
+# `sky.spot.controller` namespace when executed directly, so as
+# to inherit the setup from the `sky` logger.
+logger = sky_logging.init_logger('sky.spot.controller')
 
 
 def _get_task_and_name(task_yaml: str) -> Tuple['sky.Task', str]:
@@ -36,6 +39,7 @@ class SpotController:
 
     def __init__(self, job_id: int, task_yaml: str,
                  retry_until_up: bool) -> None:
+
         self._job_id = job_id
         self._task, self._task_name = _get_task_and_name(task_yaml)
 
@@ -126,6 +130,12 @@ def _run(self):
                     self._task.num_nodes == 1):
                 continue
 
+            if job_status in [
+                    job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
+            ]:
+                # Add a grace period before the check of preemption to avoid
+                # false alarm for job failure.
+                time.sleep(5)
             # Pull the actual cluster status from the cloud provider to
             # determine whether the cluster is preempted.
             (cluster_status,
@@ -154,10 +164,13 @@ def _run(self):
                     logger.info(
                         'The user job failed. Please check the logs below.\n'
                         f'== Logs of the user job (ID: {self._job_id}) ==\n')
-                    self._backend.tail_logs(handle,
-                                            None,
-                                            spot_job_id=self._job_id)
-                    logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
+                    # TODO(zhwu): Download the logs, and stream them from the
+                    # local disk, instead of streaming them from the spot
+                    # cluster, to make it faster and more reliable.
+                    returncode = self._backend.tail_logs(
+                        handle, None, spot_job_id=self._job_id)
+                    logger.info(f'\n== End of logs (ID: {self._job_id}, '
+                                f'tail_logs returncode: {returncode}) ==')
                     spot_status_to_set = spot_state.SpotStatus.FAILED
                     if job_status == job_lib.JobStatus.FAILED_SETUP:
                         spot_status_to_set = spot_state.SpotStatus.FAILED_SETUP