Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Spot] Show logging from the controller and grace period for cluster status checking #1951

Merged
merged 12 commits into from
May 12, 2023
23 changes: 18 additions & 5 deletions sky/spot/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
from sky.utils import common_utils
from sky.utils import subprocess_utils

logger = sky_logging.init_logger(__name__)
# Use the explicit logger name so that the logger is under the
# `sky.spot.controller` namespace when executed directly, so as
# to inherit the setup from the `sky` logger.
logger = sky_logging.init_logger('sky.spot.controller')


def _get_task_and_name(task_yaml: str) -> Tuple['sky.Task', str]:
Expand All @@ -36,6 +39,7 @@ class SpotController:

def __init__(self, job_id: int, task_yaml: str,
retry_until_up: bool) -> None:

Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
self._job_id = job_id
self._task, self._task_name = _get_task_and_name(task_yaml)

Expand Down Expand Up @@ -126,6 +130,12 @@ def _run(self):
self._task.num_nodes == 1):
continue

if job_status in [
job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
]:
# Add a grace period before the check of preemption to avoid
# false alarm for job failure.
time.sleep(5)
# Pull the actual cluster status from the cloud provider to
# determine whether the cluster is preempted.
(cluster_status,
Expand Down Expand Up @@ -154,10 +164,13 @@ def _run(self):
logger.info(
'The user job failed. Please check the logs below.\n'
f'== Logs of the user job (ID: {self._job_id}) ==\n')
self._backend.tail_logs(handle,
None,
spot_job_id=self._job_id)
logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
# TODO(zhwu): Download the logs, and stream them from the
# local disk, instead of streaming them from the spot
# cluster, to make it faster and more reliable.
returncode = self._backend.tail_logs(
handle, None, spot_job_id=self._job_id)
logger.info(f'\n== End of logs (ID: {self._job_id}, '
f'tail_logs returncode: {returncode}) ==')
spot_status_to_set = spot_state.SpotStatus.FAILED
if job_status == job_lib.JobStatus.FAILED_SETUP:
spot_status_to_set = spot_state.SpotStatus.FAILED_SETUP
Expand Down