diff --git a/sky/skylet/job_lib.py b/sky/skylet/job_lib.py index 06b8636f6103..90253fbf146f 100644 --- a/sky/skylet/job_lib.py +++ b/sky/skylet/job_lib.py @@ -140,7 +140,7 @@ def colored_str(self): # to avoid race condition with `ray job` to make sure it job has been # correctly updated. # TODO(zhwu): This number should be tuned based on heuristics. -_PENDING_SUBMIT_TIMEOUT = 5 +_PENDING_SUBMIT_TIMEOUT = 60 _PRE_RESOURCE_STATUSES = [JobStatus.PENDING] @@ -541,19 +541,16 @@ def update_job_status(job_owner: str, ray_status = job_details[ray_job_id].status job_statuses[i] = _RAY_TO_JOB_STATUS_MAP[ray_status] if job_id in pending_jobs: - # Gives a 5 second timeout between job being submit from the - # pending queue until appearing in ray jobs - if pending_jobs[job_id]['submit'] > 0 and pending_jobs[job_id][ - 'submit'] < time.time() - _PENDING_SUBMIT_TIMEOUT: - continue if pending_jobs[job_id]['created_time'] < psutil.boot_time(): # The job is stale as it is created before the instance # is booted, e.g. the instance is rebooted. job_statuses[i] = JobStatus.FAILED - else: - # Set the job status to PENDING even though the job can be - # in any later status, because the code will take the max - # of this status and the status in the jobs table. + elif (pending_jobs[job_id]['submit'] > + max(0, time.time() - _PENDING_SUBMIT_TIMEOUT)): + # Gives a 60 second timeout between job being submit from the + # pending queue until appearing in ray jobs + # Reset the job status to PENDING even though it may not appear + # in the ray jobs, so that it will not be considered as stale. job_statuses[i] = JobStatus.PENDING assert len(job_statuses) == len(job_ids), (job_statuses, job_ids) diff --git a/sky/skylet/skylet.py b/sky/skylet/skylet.py index d396bb84628b..6bbb51e7a376 100644 --- a/sky/skylet/skylet.py +++ b/sky/skylet/skylet.py @@ -5,7 +5,10 @@ from sky import sky_logging from sky.skylet import events -logger = sky_logging.init_logger(__name__) +# Use the explicit logger name so that the logger is under the +# `sky.skylet.skylet` namespace when executed directly, so as +# to inherit the setup from the `sky` logger. +logger = sky_logging.init_logger('sky.skylet.skylet') logger.info('skylet started') EVENTS = [