Skip to content

Commit

Permalink
Make update job status more readable
Browse files Browse the repository at this point in the history
  • Loading branch information
Michaelvll committed Jun 6, 2023
1 parent 46b7c11 commit 505f9e3
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 10 deletions.
20 changes: 11 additions & 9 deletions sky/skylet/job_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def colored_str(self):
# to avoid race condition with `ray job` to make sure it job has been
# correctly updated.
# TODO(zhwu): This number should be tuned based on heuristics.
_PENDING_SUBMIT_TIMEOUT = 5
_PENDING_SUBMIT_GRACE_PERIOD = 60

_PRE_RESOURCE_STATUSES = [JobStatus.PENDING]

Expand Down Expand Up @@ -541,19 +541,21 @@ def update_job_status(job_owner: str,
ray_status = job_details[ray_job_id].status
job_statuses[i] = _RAY_TO_JOB_STATUS_MAP[ray_status]
if job_id in pending_jobs:
# Gives a 5 second timeout between job being submit from the
# pending queue until appearing in ray jobs
if pending_jobs[job_id]['submit'] > 0 and pending_jobs[job_id][
'submit'] < time.time() - _PENDING_SUBMIT_TIMEOUT:
continue
if pending_jobs[job_id]['created_time'] < psutil.boot_time():
# The job is stale as it is created before the instance
# is booted, e.g. the instance is rebooted.
job_statuses[i] = JobStatus.FAILED
# Gives a 60 second grace period between job being submit from
# the pending table until appearing in ray jobs.
if (pending_jobs[job_id]['submit'] > 0 and
pending_jobs[job_id]['submit'] <
time.time() - _PENDING_SUBMIT_GRACE_PERIOD):
# For jobs submitted outside of the grace period, we will
# consider the ray job status.
continue
else:
# Set the job status to PENDING even though the job can be
# in any later status, because the code will take the max
# of this status and the status in the jobs table.
# Reset the job status to PENDING even though it may not appear
# in the ray jobs, so that it will not be considered as stale.
job_statuses[i] = JobStatus.PENDING

assert len(job_statuses) == len(job_ids), (job_statuses, job_ids)
Expand Down
5 changes: 4 additions & 1 deletion sky/skylet/skylet.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
from sky import sky_logging
from sky.skylet import events

logger = sky_logging.init_logger(__name__)
# Use the explicit logger name so that the logger is under the
# `sky.skylet.skylet` namespace when executed directly, so as
# to inherit the setup from the `sky` logger.
logger = sky_logging.init_logger('sky.skylet.skylet')
logger.info('skylet started')

EVENTS = [
Expand Down

0 comments on commit 505f9e3

Please sign in to comment.