diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index a274c9d9b5a..2073a2f453f 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -119,6 +119,9 @@ pathlib.Path(sky.__file__).resolve().parent / 'backends' / 'monkey_patches' / 'monkey_patch_ray_up.py') +# Restart skylet when the version does not match to keep the skylet up-to-date. +_MAYBE_SKYLET_RESTART_CMD = 'python3 -m sky.skylet.attempt_skylet' + def _get_cluster_config_template(cloud): cloud_to_template = { @@ -198,13 +201,11 @@ def __init__(self): def add_prologue(self, job_id: int, spot_task: Optional['task_lib.Task'] = None, - setup_cmd: Optional[str] = None, - envs: Optional[Dict[str, str]] = None, - setup_log_path: Optional[str] = None, is_local: bool = False) -> None: assert not self._has_prologue, 'add_prologue() called twice?' self._has_prologue = True self.job_id = job_id + self.is_local = is_local # Should use 'auto' or 'ray://:10001' rather than # 'ray://localhost:10001', or 'ray://127.0.0.1:10001', for public cloud. # Otherwise, ray will fail to get the placement group because of a bug @@ -260,7 +261,6 @@ def add_prologue(self, inspect.getsource(log_lib.add_ray_env_vars), inspect.getsource(log_lib.run_bash_command_with_log), 'run_bash_command_with_log = ray.remote(run_bash_command_with_log)', - f'setup_cmd = {setup_cmd!r}', ] # Currently, the codegen program is/can only be submitted to the head # node, due to using job_lib for updating job statuses, and using @@ -272,46 +272,6 @@ def add_prologue(self, if hasattr(autostop_lib, 'set_last_active_time_to_now'): autostop_lib.set_last_active_time_to_now() """)) - if setup_cmd is not None: - self._code += [ - textwrap.dedent(f"""\ - _SETUP_CPUS = 0.0001 - # The setup command will be run as a ray task with num_cpus=_SETUP_CPUS as the - # requirement; this means Ray will set CUDA_VISIBLE_DEVICES to an empty string. - # We unset it so that user setup command may properly use this env var. - setup_cmd = 'unset CUDA_VISIBLE_DEVICES; ' + setup_cmd - job_lib.set_status({job_id!r}, job_lib.JobStatus.SETTING_UP) - print({_CTRL_C_TIP_MESSAGE!r}, file=sys.stderr, flush=True) - total_num_nodes = len(ray.nodes()) - setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)] - setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD') - ray.get(setup_pg.ready()) - setup_workers = [run_bash_command_with_log \\ - .options(name='setup', num_cpus=_SETUP_CPUS, placement_group=setup_pg, placement_group_bundle_index=i) \\ - .remote( - setup_cmd, - os.path.expanduser({setup_log_path!r}), - getpass.getuser(), - job_id={self.job_id}, - env_vars={envs!r}, - stream_logs=True, - with_ray=True, - use_sudo={is_local}, - ) for i in range(total_num_nodes)] - setup_returncodes = ray.get(setup_workers) - if sum(setup_returncodes) != 0: - job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP) - # This waits for all streaming logs to finish. - time.sleep(1) - print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with ' - 'return code list:{colorama.Style.RESET_ALL}', - setup_returncodes, - file=sys.stderr, - flush=True) - # Need this to set the job status in ray job to be FAILED. - sys.exit(1) - """) - ] self._code += [ f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)', ] @@ -324,11 +284,14 @@ def add_prologue(self, f'{job_id}, {spot_task.name!r}, {resources_str!r})', ] - def add_gang_scheduling_placement_group( + def add_gang_scheduling_placement_group_and_setup( self, num_nodes: int, accelerator_dict: Optional[Dict[str, float]], stable_cluster_internal_ips: List[str], + setup_cmd: Optional[str] = None, + setup_log_path: Optional[str] = None, + envs: Optional[Dict[str, str]] = None, ) -> None: """Create the gang scheduling placement group for a Task. @@ -336,8 +299,9 @@ def add_gang_scheduling_placement_group( variable is assigned in a deterministic order whenever a new task is added. """ - assert self._has_prologue, ('Call add_prologue() before ' - 'add_gang_scheduling_placement_group().') + assert self._has_prologue, ( + 'Call add_prologue() before ' + 'add_gang_scheduling_placement_group_and_setup().') self._has_gang_scheduling = True self._num_nodes = num_nodes @@ -370,7 +334,7 @@ def add_gang_scheduling_placement_group( plural = 's' if {num_nodes} > 1 else '' node_str = f'{num_nodes} node{{plural}}' - message = '' if setup_cmd is not None else {_CTRL_C_TIP_MESSAGE!r} + '\\n' + message = {_CTRL_C_TIP_MESSAGE!r} + '\\n' message += f'INFO: Waiting for task resources on {{node_str}}. This will block if the cluster is full.' print(message, file=sys.stderr, @@ -382,10 +346,62 @@ def add_gang_scheduling_placement_group( print('INFO: All task resources reserved.', file=sys.stderr, flush=True) - job_lib.set_job_started({self.job_id!r}) """) ] + job_id = self.job_id + if setup_cmd is not None: + self._code += [ + textwrap.dedent(f"""\ + setup_cmd = {setup_cmd!r} + _SETUP_CPUS = 0.0001 + # The setup command will be run as a ray task with num_cpus=_SETUP_CPUS as the + # requirement; this means Ray will set CUDA_VISIBLE_DEVICES to an empty string. + # We unset it so that user setup command may properly use this env var. + setup_cmd = 'unset CUDA_VISIBLE_DEVICES; ' + setup_cmd + job_lib.set_status({job_id!r}, job_lib.JobStatus.SETTING_UP) + + # The schedule_step should be called after the job status is set to non-PENDING, + # otherwise, the scheduler will think the current job is not submitted yet, and + # skip the scheduling step. + job_lib.scheduler.schedule_step() + + total_num_nodes = len(ray.nodes()) + setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)] + setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD') + setup_workers = [run_bash_command_with_log \\ + .options(name='setup', num_cpus=_SETUP_CPUS, placement_group=setup_pg, placement_group_bundle_index=i) \\ + .remote( + setup_cmd, + os.path.expanduser({setup_log_path!r}), + getpass.getuser(), + job_id={self.job_id}, + env_vars={envs!r}, + stream_logs=True, + with_ray=True, + use_sudo={self.is_local}, + ) for i in range(total_num_nodes)] + setup_returncodes = ray.get(setup_workers) + if sum(setup_returncodes) != 0: + job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP) + # This waits for all streaming logs to finish. + time.sleep(1) + print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with ' + 'return code list:{colorama.Style.RESET_ALL}', + setup_returncodes, + file=sys.stderr, + flush=True) + # Need this to set the job status in ray job to be FAILED. + sys.exit(1) + """) + ] + + self._code.append(f'job_lib.set_job_started({self.job_id!r})') + if setup_cmd is None: + # Need to call schedule_step() to make sure the scheduler + # schedule the next pending job. + self._code.append('job_lib.scheduler.schedule_step()') + # Export IP and node rank to the environment variables. self._code += [ textwrap.dedent(f"""\ @@ -414,7 +430,7 @@ def register_run_fn(self, run_fn: str, run_fn_name: str) -> None: run_fn: The run function to be run on the remote cluster. """ assert self._has_gang_scheduling, ( - 'Call add_gang_scheduling_placement_group() ' + 'Call add_gang_scheduling_placement_group_and_setup() ' 'before register_run_fn().') assert not self._has_register_run_fn, ( 'register_run_fn() called twice?') @@ -436,7 +452,8 @@ def add_ray_task(self, use_sudo: bool = False) -> None: """Generates code for a ray remote task that runs a bash command.""" assert self._has_gang_scheduling, ( - 'Call add_gang_scheduling_placement_group() before add_ray_task().') + 'Call add_gang_scheduling_placement_group_and_setup() before ' + 'add_ray_task().') assert (not self._has_register_run_fn or bash_script is None), ('bash_script should ' 'be None when run_fn is registered.') @@ -549,7 +566,8 @@ def add_epilogue(self) -> None: if sum(returncodes) != 0: job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED) # This waits for all streaming logs to finish. - time.sleep(1) + job_lib.scheduler.schedule_step() + time.sleep(0.5) print('ERROR: {colorama.Fore.RED}Job {self.job_id} failed with ' 'return code list:{colorama.Style.RESET_ALL}', returncodes, @@ -562,7 +580,8 @@ def add_epilogue(self) -> None: sys.stderr.flush() job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SUCCEEDED) # This waits for all streaming logs to finish. - time.sleep(1) + job_lib.scheduler.schedule_step() + time.sleep(0.5) """) ] @@ -2495,6 +2514,14 @@ def _update_after_cluster_provisioned( usage_lib.messages.usage.update_final_cluster_status( global_user_state.ClusterStatus.UP) + # For backward compatability and robustness of skylet, it is restarted + with log_utils.safe_rich_status('Updating remote skylet'): + self.run_on_head( + handle, + _MAYBE_SKYLET_RESTART_CMD, + use_cached_head_ip=False, + ) + # Update job queue to avoid stale jobs (when restarted), before # setting the cluster to be ready. if prev_cluster_status == global_user_state.ClusterStatus.INIT: @@ -2755,15 +2782,30 @@ def _exec_code_on_head( else: job_submit_cmd = ( 'RAY_DASHBOARD_PORT=$(python -c "from sky.skylet import job_lib; print(job_lib.get_job_submission_port())" 2> /dev/null || echo 8265);' # pylint: disable=line-too-long - f'{cd} && mkdir -p {remote_log_dir} && ray job submit ' + f'{cd} && ray job submit ' '--address=http://127.0.0.1:$RAY_DASHBOARD_PORT ' f'--submission-id {ray_job_id} --no-wait ' f'"{executable} -u {script_path} > {remote_log_path} 2>&1"') + mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && ' + f'touch {remote_log_path}') + code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd) + job_submit_cmd = mkdir_code + ' && ' + code + returncode, stdout, stderr = self.run_on_head(handle, job_submit_cmd, stream_logs=False, require_outputs=True) + + if 'has no attribute' in stdout: + # Happens when someone calls `sky exec` but remote is outdated + # necessicating calling `sky launch` + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + f'{colorama.Fore.RED}SkyPilot runtime is stale on the ' + 'remote cluster. To update, run: sky launch -c ' + f'{handle.cluster_name}{colorama.Style.RESET_ALL}') + subprocess_utils.handle_returncode(returncode, job_submit_cmd, f'Failed to submit job {job_id}.', @@ -3864,12 +3906,15 @@ def _execute_task_one_node(self, handle: CloudVmRayResourceHandle, is_local = isinstance(handle.launched_resources.cloud, clouds.Local) codegen.add_prologue(job_id, spot_task=task.spot_task, - setup_cmd=self._setup_cmd, - envs=task.envs, - setup_log_path=os.path.join(log_dir, 'setup.log'), is_local=is_local) - codegen.add_gang_scheduling_placement_group( - 1, accelerator_dict, stable_cluster_internal_ips=internal_ips) + codegen.add_gang_scheduling_placement_group_and_setup( + 1, + accelerator_dict, + stable_cluster_internal_ips=internal_ips, + setup_cmd=self._setup_cmd, + setup_log_path=os.path.join(log_dir, 'setup.log'), + envs=task.envs, + ) if callable(task.run): run_fn_code = textwrap.dedent(inspect.getsource(task.run)) @@ -3929,14 +3974,14 @@ def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, is_local = isinstance(handle.launched_resources.cloud, clouds.Local) codegen.add_prologue(job_id, spot_task=task.spot_task, - setup_cmd=self._setup_cmd, - envs=task.envs, - setup_log_path=os.path.join(log_dir, 'setup.log'), is_local=is_local) - codegen.add_gang_scheduling_placement_group( + codegen.add_gang_scheduling_placement_group_and_setup( num_actual_nodes, accelerator_dict, - stable_cluster_internal_ips=internal_ips) + stable_cluster_internal_ips=internal_ips, + setup_cmd=self._setup_cmd, + setup_log_path=os.path.join(log_dir, 'setup.log'), + envs=task.envs) if callable(task.run): run_fn_code = textwrap.dedent(inspect.getsource(task.run)) diff --git a/sky/execution.py b/sky/execution.py index be4bcbe5d22..362a66d4156 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -261,6 +261,7 @@ def _execute( task.sync_storage_mounts() try: + if Stage.PROVISION in stages: if handle is None: handle = backend.provision(task, diff --git a/sky/skylet/attempt_skylet.py b/sky/skylet/attempt_skylet.py new file mode 100644 index 00000000000..b526d0642f6 --- /dev/null +++ b/sky/skylet/attempt_skylet.py @@ -0,0 +1,41 @@ +"""Restarts skylet if version does not match""" + +import os +import subprocess + +from sky.skylet import constants + +VERSION_FILE = os.path.expanduser(constants.SKYLET_VERSION_FILE) + + +def restart_skylet(): + # Kills old skylet if it is running + subprocess.run( + 'ps aux | grep "sky.skylet.skylet" | grep "python3 -m"' + '| awk \'{print $2}\' | xargs kill >> ~/.sky/skylet.log 2>&1', + shell=True, + check=False) + subprocess.run( + 'nohup python3 -m sky.skylet.skylet' + ' >> ~/.sky/skylet.log 2>&1 &', + shell=True, + check=True) + with open(VERSION_FILE, 'w') as v_f: + v_f.write(constants.SKYLET_VERSION) + + +proc = subprocess.run( + 'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | grep "python3 -m"', + shell=True, + check=False) + +running = (proc.returncode == 0) + +version_match = False +if os.path.exists(VERSION_FILE): + with open(VERSION_FILE) as f: + if f.read().strip() == constants.SKYLET_VERSION: + version_match = True + +if not running or not version_match: + restart_skylet() diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 1d4e72dbec4..e260da766b9 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -26,3 +26,6 @@ 'command to initialize it locally: sky launch -c {cluster} \'\'') JOB_ID_ENV_VAR = 'SKYPILOT_JOB_ID' + +SKYLET_VERSION = '1' +SKYLET_VERSION_FILE = '~/.sky/skylet_version' diff --git a/sky/skylet/events.py b/sky/skylet/events.py index 36649c7a9fe..bf8bf7f9ebc 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -1,5 +1,4 @@ """skylet events""" -import getpass import math import os import re @@ -53,20 +52,12 @@ def _run(self): raise NotImplementedError -class JobUpdateEvent(SkyletEvent): - """Skylet event for updating job status.""" +class JobSchedulerEvent(SkyletEvent): + """Skylet event for scheduling jobs""" EVENT_INTERVAL_SECONDS = 300 - # Only update status of the jobs after this many seconds of job submission, - # to avoid race condition with `ray job` to make sure it job has been - # correctly updated. - # TODO(zhwu): This number should be tuned based on heuristics. - _SUBMITTED_GAP_SECONDS = 60 - def _run(self): - job_owner = getpass.getuser() - job_lib.update_status(job_owner, - submitted_gap_sec=self._SUBMITTED_GAP_SECONDS) + job_lib.scheduler.schedule_step() class SpotJobUpdateEvent(SkyletEvent): diff --git a/sky/skylet/job_lib.py b/sky/skylet/job_lib.py index 498862e5be2..dc12601c0fe 100644 --- a/sky/skylet/job_lib.py +++ b/sky/skylet/job_lib.py @@ -6,13 +6,16 @@ import json import os import pathlib +import psutil import shlex +import subprocess import time import typing -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple import colorama import filelock +import getpass from sky import sky_logging from sky.skylet import constants @@ -62,6 +65,13 @@ def create_table(cursor, conn): run_timestamp TEXT CANDIDATE KEY, start_at FLOAT DEFAULT -1)""") + cursor.execute("""CREATE TABLE IF NOT EXISTS pending_jobs( + job_id INTEGER, + run_cmd TEXT, + submit INTEGER, + created_time INTEGER + )""") + db_utils.add_column_to_table(cursor, conn, 'jobs', 'end_at', 'FLOAT') db_utils.add_column_to_table(cursor, conn, 'jobs', 'resources', 'TEXT') @@ -83,15 +93,15 @@ class JobStatus(enum.Enum): # In the 'jobs' table, the `submitted_at` column will be set to the current # time, when the job is firstly created (in the INIT state). INIT = 'INIT' + # The job is waiting for the required resources. (`ray job status` + # shows RUNNING as the generated ray program has started, but blocked + # by the placement constraints.) + PENDING = 'PENDING' # Running the user's setup script (only in effect if --detach-setup is # set). Our update_job_status() can temporarily (for a short period) set # the status to SETTING_UP, if the generated ray program has not set # the status to PENDING or RUNNING yet. SETTING_UP = 'SETTING_UP' - # The job is waiting for the required resources. (`ray job status` - # shows RUNNING as the generated ray program has started, but blocked - # by the placement constraints.) - PENDING = 'PENDING' # The job is running. # In the 'jobs' table, the `start_at` column will be set to the current # time, when the job is firstly transitioned to RUNNING. @@ -126,6 +136,77 @@ def colored_str(self): return f'{color}{self.value}{colorama.Style.RESET_ALL}' +# Only update status of the jobs after this many seconds of job submission, +# to avoid race condition with `ray job` to make sure it job has been +# correctly updated. +# TODO(zhwu): This number should be tuned based on heuristics. +_PENDING_SUBMIT_GRACE_PERIOD = 60 + +_PRE_RESOURCE_STATUSES = [JobStatus.PENDING] + + +class JobScheduler: + """Base class for job scheduler""" + + def queue(self, job_id: int, cmd: str) -> None: + _CURSOR.execute('INSERT INTO pending_jobs VALUES (?,?,?,?)', + (job_id, cmd, 0, int(time.time()))) + _CONN.commit() + set_status(job_id, JobStatus.PENDING) + self.schedule_step() + + def remove_job_no_lock(self, job_id: int) -> None: + _CURSOR.execute(f'DELETE FROM pending_jobs WHERE job_id={job_id!r}') + _CONN.commit() + + def _run_job(self, job_id: int, run_cmd: str): + _CURSOR.execute((f'UPDATE pending_jobs SET submit={int(time.time())} ' + f'WHERE job_id={job_id!r}')) + _CONN.commit() + subprocess.Popen(run_cmd, shell=True, stdout=subprocess.DEVNULL) + + def schedule_step(self) -> None: + job_owner = getpass.getuser() + jobs = self._get_jobs() + if len(jobs) > 0: + update_status(job_owner) + # TODO(zhwu, mraheja): One optimization can be allowing more than one + # job staying in the pending state after ray job submit, so that to be + # faster to schedule a large amount of jobs. + for job_id, run_cmd, submit, created_time in jobs: + with filelock.FileLock(_get_lock_path(job_id)): + status = get_status_no_lock(job_id) + if (status not in _PRE_RESOURCE_STATUSES or + created_time < psutil.boot_time()): + # Job doesn't exist, is running/cancelled, or created + # before the last reboot. + self.remove_job_no_lock(job_id) + continue + if submit: + # Next job waiting for resources + return + self._run_job(job_id, run_cmd) + return + + def _get_jobs(self) -> List[Tuple[int, str, int, int]]: + """Returns the metadata for jobs in the pending jobs table + + The information contains job_id, run command, submit time, + creation time. + """ + raise NotImplementedError + + +class FIFOScheduler(JobScheduler): + """First in first out job scheduler""" + + def _get_jobs(self) -> List[Tuple[int, str, int, int]]: + return list( + _CURSOR.execute('SELECT * FROM pending_jobs ORDER BY job_id')) + + +scheduler = FIFOScheduler() + _JOB_STATUS_TO_COLOR = { JobStatus.INIT: colorama.Fore.BLUE, JobStatus.SETTING_UP: colorama.Fore.BLUE, @@ -138,23 +219,24 @@ def colored_str(self): } _RAY_TO_JOB_STATUS_MAP = { - # These are intentionally set to one status before, because: + # These are intentionally set this way, because: # 1. when the ray status indicates the job is PENDING the generated - # python program should not be started yet, i.e. the job should be INIT. + # python program has been `ray job submit` from the job queue + # and is now PENDING # 2. when the ray status indicates the job is RUNNING the job can be in # setup or resources may not be allocated yet, i.e. the job should be - # SETTING_UP. - # For case 2, update_job_status() would compare this mapped SETTING_UP to + # PENDING. + # For case 2, update_job_status() would compare this mapped PENDING to # the status in our jobs DB and take the max. This is because the job's # generated ray program is the only place that can determine a job has # reserved resources and actually started running: it will set the - # status in the DB to RUNNING. + # status in the DB to SETTING_UP or RUNNING. # If there is no setup specified in the task, as soon as it is started # (ray's status becomes RUNNING), i.e. it will be very rare that the job # will be set to SETTING_UP by the update_job_status, as our generated # ray program will set the status to PENDING immediately. - 'PENDING': JobStatus.INIT, - 'RUNNING': JobStatus.SETTING_UP, + 'PENDING': JobStatus.PENDING, + 'RUNNING': JobStatus.PENDING, 'SUCCEEDED': JobStatus.SUCCEEDED, 'FAILED': JobStatus.FAILED, 'STOPPED': JobStatus.CANCELLED, @@ -368,9 +450,9 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]: return records -def _get_jobs(username: Optional[str], - status_list: Optional[List[JobStatus]] = None, - submitted_gap_sec: int = 0) -> List[Dict[str, Any]]: +def _get_jobs( + username: Optional[str], + status_list: Optional[List[JobStatus]] = None) -> List[Dict[str, Any]]: if status_list is None: status_list = list(JobStatus) status_str_list = [status.value for status in status_list] @@ -379,18 +461,17 @@ def _get_jobs(username: Optional[str], f"""\ SELECT * FROM jobs WHERE status IN ({','.join(['?'] * len(status_list))}) - AND submitted_at <= (?) ORDER BY job_id DESC""", - (*status_str_list, time.time() - submitted_gap_sec), + (*status_str_list,), ) else: rows = _CURSOR.execute( f"""\ SELECT * FROM jobs WHERE status IN ({','.join(['?'] * len(status_list))}) - AND username=(?) AND submitted_at <= (?) + AND username=(?) ORDER BY job_id DESC""", - (*status_str_list, username, time.time() - submitted_gap_sec), + (*status_str_list, username), ) records = _get_records_from_rows(rows) @@ -409,6 +490,18 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]: return records +def _get_pending_jobs(): + rows = _CURSOR.execute( + 'SELECT job_id, created_time, submit FROM pending_jobs') + rows = list(rows) + return { + job_id: { + 'created_time': created_time, + 'submit': submit + } for job_id, created_time, submit in rows + } + + def update_job_status(job_owner: str, job_ids: List[int], silent: bool = False) -> List[JobStatus]: @@ -435,6 +528,7 @@ def update_job_status(job_owner: str, # which contains the job status (str) and submission_id (str). job_detail_lists: List['ray_pydantic.JobDetails'] = job_client.list_jobs() + pending_jobs = _get_pending_jobs() job_details = {} ray_job_ids_set = set(ray_job_ids) for job_detail in job_detail_lists: @@ -442,9 +536,27 @@ def update_job_status(job_owner: str, job_details[job_detail.submission_id] = job_detail job_statuses: List[Optional[JobStatus]] = [None] * len(ray_job_ids) for i, ray_job_id in enumerate(ray_job_ids): + job_id = job_ids[i] if ray_job_id in job_details: ray_status = job_details[ray_job_id].status job_statuses[i] = _RAY_TO_JOB_STATUS_MAP[ray_status] + if job_id in pending_jobs: + if pending_jobs[job_id]['created_time'] < psutil.boot_time(): + # The job is stale as it is created before the instance + # is booted, e.g. the instance is rebooted. + job_statuses[i] = JobStatus.FAILED + # Gives a 60 second grace period between job being submit from + # the pending table until appearing in ray jobs. + if (pending_jobs[job_id]['submit'] > 0 and + pending_jobs[job_id]['submit'] < + time.time() - _PENDING_SUBMIT_GRACE_PERIOD): + # For jobs submitted outside of the grace period, we will + # consider the ray job status. + continue + else: + # Reset the job status to PENDING even though it may not appear + # in the ray jobs, so that it will not be considered as stale. + job_statuses[i] = JobStatus.PENDING assert len(job_statuses) == len(job_ids), (job_statuses, job_ids) @@ -453,9 +565,6 @@ def update_job_status(job_owner: str, # Per-job status lock is required because between the job status # query and the job status update, the job status in the databse # can be modified by the generated ray program. - # TODO(mraheja): remove pylint disabling when filelock version - # updated - # pylint: disable=abstract-class-instantiated with filelock.FileLock(_get_lock_path(job_id)): original_status = get_status_no_lock(job_id) assert original_status is not None, (job_id, status) @@ -503,7 +612,7 @@ def fail_all_jobs_in_progress() -> None: _CONN.commit() -def update_status(job_owner: str, submitted_gap_sec: int = 0) -> None: +def update_status(job_owner: str) -> None: # This will be called periodically by the skylet to update the status # of the jobs in the database, to avoid stale job status. # NOTE: there might be a INIT job in the database set to FAILED by this @@ -511,8 +620,7 @@ def update_status(job_owner: str, submitted_gap_sec: int = 0) -> None: # not submitted yet. It will be then reset to PENDING / RUNNING when the # app starts. nonterminal_jobs = _get_jobs(username=None, - status_list=JobStatus.nonterminal_statuses(), - submitted_gap_sec=submitted_gap_sec) + status_list=JobStatus.nonterminal_statuses()) nonterminal_job_ids = [job['job_id'] for job in nonterminal_jobs] update_job_status(job_owner, nonterminal_job_ids) @@ -603,7 +711,7 @@ def cancel_jobs(job_owner: str, jobs: Optional[List[int]]) -> None: # jobs to CANCELLED. if jobs is None: job_records = _get_jobs( - None, [JobStatus.SETTING_UP, JobStatus.PENDING, JobStatus.RUNNING]) + None, [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING]) else: job_records = _get_jobs_by_ids(jobs) @@ -615,18 +723,24 @@ def cancel_jobs(job_owner: str, jobs: Optional[List[int]]) -> None: # ray cluster (tracked in #1262). for job in job_records: job_id = make_ray_job_id(job['job_id'], job_owner) - try: - job_client.stop_job(job_id) - except RuntimeError as e: - # If the job does not exist or if the request to the - # job server fails. - logger.warning(str(e)) - continue - - if job['status'] in [ - JobStatus.SETTING_UP, JobStatus.PENDING, JobStatus.RUNNING - ]: - set_status(job['job_id'], JobStatus.CANCELLED) + # Job is locked to ensure that pending queue does not start it while + # it is being cancelled + with filelock.FileLock(_get_lock_path(job['job_id'])): + try: + job_client.stop_job(job_id) + except RuntimeError as e: + # If the request to the job server fails, we should not + # set the job to CANCELLED. + if 'does not exist' not in str(e): + logger.warning(str(e)) + continue + + if job['status'] in [ + JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING + ]: + _set_status_no_lock(job['job_id'], JobStatus.CANCELLED) + + scheduler.schedule_step() def get_run_timestamp(job_id: Optional[int]) -> Optional[str]: @@ -683,6 +797,13 @@ def add_job(cls, job_name: Optional[str], username: str, run_timestamp: str, ] return cls._build(code) + @classmethod + def queue_job(cls, job_id: int, cmd: str) -> str: + code = ['job_lib.scheduler.queue(' + f'{job_id!r},' + f'{cmd!r})'] + return cls._build(code) + @classmethod def update_status(cls, job_owner: str) -> str: code = [ diff --git a/sky/skylet/skylet.py b/sky/skylet/skylet.py index e6befdf72de..6bbb51e7a37 100644 --- a/sky/skylet/skylet.py +++ b/sky/skylet/skylet.py @@ -5,12 +5,15 @@ from sky import sky_logging from sky.skylet import events -logger = sky_logging.init_logger(__name__) +# Use the explicit logger name so that the logger is under the +# `sky.skylet.skylet` namespace when executed directly, so as +# to inherit the setup from the `sky` logger. +logger = sky_logging.init_logger('sky.skylet.skylet') logger.info('skylet started') EVENTS = [ events.AutostopEvent(), - events.JobUpdateEvent(), + events.JobSchedulerEvent(), # The spot job update event should be after the job update event. # Otherwise, the abnormal spot job status update will be delayed # until the next job update event. diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 1d42b065853..117fe2325f2 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -595,7 +595,7 @@ def test_aws_stale_job_manual_restart(): f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - f'sleep {events.JobUpdateEvent.EVENT_INTERVAL_SECONDS}', + f'sleep {events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS}', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED', ], f'sky down -y {name}', @@ -624,7 +624,7 @@ def test_gcp_stale_job_manual_restart(): f'sky logs {name} 1 --status', f'sky logs {name} 3 --status', # Ensure the skylet updated the stale job status. - f'sleep {events.JobUpdateEvent.EVENT_INTERVAL_SECONDS}', + f'sleep {events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS}', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED', ], f'sky down -y {name}', @@ -928,16 +928,16 @@ def test_job_queue_multinode(generic_cloud: str): f'sky launch -c {name} -n {name}-3 --detach-setup -d examples/job_queue/job_multinode.yaml', f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)', f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)', - f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-3 | grep SETTING_UP)', - 'sleep 90', f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-3 | grep PENDING)', + 'sleep 90', f'sky cancel -y {name} 1', 'sleep 5', - f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep SETTING_UP', f'sky cancel -y {name} 1 2 3', f'sky launch -c {name} -n {name}-4 --detach-setup -d examples/job_queue/job_multinode.yaml', # Test the job status is correctly set to SETTING_UP, during the setup is running, # and the job can be cancelled during the setup. + 'sleep 5', f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)', f'sky cancel -y {name} 4', f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)', @@ -960,12 +960,53 @@ def test_large_job_queue(generic_cloud: str): 'large_job_queue', [ f'sky launch -y -c {name} --cloud {generic_cloud}', - f'for i in `seq 1 75`; do sky exec {name} -d "echo $i; sleep 100000000"; done', + f'for i in `seq 1 75`; do sky exec {name} -n {name}-$i -d "echo $i; sleep 100000000"; done', f'sky cancel -y {name} 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16', - 'sleep 20', + 'sleep 70', # Each job takes 0.5 CPU and the default VM has 8 CPUs, so there should be 8 / 0.5 = 16 jobs running. # The first 16 jobs are canceled, so there should be 75 - 32 = 43 jobs PENDING. f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep PENDING | wc -l | grep 43', + # Make sure the jobs are scheduled in FIFO order + *[ + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED' + for i in range(1, 17) + ], + *[ + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING' + for i in range(17, 33) + ], + *[ + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep PENDING' + for i in range(33, 75) + ], + f'sky cancel -y {name} 33 35 37 39 17 18 19', + *[ + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep CANCELLED' + for i in range(33, 40, 2) + ], + 'sleep 10', + *[ + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-{i} | grep RUNNING' + for i in [34, 36, 38] + ], + ], + f'sky down -y {name}', + timeout=20 * 60, + ) + run_one_test(test) + + +@pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs +def test_fast_large_job_queue(generic_cloud: str): + # This is to test the jobs can be scheduled quickly when there are many jobs in the queue. + name = _get_cluster_name() + test = Test( + 'fast_large_job_queue', + [ + f'sky launch -y -c {name} --cloud {generic_cloud}', + f'for i in `seq 1 32`; do sky exec {name} -n {name}-$i -d "echo $i"; done', + 'sleep 60', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep SUCCEEDED | wc -l | grep 32', ], f'sky down -y {name}', timeout=20 * 60,