From 3de0e56186ad753a1c407ebf14200535df022668 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 18 Mar 2024 08:05:48 +0000 Subject: [PATCH 01/11] Use original python path --- sky/backends/backend_utils.py | 3 ++- sky/backends/cloud_vm_ray_backend.py | 16 ++++++------- sky/clouds/aws.py | 3 ++- sky/execution.py | 1 + sky/provision/instance_setup.py | 28 +++++++++++++---------- sky/serve/serve_utils.py | 4 +++- sky/skylet/attempt_skylet.py | 8 ++++--- sky/skylet/autostop_lib.py | 3 ++- sky/skylet/constants.py | 33 +++++++++++++++++---------- sky/skylet/job_lib.py | 2 +- sky/skylet/log_lib.py | 2 +- sky/spot/spot_utils.py | 2 +- sky/templates/azure-ray.yml.j2 | 4 ++-- sky/templates/ibm-ray.yml.j2 | 4 ++-- sky/templates/lambda-ray.yml.j2 | 4 ++-- sky/templates/oci-ray.yml.j2 | 4 ++-- sky/templates/scp-ray.yml.j2 | 4 ++-- sky/templates/spot-controller.yaml.j2 | 2 +- tests/test_smoke.py | 9 -------- 19 files changed, 73 insertions(+), 63 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index dd7711fe326..5675de7260e 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -850,7 +850,7 @@ def write_cluster_config( # Dump the Ray ports to a file for Ray job submission dump_port_command = ( - f'python -c \'import json, os; json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, ' + f'{constants.SKY_PYTHON_CMD} -c \'import json, os; json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, ' f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\'' ) @@ -903,6 +903,7 @@ def write_cluster_config( 'ray_temp_dir': constants.SKY_REMOTE_RAY_TEMPDIR, 'dump_port_command': dump_port_command, # Ray version. + 'RAY_CMD': constants.SKY_RAY_CMD, 'ray_version': constants.SKY_REMOTE_RAY_VERSION, # Command for waiting ray cluster to be ready on head. 'ray_head_wait_initialized_command': diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 6bf65afac2d..8bee95897f5 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1933,7 +1933,7 @@ def _ensure_cluster_ray_started(self, handle: 'CloudVmRayResourceHandle', require_outputs=True) if returncode == 0: return - backend.run_on_head(handle, 'ray stop') + backend.run_on_head(handle, f'{constants.SKY_RAY_CMD} stop') # Runs `ray up ` with our monkey-patched launch hash # calculation. See the monkey patch file for why. @@ -3095,7 +3095,6 @@ def _exec_code_on_head( handle: CloudVmRayResourceHandle, codegen: str, job_id: int, - executable: str, detach_run: bool = False, spot_dag: Optional['dag.Dag'] = None, ) -> None: @@ -3122,16 +3121,16 @@ def _exec_code_on_head( remote_log_dir = self.log_dir remote_log_path = os.path.join(remote_log_dir, 'run.log') - assert executable == 'python3', executable cd = f'cd {SKY_REMOTE_WORKDIR}' job_submit_cmd = ( - 'RAY_DASHBOARD_PORT=$(python -c "from sky.skylet import job_lib; print(job_lib.get_job_submission_port())" 2> /dev/null || echo 8265);' # pylint: disable=line-too-long - f'{cd} && ray job submit ' + f'RAY_DASHBOARD_PORT=$({constants.SKY_PYTHON_CMD} -c "from sky.skylet import job_lib; print(job_lib.get_job_submission_port())" 2> /dev/null || echo 8265);' # pylint: disable=line-too-long + f'{cd} && {constants.SKY_RAY_CMD} job submit ' '--address=http://127.0.0.1:$RAY_DASHBOARD_PORT ' f'--submission-id {job_id}-$(whoami) --no-wait ' # Redirect stderr to /dev/null to avoid distracting error from ray. - f'"{executable} -u {script_path} > {remote_log_path} 2> /dev/null"') + f'"{constants.SKY_PYTHON_CMD} -u {script_path} > {remote_log_path} 2> /dev/null"' + ) mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && ' f'touch {remote_log_path}') @@ -3686,7 +3685,8 @@ def teardown_no_lock(self, # We do not check the return code, since Ray returns # non-zero return code when calling Ray stop, # even when the command was executed successfully. - self.run_on_head(handle, 'ray stop --force') + self.run_on_head(handle, + f'{constants.SKY_RAY_CMD} stop --force') except exceptions.FetchIPError: # This error is expected if the previous cluster IP is # failed to be found, @@ -4582,7 +4582,6 @@ def _execute_task_one_node(self, handle: CloudVmRayResourceHandle, self._exec_code_on_head(handle, codegen.build(), job_id, - executable='python3', detach_run=detach_run, spot_dag=task.spot_dag) @@ -4646,6 +4645,5 @@ def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle, self._exec_code_on_head(handle, codegen.build(), job_id, - executable='python3', detach_run=detach_run, spot_dag=task.spot_dag) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 0433a6806fe..7b4c25b696d 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -16,6 +16,7 @@ from sky import skypilot_config from sky.adaptors import aws from sky.clouds import service_catalog +from sky.skylet import constants from sky.utils import common_utils from sky.utils import resources_utils from sky.utils import rich_utils @@ -286,7 +287,7 @@ def get_zone_shell_cmd(cls) -> Optional[str]: # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-identity-documents.html # pylint: disable=line-too-long command_str = ( 'curl -s http://169.254.169.254/latest/dynamic/instance-identity/document' # pylint: disable=line-too-long - ' | python3 -u -c "import sys, json; ' + f' | {constants.SKY_PYTHON_CMD} -u -c "import sys, json; ' 'print(json.load(sys.stdin)[\'availabilityZone\'])"') return command_str diff --git a/sky/execution.py b/sky/execution.py index 4e056075d49..bac7f27a48a 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -678,6 +678,7 @@ def spot_launch( 'dag_name': dag.name, 'retry_until_up': retry_until_up, 'remote_user_config_path': remote_user_config_path, + 'SKY_PYTHON_CMD': constants.SKY_PYTHON_CMD, **controller_utils.shared_controller_vars_to_fill( 'spot', remote_user_config_path=remote_user_config_path, diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index d193647f0de..265a8350592 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -34,15 +34,16 @@ 'do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;') _DUMP_RAY_PORTS = ( - 'python -c \'import json, os; ' + f'{constants.SKY_PYTHON_CMD} -c \'import json, os; ' f'json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, ' f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", ' 'encoding="utf-8"))\';') _RAY_PORT_COMMAND = ( - 'RAY_PORT=$(python -c "from sky.skylet import job_lib; ' - 'print(job_lib.get_ray_port())" 2> /dev/null || echo 6379);' - 'python -c "from sky.utils import common_utils; ' + f'RAY_PORT=$({constants.SKY_PYTHON_CMD} -c ' + '"from sky.skylet import job_lib; print(job_lib.get_ray_port())" ' + '2> /dev/null || echo 6379);' + f'{constants.SKY_PYTHON_CMD} -c "from sky.utils import common_utils; ' 'print(common_utils.encode_payload({\'ray_port\': $RAY_PORT}))"') # Command that calls `ray status` with SkyPilot's Ray port set. @@ -59,7 +60,8 @@ 'done;') # Restart skylet when the version does not match to keep the skylet up-to-date. -MAYBE_SKYLET_RESTART_CMD = 'python3 -m sky.skylet.attempt_skylet;' +MAYBE_SKYLET_RESTART_CMD = (f'{constants.SKY_PYTHON_CMD} -m ' + 'sky.skylet.attempt_skylet;') def _auto_retry(func): @@ -288,10 +290,11 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str], # the same credentials. Otherwise, `ray status` will fail to fetch the # available nodes. # Reference: https://github.com/skypilot-org/skypilot/issues/2441 - cmd = ('ray stop; unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; ' + cmd = (f'{constants.SKY_RAY_CMD} stop; ' + 'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; ' 'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ' - f'ray start --head {ray_options} || exit 1;' + _RAY_PRLIMIT + - _DUMP_RAY_PORTS + RAY_HEAD_WAIT_INITIALIZED_COMMAND) + f'{constants.SKY_RAY_CMD} start --head {ray_options} || exit 1;' + + _RAY_PRLIMIT + _DUMP_RAY_PORTS + RAY_HEAD_WAIT_INITIALIZED_COMMAND) logger.info(f'Running command on head node: {cmd}') # TODO(zhwu): add the output to log files. returncode, stdout, stderr = ssh_runner.run(cmd, @@ -356,10 +359,11 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool, # Unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY, see the comment in # `start_ray_on_head_node`. - cmd = (f'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; ' - 'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ' - f'ray start --disable-usage-stats {ray_options} || exit 1;' + - _RAY_PRLIMIT) + cmd = ( + f'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; ' + 'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ' + f'{constants.SKY_RAY_CMD} start --disable-usage-stats {ray_options} || ' + 'exit 1;' + _RAY_PRLIMIT) if no_restart: # We do not use ray status to check whether ray is running, because # on worker node, if the user started their own ray cluster, ray status diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index cd33c15b178..20bc3a34ddb 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -26,6 +26,7 @@ from sky import status_lib from sky.serve import constants from sky.serve import serve_state +from sky.skylet import constants as skylet_constants from sky.skylet import job_lib from sky.utils import common_utils from sky.utils import log_utils @@ -923,7 +924,8 @@ def stream_serve_process_logs(cls, service_name: str, def _build(cls, code: List[str]) -> str: code = cls._PREFIX + code generated_code = '; '.join(code) - return f'python3 -u -c {shlex.quote(generated_code)}' + return (f'{skylet_constants.SKY_PYTHON_CMD} ' + f'-u -c {shlex.quote(generated_code)}') @classmethod def update_service(cls, service_name: str, version: int, mode: str) -> str: diff --git a/sky/skylet/attempt_skylet.py b/sky/skylet/attempt_skylet.py index b02e69eb22d..ad5c26544fe 100644 --- a/sky/skylet/attempt_skylet.py +++ b/sky/skylet/attempt_skylet.py @@ -13,12 +13,13 @@ def restart_skylet(): # TODO(zhwu): make the killing graceful, e.g., use a signal to tell # skylet to exit, instead of directly killing it. subprocess.run( - 'ps aux | grep "sky.skylet.skylet" | grep "python3 -m"' + 'ps aux | grep "sky.skylet.skylet" | ' + f'grep "{constants.SKY_PYTHON_CMD} -m"' '| awk \'{print $2}\' | xargs kill >> ~/.sky/skylet.log 2>&1', shell=True, check=False) subprocess.run( - 'nohup python3 -m sky.skylet.skylet' + f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet' ' >> ~/.sky/skylet.log 2>&1 &', shell=True, check=True) @@ -27,7 +28,8 @@ def restart_skylet(): proc = subprocess.run( - 'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | grep "python3 -m"', + f'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | ' + f'grep "{constants.SKY_PYTHON_CMD} -m"', shell=True, check=False) diff --git a/sky/skylet/autostop_lib.py b/sky/skylet/autostop_lib.py index c69d98e528c..687c04f5211 100644 --- a/sky/skylet/autostop_lib.py +++ b/sky/skylet/autostop_lib.py @@ -8,6 +8,7 @@ from sky import sky_logging from sky.skylet import configs +from sky.skylet import constants from sky.utils import common_utils logger = sky_logging.init_logger(__name__) @@ -121,4 +122,4 @@ def is_autostopping(cls) -> str: def _build(cls, code: List[str]) -> str: code = cls._PREFIX + code code = ';'.join(code) - return f'python3 -u -c {shlex.quote(code)}' + return f'{constants.SKY_PYTHON_CMD} -u -c {shlex.quote(code)}' diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 0c15adc8dd0..1a9f1137130 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -23,6 +23,13 @@ SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot' SKY_REMOTE_RAY_VERSION = '2.9.3' +# Use the same Python used for installing ray and skypilot. +SKY_PYTHON_PATH_FILE = '~/.sky/python_path' +SKY_PYTHON_CMD = f'$(cat {SKY_PYTHON_PATH_FILE} || which python3)' +_SKY_PYTHON_DIR_CMD = f'$(dirname {SKY_PYTHON_CMD})' +SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip' +SKY_RAY_CMD = f'{_SKY_PYTHON_DIR_CMD}/ray' + # The name for the environment variable that stores the unique ID of the # current task. This will stay the same across multiple recoveries of the # same spot task. @@ -78,7 +85,12 @@ 'bash Miniconda3-Linux-x86_64.sh -b && ' 'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && ' 'conda config --set auto_activate_base true); ' - 'grep "# >>> conda initialize >>>" ~/.bashrc || conda init;') + 'grep "# >>> conda initialize >>>" ~/.bashrc || conda init;' + '(type -a python | grep -q python3) || ' + 'echo \'alias python=python3\' >> ~/.bashrc;' + '(type -a pip | grep -q pip3) || echo \'alias pip=pip3\' >> ~/.bashrc;' + 'source ~/.bashrc;' + f'[ -f {SKY_PYTHON_PATH_FILE} ] || which python3 > {SKY_PYTHON_PATH_FILE};') _sky_version = str(version.parse(sky.__version__)) RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} ray status' @@ -86,11 +98,7 @@ # installed. {var} will be replaced with the actual value in # backend_utils.write_cluster_config. RAY_SKYPILOT_INSTALLATION_COMMANDS = ( - '(type -a python | grep -q python3) || ' - 'echo \'alias python=python3\' >> ~/.bashrc;' - '(type -a pip | grep -q pip3) || echo \'alias pip=pip3\' >> ~/.bashrc;' 'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;' - 'source ~/.bashrc;' # Backward compatibility for ray upgrade (#3248): do not upgrade ray if the # ray cluster is already running, to avoid the ray cluster being restarted. # @@ -104,14 +112,15 @@ # latest ray port 6380, but those existing cluster launched before #1790 # that has ray cluster on the default port 6379 will be upgraded and # restarted. - f'pip3 list | grep "ray " | grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null ' + f'{SKY_PIP_CMD} list | grep "ray " | ' + f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null ' f'|| {RAY_STATUS} || ' - f'pip3 install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long + f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long # END ray package check and installation - '{ pip3 list | grep "skypilot " && ' + f'{{ {SKY_PIP_CMD} list | grep "skypilot " && ' '[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long - '{ pip3 uninstall skypilot -y; ' - 'pip3 install "$(echo ~/.sky/wheels/{sky_wheel_hash}/' + f'{{ {SKY_PIP_CMD} uninstall skypilot -y; ' + f'{SKY_PIP_CMD} install "$(echo ~/.sky/wheels/{{sky_wheel_hash}}/' f'skypilot-{_sky_version}*.whl)[{{cloud}}, remote]" && ' 'echo "{sky_wheel_hash}" > ~/.sky/wheels/current_sky_wheel_hash || ' 'exit 1; }; ' @@ -121,8 +130,8 @@ # The ray installation above can be skipped due to the existing ray cluster # for backward compatibility. In this case, we should not patch the ray # files. - f'pip3 list | grep "ray " | grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null ' - '&& { python3 -c "from sky.skylet.ray_patches import patch; patch()" ' + f'{SKY_PIP_CMD} list | grep "ray " | grep {{SKY_REMOTE_RAY_VERSION}} 2>&1 > /dev/null ' + f'&& {{ {SKY_PYTHON_CMD} -c "from sky.skylet.ray_patches import patch; patch()" ' '|| exit 1; };') # The name for the environment variable that stores SkyPilot user hash, which diff --git a/sky/skylet/job_lib.py b/sky/skylet/job_lib.py index 87a6d4806cb..d837e510033 100644 --- a/sky/skylet/job_lib.py +++ b/sky/skylet/job_lib.py @@ -933,4 +933,4 @@ def get_run_timestamp_with_globbing(cls, def _build(cls, code: List[str]) -> str: code = cls._PREFIX + code code = ';'.join(code) - return f'python3 -u -c {shlex.quote(code)}' + return f'{constants.SKY_PYTHON_CMD} -u -c {shlex.quote(code)}' diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py index 7d1b63eca94..46b8984e366 100644 --- a/sky/skylet/log_lib.py +++ b/sky/skylet/log_lib.py @@ -185,7 +185,7 @@ def run_with_log( os.path.dirname(os.path.abspath(job_lib.__file__)), 'subprocess_daemon.py') daemon_cmd = [ - 'python3', + constants.SKY_PYTHON_CMD, daemon_script, '--parent-pid', str(parent_pid), diff --git a/sky/spot/spot_utils.py b/sky/spot/spot_utils.py index 2c1d88c6d20..67ac0cf4ae8 100644 --- a/sky/spot/spot_utils.py +++ b/sky/spot/spot_utils.py @@ -741,7 +741,7 @@ def set_pending(cls, job_id: int, spot_dag: 'dag_lib.Dag') -> str: def _build(cls, code: List[str]) -> str: code = cls._PREFIX + code generated_code = '; '.join(code) - return f'python3 -u -c {shlex.quote(generated_code)}' + return f'{constants.SKY_PYTHON_CMD} -u -c {shlex.quote(generated_code)}' def dump_job_table_cache(job_table: str): diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index 21a1ac1214f..af507e90564 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -164,14 +164,14 @@ setup_commands: # current num items (num SSH connections): 2 head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. - - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/ibm-ray.yml.j2 b/sky/templates/ibm-ray.yml.j2 index f455f400a61..c76f390d842 100644 --- a/sky/templates/ibm-ray.yml.j2 +++ b/sky/templates/ibm-ray.yml.j2 @@ -118,13 +118,13 @@ head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/lambda-ray.yml.j2 b/sky/templates/lambda-ray.yml.j2 index 8f6f3580d1a..8effb5c2297 100644 --- a/sky/templates/lambda-ray.yml.j2 +++ b/sky/templates/lambda-ray.yml.j2 @@ -89,13 +89,13 @@ setup_commands: # Increment the following for catching performance bugs easier: # current num items (num SSH connections): 2 head_start_ray_commands: - - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/oci-ray.yml.j2 b/sky/templates/oci-ray.yml.j2 index f3fceedd042..a6e6bf666aa 100644 --- a/sky/templates/oci-ray.yml.j2 +++ b/sky/templates/oci-ray.yml.j2 @@ -114,13 +114,13 @@ head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/scp-ray.yml.j2 b/sky/templates/scp-ray.yml.j2 index f8e07c5283c..e44d3b46764 100644 --- a/sky/templates/scp-ray.yml.j2 +++ b/sky/templates/scp-ray.yml.j2 @@ -88,13 +88,13 @@ head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/spot-controller.yaml.j2 b/sky/templates/spot-controller.yaml.j2 index 5181f9d4544..1d572495df4 100644 --- a/sky/templates/spot-controller.yaml.j2 +++ b/sky/templates/spot-controller.yaml.j2 @@ -18,7 +18,7 @@ setup: | # Dashboard. pip list | grep flask > /dev/null 2>&1 || pip install flask 2>&1 > /dev/null - ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.spot.dashboard.dashboard") || (nohup python3 -m sky.spot.dashboard.dashboard >> ~/.sky/spot-dashboard.log 2>&1 &)); + ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.spot.dashboard.dashboard") || (nohup {{ SKY_PYTHON_CMD }} -m sky.spot.dashboard.dashboard >> ~/.sky/spot-dashboard.log 2>&1 &)); run: | # Start the controller for the current spot job. diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 1c8c300a385..245a0d0243e 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3464,15 +3464,6 @@ def test_user_ray_cluster(generic_cloud: str): run_one_test(test) -_CODE_PREFIX = ['import sky'] - - -def _build(code: List[str]) -> str: - code = _CODE_PREFIX + code - code = ';'.join(code) - return f'python3 -u -c {shlex.quote(code)}' - - # ------- Testing the core API -------- # Most of the core APIs have been tested in the CLI tests. # These tests are for testing the return value of the APIs not fully used in CLI. From 3fe89238de68f525578b1f20af3e8a978ce30792 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 18 Mar 2024 08:25:40 +0000 Subject: [PATCH 02/11] Fix python --- sky/provision/instance_setup.py | 2 +- sky/skylet/constants.py | 3 ++- sky/skylet/log_lib.py | 7 ++++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index 265a8350592..fea1ee24687 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -448,7 +448,7 @@ def _internal_file_mounts(file_mounts: Dict, rc, stdout, stderr = runner.run(mkdir_command, log_path=log_path, - stream_logs=False, + stream_logs=True, require_outputs=True) subprocess_utils.handle_returncode( rc, diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 1a9f1137130..cbcdfbaf323 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -25,7 +25,8 @@ # Use the same Python used for installing ray and skypilot. SKY_PYTHON_PATH_FILE = '~/.sky/python_path' -SKY_PYTHON_CMD = f'$(cat {SKY_PYTHON_PATH_FILE} || which python3)' +SKY_GET_PYTHON_PATH_CMD = f'cat {SKY_PYTHON_PATH_FILE} || which python3' +SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})' _SKY_PYTHON_DIR_CMD = f'$(dirname {SKY_PYTHON_CMD})' SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip' SKY_RAY_CMD = f'{_SKY_PYTHON_DIR_CMD}/ray' diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py index 46b8984e366..edd7d932e69 100644 --- a/sky/skylet/log_lib.py +++ b/sky/skylet/log_lib.py @@ -184,8 +184,13 @@ def run_with_log( daemon_script = os.path.join( os.path.dirname(os.path.abspath(job_lib.__file__)), 'subprocess_daemon.py') + python_path = subprocess.check_output( + constants.SKY_GET_PYTHON_PATH_CMD, + shell=True, + stderr=subprocess.DEVNULL, + encoding='utf-8').strip() daemon_cmd = [ - constants.SKY_PYTHON_CMD, + python_path, daemon_script, '--parent-pid', str(parent_pid), From debb4149b5282477e4b2d968fb7312b44afb084b Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 18 Mar 2024 09:13:14 +0000 Subject: [PATCH 03/11] cmd --- sky/backends/backend_utils.py | 1 + sky/provision/instance_setup.py | 2 +- sky/skylet/constants.py | 2 +- sky/templates/gcp-ray.yml.j2 | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 5675de7260e..9099e1b999b 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -904,6 +904,7 @@ def write_cluster_config( 'dump_port_command': dump_port_command, # Ray version. 'RAY_CMD': constants.SKY_RAY_CMD, + 'SKY_PIP_CMD': constants.SKY_PIP_CMD, 'ray_version': constants.SKY_REMOTE_RAY_VERSION, # Command for waiting ray cluster to be ready on head. 'ray_head_wait_initialized_command': diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index fea1ee24687..86bd85a1bdb 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -49,7 +49,7 @@ # Command that calls `ray status` with SkyPilot's Ray port set. RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND = ( f'{_RAY_PORT_COMMAND}; ' - 'RAY_ADDRESS=127.0.0.1:$RAY_PORT ray status') + f'RAY_ADDRESS=127.0.0.1:$RAY_PORT {constants.SKY_RAY_CMD} status') # Command that waits for the ray status to be initialized. Otherwise, a later # `sky status -r` may fail due to the ray cluster not being ready. diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index cbcdfbaf323..86e77a67270 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -94,7 +94,7 @@ f'[ -f {SKY_PYTHON_PATH_FILE} ] || which python3 > {SKY_PYTHON_PATH_FILE};') _sky_version = str(version.parse(sky.__version__)) -RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} ray status' +RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status' # Install ray and skypilot on the remote cluster if they are not already # installed. {var} will be replaced with the actual value in # backend_utils.write_cluster_config. diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 586649e5ef1..b631a9b6103 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -191,7 +191,7 @@ setup_commands: source ~/.bashrc; {%- if tpu_vm %} test -f ~/miniconda3/etc/profile.d/conda.sh && source ~/miniconda3/etc/profile.d/conda.sh && conda activate base || true; - pip3 install --upgrade google-api-python-client; + {{ SKY_PIP_CMD }} install --upgrade google-api-python-client; {%- endif %} {%- if tpu_node_name %} grep "export TPU_NAME=" ~/.bashrc && echo "TPU_NAME already set" || echo "export TPU_NAME={{tpu_node_name}}" >> ~/.bashrc; From 39daf35d26e235bd84bb209c9f63df358294f8fb Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 18 Mar 2024 09:29:55 +0000 Subject: [PATCH 04/11] fix patch --- sky/provision/instance_setup.py | 2 +- sky/skylet/attempt_skylet.py | 3 +-- sky/skylet/constants.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index 86bd85a1bdb..81e13f54fd0 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -448,7 +448,7 @@ def _internal_file_mounts(file_mounts: Dict, rc, stdout, stderr = runner.run(mkdir_command, log_path=log_path, - stream_logs=True, + stream_logs=False, require_outputs=True) subprocess_utils.handle_returncode( rc, diff --git a/sky/skylet/attempt_skylet.py b/sky/skylet/attempt_skylet.py index ad5c26544fe..010532b4720 100644 --- a/sky/skylet/attempt_skylet.py +++ b/sky/skylet/attempt_skylet.py @@ -13,8 +13,7 @@ def restart_skylet(): # TODO(zhwu): make the killing graceful, e.g., use a signal to tell # skylet to exit, instead of directly killing it. subprocess.run( - 'ps aux | grep "sky.skylet.skylet" | ' - f'grep "{constants.SKY_PYTHON_CMD} -m"' + 'ps aux | grep "sky.skylet.skylet" | grep " -m "' '| awk \'{print $2}\' | xargs kill >> ~/.sky/skylet.log 2>&1', shell=True, check=False) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 86e77a67270..2bda1234681 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -131,7 +131,7 @@ # The ray installation above can be skipped due to the existing ray cluster # for backward compatibility. In this case, we should not patch the ray # files. - f'{SKY_PIP_CMD} list | grep "ray " | grep {{SKY_REMOTE_RAY_VERSION}} 2>&1 > /dev/null ' + f'{SKY_PIP_CMD} list | grep "ray " | grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null ' f'&& {{ {SKY_PYTHON_CMD} -c "from sky.skylet.ray_patches import patch; patch()" ' '|| exit 1; };') From ef0510b64abffea6f00ebf1614eb562bdc25fd72 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 18 Mar 2024 21:07:22 +0000 Subject: [PATCH 05/11] add test for custom default env --- tests/test_smoke.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 245a0d0243e..b174e415a8c 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -682,6 +682,28 @@ def test_image_no_conda(): ) run_one_test(test) +def test_custom_default_conda_env(generic_cloud: str): + name = _get_cluster_name() + test = Test( + 'custom_default_conda_env', + [ + f'sky launch -c {name} -y --cloud {generic_cloud} examples/test_custom_default_conda_env.yaml', + f'sky status -r {name} | grep "UP"', + f'sky logs {name} 1 --status', + f'sky logs {name} 1 --no-follow | grep -P "myenv\\s+\\*"', + f'sky exec {name} examples/test_custom_default_conda_env.yaml', + f'sky logs {name} 2 --status', + f'sky autostop -y -i 0 {name}', + 'sleep 60', + f'sky status -r {name} | grep "STOPPED"', + f'sky start -y {name}', + f'sky logs {name} 2 --no-follow | grep -P "myenv\\s+\\*"', + f'sky exec {name} examples/test_custom_default_conda_env.yaml', + f'sky logs {name} 3 --status', + ] + ) + run_one_test(test) + # ------------ Test stale job ------------ @pytest.mark.no_fluidstack # FluidStack does not support stopping instances in SkyPilot implementation From eaa2ac22afb6bf8fe470955ba4462077aec8fb8a Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 18 Mar 2024 21:22:35 +0000 Subject: [PATCH 06/11] Fix test --- tests/test_smoke.py | 34 +++++++++---------- .../test_custom_default_conda_env.yaml | 17 ++++++++++ 2 files changed, 33 insertions(+), 18 deletions(-) create mode 100644 tests/test_yamls/test_custom_default_conda_env.yaml diff --git a/tests/test_smoke.py b/tests/test_smoke.py index b174e415a8c..b366c9042a0 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -682,26 +682,24 @@ def test_image_no_conda(): ) run_one_test(test) + def test_custom_default_conda_env(generic_cloud: str): name = _get_cluster_name() - test = Test( - 'custom_default_conda_env', - [ - f'sky launch -c {name} -y --cloud {generic_cloud} examples/test_custom_default_conda_env.yaml', - f'sky status -r {name} | grep "UP"', - f'sky logs {name} 1 --status', - f'sky logs {name} 1 --no-follow | grep -P "myenv\\s+\\*"', - f'sky exec {name} examples/test_custom_default_conda_env.yaml', - f'sky logs {name} 2 --status', - f'sky autostop -y -i 0 {name}', - 'sleep 60', - f'sky status -r {name} | grep "STOPPED"', - f'sky start -y {name}', - f'sky logs {name} 2 --no-follow | grep -P "myenv\\s+\\*"', - f'sky exec {name} examples/test_custom_default_conda_env.yaml', - f'sky logs {name} 3 --status', - ] - ) + test = Test('custom_default_conda_env', [ + f'sky launch -c {name} -y --cloud {generic_cloud} tests/test_yamls/test_custom_default_conda_env.yaml', + f'sky status -r {name} | grep "UP"', + f'sky logs {name} 1 --status', + f'sky logs {name} 1 --no-follow | grep -P "myenv\\s+\\*"', + f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', + f'sky logs {name} 2 --status', + f'sky autostop -y -i 0 {name}', + 'sleep 60', + f'sky status -r {name} | grep "STOPPED"', + f'sky start -y {name}', + f'sky logs {name} 2 --no-follow | grep -P "myenv\\s+\\*"', + f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', + f'sky logs {name} 3 --status', + ]) run_one_test(test) diff --git a/tests/test_yamls/test_custom_default_conda_env.yaml b/tests/test_yamls/test_custom_default_conda_env.yaml new file mode 100644 index 00000000000..2ab7934b737 --- /dev/null +++ b/tests/test_yamls/test_custom_default_conda_env.yaml @@ -0,0 +1,17 @@ +resources: + cpus: 2+ + +setup: | + conda activate myenv + if [ $? -ne 0 ]; then + conda create -n myenv python=3.7 -y + conda activate myenv + fi + + grep -qxF 'conda activate myenv' ~/.bashrc || echo "conda activate myenv" >> ~/.bashrc + pip install ray==2.6.0 + +run: | + conda env list + echo hi + echo bye From 7040972414c93fd17406c561f85176c94105d1d3 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 19 Mar 2024 05:04:04 +0000 Subject: [PATCH 07/11] address comments --- sky/backends/backend_utils.py | 4 ++-- sky/execution.py | 2 +- sky/skylet/attempt_skylet.py | 6 ++++-- sky/skylet/constants.py | 8 +++++++- sky/templates/azure-ray.yml.j2 | 4 ++-- sky/templates/gcp-ray.yml.j2 | 2 +- sky/templates/ibm-ray.yml.j2 | 4 ++-- sky/templates/lambda-ray.yml.j2 | 4 ++-- sky/templates/oci-ray.yml.j2 | 4 ++-- sky/templates/scp-ray.yml.j2 | 4 ++-- sky/templates/spot-controller.yaml.j2 | 2 +- tests/test_yamls/test_custom_default_conda_env.yaml | 4 ++++ 12 files changed, 30 insertions(+), 18 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 9099e1b999b..d0092978211 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -903,8 +903,8 @@ def write_cluster_config( 'ray_temp_dir': constants.SKY_REMOTE_RAY_TEMPDIR, 'dump_port_command': dump_port_command, # Ray version. - 'RAY_CMD': constants.SKY_RAY_CMD, - 'SKY_PIP_CMD': constants.SKY_PIP_CMD, + 'sky_ray_cmd': constants.SKY_RAY_CMD, + 'sky_pip_cmd': constants.SKY_PIP_CMD, 'ray_version': constants.SKY_REMOTE_RAY_VERSION, # Command for waiting ray cluster to be ready on head. 'ray_head_wait_initialized_command': diff --git a/sky/execution.py b/sky/execution.py index bac7f27a48a..7b83a57077f 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -678,7 +678,7 @@ def spot_launch( 'dag_name': dag.name, 'retry_until_up': retry_until_up, 'remote_user_config_path': remote_user_config_path, - 'SKY_PYTHON_CMD': constants.SKY_PYTHON_CMD, + 'sky_python_cmd': constants.SKY_PYTHON_CMD, **controller_utils.shared_controller_vars_to_fill( 'spot', remote_user_config_path=remote_user_config_path, diff --git a/sky/skylet/attempt_skylet.py b/sky/skylet/attempt_skylet.py index 010532b4720..7060905e007 100644 --- a/sky/skylet/attempt_skylet.py +++ b/sky/skylet/attempt_skylet.py @@ -13,6 +13,9 @@ def restart_skylet(): # TODO(zhwu): make the killing graceful, e.g., use a signal to tell # skylet to exit, instead of directly killing it. subprocess.run( + # We use -m to grep instead of {constants.SKY_PYTHON_CMD} -m to grep + # because need to handle the backward compatibility of the old skylet + # started before #3326, which does not use the full path to python. 'ps aux | grep "sky.skylet.skylet" | grep " -m "' '| awk \'{print $2}\' | xargs kill >> ~/.sky/skylet.log 2>&1', shell=True, @@ -27,8 +30,7 @@ def restart_skylet(): proc = subprocess.run( - f'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | ' - f'grep "{constants.SKY_PYTHON_CMD} -m"', + f'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | grep " -m"', shell=True, check=False) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 2bda1234681..bf4145fe048 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -23,12 +23,18 @@ SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot' SKY_REMOTE_RAY_VERSION = '2.9.3' -# Use the same Python used for installing ray and skypilot. +# We store the absolute path of the python executable (/opt/conda/bin/python3) +# in this file, so that any future internal commands that need to use python +# can use this path. This is useful for the case where the user has a custom +# conda environment as a default environment, which is not the same as the one +# used for installing SkyPilot runtime (ray and skypilot). SKY_PYTHON_PATH_FILE = '~/.sky/python_path' SKY_GET_PYTHON_PATH_CMD = f'cat {SKY_PYTHON_PATH_FILE} || which python3' +# Python executable, e.g., /opt/conda/bin/python3 SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})' _SKY_PYTHON_DIR_CMD = f'$(dirname {SKY_PYTHON_CMD})' SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip' +# Ray executable, e.g., /opt/conda/bin/ray SKY_RAY_CMD = f'{_SKY_PYTHON_DIR_CMD}/ray' # The name for the environment variable that stores the unique ID of the diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index af507e90564..f1477d92132 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -164,14 +164,14 @@ setup_commands: # current num items (num SSH connections): 2 head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. - - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index b631a9b6103..c3d75015bc0 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -191,7 +191,7 @@ setup_commands: source ~/.bashrc; {%- if tpu_vm %} test -f ~/miniconda3/etc/profile.d/conda.sh && source ~/miniconda3/etc/profile.d/conda.sh && conda activate base || true; - {{ SKY_PIP_CMD }} install --upgrade google-api-python-client; + {{ sky_pip_cmd }} install --upgrade google-api-python-client; {%- endif %} {%- if tpu_node_name %} grep "export TPU_NAME=" ~/.bashrc && echo "TPU_NAME already set" || echo "export TPU_NAME={{tpu_node_name}}" >> ~/.bashrc; diff --git a/sky/templates/ibm-ray.yml.j2 b/sky/templates/ibm-ray.yml.j2 index c76f390d842..cb527a85a55 100644 --- a/sky/templates/ibm-ray.yml.j2 +++ b/sky/templates/ibm-ray.yml.j2 @@ -118,13 +118,13 @@ head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/lambda-ray.yml.j2 b/sky/templates/lambda-ray.yml.j2 index 8effb5c2297..1aaf7edaddd 100644 --- a/sky/templates/lambda-ray.yml.j2 +++ b/sky/templates/lambda-ray.yml.j2 @@ -89,13 +89,13 @@ setup_commands: # Increment the following for catching performance bugs easier: # current num items (num SSH connections): 2 head_start_ray_commands: - - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/oci-ray.yml.j2 b/sky/templates/oci-ray.yml.j2 index a6e6bf666aa..a15a53732b1 100644 --- a/sky/templates/oci-ray.yml.j2 +++ b/sky/templates/oci-ray.yml.j2 @@ -114,13 +114,13 @@ head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/scp-ray.yml.j2 b/sky/templates/scp-ray.yml.j2 index e44d3b46764..42126652920 100644 --- a/sky/templates/scp-ray.yml.j2 +++ b/sky/templates/scp-ray.yml.j2 @@ -88,13 +88,13 @@ head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - {{ RAY_CMD }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ RAY_CMD }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/spot-controller.yaml.j2 b/sky/templates/spot-controller.yaml.j2 index 1d572495df4..efe20d80d01 100644 --- a/sky/templates/spot-controller.yaml.j2 +++ b/sky/templates/spot-controller.yaml.j2 @@ -18,7 +18,7 @@ setup: | # Dashboard. pip list | grep flask > /dev/null 2>&1 || pip install flask 2>&1 > /dev/null - ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.spot.dashboard.dashboard") || (nohup {{ SKY_PYTHON_CMD }} -m sky.spot.dashboard.dashboard >> ~/.sky/spot-dashboard.log 2>&1 &)); + ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.spot.dashboard.dashboard") || (nohup {{ sky_python_cmd }} -m sky.spot.dashboard.dashboard >> ~/.sky/spot-dashboard.log 2>&1 &)); run: | # Start the controller for the current spot job. diff --git a/tests/test_yamls/test_custom_default_conda_env.yaml b/tests/test_yamls/test_custom_default_conda_env.yaml index 2ab7934b737..6726d2d8c2b 100644 --- a/tests/test_yamls/test_custom_default_conda_env.yaml +++ b/tests/test_yamls/test_custom_default_conda_env.yaml @@ -8,7 +8,11 @@ setup: | conda activate myenv fi + # Set user's conda environment as default, which does not have the SkyPilot + # runtime installed. grep -qxF 'conda activate myenv' ~/.bashrc || echo "conda activate myenv" >> ~/.bashrc + # Further install the older version of Ray to test SkyPilot being able to use + # the original environment to submit jobs. pip install ray==2.6.0 run: | From 8a03505fd2b50b7ba553976fb2e502eb7c7ad385 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 19 Mar 2024 05:05:22 +0000 Subject: [PATCH 08/11] format --- sky/skylet/attempt_skylet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/skylet/attempt_skylet.py b/sky/skylet/attempt_skylet.py index 7060905e007..609cfa09141 100644 --- a/sky/skylet/attempt_skylet.py +++ b/sky/skylet/attempt_skylet.py @@ -30,7 +30,7 @@ def restart_skylet(): proc = subprocess.run( - f'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | grep " -m"', + 'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | grep " -m"', shell=True, check=False) From 217cde68e46776ddfafb52e9a40e15ed6c44cb84 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 19 Mar 2024 06:02:33 +0000 Subject: [PATCH 09/11] Fix backward compat --- sky/skylet/constants.py | 2 +- sky/skylet/log_lib.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index bf4145fe048..65e23473993 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -29,7 +29,7 @@ # conda environment as a default environment, which is not the same as the one # used for installing SkyPilot runtime (ray and skypilot). SKY_PYTHON_PATH_FILE = '~/.sky/python_path' -SKY_GET_PYTHON_PATH_CMD = f'cat {SKY_PYTHON_PATH_FILE} || which python3' +SKY_GET_PYTHON_PATH_CMD = f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || which python3' # Python executable, e.g., /opt/conda/bin/python3 SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})' _SKY_PYTHON_DIR_CMD = f'$(dirname {SKY_PYTHON_CMD})' diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py index edd7d932e69..c6e78e7264b 100644 --- a/sky/skylet/log_lib.py +++ b/sky/skylet/log_lib.py @@ -184,7 +184,16 @@ def run_with_log( daemon_script = os.path.join( os.path.dirname(os.path.abspath(job_lib.__file__)), 'subprocess_daemon.py') - python_path = subprocess.check_output( + if not hasattr(constants, 'SKY_GET_PYTHON_PATH_CMD'): + # Backward compatibility: for cluster started before #3326, this + # constant does not exist. Since we generate the job script + # in backends.cloud_vm_ray_backend with inspect, so the + # the lates `run_with_log` will be used, but the `constants` is + # not updated. We fallback to `python3` in this case. + # TODO(zhwu): remove this after 0.7.0. + python_path = 'python3' + else: + python_path = subprocess.check_output( constants.SKY_GET_PYTHON_PATH_CMD, shell=True, stderr=subprocess.DEVNULL, From bd9416f30a349906e24e9065e6acce794d4b51bd Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 19 Mar 2024 06:09:30 +0000 Subject: [PATCH 10/11] format --- sky/skylet/constants.py | 3 ++- sky/skylet/log_lib.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 65e23473993..d3b991ab7ac 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -29,7 +29,8 @@ # conda environment as a default environment, which is not the same as the one # used for installing SkyPilot runtime (ray and skypilot). SKY_PYTHON_PATH_FILE = '~/.sky/python_path' -SKY_GET_PYTHON_PATH_CMD = f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || which python3' +SKY_GET_PYTHON_PATH_CMD = (f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || ' + 'which python3') # Python executable, e.g., /opt/conda/bin/python3 SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})' _SKY_PYTHON_DIR_CMD = f'$(dirname {SKY_PYTHON_CMD})' diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py index c6e78e7264b..a1d040dbcdb 100644 --- a/sky/skylet/log_lib.py +++ b/sky/skylet/log_lib.py @@ -194,10 +194,10 @@ def run_with_log( python_path = 'python3' else: python_path = subprocess.check_output( - constants.SKY_GET_PYTHON_PATH_CMD, - shell=True, - stderr=subprocess.DEVNULL, - encoding='utf-8').strip() + constants.SKY_GET_PYTHON_PATH_CMD, + shell=True, + stderr=subprocess.DEVNULL, + encoding='utf-8').strip() daemon_cmd = [ python_path, daemon_script, From f0c2c23b7cf5269e11312723c5e0b7fad3f039d6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 18 Mar 2024 23:09:35 -0700 Subject: [PATCH 11/11] Update sky/backends/backend_utils.py Co-authored-by: Zongheng Yang --- sky/backends/backend_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index d0092978211..850dc2009ca 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -902,7 +902,7 @@ def write_cluster_config( 'ray_dashboard_port': constants.SKY_REMOTE_RAY_DASHBOARD_PORT, 'ray_temp_dir': constants.SKY_REMOTE_RAY_TEMPDIR, 'dump_port_command': dump_port_command, - # Ray version. + # Sky-internal constants. 'sky_ray_cmd': constants.SKY_RAY_CMD, 'sky_pip_cmd': constants.SKY_PIP_CMD, 'ray_version': constants.SKY_REMOTE_RAY_VERSION,