Skip to content

Commit

Permalink
[Core] Add ~/.local/bin to make which ray work if ray is installed …
Browse files Browse the repository at this point in the history
…in `~/.local` (#3368)

* save ray path

* only use path in file when non empty

* try grepping but fail

* hardcode ~/.local/bin

* format

* add comments

* Add comments

* format

* avoid backward compat test conflict

* fix sleep

* Fix the task ID for spot pipeline
  • Loading branch information
Michaelvll authored Apr 15, 2024
1 parent e60eb73 commit 91d6d1b
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 21 deletions.
21 changes: 17 additions & 4 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@
# used for installing SkyPilot runtime (ray and skypilot).
SKY_PYTHON_PATH_FILE = '~/.sky/python_path'
SKY_RAY_PATH_FILE = '~/.sky/ray_path'
SKY_GET_PYTHON_PATH_CMD = (f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
'which python3')
# Python executable, e.g., /opt/conda/bin/python3
SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
# Ray executable, e.g., /opt/conda/bin/ray
SKY_RAY_CMD = f'$(cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)'
SKY_RAY_CMD = (f'$([ -s {SKY_RAY_PATH_FILE} ] && '
f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')

# The name for the environment variable that stores the unique ID of the
# current task. This will stay the same across multiple recoveries of the
Expand Down Expand Up @@ -108,6 +110,8 @@
# backend_utils.write_cluster_config.
RAY_SKYPILOT_INSTALLATION_COMMANDS = (
'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;'
# Print the PATH in provision.log to help debug PATH issues.
'echo PATH=$PATH; '
# Backward compatibility for ray upgrade (#3248): do not upgrade ray if the
# ray cluster is already running, to avoid the ray cluster being restarted.
#
Expand All @@ -124,9 +128,18 @@
f'{SKY_PIP_CMD} list | grep "ray " | '
f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
f'|| {RAY_STATUS} || '
f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION};' # pylint: disable=line-too-long
f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long
# In some envs, e.g. pip does not have permission to write under /opt/conda
# ray package will be installed under ~/.local/bin. If the user's PATH does
# not include ~/.local/bin (the pip install will have the output: `WARNING:
# The scripts ray, rllib, serve and tune are installed in '~/.local/bin'
# which is not on PATH.`), causing an empty SKY_RAY_PATH_FILE later.
#
# Here, we add ~/.local/bin to the end of the PATH to make sure the issues
# mentioned above are resolved.
'export PATH=$PATH:$HOME/.local/bin; '
# Writes ray path to file if it does not exist or the file is empty.
f'[ -s {SKY_RAY_PATH_FILE} ] || which ray > {SKY_RAY_PATH_FILE};'
f'[ -s {SKY_RAY_PATH_FILE} ] || which ray > {SKY_RAY_PATH_FILE}; '
# END ray package check and installation
f'{{ {SKY_PIP_CMD} list | grep "skypilot " && '
'[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long
Expand Down
12 changes: 9 additions & 3 deletions sky/spot/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,15 @@ def __init__(self, job_id: int, dag_yaml: str,
# the user can have the same id for multiple recoveries.
# Example value: sky-2022-10-04-22-46-52-467694_my-spot-name_spot_id-17-0
job_id_env_vars = []
for i in range(len(self._dag.tasks)):
task_name = self._dag_name
if len(self._dag.tasks) > 1:
for i, task in enumerate(self._dag.tasks):
if len(self._dag.tasks) <= 1:
task_name = self._dag_name
else:
task_name = task.name
# This is guaranteed by the spot_launch API, where we fill in
# the task.name with
# dag_utils.maybe_infer_and_fill_dag_and_task_names.
assert task_name is not None, self._dag
task_name = f'{self._dag_name}_{task_name}'
job_id_env_var = common_utils.get_global_job_id(
self._backend.run_timestamp,
Expand Down
25 changes: 14 additions & 11 deletions tests/backward_compatibility_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# bash tests/backward_compatibility_tests.sh

#!/bin/bash
set -ev
set -evx

need_launch=${1:-0}
start_from=${2:-0}
Expand Down Expand Up @@ -160,30 +160,33 @@ fi

# Test spot jobs to make sure existing jobs and new job can run correctly, after
# the spot controller is updated.
# Get a new uuid to avoid conflict with previous back-compat tests.
uuid=$(uuidgen)
SPOT_JOB_JOB_NAME=${CLUSTER_NAME}-${uuid:0:4}
if [ "$start_from" -le 7 ]; then
conda activate sky-back-compat-master
rm -r ~/.sky/wheels || true
sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${CLUSTER_NAME}-7-0 "echo hi; sleep 1000"
sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${CLUSTER_NAME}-7-1 "echo hi; sleep 300"
sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${SPOT_JOB_JOB_NAME}-7-0 "echo hi; sleep 1000"
sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${SPOT_JOB_JOB_NAME}-7-1 "echo hi; sleep 300"
conda activate sky-back-compat-current
rm -r ~/.sky/wheels || true
s=$(sky spot logs --no-follow -n ${CLUSTER_NAME}-7-1)
s=$(sky spot logs --no-follow -n ${SPOT_JOB_JOB_NAME}-7-1)
echo "$s"
echo "$s" | grep " hi" || exit 1
sky spot launch -d --cloud ${CLOUD} -y -n ${CLUSTER_NAME}-7-2 "echo hi; sleep 60"
s=$(sky spot logs --no-follow -n ${CLUSTER_NAME}-7-2)
sky spot launch -d --cloud ${CLOUD} -y -n ${SPOT_JOB_JOB_NAME}-7-2 "echo hi; sleep 40"
s=$(sky spot logs --no-follow -n ${SPOT_JOB_JOB_NAME}-7-2)
echo "$s"
echo "$s" | grep " hi" || exit 1
s=$(sky spot queue | grep ${CLUSTER_NAME}-7)
s=$(sky spot queue | grep ${SPOT_JOB_JOB_NAME}-7)
echo "$s"
echo "$s" | grep "RUNNING" | wc -l | grep 3 || exit 1
sky spot cancel -y -n ${CLUSTER_NAME}-7-0
sky spot logs -n "${CLUSTER_NAME}-7-1"
s=$(sky spot queue | grep ${CLUSTER_NAME}-7)
sky spot cancel -y -n ${SPOT_JOB_JOB_NAME}-7-0
sky spot logs -n "${SPOT_JOB_JOB_NAME}-7-1"
s=$(sky spot queue | grep ${SPOT_JOB_JOB_NAME}-7)
echo "$s"
echo "$s" | grep "SUCCEEDED" | wc -l | grep 2 || exit 1
echo "$s" | grep "CANCELLED" | wc -l | grep 1 || exit 1
fi

sky down ${CLUSTER_NAME}* -y
sky spot cancel -n ${CLUSTER_NAME}* -y
sky spot cancel -n ${SPOT_JOB_JOB_NAME}* -y
6 changes: 3 additions & 3 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -2169,7 +2169,7 @@ def test_spot(generic_cloud: str):
@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances
@pytest.mark.managed_spot
def test_spot_pipeline(generic_cloud: str):
"""Test a spot pipeline."""
"""Test a spot pipeline."""
name = _get_cluster_name()
test = Test(
'spot-pipeline',
Expand Down Expand Up @@ -2358,7 +2358,7 @@ def test_spot_pipeline_recovery_aws(aws_config_region):
# separated by `-`.
(
f'SPOT_JOB_ID=`cat /tmp/{name}-run-id | rev | '
'cut -d\'-\' -f2 | rev`;'
'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`;'
f'aws ec2 terminate-instances --region {region} --instance-ids $('
f'aws ec2 describe-instances --region {region} '
# TODO(zhwu): fix the name for spot cluster.
Expand Down Expand Up @@ -2407,7 +2407,7 @@ def test_spot_pipeline_recovery_gcp():
# SKYPILOT_TASK_ID, which gets the second to last field
# separated by `-`.
(f'SPOT_JOB_ID=`cat /tmp/{name}-run-id | rev | '
f'cut -d\'-\' -f2 | rev`;{terminate_cmd}'),
f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'),
'sleep 60',
f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
'sleep 200',
Expand Down

0 comments on commit 91d6d1b

Please sign in to comment.