Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Add ~/.local/bin to make which ray work if ray is installed in ~/.local #3368

Merged
merged 12 commits into from
Apr 15, 2024
21 changes: 17 additions & 4 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@
# used for installing SkyPilot runtime (ray and skypilot).
SKY_PYTHON_PATH_FILE = '~/.sky/python_path'
SKY_RAY_PATH_FILE = '~/.sky/ray_path'
SKY_GET_PYTHON_PATH_CMD = (f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
'which python3')
# Python executable, e.g., /opt/conda/bin/python3
SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
# Ray executable, e.g., /opt/conda/bin/ray
SKY_RAY_CMD = f'$(cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)'
SKY_RAY_CMD = (f'$([ -s {SKY_RAY_PATH_FILE} ] && '
f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')

# The name for the environment variable that stores the unique ID of the
# current task. This will stay the same across multiple recoveries of the
Expand Down Expand Up @@ -108,6 +110,8 @@
# backend_utils.write_cluster_config.
RAY_SKYPILOT_INSTALLATION_COMMANDS = (
'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;'
# Print the PATH in provision.log to help debug PATH issues.
'echo PATH=$PATH; '
# Backward compatibility for ray upgrade (#3248): do not upgrade ray if the
# ray cluster is already running, to avoid the ray cluster being restarted.
#
Expand All @@ -124,9 +128,18 @@
f'{SKY_PIP_CMD} list | grep "ray " | '
f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
f'|| {RAY_STATUS} || '
f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION};' # pylint: disable=line-too-long
f'{SKY_PIP_CMD} install --exists-action w -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long
# In some envs, e.g. pip does not have permission to write under /opt/conda
# ray package will be installed under ~/.local/bin. If the user's PATH does
# not include ~/.local/bin (the pip install will have the output: `WARNING:
# The scripts ray, rllib, serve and tune are installed in '~/.local/bin'
# which is not on PATH.`), causing an empty SKY_RAY_PATH_FILE later.
#
# Here, we add ~/.local/bin to the end of the PATH to make sure the issues
# mentioned above are resolved.
'export PATH=$PATH:$HOME/.local/bin; '
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do I understand correctly:

  • The issue is in some envs, our own Ray is by default installed into ~/.local/bin
  • In that env, this path is not in $PATH by default
  • So, previously we touch the SKY_RAY_PATH_FILE file, which is empty
  • Bug is, later on when we do our_ray start, our_ray is basically empty string?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is exactly the issue. Tried to elaborate the comments a bit. PTAL : )

# Writes ray path to file if it does not exist or the file is empty.
f'[ -s {SKY_RAY_PATH_FILE} ] || which ray > {SKY_RAY_PATH_FILE};'
f'[ -s {SKY_RAY_PATH_FILE} ] || which ray > {SKY_RAY_PATH_FILE}; '
# END ray package check and installation
f'{{ {SKY_PIP_CMD} list | grep "skypilot " && '
'[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long
Expand Down
12 changes: 9 additions & 3 deletions sky/spot/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,15 @@ def __init__(self, job_id: int, dag_yaml: str,
# the user can have the same id for multiple recoveries.
# Example value: sky-2022-10-04-22-46-52-467694_my-spot-name_spot_id-17-0
job_id_env_vars = []
for i in range(len(self._dag.tasks)):
task_name = self._dag_name
if len(self._dag.tasks) > 1:
for i, task in enumerate(self._dag.tasks):
if len(self._dag.tasks) <= 1:
task_name = self._dag_name
else:
task_name = task.name
# This is guaranteed by the spot_launch API, where we fill in
# the task.name with
# dag_utils.maybe_infer_and_fill_dag_and_task_names.
assert task_name is not None, self._dag
task_name = f'{self._dag_name}_{task_name}'
job_id_env_var = common_utils.get_global_job_id(
self._backend.run_timestamp,
Expand Down
25 changes: 14 additions & 11 deletions tests/backward_compatibility_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# bash tests/backward_compatibility_tests.sh

#!/bin/bash
set -ev
set -evx

need_launch=${1:-0}
start_from=${2:-0}
Expand Down Expand Up @@ -160,30 +160,33 @@ fi

# Test spot jobs to make sure existing jobs and new job can run correctly, after
# the spot controller is updated.
# Get a new uuid to avoid conflict with previous back-compat tests.
uuid=$(uuidgen)
SPOT_JOB_JOB_NAME=${CLUSTER_NAME}-${uuid:0:4}
if [ "$start_from" -le 7 ]; then
conda activate sky-back-compat-master
rm -r ~/.sky/wheels || true
sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${CLUSTER_NAME}-7-0 "echo hi; sleep 1000"
sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${CLUSTER_NAME}-7-1 "echo hi; sleep 300"
sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${SPOT_JOB_JOB_NAME}-7-0 "echo hi; sleep 1000"
sky spot launch -d --cloud ${CLOUD} -y --cpus 2 -n ${SPOT_JOB_JOB_NAME}-7-1 "echo hi; sleep 300"
conda activate sky-back-compat-current
rm -r ~/.sky/wheels || true
s=$(sky spot logs --no-follow -n ${CLUSTER_NAME}-7-1)
s=$(sky spot logs --no-follow -n ${SPOT_JOB_JOB_NAME}-7-1)
echo "$s"
echo "$s" | grep " hi" || exit 1
sky spot launch -d --cloud ${CLOUD} -y -n ${CLUSTER_NAME}-7-2 "echo hi; sleep 60"
s=$(sky spot logs --no-follow -n ${CLUSTER_NAME}-7-2)
sky spot launch -d --cloud ${CLOUD} -y -n ${SPOT_JOB_JOB_NAME}-7-2 "echo hi; sleep 40"
s=$(sky spot logs --no-follow -n ${SPOT_JOB_JOB_NAME}-7-2)
echo "$s"
echo "$s" | grep " hi" || exit 1
s=$(sky spot queue | grep ${CLUSTER_NAME}-7)
s=$(sky spot queue | grep ${SPOT_JOB_JOB_NAME}-7)
echo "$s"
echo "$s" | grep "RUNNING" | wc -l | grep 3 || exit 1
sky spot cancel -y -n ${CLUSTER_NAME}-7-0
sky spot logs -n "${CLUSTER_NAME}-7-1"
s=$(sky spot queue | grep ${CLUSTER_NAME}-7)
sky spot cancel -y -n ${SPOT_JOB_JOB_NAME}-7-0
sky spot logs -n "${SPOT_JOB_JOB_NAME}-7-1"
s=$(sky spot queue | grep ${SPOT_JOB_JOB_NAME}-7)
echo "$s"
echo "$s" | grep "SUCCEEDED" | wc -l | grep 2 || exit 1
echo "$s" | grep "CANCELLED" | wc -l | grep 1 || exit 1
fi

sky down ${CLUSTER_NAME}* -y
sky spot cancel -n ${CLUSTER_NAME}* -y
sky spot cancel -n ${SPOT_JOB_JOB_NAME}* -y
6 changes: 3 additions & 3 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -2169,7 +2169,7 @@ def test_spot(generic_cloud: str):
@pytest.mark.no_kubernetes # Kubernetes does not have a notion of spot instances
@pytest.mark.managed_spot
def test_spot_pipeline(generic_cloud: str):
"""Test a spot pipeline."""
"""Test a spot pipeline."""
name = _get_cluster_name()
test = Test(
'spot-pipeline',
Expand Down Expand Up @@ -2358,7 +2358,7 @@ def test_spot_pipeline_recovery_aws(aws_config_region):
# separated by `-`.
(
f'SPOT_JOB_ID=`cat /tmp/{name}-run-id | rev | '
'cut -d\'-\' -f2 | rev`;'
'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`;'
f'aws ec2 terminate-instances --region {region} --instance-ids $('
f'aws ec2 describe-instances --region {region} '
# TODO(zhwu): fix the name for spot cluster.
Expand Down Expand Up @@ -2407,7 +2407,7 @@ def test_spot_pipeline_recovery_gcp():
# SKYPILOT_TASK_ID, which gets the second to last field
# separated by `-`.
(f'SPOT_JOB_ID=`cat /tmp/{name}-run-id | rev | '
f'cut -d\'-\' -f2 | rev`;{terminate_cmd}'),
f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'),
'sleep 60',
f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
'sleep 200',
Expand Down
Loading