Skip to content

Commit

Permalink
[Onprem] Automatically install sky dependencies (#1116)
Browse files Browse the repository at this point in the history
* Remove root user, move ray cluster to admin

* Automatically install sky dependencies

* Fix admin alignment

* Fix PR

* Address romil's comments

* F

* Addressed Romil's comments
  • Loading branch information
michaelzhiluo authored Oct 10, 2022
1 parent a48ad8c commit bd4f929
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 19 deletions.
54 changes: 37 additions & 17 deletions sky/backends/onprem_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from sky import global_user_state
from sky import sky_logging
from sky.backends import backend_utils
from sky.skylet import constants
from sky.utils import command_runner
from sky.utils import common_utils
from sky.utils import schemas
Expand Down Expand Up @@ -153,8 +154,9 @@ def get_local_cluster_config_or_error(cluster_name: str) -> Dict[str, Any]:
raise ValueError(f'Cluster config {local_file} not found.')


def check_local_installation(ips: List[str], auth_config: Dict[str, str]):
"""Checks if the Sky dependencies are properly installed on the machine.
def check_and_install_local_env(ips: List[str], auth_config: Dict[str, str]):
"""Checks if SkyPilot dependencies are present on the machine. Installs
them if not already installed.
This function checks for the following dependencies on the root user:
- Sky
Expand All @@ -174,29 +176,47 @@ def check_local_installation(ips: List[str], auth_config: Dict[str, str]):
ssh_credentials = (ssh_user, ssh_key, 'sky-admin-deploy')
runners = command_runner.SSHCommandRunner.make_runner_list(
ips, *ssh_credentials)
sky_ray_version = constants.SKY_REMOTE_RAY_VERSION

def _check_dependencies(runner: command_runner.SSHCommandRunner) -> None:
# Checks for global python3 installation.
def _install_and_check_dependencies(
runner: command_runner.SSHCommandRunner) -> None:
# Checks for python3 installation.
backend_utils.run_command_and_handle_ssh_failure(
runner,
'python3 --version',
runner, ('python3 --version'),
failure_message=f'Python3 is not installed on {runner.ip}.')

# Checks for global Ray installation (accessible by all users).
# Checks for pip3 installation.
backend_utils.run_command_and_handle_ssh_failure(
runner,
'ray --version',
runner, ('pip3 --version'),
failure_message=f'Pip3 is not installed on {runner.ip}.')

# If Ray does not exist, installs Ray.
backend_utils.run_command_and_handle_ssh_failure(
runner, (f'ray --version || '
f'(pip3 install ray[default]=={sky_ray_version})'),
failure_message=f'Ray is not installed on {runner.ip}.')

# Checks for global Sky installation (accessible by all users). When
# Sky's job submission code is ran on a user's account, Sky calls the
# Ray cluster to prepare the user's job. Due to Ray's limitations,
# this is ran under the admin's environment, which requires Sky to be
# installed globally.
# TODO(mluo): Make Sky admin only.
# If Ray exists, check Ray version. If the version does not match
# raise an error.
backend_utils.run_command_and_handle_ssh_failure(
runner,
f'ray --version | grep {sky_ray_version}',
failure_message=(
f'Ray (on {runner.ip}) does not match skypilot\'s'
f' requirement for ray=={sky_ray_version}.'
f' Make sure that the correct version of ray is installed.'))

# Checks for Sky installation. When Sky's job submission code is ran
# on a user's account, Sky calls the Ray cluster to prepare the user's
# job. Due to Ray's limitations, this is ran under the admin's
# environment, which requires Sky to be installed globally. NOTE: This
# package is installed from PyPI and may not contain any changes made
# since the last SkyPilot release. If required, please install
# skypilot from source on the onprem machine(s) before running sky
# admin deploy
backend_utils.run_command_and_handle_ssh_failure(
runner,
'sky --help',
'sky --help || (pip3 install skypilot)',
failure_message=f'Sky is not installed on {runner.ip}.')

# Patches global Ray.
Expand All @@ -205,7 +225,7 @@ def _check_dependencies(runner: command_runner.SSHCommandRunner) -> None:
'import patch; patch()"'),
failure_message=f'Failed to patch ray on {runner.ip}.')

subprocess_utils.run_in_parallel(_check_dependencies, runners)
subprocess_utils.run_in_parallel(_install_and_check_dependencies, runners)


def get_local_cluster_accelerators(
Expand Down
4 changes: 2 additions & 2 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2403,10 +2403,10 @@ def admin_deploy(clusterspec_yaml: str):
len(ips), sky.Resources(sky.Local()))

# Check for Ray
click.secho(f'[{steps}/4] Checking on-premise environment\n',
click.secho(f'[{steps}/4] Installing on-premise dependencies\n',
fg='green',
nl=False)
onprem_utils.check_local_installation(ips, auth_config)
onprem_utils.check_and_install_local_env(ips, auth_config)
steps += 1

# Detect what GPUs the cluster has (which can be heterogeneous)
Expand Down

0 comments on commit bd4f929

Please sign in to comment.