Skip to content

Commit

Permalink
[FluidStack] PR Changes: Reformat files
Browse files Browse the repository at this point in the history
* Reformat after pull from skypilot/master

* Implement multi-node support for FluidStack

* Use CUDA installation on plain distro image

* Removed `check_disk_tier_enabled` from `fluidstack.py`
  • Loading branch information
mjibril committed Feb 19, 2024
1 parent 366c380 commit fc2ae25
Show file tree
Hide file tree
Showing 11 changed files with 49 additions and 39 deletions.
2 changes: 1 addition & 1 deletion sky/authentication.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ def setup_fluidstack_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
client = fluidstack_utils.FluidstackClient()
public_key_path = os.path.expanduser(PUBLIC_SSH_KEY_PATH)
public_key = None
with open(public_key_path, 'r') as f:
with open(public_key_path, 'r', encoding='utf-8') as f:
public_key = f.read()
client.get_or_add_ssh_key(public_key)
config['auth']['ssh_public_key'] = PUBLIC_SSH_KEY_PATH
Expand Down
2 changes: 1 addition & 1 deletion sky/clouds/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# isort: split
from sky.clouds.aws import AWS
from sky.clouds.azure import Azure
from sky.clouds.fluidstack import Fluidstack
from sky.clouds.cudo import Cudo
from sky.clouds.fluidstack import Fluidstack
from sky.clouds.gcp import GCP
from sky.clouds.ibm import IBM
from sky.clouds.kubernetes import Kubernetes
Expand Down
19 changes: 5 additions & 14 deletions sky/clouds/fluidstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import requests

from sky import clouds
from sky import exceptions
from sky import status_lib
from sky.clouds import service_catalog
from sky.provision.fluidstack import fluidstack_utils
Expand Down Expand Up @@ -37,8 +36,8 @@ class Fluidstack(clouds.Cloud):

_CLOUD_UNSUPPORTED_FEATURES = {
clouds.CloudImplementationFeatures.STOP:
'FluidStack cloud does not support'
' stopping VMs. for all DCs',
'Stopping clusters in FluidStack'
' is not supported in SkyPilot',
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
'Migrating '
f'disk is not supported in {_REPR}.',
Expand All @@ -50,7 +49,7 @@ class Fluidstack(clouds.Cloud):
f'is not supported for {_REPR}.',
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
'Custom disk tiers'
f' are not supported in {_REPR}.',
f' is not supported in {_REPR}.',
clouds.CloudImplementationFeatures.OPEN_PORTS:
'Opening ports'
f'is not supported in {_REPR}.',
Expand Down Expand Up @@ -200,15 +199,13 @@ def make_deploy_resources_variables(
sudo apt-get -y install cuda-toolkit-12-3;
sudo apt-get install -y cuda-drivers;
sudo apt-get install -y python3-pip;
nvidia-smi;"""
nvidia-smi || sudo reboot;"""
return {
'instance_type': resources.instance_type,
'custom_resources': custom_resources,
'region': region.name,
'fluidstack_username': self.default_username(region.name),
'cuda_installation_commands':
cuda_installation_commands
if not fluidstack_utils.with_nvidia_drivers(region) else ''
'cuda_installation_commands': cuda_installation_commands,
}

def _get_feasible_launchable_resources(
Expand Down Expand Up @@ -304,12 +301,6 @@ def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
zone,
clouds='fluidstack')

@classmethod
def check_disk_tier_enabled(cls, instance_type: Optional[str],
disk_tier: DiskTier) -> None:
raise exceptions.NotSupportedError(
'FluidStack does not support disk tiers.')

@classmethod
def default_username(cls, region: str) -> str:
return {
Expand Down
2 changes: 1 addition & 1 deletion sky/clouds/service_catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

CloudFilter = Optional[Union[List[str], str]]
ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
'kubernetes', 'runpod', 'vsphere', 'cudo','fluidstack')
'kubernetes', 'runpod', 'vsphere', 'cudo', 'fluidstack')


def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs):
Expand Down
5 changes: 3 additions & 2 deletions sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ def create_catalog(output_dir: str) -> None:
plans = plans + [
plan for plan in custom_plans for plan in plans_from_custom_plan(plan)
]
with open(os.path.join(output_dir, 'vms.csv'), mode='w') as f:
with open(os.path.join(output_dir, 'vms.csv'), mode='w',
encoding='utf-8') as f:
writer = csv.writer(f, delimiter=',', quotechar='"')
writer.writerow([
'InstanceType',
Expand Down Expand Up @@ -153,5 +154,5 @@ def create_catalog(output_dir: str) -> None:
if __name__ == '__main__':

os.makedirs('fluidstack', exist_ok=True)
create_catalog('fluidstack/vms.csv')
create_catalog('fluidstack')
print('Fluidstack catalog saved to {}/vms.csv'.format('fluidstack'))
2 changes: 2 additions & 0 deletions sky/clouds/service_catalog/fluidstack_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,10 @@ def list_accelerators(
quantity_filter: Optional[int],
case_sensitive: bool = True,
all_regions: bool = False,
require_price: bool = True,
) -> Dict[str, List[common.InstanceTypeInfo]]:
"""Returns all instance types in Fluidstack offering GPUs."""
del require_price
return common.list_accelerators_impl('Fluidstack', _df, gpus_only,
name_filter, region_filter,
quantity_filter, case_sensitive,
Expand Down
2 changes: 1 addition & 1 deletion sky/provision/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from sky.provision import aws
from sky.provision import azure
from sky.provision import common
from sky.provision import fluidstack
from sky.provision import cudo
from sky.provision import fluidstack
from sky.provision import gcp
from sky.provision import kubernetes
from sky.provision import runpod
Expand Down
7 changes: 4 additions & 3 deletions sky/provision/fluidstack/fluidstack_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def get_key_suffix():


def read_contents(path: str) -> str:
with open(path, mode='r') as f:
with open(path, mode='r', encoding='utf-8') as f:
return f.read().strip()


Expand Down Expand Up @@ -50,6 +50,8 @@ def raise_fluidstack_error(response: requests.Response) -> None:

@functools.lru_cache()
def with_nvidia_drivers(region: str):
if region in ['norway_4_eu', 'generic_1_canada']:
return False
client = FluidstackClient()
plans = client.get_plans()
for plan in plans:
Expand Down Expand Up @@ -138,8 +140,7 @@ def create_instance(
f'Plan {instance_type} out of stock in region {region}')

ssh_key = self.get_or_add_ssh_key(ssh_pub_key)
os_id = 'Ubuntu 20.04 LTS' if not with_nvidia_drivers(
region) else 'Ubuntu 20.04 LTS (Nvidia)'
os_id = 'Ubuntu 20.04 LTS'
body = dict(plan=None if config else instance_type,
region=regions[region],
os=os_id,
Expand Down
25 changes: 15 additions & 10 deletions sky/provision/fluidstack/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any, Dict, List, Optional

from sky import authentication as auth
from sky import exceptions
from sky import sky_logging
from sky import status_lib
from sky.provision import common
Expand Down Expand Up @@ -31,13 +32,13 @@ def get_internal_ip(node_info: Dict[str, Any]) -> None:
result = runner.run(_GET_INTERNAL_IP_CMD,
require_outputs=True,
stream_logs=False)
# rc, stdout, stderr = result

if result[0] != 0:
logger.error('Failed get obtain private IP from node')
# subprocess_utils.handle_returncode(rc,
# _GET_INTERNAL_IP_CMD,
# 'Failed get obtain private IP from node',
# stderr=stdout + stderr)
# Some DCs do not have internal IPs and can fail when getting
# the IP. We set the `internal_ip` to the same as
# external IP. It should be fine as the `ray cluster`
# will also get and use that external IP in that case.
logger.debug('Failed get obtain private IP from node')
else:
node_info['internal_ip'] = result[1].strip()

Expand Down Expand Up @@ -223,10 +224,9 @@ def get_cluster_info(
if instance_info['hostname'].endswith('-head'):
head_instance_id = instance_id

return common.ClusterInfo(
instances=instances,
head_instance_id=head_instance_id,
)
return common.ClusterInfo(instances=instances,
head_instance_id=head_instance_id,
custom_ray_options={'use_external_ip': True})


def query_instances(
Expand Down Expand Up @@ -254,9 +254,14 @@ def query_instances(
'failed to create': status_lib.ClusterStatus.INIT,
'timeout error': status_lib.ClusterStatus.INIT,
'out of stock': status_lib.ClusterStatus.INIT,
'terminated': None,
}
statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
for inst_id, inst in instances.items():
if inst['status'] not in status_map:
with ux_utils.print_exception_no_traceback():
raise exceptions.ClusterStatusFetchingError(
f'Failed to parse status from Fluidstack: {inst["status"]}')
status = status_map.get(inst['status'], None)
if non_terminated_only and status is None:
continue
Expand Down
20 changes: 15 additions & 5 deletions sky/provision/instance_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,8 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
ray_options += f' --resources=\'{custom_resource}\''

if cluster_info.custom_ray_options:
if 'use_external_ip' in cluster_info.custom_ray_options:
cluster_info.custom_ray_options.pop('use_external_ip')
for key, value in cluster_info.custom_ray_options.items():
ray_options += f' --{key}={value}'

Expand Down Expand Up @@ -296,11 +298,19 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,

head_instance = cluster_info.get_head_instance()
assert head_instance is not None, cluster_info
head_private_ip = head_instance.internal_ip
use_external_ip = False
if cluster_info.custom_ray_options:
# Some cloud providers, e.g. fluidstack, cannot connect to the internal
# IP of the head node from the worker nodes. In this case, we need to
# use the external IP of the head node.
use_external_ip = cluster_info.custom_ray_options.pop(
'use_external_ip', False)
head_ip = (head_instance.internal_ip
if not use_external_ip else head_instance.external_ip)

ray_options = (f'--address={head_ip}:{constants.SKY_REMOTE_RAY_PORT} '
f'--object-manager-port=8076')

ray_options = (
f'--address={head_private_ip}:{constants.SKY_REMOTE_RAY_PORT} '
f'--object-manager-port=8076')
if custom_resource:
ray_options += f' --resources=\'{custom_resource}\''

Expand All @@ -321,7 +331,7 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
# Instead, we check whether the raylet process is running on gcs address
# that is connected to the head with the correct port.
cmd = (f'RAY_PORT={ray_port}; ps aux | grep "ray/raylet/raylet" | '
f'grep "gcs-address={head_private_ip}:${{RAY_PORT}}" || '
f'grep "gcs-address={head_ip}:${{RAY_PORT}}" || '
f'{{ {cmd} }}')
else:
cmd = 'ray stop; ' + cmd
Expand Down
2 changes: 1 addition & 1 deletion tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -1012,7 +1012,7 @@ def test_cli_logs(generic_cloud: str):
f'sky logs {name} * --sync-down',
f'sky logs {name} 1 | grep "{timestamp} 1"',
f'sky logs {name} | grep "{timestamp} 4"',
], f'sky down -y {name}', _get_timeout(generic_cloud))
], f'sky down -y {name}')
run_one_test(test)


Expand Down

0 comments on commit fc2ae25

Please sign in to comment.