skypilot-org · ewzeng · Jan 30, 2023 · Dec 9, 2022 · Dec 16, 2022 · Dec 19, 2022
diff --git a/sky/__init__.py b/sky/__init__.py
@@ -27,6 +27,7 @@
 AWS = clouds.AWS
 Azure = clouds.Azure
 GCP = clouds.GCP
+Lambda = clouds.Lambda
 Local = clouds.Local
 optimize = Optimizer.optimize
 
@@ -35,6 +36,7 @@
     'AWS',
     'Azure',
     'GCP',
+    'Lambda',
     'Local',
     'Optimizer',
     'OptimizeTarget',

diff --git a/sky/authentication.py b/sky/authentication.py
@@ -21,6 +21,7 @@
 from sky.utils import common_utils
 from sky.utils import subprocess_utils
 from sky.utils import ux_utils
+from sky.skylet.providers.lambda_labs.lambda_utils import LambdaLabsClient
 
 logger = sky_logging.init_logger(__name__)
 
@@ -299,3 +300,26 @@ def setup_azure_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
     config['file_mounts'] = file_mounts
 
     return config
+
+
+def setup_lambda_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
+    get_or_generate_keys()
+
+    # Ensure ssh key is registered with Lambda Labs
+    lambda_client = LambdaLabsClient()
+    if lambda_client.ssh_key_name is None:
+        public_key_path = os.path.expanduser(PUBLIC_SSH_KEY_PATH)
+        with open(public_key_path, 'r') as f:
+            public_key = f.read()
+        name = f'{common_utils.get_user_hash()}-sky-key'
+        lambda_client.set_ssh_key(name, public_key)
+
+    # Need to use ~ relative path because Ray uses the same
+    # path for finding the public key path on both local and head node.
+    config['auth']['ssh_public_key'] = PUBLIC_SSH_KEY_PATH
+
+    file_mounts = config['file_mounts']
+    file_mounts[PUBLIC_SSH_KEY_PATH] = PUBLIC_SSH_KEY_PATH
+    config['file_mounts'] = file_mounts
+
+    return config
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
@@ -43,6 +43,7 @@
 from sky.backends import onprem_utils
 from sky.skylet import constants
 from sky.skylet import log_lib
+from sky.skylet.providers.lambda_labs.lambda_utils import LambdaLabsClient
 from sky.utils import common_utils
 from sky.utils import command_runner
 from sky.utils import env_options
@@ -891,6 +892,8 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
         config = auth.setup_gcp_authentication(config)
     elif isinstance(cloud, clouds.Azure):
         config = auth.setup_azure_authentication(config)
+    elif isinstance(cloud, clouds.Lambda):
+        config = auth.setup_lambda_authentication(config)
     else:
         assert isinstance(cloud, clouds.Local), cloud
         # Local cluster case, authentication is already filled by the user
@@ -1651,10 +1654,27 @@ def _query_status_azure(
     return _process_cli_query('Azure', cluster, query_cmd, '\t', status_map)
 
 
+def _query_status_lambda(
+        cluster: str,
+        ray_config: Dict[str, Any],  # pylint: disable=unused-argument
+) -> List[global_user_state.ClusterStatus]:
+    status_map = {
+        'booting': global_user_state.ClusterStatus.INIT,
+        'active': global_user_state.ClusterStatus.UP,
+    }
+    # TODO(ewzeng): filter by hash_filter_string to be safe
+    vms = LambdaLabsClient().ls().get('data', [])
+    for node in vms:
+        if node['name'] == cluster:
+            return [status_map[node['status']]]
+    return []
+
+
 _QUERY_STATUS_FUNCS = {
     'AWS': _query_status_aws,
     'GCP': _query_status_gcp,
     'Azure': _query_status_azure,
+    'Lambda': _query_status_lambda,
 }
 
 

diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
@@ -104,6 +104,7 @@ def _get_cluster_config_template(cloud):
         clouds.AWS: 'aws-ray.yml.j2',
         clouds.Azure: 'azure-ray.yml.j2',
         clouds.GCP: 'gcp-ray.yml.j2',
+        clouds.Lambda: 'lambda-ray.yml.j2',
         clouds.Local: 'local-ray.yml.j2',
     }
     return cloud_to_template[type(cloud)]
@@ -557,9 +558,9 @@ def __init__(self, log_dir: str, dag: 'dag.Dag',
     def _in_blocklist(self, cloud, region, zones):
         if region.name in self._blocked_regions:
             return True
-        # We do not keep track of zones in Azure and Local,
-        # as both clouds do not have zones.
-        if isinstance(cloud, (clouds.Azure, clouds.Local)):
+        # We do not keep track of zones in Azure, Lambda, and Local,
+        # as these clouds do not have zones.
+        if isinstance(cloud, (clouds.Azure, clouds.Local, clouds.Lambda)):
             return False
         assert zones, (cloud, region, zones)
         for zone in zones:
@@ -737,6 +738,32 @@ def _update_blocklist_on_azure_error(self, region, zones, stdout, stderr):
         else:
             self._blocked_regions.add(region.name)
 
+    def _update_blocklist_on_lambda_error(self, region, zones, stdout, stderr):
+        del zones  # Unused.
+        style = colorama.Style
+        stdout_splits = stdout.split('\n')
+        stderr_splits = stderr.split('\n')
+        errors = [
+            s.strip()
+            for s in stdout_splits + stderr_splits
+            if 'LambdaLabsError:' in s.strip()
+        ]
+        if not errors:
+            logger.info('====== stdout ======')
+            for s in stdout_splits:
+                print(s)
+            logger.info('====== stderr ======')
+            for s in stderr_splits:
+                print(s)
+            with ux_utils.print_exception_no_traceback():
+                raise RuntimeError('Errors occurred during provision; '
+                                   'check logs above.')
+
+        logger.warning(f'Got error(s) in {region.name}:')
+        messages = '\n\t'.join(errors)
+        logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
+        self._blocked_regions.add(region.name)
+
     def _update_blocklist_on_local_error(self, region, zones, stdout, stderr):
         del zones  # Unused.
         style = colorama.Style
@@ -789,6 +816,10 @@ def _update_blocklist_on_error(self, cloud, region, zones, stdout,
             return self._update_blocklist_on_azure_error(
                 region, zones, stdout, stderr)
 
+        if isinstance(cloud, clouds.Lambda):
+            return self._update_blocklist_on_lambda_error(
+                region, zones, stdout, stderr)
+
         if isinstance(cloud, clouds.Local):
             return self._update_blocklist_on_local_error(
                 region, zones, stdout, stderr)
@@ -818,6 +849,9 @@ def _yield_region_zones(self, to_provision: resources_lib.Resources,
                     elif cloud.is_same_cloud(clouds.Azure()):
                         region = config['provider']['location']
                         zones = None
+                    elif cloud.is_same_cloud(clouds.Lambda()):
+                        region = config['provider']['region']
+                        zones = None
                     elif cloud.is_same_cloud(clouds.Local()):
                         local_regions = clouds.Local.regions()
                         region = local_regions[0].name

diff --git a/sky/cli.py b/sky/cli.py
@@ -710,6 +710,21 @@ def _launch_with_confirm(
             confirm_shown = True
             click.confirm(prompt, default=True, abort=True, show_default=True)
 
+    # Lambda Labs does not support autostop or multiple nodes.
+    # If task.resources is None, cannot be Lambda Labs.
+    if task.resources:
+        for resource in task.resources:
+            if resource.cloud.is_same_cloud(sky.Lambda()):
+                if not down and idle_minutes_to_autostop is not None:
+                    with ux_utils.print_exception_no_traceback():
+                        raise exceptions.NotSupportedError(
+                            ('Lambda Labs does not support stopping '
+                             'instances.'))
+                elif task.num_nodes > 1:
+                    with ux_utils.print_exception_no_traceback():
+                        raise exceptions.NotSupportedError(
+                            ('Lambda Labs does not support --num-nodes > 1.'))
+
     if node_type is not None:
         if maybe_status != global_user_state.ClusterStatus.UP:
             click.secho(f'Setting up interactive node {cluster}...',

diff --git a/sky/clouds/__init__.py b/sky/clouds/__init__.py
@@ -6,13 +6,15 @@
 from sky.clouds.aws import AWS
 from sky.clouds.azure import Azure
 from sky.clouds.gcp import GCP
+from sky.clouds.lambda_labs import Lambda
 from sky.clouds.local import Local
 
 __all__ = [
     'AWS',
     'Azure',
     'Cloud',
     'GCP',
+    'Lambda',
     'Local',
     'Region',
     'Zone',