skypilot-org · Michaelvll · Aug 26, 2023 · Aug 11, 2023 · Aug 12, 2023 · Aug 12, 2023
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
@@ -891,6 +891,7 @@ def write_cluster_config(
     # task.best_resources may not be equal to to_provision if the user
     # is running a job with less resources than the cluster has.
     cloud = to_provision.cloud
+    assert cloud is not None, to_provision
     # This can raise a ResourcesUnavailableError, when the region/zones
     # requested does not appear in the catalog. It can be triggered when the
     # user changed the catalog file, while there is a cluster in the removed
@@ -982,10 +983,14 @@ def write_cluster_config(
         f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w"))\''
     )
 
+    cluster_name_on_cloud = common_utils.make_cluster_name_on_cloud(
+        cluster_name, max_length=cloud.max_cluster_name_length())
+
     # Only using new security group names for clusters with ports specified.
     default_aws_sg_name = f'sky-sg-{common_utils.user_and_hostname_hash()}'
     if ports is not None:
-        default_aws_sg_name += f'-{common_utils.truncate_and_hash_cluster_name(cluster_name)}'
+        default_aws_sg_name = f'sky-sg-{cluster_name_on_cloud}'
+
     # Use a tmp file path to avoid incomplete YAML file being re-used in the
     # future.
     tmp_yaml_path = yaml_path + '.tmp'
@@ -994,7 +999,7 @@ def write_cluster_config(
         dict(
             resources_vars,
             **{
-                'cluster_name': cluster_name,
+                'cluster_name_on_cloud': cluster_name_on_cloud,
                 'num_nodes': num_nodes,
                 'ports': ports,
                 'disk_size': to_provision.disk_size,
@@ -1083,6 +1088,12 @@ def write_cluster_config(
         with open(tmp_yaml_path, 'w') as f:
             f.write(restored_yaml_content)
 
+    # Read the cluster name from the tmp yaml file, to take the backward
+    # compatbility restortion above into account.
+    # TODO: remove this after 2 minor releases, 0.5.0.
+    yaml_config = common_utils.read_yaml(tmp_yaml_path)
+    config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
+
     # Optimization: copy the contents of source files in file_mounts to a
     # special dir, and upload that as the only file_mount instead. Delay
     # calling this optimization until now, when all source files have been
@@ -1809,7 +1820,9 @@ def _query_cluster_status_via_cloud_api(
         exceptions.ClusterStatusFetchingError: the cluster status cannot be
           fetched from the cloud provider.
     """
-    cluster_name = handle.cluster_name
+    cluster_name_on_cloud = handle.cluster_name_on_cloud
+    cluster_name_in_hint = common_utils.cluster_name_in_hint(
+        handle.cluster_name, cluster_name_on_cloud)
     # Use region and zone from the cluster config, instead of the
     # handle.launched_resources, because the latter may not be set
     # correctly yet.
@@ -1827,26 +1840,31 @@ def _query_cluster_status_via_cloud_api(
         cloud_name = repr(handle.launched_resources.cloud)
         try:
             node_status_dict = provision_lib.query_instances(
-                cloud_name, cluster_name, provider_config)
-            logger.debug(f'Querying {cloud_name} cluster {cluster_name!r} '
+                cloud_name, cluster_name_on_cloud, provider_config)
+            logger.debug(f'Querying {cloud_name} cluster '
+                         f'{cluster_name_in_hint} '
                          f'status:\n{pprint.pformat(node_status_dict)}')
             node_statuses = list(node_status_dict.values())
         except Exception as e:  # pylint: disable=broad-except
             with ux_utils.print_exception_no_traceback():
                 raise exceptions.ClusterStatusFetchingError(
-                    f'Failed to query {cloud_name} cluster {cluster_name!r} '
-                    f'status: {e}')
+                    f'Failed to query {cloud_name} cluster '
+                    f'{cluster_name_in_hint} '
+                    f'status: {common_utils.format_exception(e, use_bracket=True)}'
+                )
     else:
         node_statuses = handle.launched_resources.cloud.query_status(
-            cluster_name, tag_filter_for_cluster(cluster_name), region, zone,
+            cluster_name_on_cloud,
+            tag_filter_for_cluster(cluster_name_on_cloud), region, zone,
             **kwargs)
     # GCP does not clean up preempted TPU VMs. We remove it ourselves.
     # TODO(wei-lin): handle multi-node cases.
     # TODO(zhwu): this should be moved into the GCP class, after we refactor
     # the cluster termination, as the preempted TPU VM should always be
     # removed.
     if kwargs.get('use_tpu_vm', False) and len(node_statuses) == 0:
-        logger.debug(f'Terminating preempted TPU VM cluster {cluster_name}')
+        logger.debug(
+            f'Terminating preempted TPU VM cluster {cluster_name_in_hint}')
         backend = backends.CloudVmRayBackend()
         # Do not use refresh cluster status during teardown, as that will
         # cause infinite recursion by calling cluster status refresh

diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
@@ -45,8 +45,6 @@
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.skylet import log_lib
-from sky.skylet.providers.scp.node_provider import SCPError
-from sky.skylet.providers.scp.node_provider import SCPNodeProvider
 from sky.usage import usage_lib
 from sky.utils import command_runner
 from sky.utils import common_utils
@@ -1533,6 +1531,11 @@ def _retry_zones(
             # means a second 'sky launch -c <name>' will attempt to reuse.
             handle = CloudVmRayResourceHandle(
                 cluster_name=cluster_name,
+                # Backward compatibility will be guaranteed by the underlying
+                # backend_utils.write_cluster_config, which gets the cluster
+                # name on cloud from the ray yaml file, if the previous cluster
+                # exists.
+                cluster_name_on_cloud=config_dict['cluster_name_on_cloud'],
                 cluster_yaml=cluster_config_file,
                 launched_nodes=num_nodes,
                 # OK for this to be shown in CLI as status == INIT.
@@ -2179,9 +2182,14 @@ def provision_with_retries(
 
 
 class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
-    """A pickle-able tuple of:
+    """A pickle-able handle to a cluster created by CloudVmRayBackend.
+
+    The handle object will last for the whole lifecycle of the cluster.
 
     - (required) Cluster name.
+    - (required) Cluster name on cloud (different from the cluster name, as we
+        append user hash to avoid conflict b/t multiple users in the same
+        organization/account, and truncate the name for length limit).
     - (required) Path to a cluster.yaml file.
     - (optional) A cached head node public IP.  Filled in after a
         successful provision().
@@ -2192,11 +2200,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
     - (optional) Docker user name
     - (optional) If TPU(s) are managed, a path to a deletion script.
     """
-    _VERSION = 5
+    _VERSION = 6
 
     def __init__(self,
                  *,
                  cluster_name: str,
+                 cluster_name_on_cloud: str,
                  cluster_yaml: str,
                  launched_nodes: int,
                  launched_resources: resources_lib.Resources,
@@ -2207,6 +2216,7 @@ def __init__(self,
                  tpu_delete_script: Optional[str] = None) -> None:
         self._version = self._VERSION
         self.cluster_name = cluster_name
+        self.cluster_name_on_cloud = cluster_name_on_cloud
         self._cluster_yaml = cluster_yaml.replace(os.path.expanduser('~'), '~',
                                                   1)
         # List of (internal_ip, external_ip) tuples for all the nodes
@@ -2223,6 +2233,7 @@ def __init__(self,
     def __repr__(self):
         return (f'ResourceHandle('
                 f'\n\tcluster_name={self.cluster_name},'
+                f'\n\tcluster_name_on_cloud={self.cluster_name_on_cloud},'
                 f'\n\thead_ip={self.head_ip},'
                 '\n\tstable_internal_external_ips='
                 f'{self.stable_internal_external_ips},'
@@ -2518,6 +2529,9 @@ def __setstate__(self, state):
         if version < 5:
             state['docker_user'] = None
 
+        if version < 6:
+            state['cluster_name_on_cloud'] = state['cluster_name']
+
         self.__dict__.update(state)
 
         # Because the update_cluster_ips and update_ssh_ports
@@ -3377,8 +3391,8 @@ def _teardown(self,
                 logger.warning(
                     f'{yellow}Purge (-p/--purge) is set, ignoring the '
                     f'identity mismatch error and removing '
-                    f'the cluser record from cluster table.{reset}\n{yellow}It '
-                    'is the user\'s responsibility to ensure that this '
+                    f'the cluster record from cluster table.{reset}\n{yellow}It'
+                    ' is the user\'s responsibility to ensure that this '
                     f'cluster is actually {verbed} on the cloud.{reset}')
                 is_identity_mismatch_and_purge = True
             else:
@@ -3661,6 +3675,7 @@ def teardown_no_lock(self,
         cloud = handle.launched_resources.cloud
         config = common_utils.read_yaml(handle.cluster_yaml)
         cluster_name = handle.cluster_name
+        cluster_name_on_cloud = handle.cluster_name_on_cloud
 
         # Avoid possibly unbound warnings. Code below must overwrite these vars:
         returncode = 0
@@ -3691,7 +3706,7 @@ def teardown_no_lock(self,
                 operation_fn = provision_lib.terminate_instances
             try:
                 operation_fn(repr(cloud),
-                             cluster_name,
+                             cluster_name_on_cloud,
                              provider_config=config['provider'])
             except Exception as e:  # pylint: disable=broad-except
                 if purge:
@@ -3730,12 +3745,12 @@ def teardown_no_lock(self,
             config_provider = common_utils.read_yaml(
                 handle.cluster_yaml)['provider']
             region = config_provider['region']
-            cluster_name = handle.cluster_name
             search_client = ibm.search_client()
             vpc_found = False
             # pylint: disable=unsubscriptable-object
             vpcs_filtered_by_tags_and_region = search_client.search(
-                query=f'type:vpc AND tags:{cluster_name} AND region:{region}',
+                query=(f'type:vpc AND tags:{cluster_name_on_cloud} '
+                       f'AND region:{region}'),
                 fields=['tags', 'region', 'type'],
                 limit=1000).get_result()['items']
             vpc_id = None
@@ -3752,26 +3767,31 @@ def teardown_no_lock(self,
                 # pylint: disable=line-too-long E1136
                 # Delete VPC and it's associated resources
                 vpc_provider = IBMVPCProvider(
-                    config_provider['resource_group_id'], region, cluster_name)
+                    config_provider['resource_group_id'], region,
+                    cluster_name_on_cloud)
                 vpc_provider.delete_vpc(vpc_id, region)
                 # successfully removed cluster as no exception was raised
                 returncode = 0
 
         elif terminate and isinstance(cloud, clouds.SCP):
+            # pylint: disable=import-outside-toplevel
+            from sky.skylet.providers.scp import node_provider
             config['provider']['cache_stopped_nodes'] = not terminate
-            provider = SCPNodeProvider(config['provider'], handle.cluster_name)
+            provider = node_provider.SCPNodeProvider(config['provider'],
+                                                     cluster_name_on_cloud)
             try:
                 if not os.path.exists(provider.metadata.path):
-                    raise SCPError('SKYPILOT_ERROR_NO_NODES_LAUNCHED: '
-                                   'Metadata file does not exist.')
+                    raise node_provider.SCPError(
+                        'SKYPILOT_ERROR_NO_NODES_LAUNCHED: '
+                        'Metadata file does not exist.')
 
                 with open(provider.metadata.path, 'r') as f:
                     metadata = json.load(f)
                     node_id = next(iter(metadata.values())).get(
                         'creation', {}).get('virtualServerId', None)
                     provider.terminate_node(node_id)
                 returncode = 0
-            except SCPError as e:
+            except node_provider.SCPError as e:
                 returncode = 1
                 stdout = ''
                 stderr = str(e)
@@ -3791,7 +3811,7 @@ def teardown_no_lock(self,
 
             # 0: All terminated successfully, failed count otherwise
             returncode = oci_query_helper.terminate_instances_by_tags(
-                {TAG_RAY_CLUSTER_NAME: cluster_name}, region)
+                {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}, region)
 
             # To avoid undefined local variables error.
             stdout = stderr = ''
@@ -3846,7 +3866,8 @@ def teardown_no_lock(self,
                 raise RuntimeError(
                     _TEARDOWN_FAILURE_MESSAGE.format(
                         extra_reason='',
-                        cluster_name=handle.cluster_name,
+                        cluster_name=common_utils.cluster_name_in_hint(
+                            cluster_name, cluster_name_on_cloud),
                         stdout=stdout,
                         stderr=stderr))
 
@@ -3874,6 +3895,7 @@ def post_teardown_cleanup(self,
         log_path = os.path.join(os.path.expanduser(self.log_dir),
                                 'teardown.log')
         log_abs_path = os.path.abspath(log_path)
+        cluster_name_on_cloud = handle.cluster_name_on_cloud
 
         if (handle.tpu_delete_script is not None and
                 os.path.exists(handle.tpu_delete_script)):
@@ -3896,7 +3918,8 @@ def post_teardown_cleanup(self,
                     raise RuntimeError(
                         _TEARDOWN_FAILURE_MESSAGE.format(
                             extra_reason='It is caused by TPU failure.',
-                            cluster_name=handle.cluster_name,
+                            cluster_name=common_utils.cluster_name_in_hint(
+                                handle.cluster_name, cluster_name_on_cloud),
                             stdout=tpu_stdout,
                             stderr=tpu_stderr))
         if (terminate and handle.launched_resources.is_image_managed is True):
@@ -3926,7 +3949,7 @@ def post_teardown_cleanup(self,
                 # our sky node provider.
                 # TODO(tian): Adding a no-op cleanup_ports API after #2286
                 # merged.
-                provision_lib.cleanup_ports(repr(cloud), handle.cluster_name,
+                provision_lib.cleanup_ports(repr(cloud), cluster_name_on_cloud,
                                             config['provider'])
 
         # The cluster file must exist because the cluster_yaml will only

diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py
@@ -109,7 +109,7 @@ def _cloud_unsupported_features(
         return dict()
 
     @classmethod
-    def _max_cluster_name_length(cls) -> Optional[int]:
+    def max_cluster_name_length(cls) -> Optional[int]:
         return cls._MAX_CLUSTER_NAME_LEN_LIMIT
 
     @classmethod
@@ -768,20 +768,22 @@ def query_status(cls, name: str, tag_filters: Dict[str, str],
 
     @classmethod
     def create_image_from_cluster(cls, cluster_name: str,
-                                  tag_filters: Dict[str,
-                                                    str], region: Optional[str],
+                                  cluster_name_on_cloud: str,
+                                  region: Optional[str],
                                   zone: Optional[str]) -> str:
-        assert region is not None, (tag_filters, region)
-        del tag_filters, zone  # unused
+        assert region is not None, (cluster_name, cluster_name_on_cloud, region)
+        del zone  # unused
 
         image_name = f'skypilot-{cluster_name}-{int(time.time())}'
 
-        status = provision_lib.query_instances('AWS', cluster_name,
+        status = provision_lib.query_instances('AWS', cluster_name_on_cloud,
                                                {'region': region})
         instance_ids = list(status.keys())
         if not instance_ids:
             with ux_utils.print_exception_no_traceback():
-                raise RuntimeError('Failed to find the source cluster on AWS.')
+                raise RuntimeError(
+                    f'Failed to find the source cluster {cluster_name!r} on '
+                    'AWS.')
 
         if len(instance_ids) != 1:
             with ux_utils.print_exception_no_traceback():

diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py
@@ -71,7 +71,7 @@ def _cloud_unsupported_features(
         }
 
     @classmethod
-    def _max_cluster_name_length(cls) -> int:
+    def max_cluster_name_length(cls) -> int:
         return cls._MAX_CLUSTER_NAME_LEN_LIMIT
 
     def instance_type_to_hourly_cost(self,