diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 4f43eaf62f81..d3959e155398 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2072,6 +2072,10 @@ def check_resources_fit_cluster(self, handle: ResourceHandle, The resources requested by the task should be smaller than the existing cluster. + + Raises: + exceptions.ResourcesMismatchError: If the resources in the task + does not match the existing cluster. """ assert len(task.resources) == 1, task.resources @@ -2141,12 +2145,16 @@ def _provision(self, Raises: exceptions.ClusterOwnerIdentityMismatchError: if the cluster 'cluster_name' exists and is owned by another user. + exceptions.InvalidClusterNameError: if the cluster name is invalid. + exceptions.ResourcesMismatchError: if the requested resources + do not match the existing cluster. exceptions.ResourcesUnavailableError: if the requested resources cannot be satisfied. The failover_history of the exception will be set as at least 1 exception from either our pre-checks (e.g., cluster name invalid) or a region/zone throwing resource unavailability. exceptions.CommandError: any ssh command error. + # TODO(zhwu): complete the list of exceptions. """ # FIXME: ray up for Azure with different cluster_names will overwrite # each other. @@ -3269,6 +3277,14 @@ def run_on_head( def _check_existing_cluster( self, task: task_lib.Task, to_provision: resources_lib.Resources, cluster_name: str) -> RetryingVmProvisioner.ToProvisionConfig: + """Checks if the cluster exists and returns the provision config. + + Raises: + exceptions.ResourcesMismatchError: If the resources in the task + does not match the existing cluster. + exceptions.InvalidClusterNameError: If the cluster name is invalid. + # TODO(zhwu): complete the list of exceptions. + """ prev_cluster_status, handle = ( backend_utils.refresh_cluster_status_handle( cluster_name, acquire_per_cluster_status_lock=False)) diff --git a/sky/execution.py b/sky/execution.py index 4cc79f58c5fa..ec2e9c247a73 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -393,6 +393,11 @@ def launch( Raises: exceptions.ClusterOwnerIdentityMismatchError: if the cluster is owned by another user. + exceptions.InvalidClusterNameError: if the cluster name is invalid. + exceptions.ResourcesMismatchError: if the requested resources + do not match the existing cluster. + exceptions.NotSupportedError: if required features are not supported + by the backend/cloud/cluster. exceptions.ResourcesUnavailableError: if the requested resources cannot be satisfied. The failover_history of the exception will be set as: @@ -402,7 +407,7 @@ def launch( 2. Non-empty: iff at least 1 exception from either our pre-checks (e.g., cluster name invalid) or a region/zone throwing resource unavailability. - exceptions.NotSupportedError: if the cluster name is reserved. + exceptions.CommandError: any ssh command error. Other exceptions may be raised depending on the backend. """ entrypoint = task diff --git a/sky/spot/recovery_strategy.py b/sky/spot/recovery_strategy.py index 0d72b338f094..538a23f3625c 100644 --- a/sky/spot/recovery_strategy.py +++ b/sky/spot/recovery_strategy.py @@ -246,6 +246,12 @@ def _launch(self, max_retry=3, raise_on_failure=True) -> Optional[float]: detach_run=True, _is_launched_by_spot_controller=True) logger.info('Spot cluster launched.') + except exceptions.InvalidClusterNameError as e: + logger.error('Failure happened before provisioning. ' + f'{common_utils.format_exception(e)}') + if raise_on_failure: + raise exceptions.ProvisionPrechecksError(reasons=[e]) + return None except exceptions.ResourcesUnavailableError as e: # This is raised when the launch fails due to prechecks or # after failing over through all the candidates.