Skip to content

Commit

Permalink
[Spot] Fix spot failure reason when cloud is specified (#1714)
Browse files Browse the repository at this point in the history
* Fix spot failure reason when cloud is specified

* format

* fix

* format

* Update exceptions in docstr

* fix docstr

* Add error

* format
  • Loading branch information
Michaelvll authored Feb 23, 2023
1 parent 71697f7 commit c94b43b
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 1 deletion.
16 changes: 16 additions & 0 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2072,6 +2072,10 @@ def check_resources_fit_cluster(self, handle: ResourceHandle,
The resources requested by the task should be smaller than the existing
cluster.
Raises:
exceptions.ResourcesMismatchError: If the resources in the task
does not match the existing cluster.
"""
assert len(task.resources) == 1, task.resources

Expand Down Expand Up @@ -2141,12 +2145,16 @@ def _provision(self,
Raises:
exceptions.ClusterOwnerIdentityMismatchError: if the cluster
'cluster_name' exists and is owned by another user.
exceptions.InvalidClusterNameError: if the cluster name is invalid.
exceptions.ResourcesMismatchError: if the requested resources
do not match the existing cluster.
exceptions.ResourcesUnavailableError: if the requested resources
cannot be satisfied. The failover_history of the exception
will be set as at least 1 exception from either our pre-checks
(e.g., cluster name invalid) or a region/zone throwing
resource unavailability.
exceptions.CommandError: any ssh command error.
# TODO(zhwu): complete the list of exceptions.
"""
# FIXME: ray up for Azure with different cluster_names will overwrite
# each other.
Expand Down Expand Up @@ -3269,6 +3277,14 @@ def run_on_head(
def _check_existing_cluster(
self, task: task_lib.Task, to_provision: resources_lib.Resources,
cluster_name: str) -> RetryingVmProvisioner.ToProvisionConfig:
"""Checks if the cluster exists and returns the provision config.
Raises:
exceptions.ResourcesMismatchError: If the resources in the task
does not match the existing cluster.
exceptions.InvalidClusterNameError: If the cluster name is invalid.
# TODO(zhwu): complete the list of exceptions.
"""
prev_cluster_status, handle = (
backend_utils.refresh_cluster_status_handle(
cluster_name, acquire_per_cluster_status_lock=False))
Expand Down
7 changes: 6 additions & 1 deletion sky/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,11 @@ def launch(
Raises:
exceptions.ClusterOwnerIdentityMismatchError: if the cluster is
owned by another user.
exceptions.InvalidClusterNameError: if the cluster name is invalid.
exceptions.ResourcesMismatchError: if the requested resources
do not match the existing cluster.
exceptions.NotSupportedError: if required features are not supported
by the backend/cloud/cluster.
exceptions.ResourcesUnavailableError: if the requested resources
cannot be satisfied. The failover_history of the exception
will be set as:
Expand All @@ -402,7 +407,7 @@ def launch(
2. Non-empty: iff at least 1 exception from either
our pre-checks (e.g., cluster name invalid) or a region/zone
throwing resource unavailability.
exceptions.NotSupportedError: if the cluster name is reserved.
exceptions.CommandError: any ssh command error.
Other exceptions may be raised depending on the backend.
"""
entrypoint = task
Expand Down
6 changes: 6 additions & 0 deletions sky/spot/recovery_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,12 @@ def _launch(self, max_retry=3, raise_on_failure=True) -> Optional[float]:
detach_run=True,
_is_launched_by_spot_controller=True)
logger.info('Spot cluster launched.')
except exceptions.InvalidClusterNameError as e:
logger.error('Failure happened before provisioning. '
f'{common_utils.format_exception(e)}')
if raise_on_failure:
raise exceptions.ProvisionPrechecksError(reasons=[e])
return None
except exceptions.ResourcesUnavailableError as e:
# This is raised when the launch fails due to prechecks or
# after failing over through all the candidates.
Expand Down

0 comments on commit c94b43b

Please sign in to comment.