From a67d0bffe1f5560c637bce4446688889238db948 Mon Sep 17 00:00:00 2001 From: Edward Zeng Date: Tue, 14 Mar 2023 15:00:20 -0700 Subject: [PATCH 1/2] Handle gpunode reuse --- sky/cli.py | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 0c8c0d58658..d9ab1526758 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -808,6 +808,36 @@ def _create_and_ssh_into_node( f'Name {cluster_name!r} taken by a local cluster and cannot ' f'be used for a {node_type}.') + backend = backend if backend is not None else backends.CloudVmRayBackend() + if not isinstance(backend, backends.CloudVmRayBackend): + raise click.UsageError('Interactive nodes are only supported for ' + f'{backends.CloudVmRayBackend.__name__} ' + f'backend. Got {type(backend).__name__}.') + + maybe_status, handle = backend_utils.refresh_cluster_status_handle( + cluster_name) + if maybe_status is not None: + if user_requested_resources: + # Reuse existing interactive node if resources = launched resources + same_resources = ( + resources.less_demanding_than(handle.launched_resources) and + handle.launched_resources.less_demanding_than(resources)) + if not same_resources: + name_arg = '' + if cluster_name != _default_interactive_node_name(node_type): + name_arg = f' -c {cluster_name}' + raise click.UsageError( + f'Relaunching existing interactive node {cluster_name!r} ' + 'with different resource specification. To login to ' + f'existing cluster, use {colorama.Style.BRIGHT}' + f'sky {node_type}{name_arg}{colorama.Style.RESET_ALL}. ' + f'To launch a new cluster, use {colorama.Style.BRIGHT}' + f'sky {node_type} -c NEW_NAME {colorama.Style.RESET_ALL}') + else: + # Use existing interactive node if it already exists and no user + # resources were specified. + resources = handle.launched_resources + # TODO: Add conda environment replication # should be setup = # 'conda env export | grep -v "^prefix: " > environment.yml' @@ -819,22 +849,6 @@ def _create_and_ssh_into_node( ) task.set_resources(resources) - backend = backend if backend is not None else backends.CloudVmRayBackend() - if not isinstance(backend, backends.CloudVmRayBackend): - raise click.UsageError('Interactive nodes are only supported for ' - f'{backends.CloudVmRayBackend.__name__} ' - f'backend. Got {type(backend).__name__}.') - maybe_status, _ = backend_utils.refresh_cluster_status_handle(cluster_name) - if maybe_status is not None and user_requested_resources: - name_arg = '' - if cluster_name != _default_interactive_node_name(node_type): - name_arg = f' -c {cluster_name}' - raise click.UsageError( - 'Resources cannot be specified for an existing interactive node ' - f'{cluster_name!r}. To login to the cluster, use: ' - f'{colorama.Style.BRIGHT}' - f'sky {node_type}{name_arg}{colorama.Style.RESET_ALL}') - _launch_with_confirm( task, backend, From 581ccf0998e934af50fa2bf9f720379a483aa938 Mon Sep 17 00:00:00 2001 From: Edward Zeng Date: Wed, 15 Mar 2023 17:51:00 -0700 Subject: [PATCH 2/2] Improve error message + enforcing same resources is too hard --- sky/cli.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index d9ab1526758..af7fea87cc0 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -818,23 +818,22 @@ def _create_and_ssh_into_node( cluster_name) if maybe_status is not None: if user_requested_resources: - # Reuse existing interactive node if resources = launched resources - same_resources = ( - resources.less_demanding_than(handle.launched_resources) and - handle.launched_resources.less_demanding_than(resources)) - if not same_resources: + if not resources.less_demanding_than(handle.launched_resources): name_arg = '' if cluster_name != _default_interactive_node_name(node_type): name_arg = f' -c {cluster_name}' raise click.UsageError( - f'Relaunching existing interactive node {cluster_name!r} ' - 'with different resource specification. To login to ' - f'existing cluster, use {colorama.Style.BRIGHT}' - f'sky {node_type}{name_arg}{colorama.Style.RESET_ALL}. ' - f'To launch a new cluster, use {colorama.Style.BRIGHT}' - f'sky {node_type} -c NEW_NAME {colorama.Style.RESET_ALL}') + f'Relaunching interactive node {cluster_name!r} with ' + 'mismatched resources.\n ' + f'Requested resources: {resources}\n ' + f'Launched resources: {handle.launched_resources}\n' + 'To login to existing cluster, use ' + f'{colorama.Style.BRIGHT}sky {node_type}{name_arg}' + f'{colorama.Style.RESET_ALL}. To launch a new cluster, ' + f'use {colorama.Style.BRIGHT}sky {node_type} -c NEW_NAME ' + f'{colorama.Style.RESET_ALL}') else: - # Use existing interactive node if it already exists and no user + # Use existing interactive node if it exists and no user # resources were specified. resources = handle.launched_resources