Skip to content

Commit

Permalink
CLI help str rewording. (#1821)
Browse files Browse the repository at this point in the history
* CLI help str rewording.

* Updates.
  • Loading branch information
concretevitamin authored Mar 30, 2023
1 parent 718951c commit 91be9f2
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 56 deletions.
144 changes: 88 additions & 56 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1671,19 +1671,22 @@ def status(all: bool, refresh: bool, show_spot_jobs: bool, clusters: List[str]):
@usage_lib.entrypoint
def cost_report(all: bool): # pylint: disable=redefined-builtin
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
"""Show cost reports for each cluster.
"""Show estimated costs for launched clusters.
The following fields for each cluster are recorded: cluster name,
resources, launched time, duration that cluster was up,
and total cost.
For each cluster, this shows: cluster name, resources, launched time,
duration that cluster was up, and total estimated cost.
The estimated cost column indicates the price for the cluster based on the
type of resources being used and the duration of use up until the call
to status. This means if the cluster is UP, successive calls to report
will show increasing price. The estimated cost is calculated based on
the local cache of the cluster status, and may not be accurate for
the cluster with autostop/use_spot set or terminated/stopped
on the cloud console.
type of resources being used and the duration of use up until now. This
means if the cluster is UP, successive calls to cost-report will show
increasing price.
The estimated cost is calculated based on the local cache of the cluster
status, and may not be accurate for:
- clusters with autostop/use_spot set; or
- clusters that were terminated/stopped on the cloud console.
"""
cluster_records = core.cost_report()
nonreserved_cluster_records = []
Expand Down Expand Up @@ -1778,8 +1781,8 @@ def queue(clusters: Sequence[str], skip_finished: bool, all_users: bool):
'-s',
is_flag=True,
default=False,
help='Sync down the logs of the job (this is useful for distributed jobs to'
'download a separate log for each job from all the workers).')
help='Sync down the logs of a job to the local machine. For a distributed'
' job, a separate log file from each worker will be downloaded.')
@click.option(
'--status',
is_flag=True,
Expand All @@ -1790,8 +1793,9 @@ def queue(clusters: Sequence[str], skip_finished: bool, all_users: bool):
'--follow/--no-follow',
is_flag=True,
default=True,
help=('Follow the logs of the job. [default: --follow] '
'If --no-follow is specified, print the log so far and exit.'))
help=('Follow the logs of a job. '
'If --no-follow is specified, print the log so far and exit. '
'[default: --follow]'))
@click.argument('cluster',
required=True,
type=str,
Expand All @@ -1813,12 +1817,14 @@ def logs(
1. If no flags are provided, tail the logs of the job_id specified. At most
one job_id can be provided.
2. If --status is specified, print the status of the job and exit with
returncode 0 if the job is succeeded, or 1 otherwise. At most one job_id can
2. If ``--status`` is specified, print the status of the job and exit with
returncode 0 if the job succeeded, or 1 otherwise. At most one job_id can
be specified.
3. If --sync-down is specified, the logs of the job will be downloaded from
the cluster and saved to the local machine under `~/sky_logs`. Mulitple
job_ids can be specified.
3. If ``--sync-down`` is specified, the logs of the job will be downloaded
from the cluster and saved to the local machine under
``~/sky_logs``. Mulitple job_ids can be specified.
"""
if sync_down and status:
raise click.UsageError(
Expand Down Expand Up @@ -2881,11 +2887,14 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
@cli.command()
@usage_lib.entrypoint
def check():
"""Determine the set of clouds available to use.
"""Check which clouds are available to use.
This checks access credentials for all clouds supported by SkyPilot. If a
cloud is detected to be inaccessible, the reason and correction steps will
be shown.
This checks access credentials for AWS, Azure and GCP; on failure, it shows
the reason and suggests correction steps. Tasks will only run on clouds
that you have access to.
The enabled clouds are cached and form the "search space" to be considered
for each task.
"""
sky_check.check()

Expand Down Expand Up @@ -2916,15 +2925,15 @@ def show_gpus(
all: bool, # pylint: disable=redefined-builtin
cloud: Optional[str],
region: Optional[str]):
"""Show supported GPU/TPU/accelerators.
"""Show supported GPU/TPU/accelerators and their prices.
The names and counts shown can be set in the ``accelerators`` field in task
YAMLs, or in the ``--gpus`` flag in CLI commands. For example, if this
table shows 8x V100s are supported, then the string ``V100:8`` will be
accepted by the above.
To show the detailed information of a GPU/TPU type (which clouds offer it,
the quantity in each VM type, etc.), use ``sky show-gpus <gpu>``.
To show the detailed information of a GPU/TPU type (its price, which clouds
offer it, the quantity in each VM type, etc.), use ``sky show-gpus <gpu>``.
To show all accelerators, including less common ones and their detailed
information, use ``sky show-gpus --all``.
Expand Down Expand Up @@ -3058,7 +3067,7 @@ def _output():

@cli.group(cls=_NaturalOrderGroup)
def storage():
"""Storage related commands."""
"""SkyPilot Storage CLI."""
pass


Expand Down Expand Up @@ -3115,7 +3124,7 @@ def storage_delete(names: Sequence[str], all: bool): # pylint: disable=redefine

@cli.group(cls=_NaturalOrderGroup)
def admin():
"""Sky administrator commands for local clusters."""
"""SkyPilot On-prem administrator CLI."""
pass


Expand Down Expand Up @@ -3191,7 +3200,7 @@ def admin_deploy(clusterspec_yaml: str):

@cli.group(cls=_NaturalOrderGroup)
def spot():
"""Commands for managed spot jobs."""
"""Managed Spot commands (spot instances with auto-recovery)."""
pass


Expand Down Expand Up @@ -3354,32 +3363,56 @@ def spot_launch(
def spot_queue(all: bool, refresh: bool, skip_finished: bool):
"""Show statuses of managed spot jobs.
\b
Each spot job can have one of the following statuses:
\b
- SUBMITTED: The job is submitted to the spot controller.
- STARTING: The job is starting (starting a spot cluster).
- RUNNING: The job is running.
- RECOVERING: The spot cluster is recovering from a preemption.
- SUCCEEDED: The job succeeded.
- FAILED: The job failed due to an error from the job itself.
- FAILED_NO_RESOURCES: The job failed due to resources being unavailable
after a maximum number of retry attempts.
- FAILED_CONTROLLER: The job failed due to an unexpected error in the spot
controller.
- CANCELLING: The job was requested to be cancelled by the user, and the
cancellation is in progress.
- CANCELLED: The job was cancelled by the user.
If the job failed, either due to user code or spot unavailability, the error
log can be found with ``sky spot logs --controller job_id``.
- ``PENDING``: Job is waiting for a free slot on the spot controller to be
accepted.
- ``SUBMITTED``: Job is submitted to and accepted by the spot controller.
- ``STARTING``: Job is starting (provisioning a spot cluster).
- ``RUNNING``: Job is running.
- ``RECOVERING``: The spot cluster is recovering from a preemption.
- ``SUCCEEDED``: Job succeeded.
- ``CANCELLING``: Job was requested to be cancelled by the user, and the
cancellation is in progress.
- ``CANCELLED``: Job was cancelled by the user.
- ``FAILED``: Job failed due to an error from the job itself.
- ``FAILED_SETUP``: Job failed due to an error from the job's ``setup``
commands.
- ``FAILED_PRECHECKS``: Job failed due to an error from our prechecks such
as invalid cluster names or an infeasible resource is specified.
- ``FAILED_NO_RESOURCE``: Job failed due to resources being unavailable
after a maximum number of retries.
- ``FAILED_CONTROLLER``: Job failed due to an unexpected error in the spot
controller.
If the job failed, either due to user code or spot unavailability, the
error log can be found with ``sky spot logs --controller``, e.g.:
.. code-block:: bash
sky spot logs --controller job_id
This also shows the logs for provisioning and any preemption and recovery
attempts.
(Tip) To fetch job statuses every 60 seconds, use ``watch``:
.. code-block:: bash
watch -n60 sky spot queue
"""
click.secho('Fetching managed spot job statuses...', fg='yellow')
with log_utils.safe_rich_status('[cyan]Checking spot jobs[/]'):
Expand Down Expand Up @@ -3422,20 +3455,19 @@ def spot_queue(all: bool, refresh: bool, skip_finished: bool):
def spot_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
"""Cancel managed spot jobs.
You can provide either a job name or a list of job ids to be cancelled.
You can provide either a job name or a list of job IDs to be cancelled.
They are exclusive options.
Examples:
.. code-block:: bash
# Cancel managed spot job with name 'my-job'
$ sky spot cancel -n my-job
# Cancel managed spot jobs with IDs 1, 2, 3
$ sky spot cancel 1 2 3
# Cancel managed spot job with name 'my-job'
$ sky spot cancel -n my-job
\b
# Cancel managed spot jobs with IDs 1, 2, 3
$ sky spot cancel 1 2 3
"""

_, handle = spot_lib.is_spot_controller_up(
'All managed spot jobs should have finished.')
if handle is None:
Expand Down Expand Up @@ -3544,7 +3576,7 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:

@cli.group(cls=_NaturalOrderGroup)
def bench():
"""Sky Benchmark related commands."""
"""SkyPilot Benchmark CLI."""
pass


Expand Down
15 changes: 15 additions & 0 deletions sky/utils/cli_utils/status_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,21 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord],
reserved_group_name: Optional[str] = None):
"""Compute cluster table values and display for cost report.
For each cluster, this shows: cluster name, resources, launched time,
duration that cluster was up, and total estimated cost.
The estimated cost column indicates the price for the cluster based on the
type of resources being used and the duration of use up until now. This
means if the cluster is UP, successive calls to cost-report will show
increasing price.
The estimated cost is calculated based on the local cache of the cluster
status, and may not be accurate for:
- clusters with autostop/use_spot set; or
- clusters that were terminated/stopped on the cloud console.
Returns:
Number of pending auto{stop,down} clusters.
"""
Expand Down

0 comments on commit 91be9f2

Please sign in to comment.