Show DEVICE_MEMORY in show-gpus for AWS & Lambda. (#1825)

* Show DEVICE_MEMORY in `show-gpus` for AWS & Lambda. * Pylint. * Address comments * Minor typo fixes.
skypilot-org · Apr 3, 2023 · 466691d · 466691d
1 parent f8eeeea
commit 466691d
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 10 deletions.
diff --git a/sky/adaptors/gcp.py b/sky/adaptors/gcp.py
@@ -19,8 +19,8 @@ def wrapper(*args, **kwargs):
                 googleapiclient = _googleapiclient
                 google = _google
             except ImportError:
-                raise ImportError('Fail to import dependencies for GCP.'
-                                  'Try pip install "skypilot[gcp]"') from None
+                raise ImportError('Failed to import dependencies for GCP. '
+                                  'Try: pip install "skypilot[gcp]"') from None
         return func(*args, **kwargs)
 
     return wrapper

diff --git a/sky/cli.py b/sky/cli.py
@@ -2938,8 +2938,16 @@ def show_gpus(
     To show all accelerators, including less common ones and their detailed
     information, use ``sky show-gpus --all``.
 
-    NOTE: If region is not specified, the price displayed for each instance type
-    is the lowest across all regions for both on-demand and spot instances.
+    Definitions of certain fields:
+
+    * ``DEVICE_MEM``: Memory of a single device; does not depend on the device
+      count of the instance (VM).
+
+    * ``HOST_MEM``: Memory of the host instance (VM).
+
+    If ``--region`` is not specified, the price displayed for each instance
+    type is the lowest across all regions for both on-demand and spot
+    instances. There may be multiple regions with the same lowest price.
     """
     # validation for the --region flag
     if region is not None and cloud is None:
@@ -3012,8 +3020,9 @@ def _output():
                 'QTY',
                 'CLOUD',
                 'INSTANCE_TYPE',
+                'DEVICE_MEM',
                 'vCPUs',
-                'HOST_MEMORY',
+                'HOST_MEM',
                 'HOURLY_PRICE',
                 'HOURLY_SPOT_PRICE',
             ]
@@ -3032,7 +3041,9 @@ def _output():
                         cpu_str = str(int(cpu_count))
                     else:
                         cpu_str = f'{cpu_count:.1f}'
-                mem_str = f'{item.memory:.0f}GB' if not pd.isna(
+                device_memory_str = (f'{item.device_memory:.0f}GB' if
+                                     not pd.isna(item.device_memory) else '-')
+                host_memory_str = f'{item.memory:.0f}GB' if not pd.isna(
                     item.memory) else '-'
                 price_str = f'$ {item.price:.3f}' if not pd.isna(
                     item.price) else '-'
@@ -3044,8 +3055,9 @@ def _output():
                     item.accelerator_count,
                     item.cloud,
                     instance_type_str,
+                    device_memory_str,
                     cpu_str,
-                    mem_str,
+                    host_memory_str,
                     price_str,
                     spot_price_str,
                 ]

diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py
@@ -1,4 +1,5 @@
 """Common utilities for service catalog."""
+import ast
 import hashlib
 import os
 import time
@@ -31,6 +32,7 @@ class InstanceTypeInfo(NamedTuple):
     - accelerator_name: Canonical name of the accelerator. E.g. `V100`.
     - accelerator_count: Number of accelerators offered by this instance type.
     - cpu_count: Number of vCPUs offered by this instance type.
+    - device_memory: Device memory in GiB.
     - memory: Instance memory in GiB.
     - price: Regular instance price per hour (cheapest across all regions).
     - spot_price: Spot instance price per hour (cheapest across all regions).
@@ -41,6 +43,7 @@ class InstanceTypeInfo(NamedTuple):
     accelerator_name: str
     accelerator_count: int
     cpu_count: Optional[float]
+    device_memory: Optional[float]
     memory: Optional[float]
     price: float
     spot_price: float
@@ -178,8 +181,8 @@ def _get_candidate_str(loc: str, all_loc: List[str]) -> str:
 
     def _get_all_supported_regions_str() -> str:
         all_regions: List[str] = sorted(df['Region'].unique().tolist())
-        return \
-        f'\nList of supported {cloud_name} regions: {", ".join(all_regions)!r}'
+        return (f'\nList of supported {cloud_name} regions: '
+                f'{", ".join(all_regions)!r}')
 
     validated_region, validated_zone = region, zone
 
@@ -435,12 +438,25 @@ def list_accelerators_impl(
     """
     if gpus_only:
         df = df[~df['GpuInfo'].isna()]
+    df = df.copy()  # avoid column assignment warning
+
+    try:
+        gpu_info_df = df['GpuInfo'].apply(ast.literal_eval)
+        df['DeviceMemoryGiB'] = gpu_info_df.apply(
+            lambda row: row['Gpus'][0]['MemoryInfo']['SizeInMiB']) / 1024.0
+    except ValueError:
+        # TODO(zongheng,woosuk): GCP/Azure catalogs do not have well-formed
+        # GpuInfo fields. So the above will throw:
+        #  ValueError: malformed node or string: <_ast.Name object at ..>
+        df['DeviceMemoryGiB'] = None
+
     df = df[[
         'InstanceType',
         'AcceleratorName',
         'AcceleratorCount',
         'vCPUs',
-        'MemoryGiB',
+        'DeviceMemoryGiB',  # device memory
+        'MemoryGiB',  # host memory
         'Price',
         'SpotPrice',
         'Region',
@@ -470,6 +486,7 @@ def make_list_from_df(rows):
                 row['AcceleratorName'],
                 row['AcceleratorCount'],
                 row['vCPUs'],
+                row['DeviceMemoryGiB'],
                 row['MemoryGiB'],
                 row['Price'],
                 row['SpotPrice'],