skypilot-org · concretevitamin · Dec 7, 2023 · Dec 6, 2023 · Dec 6, 2023 · Michaelvll
diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py
@@ -36,6 +36,34 @@
 
 SINGLE_THREADED = False
 
+# Family name to SkyPilot GPU name mapping.
+#
+# When adding a new accelerator:
+# - The instance type is typically already fetched, but we need to find the
+#   family name and add it to this mapping.
+# - To inspect family names returned by Azure API, check the dataframes in
+#   get_all_regions_instance_types_df().
+FAMILY_NAME_TO_SKYPILOT_GPU_NAME = {
+    'standardNCFamily': 'K80',
+    'standardNCSv2Family': 'P100',
+    'standardNCSv3Family': 'V100',
+    'standardNCPromoFamily': 'K80',
+    'StandardNCASv3_T4Family': 'T4',
+    'standardNDSv2Family': 'V100-32GB',
+    'StandardNCADSA100v4Family': 'A100-80GB',
+    'standardNDAMSv4_A100Family': 'A100-80GB',
+    'StandardNDASv4_A100Family': 'A100',
+    'standardNVFamily': 'M60',
+    'standardNVSv2Family': 'M60',
+    'standardNVSv3Family': 'M60',
+    'standardNVPromoFamily': 'M60',
+    'standardNVSv4Family': 'Radeon MI25',
+    'standardNDSFamily': 'P40',
+    'StandardNVADSA10v5Family': 'A10',
+    'StandardNCadsH100v5Family': 'H100',
+    'standardNDSH100v5Family': 'H100',
+}
+
 
 def get_regions() -> List[str]:
     """Get all available regions."""
@@ -78,7 +106,7 @@ def get_pricing_url(region: Optional[str] = None) -> str:
 def get_pricing_df(region: Optional[str] = None) -> pd.DataFrame:
     all_items = []
     url = get_pricing_url(region)
-    print(f'Getting pricing for {region}')
+    print(f'Getting pricing for {region}, url: {url}')
     page = 0
     while url is not None:
         page += 1
@@ -125,29 +153,11 @@ def get_sku_df(region_set: Set[str]) -> pd.DataFrame:
 
 
 def get_gpu_name(family: str) -> Optional[str]:
-    gpu_data = {
-        'standardNCFamily': 'K80',
-        'standardNCSv2Family': 'P100',
-        'standardNCSv3Family': 'V100',
-        'standardNCPromoFamily': 'K80',
-        'StandardNCASv3_T4Family': 'T4',
-        'standardNDSv2Family': 'V100-32GB',
-        'StandardNCADSA100v4Family': 'A100-80GB',
-        'standardNDAMSv4_A100Family': 'A100-80GB',
-        'StandardNDASv4_A100Family': 'A100',
-        'standardNVFamily': 'M60',
-        'standardNVSv2Family': 'M60',
-        'standardNVSv3Family': 'M60',
-        'standardNVPromoFamily': 'M60',
-        'standardNVSv4Family': 'Radeon MI25',
-        'standardNDSFamily': 'P40',
-        'StandardNVADSA10v5Family': 'A10',
-    }
     # NP-series offer Xilinx U250 FPGAs which are not GPUs,
     # so we do not include them here.
     # https://docs.microsoft.com/en-us/azure/virtual-machines/np-series
     family = family.replace(' ', '')
-    return gpu_data.get(family)
+    return FAMILY_NAME_TO_SKYPILOT_GPU_NAME.get(family)
 
 
 def get_all_regions_instance_types_df(region_set: Set[str]):

diff --git a/sky/utils/accelerator_registry.py b/sky/utils/accelerator_registry.py
@@ -6,17 +6,22 @@
 # NOTE: Must include accelerators supported for local clusters.
 #
 # 1. What if a name is in this list, but not in any catalog?
+#
 # The name will be canonicalized, but the accelerator will not be supported.
 # Optimizer will print an error message.
+#
 # 2. What if a name is not in this list, but in a catalog?
+#
 # The list is simply an optimization to short-circuit the search in the catalog.
 # If the name is not found in the list, it will be searched in the catalog
 # with its case being ignored. If a match is found, the name will be
 # canonicalized to that in the catalog. Note that this lookup can be an
 # expensive operation, as it requires reading the catalog or making external
 # API calls (such as for Kubernetes). Thus it is desirable to keep this list
 # up-to-date with commonly used accelerators.
+
 # 3. (For SkyPilot dev) What to do if I want to add a new accelerator?
+#
 # Append its case-sensitive canonical name to this list. The name must match
 # `AcceleratorName` in the service catalog, or what we define in
 # `onprem_utils.get_local_cluster_accelerators`.
@@ -42,6 +47,7 @@
     'Radeon MI25',
     'P4',
     'L4',
+    'H100',
 ]
 
 
@@ -72,11 +78,11 @@ def canonicalize_accelerator_name(accelerator: str) -> str:
     if len(names) == 1:
         return names[0]
 
-    # Do not print an error meessage here. Optimizer will handle it.
+    # Do not print an error message here. Optimizer will handle it.
     if len(names) == 0:
         return accelerator
 
-    # Currenlty unreachable.
+    # Currently unreachable.
     # This can happen if catalogs have the same accelerator with
     # different names (e.g., A10g and A10G).
     assert len(names) > 1