From 45341a8b3c081080e51c313e51c350fba72c61b7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 21 Nov 2022 01:07:15 -0800
Subject: [PATCH] [Catalog] Remove hardcoded A2 pricing URL & Fix a bug in A2
 machine zones (#1426)

* Update no 16xA100-40GB zones

* [Catalog] Remove GCP A2 price URL & Fix GCP A100 zone issues

* Add more type annotations

* Minor

* yapf

* Do not add GCP URL prefix

* Minor

* Address comments

* Address comment1

* Minor

* Add comments about the case when a100.empty is True

* Assert not duplicated
---
 .../data_fetchers/fetch_gcp.py                | 261 +++++++++---------
 1 file changed, 123 insertions(+), 138 deletions(-)

diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py
index a8a3a3bf2a9f..53a2a6488a28 100644
--- a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py
+++ b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py
@@ -6,13 +6,13 @@
 import argparse
 import os
 import re
+from typing import Dict, List, Optional, Tuple
 
 from lxml import html
 import pandas as pd
 import requests
 
 # pylint: disable=line-too-long
-GCP_URL = 'https://cloud.google.com'
 GCP_VM_PRICING_URL = 'https://cloud.google.com/compute/vm-instance-pricing'
 GCP_VM_ZONES_URL = 'https://cloud.google.com/compute/docs/regions-zones'
 GCP_GPU_PRICING_URL = 'https://cloud.google.com/compute/gpus-pricing'
@@ -40,49 +40,12 @@
     'K80': [1, 2, 4, 8],
 }
 
-# FIXME(woosuk): This URL can change.
-A2_PRICING_URL = '/compute/vm-instance-pricing_34568c2efd1858a89d6f5b0f1cdd171bbea1cdcba646e9771e6ef4028238086f.frame'  # pylint: disable=line-too-long
-A2_INSTANCE_TYPES = {
-    'a2-highgpu-1g': {
-        'vCPUs': 12,
-        'MemoryGiB': 85,
-    },
-    'a2-highgpu-2g': {
-        'vCPUs': 24,
-        'MemoryGiB': 170,
-    },
-    'a2-highgpu-4g': {
-        'vCPUs': 48,
-        'MemoryGiB': 340,
-    },
-    'a2-highgpu-8g': {
-        'vCPUs': 96,
-        'MemoryGiB': 680,
-    },
-    'a2-megagpu-16g': {
-        'vCPUs': 96,
-        'MemoryGiB': 1360,
-    },
-    'a2-ultragpu-1g': {
-        'vCPUs': 12,
-        'MemoryGiB': 170,
-    },
-    'a2-ultragpu-2g': {
-        'vCPUs': 24,
-        'MemoryGiB': 340,
-    },
-    'a2-ultragpu-4g': {
-        'vCPUs': 48,
-        'MemoryGiB': 680,
-    },
-    'a2-ultragpu-8g': {
-        'vCPUs': 96,
-        'MemoryGiB': 1360,
-    },
-}
-
-# Source: https://cloud.google.com/compute/docs/gpus/gpu-regions-zones
-NO_A100_16G_ZONES = ['asia-northeast3-a', 'asia-northeast3-b', 'us-west4-b']
+# A2 VMs that support 16 A100 GPUs only appear in the following zones.
+# Source: https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#limitations
+A2_MEGAGPU_16G_ZONES = [
+    'us-central1-a', 'us-central1-b', 'us-central1-c', 'us-central1-f',
+    'europe-west4-a', 'europe-west4-b', 'asia-southeast1-c'
+]
 
 # For the TPU catalog, we maintain our own location/pricing table.
 # NOTE: The CSV files do not completely align with the data in the websites.
@@ -114,13 +77,13 @@
 ]
 
 
-def get_iframe_sources(url):
+def get_iframe_sources(url: str) -> List[str]:
     page = requests.get(url)
     tree = html.fromstring(page.content)
     return tree.xpath('//iframe/@src')
 
 
-def get_regions(doc):
+def get_regions(doc: str) -> Dict[str, str]:
     # Get the dictionary of regions.
     # E.g., 'kr': 'asia-northeast3'
     regions = doc.xpath('//md-option')
@@ -134,7 +97,7 @@ def get_regions(doc):
 # TODO(woosuk): parallelize this function using Ray.
 # Currently, 'HTML parser error : Tag md-option invalid' is raised
 # when the function is parallelized by Ray.
-def get_vm_price_table(url):
+def get_vm_price_table(url: str) -> pd.DataFrame:
     page = requests.get(url)
     doc = html.fromstring(page.content)
     regions = get_regions(doc)
@@ -207,7 +170,7 @@ def get_vm_price_table(url):
     }
     df.rename(columns=column_remapping, inplace=True)
 
-    def parse_memory(memory_str):
+    def parse_memory(memory_str: str) -> float:
         if 'GB' in memory_str:
             return float(memory_str.replace('GB', ''))
         else:
@@ -215,7 +178,7 @@ def parse_memory(memory_str):
 
     pattern = re.compile(r'\$?(.*?)\s?/')
 
-    def parse_price(price_str):
+    def parse_price(price_str: str) -> float:
         if NOT_AVAILABLE_STR in price_str:
             return None
         try:
@@ -237,15 +200,11 @@ def parse_price(price_str):
     if df.empty:
         return None
 
-    instance_type = None
     if 'InstanceType' in df.columns:
         # Price table for pre-defined instance types.
-        instance_type = df['InstanceType'].iloc[0]
-        if instance_type in ['a2-highgpu-1g', 'a2-ultragpu-1g']:
-            # The A2 price table includes the GPU cost.
-            return None
+        # NOTE: The price of A2 machines includes the price of A100 GPUs,
+        # and thus is modified later by post_process_a2_price().
 
-        # Price table for specific VM types.
         df = df[[
             'InstanceType',
             'vCPUs',
@@ -268,7 +227,7 @@ def parse_price(price_str):
     return df
 
 
-def get_vm_zones(url):
+def get_vm_zones(url: str) -> pd.DataFrame:
     df = pd.read_html(url)[0]
     column_remapping = {
         'Zones': 'AvailabilityZone',
@@ -279,7 +238,7 @@ def get_vm_zones(url):
     # Remove unnecessary columns.
     df = df[['AvailabilityZone', 'MachineType']]
 
-    def parse_machine_type_list(list_str):
+    def parse_machine_type_list(list_str: str) -> List[str]:
         machine_types = list_str.split(', ')
         returns = []
         # Handle the typos in the GCP web page.
@@ -297,69 +256,40 @@ def parse_machine_type_list(list_str):
     # Explode the 'MachineType' column.
     df['MachineType'] = df['MachineType'].apply(parse_machine_type_list)
     df = df.explode('MachineType', ignore_index=True)
-    return df
 
+    # Check duplicates.
+    assert not df.duplicated().any()
+    return df
 
-def get_a2_df():
-    a2_pricing = get_vm_price_table(GCP_URL + A2_PRICING_URL)
-    cpu_pricing = a2_pricing[a2_pricing['Item'] == 'Predefined vCPUs']
-    memory_pricing = a2_pricing[a2_pricing['Item'] == 'Predefined Memory']
 
-    table = []
-    for region in a2_pricing['Region'].unique():
-        per_cpu_price = cpu_pricing[cpu_pricing['Region'] ==
-                                    region]['Price'].values[0]
-        per_cpu_spot_price = cpu_pricing[cpu_pricing['Region'] ==
-                                         region]['SpotPrice'].values[0]
-        per_memory_price = memory_pricing[memory_pricing['Region'] ==
-                                          region]['Price'].values[0]
-        per_memory_spot_price = memory_pricing[memory_pricing['Region'] ==
-                                               region]['SpotPrice'].values[0]
-
-        for instance_type, spec in A2_INSTANCE_TYPES.items():
-            cpu = spec['vCPUs']
-            memory = spec['MemoryGiB']
-            price = per_cpu_price * cpu + per_memory_price * memory
-            spot_price = (per_cpu_spot_price * cpu +
-                          per_memory_spot_price * memory)
-            table.append(
-                [instance_type, cpu, memory, price, spot_price, region])
-    a2_df = pd.DataFrame(table,
-                         columns=[
-                             'InstanceType',
-                             'vCPUs',
-                             'MemoryGiB',
-                             'Price',
-                             'SpotPrice',
-                             'Region',
-                         ])
-
-    a2_df['AcceleratorName'] = None
-    a2_df['AcceleratorCount'] = None
-    a2_df['GpuInfo'] = None
-    return a2_df
-
-
-def get_vm_df(region_prefix: str):
+def get_vm_df(region_prefix: str, a100_zones: List[str]) -> pd.DataFrame:
     """Generates the GCP service catalog for host VMs."""
     vm_price_table_urls = get_iframe_sources(GCP_VM_PRICING_URL)
     # Skip the table for "Suspended VM instances".
     vm_price_table_urls = vm_price_table_urls[:-1]
 
-    vm_dfs = [get_vm_price_table(GCP_URL + url) for url in vm_price_table_urls]
+    vm_dfs = [get_vm_price_table(url) for url in vm_price_table_urls]
     vm_dfs = [
         df for df in vm_dfs if df is not None and 'InstanceType' in df.columns
     ]
-
-    # Handle A2 instance types separately.
-    a2_df = get_a2_df()
-    vm_df = pd.concat(vm_dfs + [a2_df])
+    vm_df = pd.concat(vm_dfs)
 
     vm_zones = get_vm_zones(GCP_VM_ZONES_URL)
+    # Manually add A2 machines to the zones with A100 GPUs.
+    # This is necessary because GCP_VM_ZONES_URL may not be up to date.
+    df = pd.DataFrame.from_dict({
+        'AvailabilityZone': a100_zones,
+        'MachineType': 'A2',
+    })
+    vm_zones = pd.concat([vm_zones, df], ignore_index=True)
+    # vm_zones alreay includes some zones with A100 GPUs.
+    # When we merge it with a100_zones, we need to remove the duplicates.
+    vm_zones = vm_zones.drop_duplicates()
+
     # Remove regions not in the pricing data.
+    regions = vm_df['Region'].unique()
     zone_to_region = lambda x: x[:-2]
     vm_zones['Region'] = vm_zones['AvailabilityZone'].apply(zone_to_region)
-    regions = vm_df['Region'].unique()
     vm_zones = vm_zones[vm_zones['Region'].isin(regions)]
 
     # Define the MachineType column.
@@ -371,17 +301,18 @@ def get_vm_df(region_prefix: str):
 
     # Merge the dataframes.
     vm_df = pd.merge(vm_df, vm_zones, on=['Region', 'MachineType'])
+    # Check duplicates.
+    assert not vm_df[['InstanceType', 'AvailabilityZone']].duplicated().any()
 
     # Remove the MachineType column.
     vm_df.drop(columns=['MachineType'], inplace=True)
 
-    # Block non-US regions.
-    # FIXME(woosuk): Allow all regions.
+    # Drop regions without the given prefix.
     vm_df = vm_df[vm_df['Region'].str.startswith(region_prefix)]
     return vm_df
 
 
-def get_gpu_price_table(url):
+def get_gpu_price_table(url) -> pd.DataFrame:
     page = requests.get(url)
     doc = html.fromstring(page.content)
     regions = get_regions(doc)
@@ -443,7 +374,7 @@ def get_gpu_price_table(url):
     # Parse the prices.
     pattern = re.compile(r'\$?(.*?)\s?per GPU')
 
-    def parse_price(price_str):
+    def parse_price(price_str: str) -> float:
         if NOT_AVAILABLE_STR in price_str:
             return None
         try:
@@ -461,7 +392,7 @@ def parse_price(price_str):
     return df
 
 
-def get_gpu_zones(url):
+def get_gpu_zones(url) -> pd.DataFrame:
     page = requests.get(url)
     df = pd.read_html(page.text.replace('<br>', '\n'))[0]
     column_remapping = {
@@ -477,19 +408,14 @@ def get_gpu_zones(url):
     # Explode Availability Zone.
     df['AvailabilityZone'] = df['AvailabilityZone'].str.split(' ')
     df = df.explode('AvailabilityZone', ignore_index=True)
-
-    # Remove "(except a2-megagpu-16g)"
-    # The exceptional zones will be handled manually.
-    df['AcceleratorName'] = df['AcceleratorName'].apply(
-        lambda x: x.replace(' (except a2-megagpu-16g)', ''))
     return df
 
 
-def get_gpu_df(region_prefix: str):
+def get_gpu_df(region_prefix: str) -> pd.DataFrame:
     """Generates the GCP service catalog for GPUs."""
     gpu_price_table_url = get_iframe_sources(GCP_GPU_PRICING_URL)
     assert len(gpu_price_table_url) == 1
-    gpu_pricing = get_gpu_price_table(GCP_URL + gpu_price_table_url[0])
+    gpu_pricing = get_gpu_price_table(gpu_price_table_url[0])
     gpu_zones = get_gpu_zones(GCP_GPU_ZONES_URL)
 
     # Remove zones not in the pricing data.
@@ -520,10 +446,10 @@ def get_gpu_df(region_prefix: str):
     gpu_df['Price'] = gpu_df['AcceleratorCount'] * gpu_df['Price']
     gpu_df['SpotPrice'] = gpu_df['AcceleratorCount'] * gpu_df['SpotPrice']
 
-    # Consider the zones that do not have 16xA100 machines.
-    gpu_df = gpu_df[~(gpu_df['AvailabilityZone'].isin(NO_A100_16G_ZONES) &
-                      (gpu_df['AcceleratorName'] == 'A100') &
-                      (gpu_df['AcceleratorCount'] == 16))]
+    # 16xA100 is only supported in certain zones.
+    gpu_df = gpu_df[(gpu_df['AcceleratorName'] != 'A100') |
+                    (gpu_df['AcceleratorCount'] != 16) |
+                    (gpu_df['AvailabilityZone'].isin(A2_MEGAGPU_16G_ZONES))]
 
     # Add columns for the service catalog.
     gpu_df['InstanceType'] = None
@@ -531,13 +457,12 @@ def get_gpu_df(region_prefix: str):
     gpu_df['vCPUs'] = None
     gpu_df['MemoryGiB'] = None
 
-    # Block non-US regions.
-    # FIXME(woosuk): Allow all regions.
+    # Drop regions without the given prefix.
     gpu_df = gpu_df[gpu_df['Region'].str.startswith(region_prefix)]
     return gpu_df
 
 
-def get_tpu_df():
+def get_tpu_df() -> pd.DataFrame:
     """Generates the GCP service catalog for TPUs."""
     tpu_zones = pd.read_csv(GCP_TPU_ZONES_URL)
     tpu_pricing = pd.read_csv(GCP_TPU_PRICING_URL)
@@ -571,20 +496,67 @@ def get_tpu_df():
     return tpu_df
 
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--all-regions',
-        action='store_true',
-        help='Fetch all global regions, not just the U.S. ones.')
-    args = parser.parse_args()
-    region_prefix_filter = ALL_REGION_PREFIX if args.all_regions else US_REGION_PREFIX
-
-    processed_vm_df = get_vm_df(region_prefix_filter)
-    processed_gpu_df = get_gpu_df(region_prefix_filter)
-    processed_tpu_df = get_tpu_df()
-    catalog_df = pd.concat(
-        [processed_vm_df, processed_gpu_df, processed_tpu_df])
+def post_process_a2_price(catalog_df: pd.DataFrame) -> pd.DataFrame:
+    a100_df = catalog_df[catalog_df['AcceleratorName'].isin(
+        ['A100', 'A100-80GB'])]
+
+    def _deduct_a100_price(
+            row: pd.Series) -> Tuple[Optional[float], Optional[float]]:
+        instance_type = row['InstanceType']
+        if pd.isna(instance_type) or not instance_type.startswith('a2'):
+            return row['Price'], row['SpotPrice']
+
+        zone = row['AvailabilityZone']
+        a100_type = 'A100-80GB' if 'ultragpu' in instance_type else 'A100'
+        a100_count = int(instance_type.split('-')[-1][:-1])
+        a100 = a100_df[(a100_df['AcceleratorName'] == a100_type) &
+                       (a100_df['AcceleratorCount'] == a100_count) &
+                       (a100_df['AvailabilityZone'] == zone)]
+        if a100.empty:
+            # Invalid.
+            # The A2 VM is not acctually supported in this zone.
+            # The row is dropped out later.
+
+            # This happens because GCP_VM_PRICING_URL shows region-wise price,
+            # and GCP_VM_ZONES_URL only tells whether the zone has any A2 VM.
+            # Thus, for example, if zone X in a region only supports A100-40GB
+            # while another zone Y in the same region supports A100-80GB,
+            # it will appear in GCP_VM_PRICING_URL that the region supports
+            # both A100-40GB and A100-80GB. And in GCP_VM_ZONES_URL zone X
+            # will be said to support A2 VMs. In such a case, we do not know
+            # whether zone X supports both A100 GPUs or only one of them.
+            # We need to refer to GCP_GPU_ZONES_URL to know that zone X only
+            # supports A100-40GB. Thus, in get_vm_df(), we add both a2-highgpu
+            # (for A100-40GB) and a2-ultragpu (for A100-80GB) to zone X.
+            # Then in this post-processing step, we nullifies the A2 VMs
+            # that are not supported in zone X.
+
+            # This also filters out a2-megagpu-16g VMs in zones that do not
+            # support 16xA100.
+            return None, None
+
+        price = row['Price'] - a100['Price'].iloc[0]
+        spot_price = row['SpotPrice'] - a100['SpotPrice'].iloc[0]
+        return price, spot_price
+
+    catalog_df[['Price', 'SpotPrice']] = catalog_df.apply(_deduct_a100_price,
+                                                          axis=1,
+                                                          result_type='expand')
+    # Remove invalid A2 instances.
+    catalog_df = catalog_df[catalog_df['InstanceType'].str.startswith('a2').
+                            ne(True) | (catalog_df['Price'].notna())]
+    return catalog_df
+
+
+def get_catalog_df(region_prefix: str) -> pd.DataFrame:
+    """Generates the GCP catalog by combining CPU, GPU, and TPU catalogs."""
+    gpu_df = get_gpu_df(region_prefix)
+    df = gpu_df[gpu_df['AcceleratorName'].isin(['A100', 'A100-80GB'])]
+    a100_zones = df['AvailabilityZone'].unique().tolist()
+    vm_df = get_vm_df(region_prefix, a100_zones)
+    tpu_df = get_tpu_df()
+    catalog_df = pd.concat([vm_df, gpu_df, tpu_df])
+    catalog_df = post_process_a2_price(catalog_df)
 
     # Filter out unsupported VMs from the catalog.
     for vm in UNSUPPORTED_VMS:
@@ -594,7 +566,20 @@ def get_tpu_df():
 
     # Reorder the columns.
     catalog_df = catalog_df[COLUMNS]
+    return catalog_df
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--all-regions',
+        action='store_true',
+        help='Fetch all global regions, not just the U.S. ones.')
+    args = parser.parse_args()
+
+    region_prefix_filter = ALL_REGION_PREFIX if args.all_regions else US_REGION_PREFIX
+    gcp_catalog_df = get_catalog_df(region_prefix_filter)
 
     os.makedirs('gcp', exist_ok=True)
-    catalog_df.to_csv('gcp/vms.csv', index=False)
+    gcp_catalog_df.to_csv('gcp/vms.csv', index=False)
     print('GCP Service Catalog saved to gcp/vms.csv')