skypilot-org · Michaelvll · Oct 11, 2022 · Oct 6, 2022 · Oct 6, 2022 · Oct 6, 2022
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@ docs/build/
 docs/_build/
 build/
 sky_logs/
+sky/clouds/service_catalog/data_fetchers/*.csv
diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py
@@ -235,7 +235,12 @@ def make_deploy_resources_variables(
             else:
                 # Convert to GCP names:
                 # https://cloud.google.com/compute/docs/gpus
-                resources_vars['gpu'] = 'nvidia-tesla-{}'.format(acc.lower())
+                if acc == 'A100-80GB':
+                    # A100-80GB has a different name pattern.
+                    resources_vars['gpu'] = 'nvidia-{}'.format(acc.lower())
+                else:
+                    resources_vars['gpu'] = 'nvidia-tesla-{}'.format(
+                        acc.lower())
                 resources_vars['gpu_count'] = acc_count
                 if acc == 'K80':
                     # CUDA driver version 470.57.02, CUDA Library 11.4

diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py
@@ -68,6 +68,10 @@ def get_region_zones_for_instance_type(instance_type: str,
 
 
 def get_gen_version_from_instance_type(instance_type: str) -> Optional[int]:
+    if 'Generation' in _df.columns:
+        return _df[_df['InstanceType'] == instance_type]['Generation'].iloc[0]
+
+    # Backward compatibility for the older catalog.
     cell = _df[_df['InstanceType'] == instance_type]['capabilities'].iloc[0]
     cap_list = ast.literal_eval(cell)
     gen_version = None

diff --git a/sky/clouds/service_catalog/data_fetchers/analyze.py b/sky/clouds/service_catalog/data_fetchers/analyze.py
@@ -0,0 +1,54 @@
+import copy
+from typing import Tuple
+import pandas as pd
+
+from sky.clouds.service_catalog import common
+
+
+def resource_diff(original_df: pd.DataFrame, new_df: pd.DataFrame,
+                  check_tuple: Tuple[str]) -> pd.DataFrame:
+    """Returns the difference between two dataframes."""
+    original_resources = original_df[check_tuple]
+    new_resources = new_df[check_tuple]
+
+    return new_resources.merge(
+        original_resources, on=check_tuple, how='left',
+        indicator=True)[lambda x: x['_merge'] == 'left_only'].sort_values(
+            by=check_tuple)
+
+
+CLOUD_CHECKS = {
+    'aws': ['InstanceType', 'Region', 'AvailabilityZone'],
+    'azure': ['InstanceType', 'Region'],
+    'gcp': ['InstanceType', 'Region', 'AcceleratorName', 'AcceleratorCount']
+}
+
+table = {}
+
+for cloud in CLOUD_CHECKS:
+    result = {}
+    print(f'=> Checking {cloud}')
+    original_df = common.read_catalog(f'{cloud}.csv')
+    new_df = pd.read_csv(f'{cloud}.csv')
+
+    current_check_tuple = CLOUD_CHECKS[cloud]
+
+    diff_df = resource_diff(original_df, new_df, current_check_tuple)
+    diff_df.merge(new_df, on=current_check_tuple,
+                  how='left').to_csv(f'{cloud}_diff.csv', index=False)
+
+    result['#resources'] = len(diff_df)
+
+    check_price = current_check_tuple + ['Price']
+    diff_df = resource_diff(original_df, new_df, check_price)
+    result['#prices'] = len(diff_df)
+
+    check_price = current_check_tuple + ['SpotPrice']
+    diff_df = resource_diff(original_df, new_df, check_price)
+    result['#spot_prices'] = len(diff_df)
+
+    table[cloud] = result
+
+summary = pd.DataFrame(table).T
+summary.to_csv('diff_summary.csv')
+print(summary)
diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py
@@ -1,17 +1,43 @@
 """A script that queries AWS API to get instance types and pricing information.
-
 This script takes about 1 minute to finish.
 """
 import datetime
-from typing import Tuple
+from typing import Tuple, Union
 
 import numpy as np
 import pandas as pd
 import ray
 
 from sky.adaptors import aws
 
-REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
+ALL_REGIONS = [
+    'us-east-1',
+    'us-east-2',
+    'us-west-1',
+    'us-west-2',
+    'ca-central-1',
+    'eu-central-1',
+    'eu-west-1',
+    'eu-west-2',
+    'eu-south-1',
+    'eu-west-3',
+    'eu-north-1',
+    'me-south-1',
+    # 'me-central-1', # failed for no credential
+    'af-south-1',
+    'ap-east-1',
+    'ap-southeast-3',
+    # 'ap-south-1', # failed for no credential
+    'ap-northeast-3',
+    'ap-northeast-2',
+    'ap-southeast-1',
+    'ap-southeast-2',
+    'ap-northeast-1',
+]
+US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
+
+REGIONS = US_REGIONS
+
 # NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is
 # only available in this region, but it serves pricing information for all regions.
 PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv'
@@ -44,12 +70,18 @@ def get_pricing_table(region: str) -> pd.DataFrame:
     print(f'{region} downloading pricing table')
     url = PRICING_TABLE_URL_FMT.format(region=region)
     df = pd.read_csv(url, skiprows=5, low_memory=False)
+    df.rename(columns={
+        'Instance Type': 'InstanceType',
+        'PricePerUnit': 'Price',
+    },
+              inplace=True)
     return df[(df['TermType'] == 'OnDemand') &
               (df['Operating System'] == 'Linux') &
               df['Pre Installed S/W'].isnull() &
               (df['CapacityStatus'] == 'Used') &
-              (df['Tenancy'].isin(['Host', 'Shared'])) &
-              df['PricePerUnit'] > 0].set_index('Instance Type')
+              (df['Tenancy'].isin(['Host', 'Shared'])) & df['Price'] > 0][[
+                  'InstanceType', 'Price', 'vCPU', 'Memory'
+              ]]
 
 
 @ray.remote
@@ -62,78 +94,96 @@ def get_spot_pricing_table(region: str) -> pd.DataFrame:
     ret = []
     for response in response_iterator:
         ret = ret + response['SpotPriceHistory']
-    df = pd.DataFrame(ret).set_index(['InstanceType', 'AvailabilityZone'])
+    df = pd.DataFrame(ret)[['InstanceType', 'AvailabilityZone', 'SpotPrice']]
+    df = df.set_index(['InstanceType', 'AvailabilityZone'])
     return df
 
 
 @ray.remote
-def get_instance_types_df(region: str) -> pd.DataFrame:
-    df, zone_df, pricing_df, spot_pricing_df = ray.get([
-        get_instance_types.remote(region),
-        get_availability_zones.remote(region),
-        get_pricing_table.remote(region),
-        get_spot_pricing_table.remote(region)
-    ])
-    print(f'{region} Processing dataframes')
-
-    def get_price(row):
-        t = row['InstanceType']
-        try:
-            return pricing_df.loc[t]['PricePerUnit']
-        except KeyError:
-            return np.nan
-
-    def get_spot_price(row):
-        instance = row['InstanceType']
-        zone = row['AvailabilityZone']
-        try:
-            return spot_pricing_df.loc[(instance, zone)]['SpotPrice']
-        except KeyError:
-            return np.nan
-
-    def get_acc_info(row) -> Tuple[str, float]:
-        accelerator = None
-        for col, info_key in [('GpuInfo', 'Gpus'),
-                              ('InferenceAcceleratorInfo', 'Accelerators'),
-                              ('FpgaInfo', 'Fpgas')]:
-            info = row.get(col)
-            if isinstance(info, dict):
-                accelerator = info[info_key][0]
-        if accelerator is None:
-            return None, np.nan
-        return accelerator['Name'], accelerator['Count']
-
-    def get_vcpus(row) -> float:
-        return float(row['VCpuInfo']['DefaultVCpus'])
-
-    def get_memory_gib(row) -> float:
-        return row['MemoryInfo']['SizeInMiB'] // 1024
-
-    def get_additional_columns(row):
-        acc_name, acc_count = get_acc_info(row)
-        # AWS p3dn.24xlarge offers a different V100 GPU.
-        # See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/
-        if row['InstanceType'] == 'p3dn.24xlarge':
-            acc_name = 'V100-32GB'
-        return pd.Series({
-            'Price': get_price(row),
-            'SpotPrice': get_spot_price(row),
-            'AcceleratorName': acc_name,
-            'AcceleratorCount': acc_count,
-            'vCPUs': get_vcpus(row),
-            'MemoryGiB': get_memory_gib(row),
-        })
-
-    df['Region'] = region
-    df = df.merge(pd.DataFrame(zone_df), how='cross')
-    df = pd.concat([df, df.apply(get_additional_columns, axis='columns')],
-                   axis='columns')
+def get_instance_types_df(region: str) -> Union[str, pd.DataFrame]:
+    try:
+        df, zone_df, pricing_df, spot_pricing_df = ray.get([
+            get_instance_types.remote(region),
+            get_availability_zones.remote(region),
+            get_pricing_table.remote(region),
+            get_spot_pricing_table.remote(region),
+        ])
+        print(f'{region} Processing dataframes')
+
+        def get_acc_info(row) -> Tuple[str, float]:
+            accelerator = None
+            for col, info_key in [('GpuInfo', 'Gpus'),
+                                  ('InferenceAcceleratorInfo', 'Accelerators'),
+                                  ('FpgaInfo', 'Fpgas')]:
+                info = row.get(col)
+                if isinstance(info, dict):
+                    accelerator = info[info_key][0]
+            if accelerator is None:
+                return None, np.nan
+            return accelerator['Name'], accelerator['Count']
+
+        def get_vcpus(row) -> float:
+            if not np.isnan(row['vCPU']):
+                return float(row['vCPU'])
+            return float(row['VCpuInfo']['DefaultVCpus'])
+
+        def get_memory_gib(row) -> float:
+            if isinstance(row['MemoryInfo'], dict):
+                return row['MemoryInfo']['SizeInMiB'] // 1024
+            return int(row['Memory'].split(' GiB')[0])
+
+        def get_additional_columns(row) -> pd.Series:
+            acc_name, acc_count = get_acc_info(row)
+            # AWS p3dn.24xlarge offers a different V100 GPU.
+            # See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/
+            if row['InstanceType'] == 'p3dn.24xlarge':
+                acc_name = 'V100-32GB'
+            if row['InstanceType'] == 'p4de.24xlarge':
+                acc_name = 'A100-80GB'
+                acc_count = 8
+            return pd.Series({
+                'AcceleratorName': acc_name,
+                'AcceleratorCount': acc_count,
+                'vCPUs': get_vcpus(row),
+                'MemoryGiB': get_memory_gib(row),
+            })
+
+        # The AWS API may not have all the instance types in the pricing table,
+        # so we need to merge the two dataframes.
+        df = df.merge(pricing_df, on=['InstanceType'], how='outer')
+        df['Region'] = region
+        # Cartesian product of instance types and availability zones, so that
+        # we can join the spot pricing table per instance type and zone.
+        df = df.merge(pd.DataFrame(zone_df), how='cross')
+
+        # Add spot price column, by joining the spot pricing table.
+        df = df.merge(spot_pricing_df,
+                      left_on=['InstanceType', 'AvailabilityZone'],
+                      right_index=True,
+                      how='outer')
+
+        # Extract vCPUs, memory, and accelerator info from the columns.
+        df = pd.concat(
+            [df, df.apply(get_additional_columns, axis='columns')],
+            axis='columns')
+        # patch the GpuInfo for p4de.24xlarge
+        df.loc[df['InstanceType'] == 'p4de.24xlarge', 'GpuInfo'] = 'A100-80GB'
+    except Exception as e:
+        print(f'{region} failed with {e}')
+        return region
     return df
 
 
 def get_all_regions_instance_types_df():
-    dfs = ray.get([get_instance_types_df.remote(r) for r in REGIONS])
-    df = pd.concat(dfs)
+    df_or_regions = ray.get([get_instance_types_df.remote(r) for r in REGIONS])
+    new_dfs = []
+    for df_or_region in df_or_regions:
+        if isinstance(df_or_region, str):
+            print(f'{df_or_region} failed')
+        else:
+            new_dfs.append(df_or_region)
+
+    df = pd.concat(new_dfs)
     df.sort_values(['InstanceType', 'Region'], inplace=True)
     return df