skypilot-org · Michaelvll · Nov 17, 2022 · Nov 12, 2022 · Nov 13, 2022 · Nov 15, 2022
diff --git a/docs/source/reference/faq.rst b/docs/source/reference/faq.rst
@@ -10,7 +10,7 @@ Frequently Asked Questions
 
 
 Can I clone private GitHub repositories in a task's ``setup`` commands?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Yes, provided you have `set up SSH agent forwarding <https://docs.github.com/en/developers/overview/using-ssh-agent-forwarding>`_.
 For example, run the following on your laptop:
@@ -31,7 +31,7 @@ Then, any SkyPilot clusters launched from this machine would be able to clone pr
 Note: currently, cloning private repositories in the ``run`` commands is not supported yet.
 
 How to mount additional files into a cloned repository?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 If you want to mount additional files into a path that will be ``git clone``-ed (either in ``setup`` or ``run``), cloning will fail and complain that the target path is not empty:
 
@@ -58,7 +58,7 @@ To get around this, mount the files to a different path, then symlink to them.
 
 
 How to make SkyPilot clusters use my Weights & Biases credentials?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Install the wandb library on your laptop and login to your account via ``wandb login``.
 Then, add the following lines in your task yaml file:
@@ -69,15 +69,15 @@ Then, add the following lines in your task yaml file:
     ~/.netrc: ~/.netrc
 
 How to update an existing cluster's ``file_mounts`` without rerunning ``setup``?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 If you have edited the ``file_mounts`` section (e.g., by adding some files) and would like to have it reflected on an existing cluster, running ``sky launch -c <cluster> ..`` would work, but it would rerun the ``setup`` commands.
 
 To avoid rerunning the ``setup`` commands, pass the ``--no-setup`` flag to ``sky launch``.
 
 
 (Advanced) How to make SkyPilot use all global regions?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 By default, SkyPilot only supports the US regions on different clouds for convenience. If you want to utilize all global regions, please run the following command:
 
@@ -88,6 +88,7 @@ By default, SkyPilot only supports the US regions on different clouds for conven
   # Fetch all regions for AWS
   python -m sky.clouds.service_catalog.data_fetchers.fetch_aws --all-regions
   # Fetch all regions for GCP
+  pip install lxml
   python -m sky.clouds.service_catalog.data_fetchers.fetch_gcp --all-regions
   # Fetch all regions for Azure
   python -m sky.clouds.service_catalog.data_fetchers.fetch_azure --all-regions
@@ -97,7 +98,7 @@ To make your managed spot jobs potentially use all global regions, please log in
 
 
 (Advanced) How to edit or update the regions or pricing information used by SkyPilot?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 SkyPilot stores regions and pricing information for different cloud resource types in CSV files known as
 `"service catalogs" <https://github.com/skypilot-org/skypilot-catalog>`_.

diff --git a/sky/clouds/service_catalog/data_fetchers/__init__.py b/sky/clouds/service_catalog/data_fetchers/__init__.py
diff --git a/sky/clouds/service_catalog/data_fetchers/analyze.py b/sky/clouds/service_catalog/data_fetchers/analyze.py
@@ -1,4 +1,5 @@
-import copy
+"""Analyze the new catalog fetched with the original."""
+
 from typing import Tuple
 import pandas as pd
 
@@ -28,23 +29,24 @@ def resource_diff(original_df: pd.DataFrame, new_df: pd.DataFrame,
 for cloud in CLOUD_CHECKS:
     result = {}
     print(f'=> Checking {cloud}')
-    original_df = common.read_catalog(f'{cloud}.csv')
-    new_df = pd.read_csv(f'{cloud}.csv')
+    original_catalog_df = common.read_catalog(f'{cloud}.csv')
+    new_catalog_df = pd.read_csv(f'{cloud}.csv')
 
     current_check_tuple = CLOUD_CHECKS[cloud]
 
-    diff_df = resource_diff(original_df, new_df, current_check_tuple)
-    diff_df.merge(new_df, on=current_check_tuple,
+    diff_df = resource_diff(original_catalog_df, new_catalog_df,
+                            current_check_tuple)
+    diff_df.merge(new_catalog_df, on=current_check_tuple,
                   how='left').to_csv(f'{cloud}_diff.csv', index=False)
 
     result['#resources'] = len(diff_df)
 
     check_price = current_check_tuple + ['Price']
-    diff_df = resource_diff(original_df, new_df, check_price)
+    diff_df = resource_diff(original_catalog_df, new_catalog_df, check_price)
     result['#prices'] = len(diff_df)
 
     check_price = current_check_tuple + ['SpotPrice']
-    diff_df = resource_diff(original_df, new_df, check_price)
+    diff_df = resource_diff(original_catalog_df, new_catalog_df, check_price)
     result['#spot_prices'] = len(diff_df)
 
     table[cloud] = result

diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py
@@ -47,8 +47,9 @@
 ]
 
 # NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is
-# only available in this region, but it serves pricing information for all regions.
-PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv'
+# only available in this region, but it serves pricing information for all
+# regions.
+PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv'  # pylint: disable=line-too-long
 
 
 @ray.remote
@@ -143,7 +144,7 @@ def get_memory_gib(row) -> float:
         def get_additional_columns(row) -> pd.Series:
             acc_name, acc_count = get_acc_info(row)
             # AWS p3dn.24xlarge offers a different V100 GPU.
-            # See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/
+            # See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/ # pylint: disable=line-too-long
             if row['InstanceType'] == 'p3dn.24xlarge':
                 acc_name = 'V100-32GB'
             if row['InstanceType'] == 'p4de.24xlarge':
@@ -177,7 +178,7 @@ def get_additional_columns(row) -> pd.Series:
         # patch the GpuInfo for p4de.24xlarge
         df.loc[df['InstanceType'] == 'p4de.24xlarge', 'GpuInfo'] = 'A100-80GB'
         df = df[USEFUL_COLUMNS]
-    except Exception as e:
+    except Exception as e:  # pylint: disable=broad-except
         print(f'{region} failed with {e}')
         return region
     return df
@@ -224,13 +225,12 @@ def get_image_id(region: str, ubuntu_version: str, creation_date: str) -> str:
             """,
                                            shell=True)
     except subprocess.CalledProcessError as e:
-        print(
-            f'Failed {region}, {ubuntu_version}, {creation_date}. Trying next date.'
-        )
+        print(f'Failed {region}, {ubuntu_version}, {creation_date}. '
+              'Trying next date.')
         print(f'{type(e)}: {e}')
         image_id = None
     else:
-        image_id = image_id.decode("utf-8").strip()
+        image_id = image_id.decode('utf-8').strip()
     return image_id
 
 
@@ -239,6 +239,7 @@ def get_image_row(region: str, ubuntu_version: str,
                   cpu_or_gpu: str) -> Tuple[str, str, str, str, str, str]:
     print(f'Getting image for {region}, {ubuntu_version}, {cpu_or_gpu}')
     creation_date = _GPU_TO_IMAGE_DATE[cpu_or_gpu]
+    date = None
     for date in creation_date:
         image_id = get_image_id(region, ubuntu_version, date)
         if image_id:
@@ -248,6 +249,8 @@ def get_image_row(region: str, ubuntu_version: str,
         print(
             f'Failed to find image for {region}, {ubuntu_version}, {cpu_or_gpu}'
         )
+    if date is None:
+        raise ValueError(f'Could not find the creation date for {cpu_or_gpu}.')
     tag = f'skypilot:{cpu_or_gpu}-ubuntu-{ubuntu_version.replace(".", "")}'
     return tag, region, 'ubuntu', ubuntu_version, image_id, date
 
@@ -275,14 +278,14 @@ def get_all_regions_images_df() -> pd.DataFrame:
         help='Fetch all global regions, not just the U.S. ones.')
     args = parser.parse_args()
 
-    regions = ALL_REGIONS if args.all_regions else US_REGIONS
+    region_filter = ALL_REGIONS if args.all_regions else US_REGIONS
 
     ray.init()
-    df = get_all_regions_instance_types_df(regions)
+    instance_df = get_all_regions_instance_types_df(region_filter)
     os.makedirs('aws', exist_ok=True)
-    df.to_csv('aws/vms.csv', index=False)
+    instance_df.to_csv('aws/vms.csv', index=False)
     print('AWS Service Catalog saved to aws/vms.csv')
 
-    df = get_all_regions_images_df()
-    df.to_csv('aws/images.csv', index=False)
+    image_df = get_all_regions_images_df()
+    image_df.to_csv('aws/images.csv', index=False)
     print('AWS Images saved to aws/images.csv')
diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py
@@ -31,7 +31,8 @@
 def get_regions() -> Tuple[str]:
     """Get all available regions."""
     proc = subprocess.run(
-        'az account list-locations  --query "[?not_null(metadata.latitude)] .{RegionName:name , RegionDisplayName:regionalDisplayName}" -o json',
+        'az account list-locations  --query "[?not_null(metadata.latitude)] '
+        '.{RegionName:name , RegionDisplayName:regionalDisplayName}" -o json',
         shell=True,
         check=True,
         stdout=subprocess.PIPE)
@@ -56,12 +57,12 @@ def get_regions() -> Tuple[str]:
 
 def get_pricing_url(region: Optional[str] = None) -> str:
     filters = [
-        "serviceName eq 'Virtual Machines'",
-        "priceType eq 'Consumption'",
+        'serviceName eq \'Virtual Machines\'',
+        'priceType eq \'Consumption\'',
     ]
     if region is not None:
-        filters.append(f"armRegionName eq '{region}'")
-    filters_str = urllib.parse.quote(" and ".join(filters))
+        filters.append(f'armRegionName eq \'{region}\'')
+    filters_str = urllib.parse.quote(' and '.join(filters))
     return f'https://prices.azure.com/api/retail/prices?$filter={filters_str}'
 
 
@@ -99,15 +100,15 @@ def get_all_regions_pricing_df(regions: Set[str]) -> pd.DataFrame:
 
 @ray.remote
 def get_sku_df(region_set: Set[str]) -> pd.DataFrame:
-    print(f'Fetching SKU list')
+    print('Fetching SKU list')
     # To get a complete list, --all option is necessary.
     proc = subprocess.run(
-        f'az vm list-skus --all',
+        'az vm list-skus --all',
         shell=True,
         check=True,
         stdout=subprocess.PIPE,
     )
-    print(f'Done fetching SKUs')
+    print('Done fetching SKUs')
     items = json.loads(proc.stdout.decode('ascii'))
     filtered_items = []
     for item in items:
@@ -154,7 +155,7 @@ def get_all_regions_instance_types_df(region_set: Set[str]):
         get_all_regions_pricing_df.remote(region_set),
         get_sku_df.remote(region_set),
     ])
-    print(f'Processing dataframes')
+    print('Processing dataframes')
     df.drop_duplicates(inplace=True)
 
     df = df[df['unitPrice'] > 0]
@@ -248,10 +249,10 @@ def get_additional_columns(row):
     args = parser.parse_args()
 
     ray.init()
-    regions = get_regions() if args.all_regions else US_REGIONS
-    regions = set(regions)
+    region_filter = get_regions() if args.all_regions else US_REGIONS
+    region_filter = set(region_filter)
 
-    df = get_all_regions_instance_types_df(regions)
+    instance_df = get_all_regions_instance_types_df(region_filter)
     os.makedirs('azure', exist_ok=True)
-    df.to_csv('azure/vms.csv', index=False)
+    instance_df.to_csv('azure/vms.csv', index=False)
     print('Azure Service Catalog saved to azure/vms.csv')
diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py
@@ -11,6 +11,7 @@
 import pandas as pd
 import requests
 
+# pylint: disable=line-too-long
 GCP_URL = 'https://cloud.google.com'
 GCP_VM_PRICING_URL = 'https://cloud.google.com/compute/vm-instance-pricing'
 GCP_VM_ZONES_URL = 'https://cloud.google.com/compute/docs/regions-zones'
@@ -319,7 +320,8 @@ def get_a2_df():
             cpu = spec['vCPUs']
             memory = spec['MemoryGiB']
             price = per_cpu_price * cpu + per_memory_price * memory
-            spot_price = per_cpu_spot_price * cpu + per_memory_spot_price * memory
+            spot_price = (per_cpu_spot_price * cpu +
+                          per_memory_spot_price * memory)
             table.append(
                 [instance_type, cpu, memory, price, spot_price, region])
     a2_df = pd.DataFrame(table,
@@ -564,8 +566,8 @@ def get_tpu_df():
     # Add columns for the service catalog.
     tpu_df['InstanceType'] = None
     tpu_df['GpuInfo'] = tpu_df['AcceleratorName']
-    gpu_df['vCPUs'] = None
-    gpu_df['MemoryGiB'] = None
+    tpu_df['vCPUs'] = None
+    tpu_df['MemoryGiB'] = None
     return tpu_df
 
 
@@ -576,18 +578,19 @@ def get_tpu_df():
         action='store_true',
         help='Fetch all global regions, not just the U.S. ones.')
     args = parser.parse_args()
-    region_prefix = ALL_REGION_PREFIX if args.all_regions else US_REGION_PREFIX
+    region_prefix_filter = ALL_REGION_PREFIX if args.all_regions else US_REGION_PREFIX
 
-    vm_df = get_vm_df(region_prefix)
-    gpu_df = get_gpu_df(region_prefix)
-    tpu_df = get_tpu_df()
-    catalog_df = pd.concat([vm_df, gpu_df, tpu_df])
+    processed_vm_df = get_vm_df(region_prefix_filter)
+    processed_gpu_df = get_gpu_df(region_prefix_filter)
+    processed_tpu_df = get_tpu_df()
+    catalog_df = pd.concat(
+        [processed_vm_df, processed_gpu_df, processed_tpu_df])
 
     # Filter out unsupported VMs from the catalog.
     for vm in UNSUPPORTED_VMS:
         # NOTE: The `InstanceType` column can be NaN.
-        catalog_df = catalog_df[
-            catalog_df['InstanceType'].str.startswith(vm) != True]
+        catalog_df = catalog_df[catalog_df['InstanceType'].str.startswith(
+            vm).ne(True)]
 
     # Reorder the columns.
     catalog_df = catalog_df[COLUMNS]

diff --git a/sky/setup_files/MANIFEST.in b/sky/setup_files/MANIFEST.in
@@ -6,5 +6,5 @@ include sky/skylet/providers/gcp/*
 include sky/skylet/ray_patches/*.patch
 include sky/templates/*
 include sky/setup_files/*
-include sky/utils/*
 include sky/skylet/LICENCE
+exclude sky/clouds/service_catalog/data_fetchers/analyze.py