From 3bf64e7f7c154c0b94c2dca4544a4c364e20ded6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sat, 12 Nov 2022 15:53:44 -0800 Subject: [PATCH 1/5] Add data fetchers into wheel --- .../service_catalog/data_fetchers/__init__.py | 0 .../service_catalog/data_fetchers/analyze.py | 16 ++++++----- .../data_fetchers/fetch_aws.py | 27 ++++++++++--------- .../data_fetchers/fetch_azure.py | 27 ++++++++++--------- .../data_fetchers/fetch_gcp.py | 22 ++++++++------- 5 files changed, 49 insertions(+), 43 deletions(-) create mode 100644 sky/clouds/service_catalog/data_fetchers/__init__.py diff --git a/sky/clouds/service_catalog/data_fetchers/__init__.py b/sky/clouds/service_catalog/data_fetchers/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/sky/clouds/service_catalog/data_fetchers/analyze.py b/sky/clouds/service_catalog/data_fetchers/analyze.py index c9de9e9a06c..c31234da710 100644 --- a/sky/clouds/service_catalog/data_fetchers/analyze.py +++ b/sky/clouds/service_catalog/data_fetchers/analyze.py @@ -1,4 +1,5 @@ -import copy +"""Analyze the new catalog fetched with the original.""" + from typing import Tuple import pandas as pd @@ -28,23 +29,24 @@ def resource_diff(original_df: pd.DataFrame, new_df: pd.DataFrame, for cloud in CLOUD_CHECKS: result = {} print(f'=> Checking {cloud}') - original_df = common.read_catalog(f'{cloud}.csv') - new_df = pd.read_csv(f'{cloud}.csv') + original_catalog_df = common.read_catalog(f'{cloud}.csv') + new_catalog_df = pd.read_csv(f'{cloud}.csv') current_check_tuple = CLOUD_CHECKS[cloud] - diff_df = resource_diff(original_df, new_df, current_check_tuple) - diff_df.merge(new_df, on=current_check_tuple, + diff_df = resource_diff(original_catalog_df, new_catalog_df, + current_check_tuple) + diff_df.merge(new_catalog_df, on=current_check_tuple, how='left').to_csv(f'{cloud}_diff.csv', index=False) result['#resources'] = len(diff_df) check_price = current_check_tuple + ['Price'] - diff_df = resource_diff(original_df, new_df, check_price) + diff_df = resource_diff(original_catalog_df, new_catalog_df, check_price) result['#prices'] = len(diff_df) check_price = current_check_tuple + ['SpotPrice'] - diff_df = resource_diff(original_df, new_df, check_price) + diff_df = resource_diff(original_catalog_df, new_catalog_df, check_price) result['#spot_prices'] = len(diff_df) table[cloud] = result diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index 24e08ebbc73..6bc905f2aa4 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -47,8 +47,9 @@ ] # NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is -# only available in this region, but it serves pricing information for all regions. -PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv' +# only available in this region, but it serves pricing information for all +# regions. +PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv' # pylint: disable=line-too-long @ray.remote @@ -143,7 +144,7 @@ def get_memory_gib(row) -> float: def get_additional_columns(row) -> pd.Series: acc_name, acc_count = get_acc_info(row) # AWS p3dn.24xlarge offers a different V100 GPU. - # See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/ + # See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/ # pylint: disable=line-too-long if row['InstanceType'] == 'p3dn.24xlarge': acc_name = 'V100-32GB' if row['InstanceType'] == 'p4de.24xlarge': @@ -177,7 +178,7 @@ def get_additional_columns(row) -> pd.Series: # patch the GpuInfo for p4de.24xlarge df.loc[df['InstanceType'] == 'p4de.24xlarge', 'GpuInfo'] = 'A100-80GB' df = df[USEFUL_COLUMNS] - except Exception as e: + except Exception as e: # pylint: disable=broad-except print(f'{region} failed with {e}') return region return df @@ -224,13 +225,12 @@ def get_image_id(region: str, ubuntu_version: str, creation_date: str) -> str: """, shell=True) except subprocess.CalledProcessError as e: - print( - f'Failed {region}, {ubuntu_version}, {creation_date}. Trying next date.' - ) + print(f'Failed {region}, {ubuntu_version}, {creation_date}. ' + 'Trying next date.') print(f'{type(e)}: {e}') image_id = None else: - image_id = image_id.decode("utf-8").strip() + image_id = image_id.decode('utf-8').strip() return image_id @@ -239,6 +239,7 @@ def get_image_row(region: str, ubuntu_version: str, cpu_or_gpu: str) -> Tuple[str, str, str, str, str, str]: print(f'Getting image for {region}, {ubuntu_version}, {cpu_or_gpu}') creation_date = _GPU_TO_IMAGE_DATE[cpu_or_gpu] + date = None for date in creation_date: image_id = get_image_id(region, ubuntu_version, date) if image_id: @@ -275,14 +276,14 @@ def get_all_regions_images_df() -> pd.DataFrame: help='Fetch all global regions, not just the U.S. ones.') args = parser.parse_args() - regions = ALL_REGIONS if args.all_regions else US_REGIONS + region_filter = ALL_REGIONS if args.all_regions else US_REGIONS ray.init() - df = get_all_regions_instance_types_df(regions) + instance_df = get_all_regions_instance_types_df(region_filter) os.makedirs('aws', exist_ok=True) - df.to_csv('aws/vms.csv', index=False) + instance_df.to_csv('aws/vms.csv', index=False) print('AWS Service Catalog saved to aws/vms.csv') - df = get_all_regions_images_df() - df.to_csv('aws/images.csv', index=False) + image_df = get_all_regions_images_df() + image_df.to_csv('aws/images.csv', index=False) print('AWS Images saved to aws/images.csv') diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 48572573743..594bb08b2b8 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -31,7 +31,8 @@ def get_regions() -> Tuple[str]: """Get all available regions.""" proc = subprocess.run( - 'az account list-locations --query "[?not_null(metadata.latitude)] .{RegionName:name , RegionDisplayName:regionalDisplayName}" -o json', + 'az account list-locations --query "[?not_null(metadata.latitude)] ' + '.{RegionName:name , RegionDisplayName:regionalDisplayName}" -o json', shell=True, check=True, stdout=subprocess.PIPE) @@ -56,12 +57,12 @@ def get_regions() -> Tuple[str]: def get_pricing_url(region: Optional[str] = None) -> str: filters = [ - "serviceName eq 'Virtual Machines'", - "priceType eq 'Consumption'", + 'serviceName eq \'Virtual Machines\'', + 'priceType eq \'Consumption\'', ] if region is not None: - filters.append(f"armRegionName eq '{region}'") - filters_str = urllib.parse.quote(" and ".join(filters)) + filters.append(f'armRegionName eq \'{region}\'') + filters_str = urllib.parse.quote(' and '.join(filters)) return f'https://prices.azure.com/api/retail/prices?$filter={filters_str}' @@ -99,15 +100,15 @@ def get_all_regions_pricing_df(regions: Set[str]) -> pd.DataFrame: @ray.remote def get_sku_df(region_set: Set[str]) -> pd.DataFrame: - print(f'Fetching SKU list') + print('Fetching SKU list') # To get a complete list, --all option is necessary. proc = subprocess.run( - f'az vm list-skus --all', + 'az vm list-skus --all', shell=True, check=True, stdout=subprocess.PIPE, ) - print(f'Done fetching SKUs') + print('Done fetching SKUs') items = json.loads(proc.stdout.decode('ascii')) filtered_items = [] for item in items: @@ -154,7 +155,7 @@ def get_all_regions_instance_types_df(region_set: Set[str]): get_all_regions_pricing_df.remote(region_set), get_sku_df.remote(region_set), ]) - print(f'Processing dataframes') + print('Processing dataframes') df.drop_duplicates(inplace=True) df = df[df['unitPrice'] > 0] @@ -248,10 +249,10 @@ def get_additional_columns(row): args = parser.parse_args() ray.init() - regions = get_regions() if args.all_regions else US_REGIONS - regions = set(regions) + region_filter = get_regions() if args.all_regions else US_REGIONS + region_filter = set(region_filter) - df = get_all_regions_instance_types_df(regions) + instance_df = get_all_regions_instance_types_df(region_filter) os.makedirs('azure', exist_ok=True) - df.to_csv('azure/vms.csv', index=False) + instance_df.to_csv('azure/vms.csv', index=False) print('Azure Service Catalog saved to azure/vms.csv') diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py index 81cc02bd44a..9bb7c47bd45 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py @@ -11,6 +11,7 @@ import pandas as pd import requests +# pylint: disable=line-too-long GCP_URL = 'https://cloud.google.com' GCP_VM_PRICING_URL = 'https://cloud.google.com/compute/vm-instance-pricing' GCP_VM_ZONES_URL = 'https://cloud.google.com/compute/docs/regions-zones' @@ -319,7 +320,8 @@ def get_a2_df(): cpu = spec['vCPUs'] memory = spec['MemoryGiB'] price = per_cpu_price * cpu + per_memory_price * memory - spot_price = per_cpu_spot_price * cpu + per_memory_spot_price * memory + spot_price = (per_cpu_spot_price * cpu + + per_memory_spot_price * memory) table.append( [instance_type, cpu, memory, price, spot_price, region]) a2_df = pd.DataFrame(table, @@ -564,8 +566,8 @@ def get_tpu_df(): # Add columns for the service catalog. tpu_df['InstanceType'] = None tpu_df['GpuInfo'] = tpu_df['AcceleratorName'] - gpu_df['vCPUs'] = None - gpu_df['MemoryGiB'] = None + tpu_df['vCPUs'] = None + tpu_df['MemoryGiB'] = None return tpu_df @@ -576,18 +578,18 @@ def get_tpu_df(): action='store_true', help='Fetch all global regions, not just the U.S. ones.') args = parser.parse_args() - region_prefix = ALL_REGION_PREFIX if args.all_regions else US_REGION_PREFIX + region_prefix_filter = ALL_REGION_PREFIX if args.all_regions else US_REGION_PREFIX - vm_df = get_vm_df(region_prefix) - gpu_df = get_gpu_df(region_prefix) - tpu_df = get_tpu_df() - catalog_df = pd.concat([vm_df, gpu_df, tpu_df]) + processed_vm_df = get_vm_df(region_prefix_filter) + processed_gpu_df = get_gpu_df(region_prefix_filter) + processed_tpu_df = get_tpu_df() + catalog_df = pd.concat([processed_vm_df, processed_gpu_df, processed_tpu_df]) # Filter out unsupported VMs from the catalog. for vm in UNSUPPORTED_VMS: # NOTE: The `InstanceType` column can be NaN. - catalog_df = catalog_df[ - catalog_df['InstanceType'].str.startswith(vm) != True] + catalog_df = catalog_df[not catalog_df['InstanceType'].str.startswith(vm + )] # Reorder the columns. catalog_df = catalog_df[COLUMNS] From 809a8c2aac49d5700031f0086bd03f89a25bf9fb Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sat, 12 Nov 2022 16:05:07 -0800 Subject: [PATCH 2/5] yapf --- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py index 9bb7c47bd45..e227c3b02cf 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py @@ -583,7 +583,8 @@ def get_tpu_df(): processed_vm_df = get_vm_df(region_prefix_filter) processed_gpu_df = get_gpu_df(region_prefix_filter) processed_tpu_df = get_tpu_df() - catalog_df = pd.concat([processed_vm_df, processed_gpu_df, processed_tpu_df]) + catalog_df = pd.concat( + [processed_vm_df, processed_gpu_df, processed_tpu_df]) # Filter out unsupported VMs from the catalog. for vm in UNSUPPORTED_VMS: From 51fdfd6cd34fdf9b553ed8d1dea156daa71752d8 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 15 Nov 2022 14:09:48 -0800 Subject: [PATCH 3/5] Fix gcp fetcher --- docs/source/reference/faq.rst | 13 +++++++------ .../service_catalog/data_fetchers/fetch_gcp.py | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/source/reference/faq.rst b/docs/source/reference/faq.rst index f5dc1bed368..3625aaf7e7f 100644 --- a/docs/source/reference/faq.rst +++ b/docs/source/reference/faq.rst @@ -10,7 +10,7 @@ Frequently Asked Questions Can I clone private GitHub repositories in a task's ``setup`` commands? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Yes, provided you have `set up SSH agent forwarding `_. For example, run the following on your laptop: @@ -31,7 +31,7 @@ Then, any SkyPilot clusters launched from this machine would be able to clone pr Note: currently, cloning private repositories in the ``run`` commands is not supported yet. How to mount additional files into a cloned repository? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you want to mount additional files into a path that will be ``git clone``-ed (either in ``setup`` or ``run``), cloning will fail and complain that the target path is not empty: @@ -58,7 +58,7 @@ To get around this, mount the files to a different path, then symlink to them. How to make SkyPilot clusters use my Weights & Biases credentials? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Install the wandb library on your laptop and login to your account via ``wandb login``. Then, add the following lines in your task yaml file: @@ -69,7 +69,7 @@ Then, add the following lines in your task yaml file: ~/.netrc: ~/.netrc How to update an existing cluster's ``file_mounts`` without rerunning ``setup``? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you have edited the ``file_mounts`` section (e.g., by adding some files) and would like to have it reflected on an existing cluster, running ``sky launch -c ..`` would work, but it would rerun the ``setup`` commands. @@ -77,7 +77,7 @@ To avoid rerunning the ``setup`` commands, pass the ``--no-setup`` flag to ``sky (Advanced) How to make SkyPilot use all global regions? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ By default, SkyPilot only supports the US regions on different clouds for convenience. If you want to utilize all global regions, please run the following command: @@ -88,6 +88,7 @@ By default, SkyPilot only supports the US regions on different clouds for conven # Fetch all regions for AWS python -m sky.clouds.service_catalog.data_fetchers.fetch_aws --all-regions # Fetch all regions for GCP + pip install lxml python -m sky.clouds.service_catalog.data_fetchers.fetch_gcp --all-regions # Fetch all regions for Azure python -m sky.clouds.service_catalog.data_fetchers.fetch_azure --all-regions @@ -97,7 +98,7 @@ To make your managed spot jobs potentially use all global regions, please log in (Advanced) How to edit or update the regions or pricing information used by SkyPilot? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SkyPilot stores regions and pricing information for different cloud resource types in CSV files known as `"service catalogs" `_. diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py index e227c3b02cf..a8a3a3bf2a9 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py @@ -589,8 +589,8 @@ def get_tpu_df(): # Filter out unsupported VMs from the catalog. for vm in UNSUPPORTED_VMS: # NOTE: The `InstanceType` column can be NaN. - catalog_df = catalog_df[not catalog_df['InstanceType'].str.startswith(vm - )] + catalog_df = catalog_df[catalog_df['InstanceType'].str.startswith( + vm).ne(True)] # Reorder the columns. catalog_df = catalog_df[COLUMNS] From 547d67c2e6804c03ecefb04484109d4088db73e6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 16 Nov 2022 14:52:14 -0800 Subject: [PATCH 4/5] Add check --- sky/clouds/service_catalog/data_fetchers/fetch_aws.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index 6bc905f2aa4..a2dfc0e99ab 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -249,6 +249,8 @@ def get_image_row(region: str, ubuntu_version: str, print( f'Failed to find image for {region}, {ubuntu_version}, {cpu_or_gpu}' ) + if date is None: + raise ValueError(f'Could not find the creation date for {cpu_or_gpu}.') tag = f'skypilot:{cpu_or_gpu}-ubuntu-{ubuntu_version.replace(".", "")}' return tag, region, 'ubuntu', ubuntu_version, image_id, date From 891e1ca6fbd25f50e57b2d779c2387a34eddae94 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 16 Nov 2022 16:55:29 -0800 Subject: [PATCH 5/5] exclude analyze.py --- sky/setup_files/MANIFEST.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/setup_files/MANIFEST.in b/sky/setup_files/MANIFEST.in index 8f0ed6bc271..25962c69116 100644 --- a/sky/setup_files/MANIFEST.in +++ b/sky/setup_files/MANIFEST.in @@ -6,5 +6,5 @@ include sky/skylet/providers/gcp/* include sky/skylet/ray_patches/*.patch include sky/templates/* include sky/setup_files/* -include sky/utils/* include sky/skylet/LICENCE +exclude sky/clouds/service_catalog/data_fetchers/analyze.py