From 341c73b0d39b5409712427b8daa428531a1d2643 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 23:11:30 -0700 Subject: [PATCH 1/9] set the default iops to be same as console for AWS --- sky/templates/aws-ray.yml.j2 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index da3fac0717a..3b4430e9a15 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -26,10 +26,12 @@ available_node_types: node_config: InstanceType: {{instance_type}} ImageId: {{image_id}} # Deep Learning AMI (Ubuntu 18.04); see aws.py. + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: VolumeSize: {{disk_size}} + Iops: 3000 # use default Iops for gp3 {% if use_spot %} InstanceMarketOptions: MarketType: spot @@ -49,6 +51,7 @@ available_node_types: - DeviceName: /dev/sda1 Ebs: VolumeSize: {{disk_size}} + Iops: 3000 # use default Iops for gp3 {% if use_spot %} InstanceMarketOptions: MarketType: spot From 1b55709db63be2a5fc85ee397e1dd53c6db7a188 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 23:44:44 -0700 Subject: [PATCH 2/9] fix --- sky/templates/aws-ray.yml.j2 | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 3b4430e9a15..0880be270ad 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -31,7 +31,10 @@ available_node_types: - DeviceName: /dev/sda1 Ebs: VolumeSize: {{disk_size}} - Iops: 3000 # use default Iops for gp3 + # use default Iops for gp3 + VolumeType: gp3 + Iops: 3000 + Throughput: 125 {% if use_spot %} InstanceMarketOptions: MarketType: spot @@ -51,7 +54,9 @@ available_node_types: - DeviceName: /dev/sda1 Ebs: VolumeSize: {{disk_size}} - Iops: 3000 # use default Iops for gp3 + VolumeType: gp3 + Iops: 3000 + Throughput: 125 {% if use_spot %} InstanceMarketOptions: MarketType: spot From 541174034f4b8ad035e6ae2193877afba126ccb7 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 11 Oct 2022 22:25:49 -0700 Subject: [PATCH 3/9] add backward compatibility --- sky/backends/backend_utils.py | 54 ++++++++++++++++++++++++++++++++--- sky/utils/common_utils.py | 1 + 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 51acef19ef8..bb7fc32dbd8 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -31,6 +31,7 @@ from ray.autoscaler._private import util as ray_util import rich.console as rich_console import rich.progress as rich_progress +import yaml import sky from sky import authentication as auth @@ -109,12 +110,22 @@ # Remote dir that holds our runtime files. _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files' +_RAY_YAML_KEYS_TO_RESTORE_FOR_BACK_COMPATIBILITY = {'Ebs'} + def is_ip(s: str) -> bool: """Returns whether this string matches IP_ADDR_REGEX.""" return len(re.findall(IP_ADDR_REGEX, s)) == 1 +def _get_yaml_path_from_cluster_name(cluster_name: str, + prefix: str = SKY_USER_FILE_PATH) -> str: + output_path = pathlib.Path( + prefix).expanduser().resolve() / f'{cluster_name}.yml' + os.makedirs(output_path.parents[0], exist_ok=True) + return str(output_path) + + def fill_template(template_name: str, variables: Dict, output_path: Optional[str] = None, @@ -129,10 +140,8 @@ def fill_template(template_name: str, if output_path is None: assert ('cluster_name' in variables), ('cluster_name is required.') cluster_name = variables.get('cluster_name') - output_path = pathlib.Path( - output_prefix).expanduser() / f'{cluster_name}.yml' - os.makedirs(output_path.parents[0], exist_ok=True) - output_path = str(output_path) + output_path = _get_yaml_path_from_cluster_name(cluster_name, + output_prefix) output_path = os.path.abspath(output_path) # Add yaml file path to the template variables. @@ -655,6 +664,27 @@ def _remove_multinode_config( break +def _restore_original_block_for_yaml(new_yaml: str, old_yaml: str, + key_names: Set[str]) -> str: + """Restore the original block with key_name recursively.""" + + def _restore_block(new_block: Dict[str, Any], old_block: Dict[str, Any]): + for key, value in new_block.items(): + if key in key_names: + if key in old_block: + new_block[key] = old_block[key] + else: + del new_block[key] + elif isinstance(value, dict): + if key in old_block: + _restore_block(value, old_block[key]) + + new_config = yaml.safe_load(new_yaml) + old_config = yaml.safe_load(old_yaml) + _restore_block(new_config, old_config) + return common_utils.dump_yaml_str(new_config) + + # TODO: too many things happening here - leaky abstraction. Refactor. @timeline.event def write_cluster_config(to_provision: 'resources.Resources', @@ -700,6 +730,12 @@ def write_cluster_config(to_provision: 'resources.Resources', auth_config = onprem_utils.get_local_auth_config(cluster_name) region_name = resources_vars.get('region') + yaml_path = _get_yaml_path_from_cluster_name(cluster_name) + old_yaml_content = None + if os.path.exists(yaml_path): + with open(yaml_path, 'r') as f: + old_yaml_content = f.read() + yaml_path = fill_template( cluster_config_template, dict( @@ -750,6 +786,16 @@ def write_cluster_config(to_provision: 'resources.Resources', # been fully tested yet. _optimize_file_mounts(yaml_path) + # Restore the old yaml content for backward compatibility. + if old_yaml_content is not None: + with open(yaml_path, 'r') as f: + new_yaml_content = f.read() + restored_yaml_content = _restore_original_block_for_yaml( + old_yaml_content, new_yaml_content, + _RAY_YAML_KEYS_TO_RESTORE_FOR_BACK_COMPATIBILITY) + with open(yaml_path, 'w') as f: + f.write(restored_yaml_content) + usage_lib.messages.usage.update_ray_yaml(yaml_path) # For TPU nodes. TPU VMs do not need TPU_NAME. if (resources_vars.get('tpu_type') is not None and diff --git a/sky/utils/common_utils.py b/sky/utils/common_utils.py index 6e5f7798681..a533415675d 100644 --- a/sky/utils/common_utils.py +++ b/sky/utils/common_utils.py @@ -13,6 +13,7 @@ import time import uuid from typing import Dict, List, Union + import yaml from sky import sky_logging From 9d870214868b8a8e59046e02eb341c4601c85acd Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 13 Oct 2022 00:18:22 -0700 Subject: [PATCH 4/9] Address comments --- sky/backends/backend_utils.py | 30 +++++++++++++++++++++++----- sky/backends/cloud_vm_ray_backend.py | 3 ++- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index bb7fc32dbd8..87216a68032 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -110,7 +110,18 @@ # Remote dir that holds our runtime files. _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files' -_RAY_YAML_KEYS_TO_RESTORE_FOR_BACK_COMPATIBILITY = {'Ebs'} +# Include the fields that will be used for generating tags that distinguishes the +# cluster in ray, to avoid the stopped cluster being discarded due to updates in +# the yaml template. +# Some notes on the fields: +# - 'provider' fields will be used for bootstrapping and insert more new items in +# 'node_config'. +# - keeping the auth is not enough becuase the content of the key file will be used +# for calculating the hash. +# TODO(zhwu): Keep in sync with the fields used in https://github.com/ray-project/ray/blob/e4ce38d001dbbe09cd21c497fedd03d692b2be3e/python/ray/autoscaler/_private/commands.py#L687-L701 +_RAY_YAML_KEYS_TO_RESTORE_FOR_BACK_COMPATIBILITY = { + 'provider', 'auth', 'node_config' +} def is_ip(s: str) -> bool: @@ -666,7 +677,12 @@ def _remove_multinode_config( def _restore_original_block_for_yaml(new_yaml: str, old_yaml: str, key_names: Set[str]) -> str: - """Restore the original block with key_name recursively.""" + """Replaces 'new' with 'old' for all keys in key_names. + + The replacement will be applied recursively and only for the blocks + with the key in key_names, and have the same ancestors in both 'new' + and 'old' YAML tree. + """ def _restore_block(new_block: Dict[str, Any], old_block: Dict[str, Any]): for key, value in new_block.items(): @@ -696,7 +712,8 @@ def write_cluster_config(to_provision: 'resources.Resources', region: Optional[clouds.Region] = None, zones: Optional[List[clouds.Zone]] = None, auth_config: Optional[Dict[str, str]] = None, - dryrun: bool = False) -> Dict[str, str]: + dryrun: bool = False, + force_overwrite: bool = True) -> Dict[str, str]: """Fills in cluster configuration templates and writes them out. Returns: {provisioner: path to yaml, the provisioning spec}. @@ -733,8 +750,11 @@ def write_cluster_config(to_provision: 'resources.Resources', yaml_path = _get_yaml_path_from_cluster_name(cluster_name) old_yaml_content = None if os.path.exists(yaml_path): - with open(yaml_path, 'r') as f: - old_yaml_content = f.read() + if force_overwrite: + os.unlink(yaml_path) + else: + with open(yaml_path, 'r') as f: + old_yaml_content = f.read() yaml_path = fill_template( cluster_config_template, diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 4043051196c..5f3f89a87c9 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -924,7 +924,8 @@ def _retry_region_zones(self, self._wheel_hash, region=region, zones=zones, - dryrun=dryrun) + dryrun=dryrun, + force_overwrite=prev_cluster_status is None) if dryrun: return cluster_config_file = config_dict['ray'] From 8e14254819a2f68d4ac0a5635bfeb18d063a6b24 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 13 Oct 2022 00:21:50 -0700 Subject: [PATCH 5/9] fix backward_compatibility_test --- tests/backward_comaptibility_tests.sh | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/backward_comaptibility_tests.sh b/tests/backward_comaptibility_tests.sh index d18b2bc05ed..a6c2ca751a6 100644 --- a/tests/backward_comaptibility_tests.sh +++ b/tests/backward_comaptibility_tests.sh @@ -12,7 +12,7 @@ conda create -n sky-back-compat-master -y --clone sky-dev conda create -n sky-back-compat-current -y --clone sky-dev conda activate sky-back-compat-master -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true pip uninstall skypilot -y cd sky-master @@ -21,39 +21,39 @@ pip install -e ".[all]" cd - conda activate sky-back-compat-current -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true pip uninstall -y skypilot pip install -e ".[all]" # exec + launch conda activate sky-back-compat-master -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true which sky sky launch --cloud gcp -c ${CLUSTER_NAME} examples/minimal.yaml conda activate sky-back-compat-current -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true sky exec --cloud gcp ${CLUSTER_NAME} examples/minimal.yaml sky launch --cloud gcp -c ${CLUSTER_NAME} examples/minimal.yaml sky queue # sky stop + sky start + sky exec conda activate sky-back-compat-master -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true sky launch --cloud gcp -y -c ${CLUSTER_NAME}-2 examples/minimal.yaml conda activate sky-back-compat-current -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true sky stop -y ${CLUSTER_NAME}-2 sky start -y ${CLUSTER_NAME}-2 sky exec --cloud gcp ${CLUSTER_NAME}-2 examples/minimal.yaml # `sky autostop` + `sky status -r` conda activate sky-back-compat-master -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true sky launch --cloud gcp -y -c ${CLUSTER_NAME}-3 examples/minimal.yaml conda activate sky-back-compat-current -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true sky autostop -y -i0 ${CLUSTER_NAME}-3 sleep 100 sky status -r | grep ${CLUSTER_NAME}-3 | grep STOPPED @@ -61,11 +61,11 @@ sky status -r | grep ${CLUSTER_NAME}-3 | grep STOPPED # (1 node) sky launch + sky exec + sky queue + sky logs conda activate sky-back-compat-master -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true sky launch --cloud gcp -y -c ${CLUSTER_NAME}-4 examples/minimal.yaml sky stop -y ${CLUSTER_NAME}-4 conda activate sky-back-compat-current -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true sky launch --cloud gcp -y -c ${CLUSTER_NAME}-4 examples/minimal.yaml sky queue ${CLUSTER_NAME}-4 sky logs ${CLUSTER_NAME}-4 1 --status @@ -75,11 +75,11 @@ sky logs ${CLUSTER_NAME}-4 2 # (1 node) sky start + sky exec + sky queue + sky logs conda activate sky-back-compat-master -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true sky launch --cloud gcp -y -c ${CLUSTER_NAME}-5 examples/minimal.yaml sky stop -y ${CLUSTER_NAME}-5 conda activate sky-back-compat-current -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true sky start -y ${CLUSTER_NAME}-5 sky queue ${CLUSTER_NAME}-5 sky logs ${CLUSTER_NAME}-5 1 --status @@ -91,11 +91,11 @@ sky logs ${CLUSTER_NAME}-5 2 # (2 nodes) sky launch + sky exec + sky queue + sky logs conda activate sky-back-compat-master -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true sky launch --cloud gcp -y -c ${CLUSTER_NAME}-6 examples/multi_hostname.yaml sky stop -y ${CLUSTER_NAME}-6 conda activate sky-back-compat-current -rm -r ~/.sky/sky_wheels || true +rm -r ~/.sky/wheels || true sky start -y ${CLUSTER_NAME}-6 sky queue ${CLUSTER_NAME}-6 sky logs ${CLUSTER_NAME}-6 1 --status From 1002c38668bf5711d12787552ba0d4fb3bb36d30 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 13 Oct 2022 00:29:12 -0700 Subject: [PATCH 6/9] Add backward test for discarding old cluster --- tests/backward_comaptibility_tests.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/backward_comaptibility_tests.sh b/tests/backward_comaptibility_tests.sh index a6c2ca751a6..333d0d9ee83 100644 --- a/tests/backward_comaptibility_tests.sh +++ b/tests/backward_comaptibility_tests.sh @@ -1,7 +1,7 @@ #!/bin/bash set -ex -CLUSTER_NAME="test-back-compat" +CLUSTER_NAME="test-back-compat-$USER" . $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh git clone https://github.com/skypilot-org/skypilot.git sky-master || true @@ -35,8 +35,10 @@ sky launch --cloud gcp -c ${CLUSTER_NAME} examples/minimal.yaml conda activate sky-back-compat-current rm -r ~/.sky/wheels || true sky exec --cloud gcp ${CLUSTER_NAME} examples/minimal.yaml -sky launch --cloud gcp -c ${CLUSTER_NAME} examples/minimal.yaml -sky queue +s=$(sky launch --cloud gcp -d -c ${CLUSTER_NAME} examples/minimal.yaml) +echo $s +echo $s | grep "Job ID: 3" +sky queue ${CLUSTER_NAME} # sky stop + sky start + sky exec conda activate sky-back-compat-master @@ -46,7 +48,9 @@ conda activate sky-back-compat-current rm -r ~/.sky/wheels || true sky stop -y ${CLUSTER_NAME}-2 sky start -y ${CLUSTER_NAME}-2 -sky exec --cloud gcp ${CLUSTER_NAME}-2 examples/minimal.yaml +s=$(sky exec --cloud gcp -d ${CLUSTER_NAME}-2 examples/minimal.yaml) +echo $s +echo $s | grep "Job ID: 2" # `sky autostop` + `sky status -r` conda activate sky-back-compat-master From 90f5362dd54c22c9aefd4ea592be17f3847bd04f Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 13 Oct 2022 01:40:35 -0700 Subject: [PATCH 7/9] update backward --- tests/backward_comaptibility_tests.sh | 45 +++++++++++++++------------ 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/tests/backward_comaptibility_tests.sh b/tests/backward_comaptibility_tests.sh index 333d0d9ee83..d1e1a2058a3 100644 --- a/tests/backward_comaptibility_tests.sh +++ b/tests/backward_comaptibility_tests.sh @@ -1,38 +1,41 @@ #!/bin/bash set -ex +source ~/.bashrc CLUSTER_NAME="test-back-compat-$USER" -. $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh +source $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh git clone https://github.com/skypilot-org/skypilot.git sky-master || true # Create environment for compatibility tests -conda create -n sky-back-compat-master -y --clone sky-dev -conda create -n sky-back-compat-current -y --clone sky-dev +mamba > /dev/null 2>&1 || conda install -n base -c conda-forge mamba -y +mamba init +source ~/.bashrc +mamba env list | grep sky-back-compat-master || mamba create -n sky-back-compat-master -y python=3.9 -conda activate sky-back-compat-master +mamba activate sky-back-compat-master +mamba install -c conda-forge google-cloud-sdk -y rm -r ~/.sky/wheels || true -pip uninstall skypilot -y - cd sky-master git pull origin master pip install -e ".[all]" cd - -conda activate sky-back-compat-current +mamba env list | grep sky-back-compat-current || mamba create -n sky-back-compat-current -y --clone sky-back-compat-master +mamba activate sky-back-compat-current rm -r ~/.sky/wheels || true pip uninstall -y skypilot pip install -e ".[all]" # exec + launch -conda activate sky-back-compat-master +mamba activate sky-back-compat-master rm -r ~/.sky/wheels || true which sky -sky launch --cloud gcp -c ${CLUSTER_NAME} examples/minimal.yaml +sky launch --cloud gcp -y -c ${CLUSTER_NAME} examples/minimal.yaml -conda activate sky-back-compat-current +mamba activate sky-back-compat-current rm -r ~/.sky/wheels || true sky exec --cloud gcp ${CLUSTER_NAME} examples/minimal.yaml s=$(sky launch --cloud gcp -d -c ${CLUSTER_NAME} examples/minimal.yaml) @@ -41,10 +44,10 @@ echo $s | grep "Job ID: 3" sky queue ${CLUSTER_NAME} # sky stop + sky start + sky exec -conda activate sky-back-compat-master +mamba activate sky-back-compat-master rm -r ~/.sky/wheels || true sky launch --cloud gcp -y -c ${CLUSTER_NAME}-2 examples/minimal.yaml -conda activate sky-back-compat-current +mamba activate sky-back-compat-current rm -r ~/.sky/wheels || true sky stop -y ${CLUSTER_NAME}-2 sky start -y ${CLUSTER_NAME}-2 @@ -53,10 +56,10 @@ echo $s echo $s | grep "Job ID: 2" # `sky autostop` + `sky status -r` -conda activate sky-back-compat-master +mamba activate sky-back-compat-master rm -r ~/.sky/wheels || true sky launch --cloud gcp -y -c ${CLUSTER_NAME}-3 examples/minimal.yaml -conda activate sky-back-compat-current +mamba activate sky-back-compat-current rm -r ~/.sky/wheels || true sky autostop -y -i0 ${CLUSTER_NAME}-3 sleep 100 @@ -64,11 +67,11 @@ sky status -r | grep ${CLUSTER_NAME}-3 | grep STOPPED # (1 node) sky launch + sky exec + sky queue + sky logs -conda activate sky-back-compat-master +mamba activate sky-back-compat-master rm -r ~/.sky/wheels || true sky launch --cloud gcp -y -c ${CLUSTER_NAME}-4 examples/minimal.yaml sky stop -y ${CLUSTER_NAME}-4 -conda activate sky-back-compat-current +mamba activate sky-back-compat-current rm -r ~/.sky/wheels || true sky launch --cloud gcp -y -c ${CLUSTER_NAME}-4 examples/minimal.yaml sky queue ${CLUSTER_NAME}-4 @@ -78,11 +81,11 @@ sky logs ${CLUSTER_NAME}-4 1 sky logs ${CLUSTER_NAME}-4 2 # (1 node) sky start + sky exec + sky queue + sky logs -conda activate sky-back-compat-master +mamba activate sky-back-compat-master rm -r ~/.sky/wheels || true sky launch --cloud gcp -y -c ${CLUSTER_NAME}-5 examples/minimal.yaml sky stop -y ${CLUSTER_NAME}-5 -conda activate sky-back-compat-current +mamba activate sky-back-compat-current rm -r ~/.sky/wheels || true sky start -y ${CLUSTER_NAME}-5 sky queue ${CLUSTER_NAME}-5 @@ -94,11 +97,11 @@ sky logs ${CLUSTER_NAME}-5 2 --status sky logs ${CLUSTER_NAME}-5 2 # (2 nodes) sky launch + sky exec + sky queue + sky logs -conda activate sky-back-compat-master +mamba activate sky-back-compat-master rm -r ~/.sky/wheels || true sky launch --cloud gcp -y -c ${CLUSTER_NAME}-6 examples/multi_hostname.yaml sky stop -y ${CLUSTER_NAME}-6 -conda activate sky-back-compat-current +mamba activate sky-back-compat-current rm -r ~/.sky/wheels || true sky start -y ${CLUSTER_NAME}-6 sky queue ${CLUSTER_NAME}-6 @@ -108,3 +111,5 @@ sky exec --cloud gcp ${CLUSTER_NAME}-6 examples/multi_hostname.yaml sky queue ${CLUSTER_NAME}-6 sky logs ${CLUSTER_NAME}-6 2 --status sky logs ${CLUSTER_NAME}-6 2 + +sky down ${CLUSTER_NAME}* -y From e1b75c01ee0c42b836c15276d6d61cf2464ea9ff Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 13 Oct 2022 01:45:36 -0700 Subject: [PATCH 8/9] less output --- tests/backward_comaptibility_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/backward_comaptibility_tests.sh b/tests/backward_comaptibility_tests.sh index d1e1a2058a3..72baf0bc563 100644 --- a/tests/backward_comaptibility_tests.sh +++ b/tests/backward_comaptibility_tests.sh @@ -1,5 +1,5 @@ #!/bin/bash -set -ex +set -ev source ~/.bashrc CLUSTER_NAME="test-back-compat-$USER" From a15841dd1538abcbe55a1da2dd1a453d0b8a0e2d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 13 Oct 2022 13:36:20 -0700 Subject: [PATCH 9/9] address comments --- sky/backends/backend_utils.py | 40 +++++++++++++--------------- sky/backends/cloud_vm_ray_backend.py | 3 ++- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 87216a68032..aee0e985ded 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -120,7 +120,7 @@ # for calculating the hash. # TODO(zhwu): Keep in sync with the fields used in https://github.com/ray-project/ray/blob/e4ce38d001dbbe09cd21c497fedd03d692b2be3e/python/ray/autoscaler/_private/commands.py#L687-L701 _RAY_YAML_KEYS_TO_RESTORE_FOR_BACK_COMPATIBILITY = { - 'provider', 'auth', 'node_config' + 'cluster_name', 'provider', 'auth', 'node_config' } @@ -675,8 +675,8 @@ def _remove_multinode_config( break -def _restore_original_block_for_yaml(new_yaml: str, old_yaml: str, - key_names: Set[str]) -> str: +def _replace_yaml_dicts(new_yaml: str, old_yaml: str, + key_names: Set[str]) -> str: """Replaces 'new' with 'old' for all keys in key_names. The replacement will be applied recursively and only for the blocks @@ -703,17 +703,18 @@ def _restore_block(new_block: Dict[str, Any], old_block: Dict[str, Any]): # TODO: too many things happening here - leaky abstraction. Refactor. @timeline.event -def write_cluster_config(to_provision: 'resources.Resources', - num_nodes: int, - cluster_config_template: str, - cluster_name: str, - local_wheel_path: pathlib.Path, - wheel_hash: str, - region: Optional[clouds.Region] = None, - zones: Optional[List[clouds.Zone]] = None, - auth_config: Optional[Dict[str, str]] = None, - dryrun: bool = False, - force_overwrite: bool = True) -> Dict[str, str]: +def write_cluster_config( + to_provision: 'resources.Resources', + num_nodes: int, + cluster_config_template: str, + cluster_name: str, + local_wheel_path: pathlib.Path, + wheel_hash: str, + region: Optional[clouds.Region] = None, + zones: Optional[List[clouds.Zone]] = None, + auth_config: Optional[Dict[str, str]] = None, + dryrun: bool = False, + keep_launch_fields_in_existing_config: bool = True) -> Dict[str, str]: """Fills in cluster configuration templates and writes them out. Returns: {provisioner: path to yaml, the provisioning spec}. @@ -749,12 +750,9 @@ def write_cluster_config(to_provision: 'resources.Resources', yaml_path = _get_yaml_path_from_cluster_name(cluster_name) old_yaml_content = None - if os.path.exists(yaml_path): - if force_overwrite: - os.unlink(yaml_path) - else: - with open(yaml_path, 'r') as f: - old_yaml_content = f.read() + if os.path.exists(yaml_path) and keep_launch_fields_in_existing_config: + with open(yaml_path, 'r') as f: + old_yaml_content = f.read() yaml_path = fill_template( cluster_config_template, @@ -810,7 +808,7 @@ def write_cluster_config(to_provision: 'resources.Resources', if old_yaml_content is not None: with open(yaml_path, 'r') as f: new_yaml_content = f.read() - restored_yaml_content = _restore_original_block_for_yaml( + restored_yaml_content = _replace_yaml_dicts( old_yaml_content, new_yaml_content, _RAY_YAML_KEYS_TO_RESTORE_FOR_BACK_COMPATIBILITY) with open(yaml_path, 'w') as f: diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 5f3f89a87c9..0ccaa21da84 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -925,7 +925,8 @@ def _retry_region_zones(self, region=region, zones=zones, dryrun=dryrun, - force_overwrite=prev_cluster_status is None) + keep_launch_fields_in_existing_config=prev_cluster_status + is not None) if dryrun: return cluster_config_file = config_dict['ray']