Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Provisioner] Support multi level performance disk #1812

Merged
merged 41 commits into from
Apr 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
fe86508
GCP & AWS finished
cblmemo Mar 24, 2023
cfbe585
Azure finished
cblmemo Mar 25, 2023
d945c76
reformat code
cblmemo Mar 25, 2023
8e2dbc7
fix some of problem mentioned in PR discussion
cblmemo Mar 27, 2023
dbf491c
fix wrong cloud disk type check and modify default disk type behaviour
cblmemo Mar 27, 2023
868e681
fix aws bug & add type notation for disk
cblmemo Mar 28, 2023
81e5f29
add aws throughput & reformat azure
cblmemo Apr 1, 2023
6980b2e
benchmark & eprformance alignment finished
cblmemo Apr 5, 2023
2df1bbb
reformat code
cblmemo Apr 5, 2023
f25d3da
fix test_cli
cblmemo Apr 8, 2023
318abff
better code style
cblmemo Apr 8, 2023
a95b631
add cli option
cblmemo Apr 8, 2023
b81ec67
reimplement check validate in disk_type: enable cloud=None
cblmemo Apr 8, 2023
2fe1c47
add interactive node cli
cblmemo Apr 8, 2023
ae89465
fix abortion in `sky launch --disk-type high`
cblmemo Apr 9, 2023
f6316a7
rename to disk_tier
cblmemo Apr 9, 2023
b22e3c5
fix azure config
cblmemo Apr 9, 2023
7925edb
renaming in error message of {local, lambda}
cblmemo Apr 9, 2023
b6c734b
fix typo
cblmemo Apr 9, 2023
6b41a10
Update docs/source/reference/yaml-spec.rst
cblmemo Apr 10, 2023
12ddabf
Update sky/cli.py
cblmemo Apr 10, 2023
ee25c2c
Update sky/cli.py
cblmemo Apr 10, 2023
e108a19
Update sky/cli.py
cblmemo Apr 10, 2023
b94419b
Update sky/cli.py
cblmemo Apr 10, 2023
9f964ec
default to medium tier
cblmemo Apr 10, 2023
61d9ba5
remove unnecessary API
cblmemo Apr 10, 2023
6c8d757
reformat code
cblmemo Apr 10, 2023
bc45106
update resources schema
cblmemo Apr 10, 2023
f4b9f47
fix None->default_tier handle code style
cblmemo Apr 11, 2023
86d563e
add auto selection for instance type corresponding to disk_tier
cblmemo Apr 11, 2023
4edfd5c
use default Ds series to enable disk_tier=medium in Azure
cblmemo Apr 11, 2023
471c2c1
use s-series in default E too
cblmemo Apr 11, 2023
34e23e5
fix unittest on default instance type
cblmemo Apr 12, 2023
c590fa0
add aws unittest
cblmemo Apr 13, 2023
bcdb529
add gcp unittest
cblmemo Apr 13, 2023
ddd8c5d
fix None is s series bug
cblmemo Apr 13, 2023
654d9b6
add azure unittest
cblmemo Apr 13, 2023
20a8c91
better code style
cblmemo Apr 15, 2023
1b3165f
quick workaround for basic_a is s series
cblmemo Apr 15, 2023
8ab4d34
reimplement Azure._is_s_series
cblmemo Apr 15, 2023
f14d07f
fix typos
cblmemo Apr 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/source/reference/yaml-spec.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ Available fields:
# have a large working directory or tasks that write out large outputs.
disk_size: 256

# Disk tier to use for OS (optional).
# Could be one of 'low', 'medium', or 'high' (default: 'medium'). Rough performance estimate:
# low: 500 IOPS; read 20MB/s; write 40 MB/s
# medium: 3000 IOPS; read 220 MB/s; write 200 MB/s
# high: 6000 IOPS; 340 MB/s; write 250 MB/s
disk_tier: 'medium'

# Additional accelerator metadata (optional); only used for TPU node
# and TPU VM.
# Example usage:
Expand Down
74 changes: 62 additions & 12 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,12 @@ def _interactive_node_cli_command(cli_func):
type=int,
required=False,
help=('OS disk size in GBs.'))
disk_tier = click.option('--disk-tier',
default=None,
type=str,
required=False,
help=('OS disk tier. Could be one of "low", '
'"medium", "high". Default: medium'))
no_confirm = click.option('--yes',
'-y',
is_flag=True,
Expand Down Expand Up @@ -299,6 +305,7 @@ def _interactive_node_cli_command(cli_func):
screen_option,
tmux_option,
disk_size,
disk_tier,
]
decorator = functools.reduce(lambda res, f: f(res),
reversed(click_decorators), cli_func)
Expand Down Expand Up @@ -588,7 +595,8 @@ def _parse_override_params(cloud: Optional[str] = None,
instance_type: Optional[str] = None,
use_spot: Optional[bool] = None,
image_id: Optional[str] = None,
disk_size: Optional[int] = None) -> Dict[str, Any]:
disk_size: Optional[int] = None,
disk_tier: Optional[str] = None) -> Dict[str, Any]:
"""Parses the override parameters into a dictionary."""
override_params: Dict[str, Any] = {}
if cloud is not None:
Expand Down Expand Up @@ -635,6 +643,8 @@ def _parse_override_params(cloud: Optional[str] = None,
override_params['image_id'] = image_id
if disk_size is not None:
override_params['disk_size'] = disk_size
if disk_tier is not None:
override_params['disk_tier'] = disk_tier
return override_params


Expand Down Expand Up @@ -974,6 +984,7 @@ def _make_task_from_entrypoint_with_overrides(
use_spot: Optional[bool] = None,
image_id: Optional[str] = None,
disk_size: Optional[int] = None,
disk_tier: Optional[str] = None,
env: Optional[List[Tuple[str, str]]] = None,
# spot launch specific
spot_recovery: Optional[str] = None,
Expand Down Expand Up @@ -1017,7 +1028,8 @@ def _make_task_from_entrypoint_with_overrides(
instance_type=instance_type,
use_spot=use_spot,
image_id=image_id,
disk_size=disk_size)
disk_size=disk_size,
disk_tier=disk_tier)
# Spot launch specific.
if spot_recovery is not None:
if spot_recovery.lower() == 'none':
Expand Down Expand Up @@ -1174,6 +1186,14 @@ def cli():
type=int,
required=False,
help=('OS disk size in GBs.'))
@click.option(
'--disk-tier',
default=None,
type=str,
required=False,
help=(
'OS disk tier. Could be one of "low", "medium", "high". Default: medium'
))
@click.option(
'--idle-minutes-to-autostop',
'-i',
Expand Down Expand Up @@ -1241,6 +1261,7 @@ def launch(
image_id: Optional[str],
env: List[Tuple[str, str]],
disk_size: Optional[int],
disk_tier: Optional[str],
idle_minutes_to_autostop: Optional[int],
down: bool, # pylint: disable=redefined-outer-name
retry_until_up: bool,
Expand Down Expand Up @@ -1287,6 +1308,7 @@ def launch(
image_id=image_id,
env=env,
disk_size=disk_size,
disk_tier=disk_tier,
)

backend: backends.Backend
Expand Down Expand Up @@ -2672,8 +2694,8 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
memory: Optional[str], gpus: Optional[str],
use_spot: Optional[bool], screen: Optional[bool],
tmux: Optional[bool], disk_size: Optional[int],
idle_minutes_to_autostop: Optional[int], down: bool,
retry_until_up: bool):
disk_tier: Optional[str], idle_minutes_to_autostop: Optional[int],
down: bool, retry_until_up: bool):
"""Launch or attach to an interactive GPU node.

Examples:
Expand Down Expand Up @@ -2730,7 +2752,8 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
memory=memory,
accelerators=gpus,
use_spot=use_spot,
disk_size=disk_size)
disk_size=disk_size,
disk_tier=disk_tier)

_create_and_ssh_into_node(
'gpunode',
Expand All @@ -2754,8 +2777,9 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
instance_type: Optional[str], cpus: Optional[str],
memory: Optional[str], use_spot: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
down: bool, retry_until_up: bool):
disk_size: Optional[int], disk_tier: Optional[str],
idle_minutes_to_autostop: Optional[int], down: bool,
retry_until_up: bool):
"""Launch or attach to an interactive CPU node.

Examples:
Expand Down Expand Up @@ -2808,7 +2832,8 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
cpus=cpus,
memory=memory,
use_spot=use_spot,
disk_size=disk_size)
disk_size=disk_size,
disk_tier=disk_tier)

_create_and_ssh_into_node(
'cpunode',
Expand All @@ -2833,8 +2858,9 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
memory: Optional[str], tpus: Optional[str],
use_spot: Optional[bool], tpu_vm: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
down: bool, retry_until_up: bool):
disk_size: Optional[int], disk_tier: Optional[str],
idle_minutes_to_autostop: Optional[int], down: bool,
retry_until_up: bool):
"""Launch or attach to an interactive TPU node.

Examples:
Expand Down Expand Up @@ -2894,7 +2920,8 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
accelerators=tpus,
accelerator_args=accelerator_args,
use_spot=use_spot,
disk_size=disk_size)
disk_size=disk_size,
disk_tier=disk_tier)

_create_and_ssh_into_node(
'tpunode',
Expand Down Expand Up @@ -3277,6 +3304,14 @@ def spot():
type=int,
required=False,
help=('OS disk size in GBs.'))
@click.option(
'--disk-tier',
default=None,
type=str,
required=False,
help=(
'OS disk tier. Could be one of "low", "medium", "high". Default: medium'
))
@click.option(
'--detach-run',
'-d',
Expand Down Expand Up @@ -3320,6 +3355,7 @@ def spot_launch(
spot_recovery: Optional[str],
env: List[Tuple[str, str]],
disk_size: Optional[int],
disk_tier: Optional[str],
detach_run: bool,
retry_until_up: bool,
yes: bool,
Expand Down Expand Up @@ -3357,6 +3393,7 @@ def spot_launch(
image_id=image_id,
env=env,
disk_size=disk_size,
disk_tier=disk_tier,
spot_recovery=spot_recovery,
)

Expand Down Expand Up @@ -3653,6 +3690,14 @@ def bench():
type=int,
required=False,
help=('OS disk size in GBs.'))
@click.option(
'--disk-tier',
default=None,
type=str,
required=False,
help=(
'OS disk tier. Could be one of "low", "medium", "high". Default: medium'
))
@click.option(
'--idle-minutes-to-autostop',
'-i',
Expand Down Expand Up @@ -3684,6 +3729,7 @@ def benchmark_launch(
image_id: Optional[str],
env: List[Tuple[str, str]],
disk_size: Optional[int],
disk_tier: Optional[str],
idle_minutes_to_autostop: Optional[int],
yes: bool,
) -> None:
Expand Down Expand Up @@ -3740,6 +3786,9 @@ def benchmark_launch(
if disk_size is not None:
if any('disk_size' in candidate for candidate in candidates):
raise click.BadParameter(f'disk_size {message}')
if disk_tier is not None:
if any('disk_tier' in candidate for candidate in candidates):
raise click.BadParameter(f'disk_tier {message}')

# The user can specify the benchmark candidates in either of the two ways:
# 1. By specifying resources.candidates in the YAML.
Expand Down Expand Up @@ -3782,7 +3831,8 @@ def benchmark_launch(
gpus=override_gpu,
use_spot=use_spot,
image_id=image_id,
disk_size=disk_size)
disk_size=disk_size,
disk_tier=disk_tier)
resources_config.update(override_params)
if 'cloud' in resources_config:
cloud = resources_config.pop('cloud')
Expand Down
37 changes: 33 additions & 4 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
import subprocess
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Any

from sky import clouds
from sky import exceptions
Expand Down Expand Up @@ -283,9 +283,11 @@ def is_same_cloud(self, other: clouds.Cloud):
def get_default_instance_type(
cls,
cpus: Optional[str] = None,
memory: Optional[str] = None) -> Optional[str]:
memory: Optional[str] = None,
disk_tier: Optional[str] = None) -> Optional[str]:
return service_catalog.get_default_instance_type(cpus=cpus,
memory=memory,
disk_tier=disk_tier,
clouds='aws')

# TODO: factor the following three methods, as they are the same logic
Expand All @@ -308,7 +310,7 @@ def get_vcpus_mem_from_instance_type(

def make_deploy_resources_variables(
self, resources: 'resources_lib.Resources', region: 'clouds.Region',
zones: Optional[List['clouds.Zone']]) -> Dict[str, Optional[str]]:
zones: Optional[List['clouds.Zone']]) -> Dict[str, Any]:
assert zones is not None, (region, zones)

region_name = region.name
Expand All @@ -331,6 +333,7 @@ def make_deploy_resources_variables(
'region': region_name,
'zones': ','.join(zone_names),
'image_id': image_id,
**AWS._get_disk_specs(r.disk_tier)
}

def get_feasible_launchable_resources(self,
Expand Down Expand Up @@ -361,7 +364,9 @@ def _make(instance_list):
if accelerators is None:
# Return a default instance type with the given number of vCPUs.
default_instance_type = AWS.get_default_instance_type(
cpus=resources.cpus, memory=resources.memory)
cpus=resources.cpus,
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [])
else:
Expand Down Expand Up @@ -614,3 +619,27 @@ def accelerator_in_region_or_zone(self,
zone: Optional[str] = None) -> bool:
return service_catalog.accelerator_in_region_or_zone(
accelerator, acc_count, region, zone, 'aws')

@classmethod
def check_disk_tier_enabled(cls, instance_type: str,
disk_tier: str) -> None:
del instance_type, disk_tier # unused

@classmethod
def _get_disk_type(cls, disk_tier: str) -> str:
return 'standard' if disk_tier == 'low' else 'gp3'

@classmethod
def _get_disk_specs(cls, disk_tier: Optional[str]) -> Dict[str, Any]:
tier = disk_tier or cls._DEFAULT_DISK_TIER
tier2iops = {
'high': 7000,
'medium': 3500,
'low': 0, # only gp3 is required to set iops
}
return {
'disk_tier': cls._get_disk_type(tier),
'disk_iops': tier2iops[tier],
'disk_throughput': tier2iops[tier] // 16,
'custom_disk_perf': tier != 'low',
}
Loading