Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Azure default image #2468

Merged
merged 16 commits into from
Nov 16, 2023
Merged
31 changes: 31 additions & 0 deletions sky/authentication.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
is an exception, due to the limitation of the cloud provider. See the
comments in setup_lambda_authentication)
"""
import base64
import copy
import functools
import os
Expand Down Expand Up @@ -261,6 +262,36 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
return configure_ssh_info(config)


# In Azure, cloud-init script must be encoded in base64. See
# https://learn.microsoft.com/en-us/azure/virtual-machines/custom-data
# for more information. Here we decode it and replace the ssh user
# and public key content, then encode it back.
def setup_azure_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
_, public_key_path = get_or_generate_keys()
with open(public_key_path, 'r') as f:
public_key = f.read().strip()
for node_type in config['available_node_types']:
node_config = config['available_node_types'][node_type]['node_config']
cloud_init = (
node_config['azure_arm_parameters']['cloudInitSetupCommands'])
cloud_init = base64.b64decode(cloud_init).decode('utf-8')
cloud_init = cloud_init.replace('skypilot:ssh_user',
config['auth']['ssh_user'])
cloud_init = cloud_init.replace('skypilot:ssh_public_key_content',
public_key)
cloud_init = base64.b64encode(
cloud_init.encode('utf-8')).decode('utf-8')
node_config['azure_arm_parameters']['cloudInitSetupCommands'] = (
cloud_init)
config_str = common_utils.dump_yaml_str(config)
config_str = config_str.replace('skypilot:ssh_user',
config['auth']['ssh_user'])
config_str = config_str.replace('skypilot:ssh_public_key_content',
public_key)
config = yaml.safe_load(config_str)
return config


def setup_lambda_authentication(config: Dict[str, Any]) -> Dict[str, Any]:

get_or_generate_keys()
Expand Down
4 changes: 3 additions & 1 deletion sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1156,10 +1156,12 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
"""
config = common_utils.read_yaml(cluster_config_file)
# Check the availability of the cloud type.
if isinstance(cloud, (clouds.AWS, clouds.Azure, clouds.OCI, clouds.SCP)):
if isinstance(cloud, (clouds.AWS, clouds.OCI, clouds.SCP)):
config = auth.configure_ssh_info(config)
elif isinstance(cloud, clouds.GCP):
config = auth.setup_gcp_authentication(config)
elif isinstance(cloud, clouds.Azure):
config = auth.setup_azure_authentication(config)
elif isinstance(cloud, clouds.Lambda):
config = auth.setup_lambda_authentication(config)
elif isinstance(cloud, clouds.Kubernetes):
Expand Down
48 changes: 27 additions & 21 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,35 +136,39 @@ def get_default_instance_type(
def _get_image_config(self, gen_version, instance_type):
# az vm image list \
# --publisher microsoft-dsvm --all --output table
# nvidia-driver: 495.29.05, cuda: 11.5

# The latest image 2022.09.14/2022.08.11/22.06.10/22.05.11/
# 22.04.27/22.04.05 has even older nvidia driver 470.57.02,
# cuda: 11.4
# nvidia-driver: 535.54.03, cuda: 12.2
# see: https://github.com/Azure/azhpc-images/releases/tag/ubuntu-hpc-20230803
# All A100 instances is of gen2, so it will always use
# the latest ubuntu-hpc:2204 image.
image_config = {
'image_publisher': 'microsoft-dsvm',
'image_offer': 'ubuntu-2004',
'image_sku': '2004-gen2',
'image_version': '21.11.04'
'image_offer': 'ubuntu-hpc',
'image_sku': '2204',
'image_version': '22.04.2023080201'
}
cblmemo marked this conversation as resolved.
Show resolved Hide resolved

# ubuntu-2004 v21.10.21 and v21.11.04 do not work on K80
# due to an NVIDIA driver issue.
# ubuntu-2004 v21.08.30, K80 requires image with old NVIDIA driver version
acc = self.get_accelerators_from_instance_type(instance_type)
if acc is not None:
acc_name = list(acc.keys())[0]
if acc_name == 'K80':
image_config['image_version'] = '21.08.30'

# ubuntu-2004 does not work on A100
if instance_type in [
'Standard_ND96asr_v4', 'Standard_ND96amsr_A100_v4'
]:
image_config['image_offer'] = 'ubuntu-hpc'
image_config['image_sku'] = '2004'
image_config['image_version'] = '20.04.2021120101'
image_config = {
'image_publisher': 'microsoft-dsvm',
'image_offer': 'ubuntu-2004',
'image_sku': '2004-gen2',
'image_version': '21.08.30'
}

# ubuntu-2004 v21.11.04, the previous image we used in the past for
# V1 HyperV instance before we change default image to ubuntu-hpc.
if gen_version == 'V1':
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
image_config['image_sku'] = '2004'
image_config = {
'image_publisher': 'microsoft-dsvm',
'image_offer': 'ubuntu-2004',
'image_sku': '2004',
'image_version': '21.11.04'
}

return image_config

@classmethod
Expand Down Expand Up @@ -251,13 +255,15 @@ def make_deploy_resources_variables(
# This script will modify /etc/ssh/sshd_config and add a bash script
# into .bashrc. The bash script will restart sshd if it has not been
# restarted, identified by a file /tmp/__restarted is existing.
# Also, add default user to docker group.
# pylint: disable=line-too-long
cloud_init_setup_commands = base64.b64encode(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above, do we really need to encode the cloud init to base64? Is it possible to use a plain text with some careful yaml syntax, e.g. using : | or : |- to avoid unexpected new line and indents?

textwrap.dedent("""\
#cloud-config
runcmd:
- sed -i 's/#Banner none/Banner none/' /etc/ssh/sshd_config
- echo '\\nif [ ! -f "/tmp/__restarted" ]; then\\n sudo systemctl restart ssh\\n sleep 2\\n touch /tmp/__restarted\\nfi' >> /home/azureuser/.bashrc
- echo '\\nif [ ! -f "/tmp/__restarted" ]; then\\n sudo systemctl restart ssh\\n sleep 2\\n touch /tmp/__restarted\\nfi' >> /home/skypilot:ssh_user/.bashrc
- usermod -aG docker skypilot:ssh_user
write_files:
- path: /etc/apt/apt.conf.d/20auto-upgrades
content: |
Expand Down
40 changes: 32 additions & 8 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,13 +727,15 @@ def test_gcp_stale_job_manual_restart():
@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet
def test_env_check(generic_cloud: str):
name = _get_cluster_name()
total_timeout_minutes = 25 if generic_cloud == 'azure' else 15
test = Test(
'env_check',
[
f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup examples/env_check.yaml',
f'sky logs {name} 1 --status', # Ensure the job succeeded.
],
f'sky down -y {name}',
timeout=total_timeout_minutes * 60,
)
run_one_test(test)

Expand Down Expand Up @@ -1134,6 +1136,7 @@ def test_scp_job_queue():
@pytest.mark.no_kubernetes # Kubernetes not support num_nodes > 1 yet
def test_job_queue_multinode(generic_cloud: str):
name = _get_cluster_name()
total_timeout_minutes = 30 if generic_cloud == 'azure' else 15
test = Test(
'job_queue_multinode',
[
Expand Down Expand Up @@ -1164,6 +1167,7 @@ def test_job_queue_multinode(generic_cloud: str):
f'sky logs {name} 7 --status',
],
f'sky down -y {name}',
timeout=total_timeout_minutes * 60,
)
run_one_test(test)

Expand Down Expand Up @@ -1428,6 +1432,7 @@ def test_tpu_vm_pod():
@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet
def test_multi_hostname(generic_cloud: str):
name = _get_cluster_name()
total_timeout_minutes = 25 if generic_cloud == 'azure' else 15
test = Test(
'multi_hostname',
[
Expand All @@ -1438,6 +1443,7 @@ def test_multi_hostname(generic_cloud: str):
f'sky logs {name} 2 --status', # Ensure the job succeeded.
],
f'sky down -y {name}',
timeout=total_timeout_minutes * 60,
)
run_one_test(test)

Expand Down Expand Up @@ -1573,6 +1579,12 @@ def test_azure_start_stop():
@pytest.mark.no_kubernetes # Kubernetes does not autostop yet
def test_autostop(generic_cloud: str):
name = _get_cluster_name()
# Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
# the VM is stopped.
autostop_timeout = 600 if generic_cloud == 'azure' else 250
# Launching and starting Azure clusters can take a long time too. e.g., restart
# a stopped Azure cluster can take 7m. So we set the total timeout to 70m.
total_timeout_minutes = 70 if generic_cloud == 'azure' else 20
test = Test(
'autostop',
[
Expand All @@ -1587,7 +1599,7 @@ def test_autostop(generic_cloud: str):
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',

# Ensure the cluster is STOPPED.
'sleep 250',
f'sleep {autostop_timeout}',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED',

# Ensure the cluster is UP and the autostop setting is reset ('-').
Expand All @@ -1605,7 +1617,7 @@ def test_autostop(generic_cloud: str):
f'sky autostop -y {name} -i 1', # Should restart the timer.
'sleep 45',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
'sleep 250',
f'sleep {autostop_timeout}',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED',

# Test restarting the idleness timer via exec:
Expand All @@ -1616,11 +1628,11 @@ def test_autostop(generic_cloud: str):
f'sky exec {name} echo hi', # Should restart the timer.
'sleep 45',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
'sleep 250',
f'sleep {autostop_timeout}',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED',
],
f'sky down -y {name}',
timeout=20 * 60,
timeout=total_timeout_minutes * 60,
)
run_one_test(test)

Expand All @@ -1629,6 +1641,10 @@ def test_autostop(generic_cloud: str):
@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_autodown instead.
def test_autodown(generic_cloud: str):
name = _get_cluster_name()
# Azure takes ~ 13m30s (810s) to autodown a VM, so here we use 900 to ensure
# the VM is terminated.
autodown_timeout = 900 if generic_cloud == 'azure' else 240
total_timeout_minutes = 90 if generic_cloud == 'azure' else 20
test = Test(
'autodown',
[
Expand All @@ -1640,23 +1656,23 @@ def test_autodown(generic_cloud: str):
'sleep 45',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
# Ensure the cluster is terminated.
'sleep 200',
f'sleep {autodown_timeout}',
f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml',
f'sky status | grep {name} | grep UP', # Ensure the cluster is UP.
f'sky exec {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
f'sky status | grep {name} | grep "1m (down)"',
'sleep 240',
f'sleep {autodown_timeout}',
# Ensure the cluster is terminated.
f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml',
f'sky autostop -y {name} --cancel',
'sleep 240',
f'sleep {autodown_timeout}',
# Ensure the cluster is still UP.
f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && echo "$s" | grep {name} | grep UP',
],
f'sky down -y {name}',
timeout=20 * 60,
timeout=total_timeout_minutes * 60,
)
run_one_test(test)

Expand Down Expand Up @@ -1797,6 +1813,7 @@ def test_cancel_ibm():


# ---------- Testing use-spot option ----------
@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand All @@ -1818,6 +1835,7 @@ def test_use_spot(generic_cloud: str):


# ---------- Testing managed spot ----------
@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand Down Expand Up @@ -1851,6 +1869,7 @@ def test_spot(generic_cloud: str):
run_one_test(test)


@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand Down Expand Up @@ -1890,6 +1909,7 @@ def test_spot_pipeline(generic_cloud: str):
run_one_test(test)


@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand All @@ -1913,6 +1933,7 @@ def test_spot_failed_setup(generic_cloud: str):
run_one_test(test)


@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand Down Expand Up @@ -2105,6 +2126,7 @@ def test_spot_pipeline_recovery_gcp():
run_one_test(test)


@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand Down Expand Up @@ -2326,6 +2348,7 @@ def test_spot_cancellation_gcp():


# ---------- Testing storage for managed spot ----------
@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand Down Expand Up @@ -2381,6 +2404,7 @@ def test_spot_tpu():


# ---------- Testing env for spot ----------
@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand Down