diff --git a/sky/authentication.py b/sky/authentication.py index 442c1abb372..d2d65835657 100644 --- a/sky/authentication.py +++ b/sky/authentication.py @@ -19,6 +19,7 @@ is an exception, due to the limitation of the cloud provider. See the comments in setup_lambda_authentication) """ +import base64 import copy import functools import os @@ -261,6 +262,36 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]: return configure_ssh_info(config) +# In Azure, cloud-init script must be encoded in base64. See +# https://learn.microsoft.com/en-us/azure/virtual-machines/custom-data +# for more information. Here we decode it and replace the ssh user +# and public key content, then encode it back. +def setup_azure_authentication(config: Dict[str, Any]) -> Dict[str, Any]: + _, public_key_path = get_or_generate_keys() + with open(public_key_path, 'r') as f: + public_key = f.read().strip() + for node_type in config['available_node_types']: + node_config = config['available_node_types'][node_type]['node_config'] + cloud_init = ( + node_config['azure_arm_parameters']['cloudInitSetupCommands']) + cloud_init = base64.b64decode(cloud_init).decode('utf-8') + cloud_init = cloud_init.replace('skypilot:ssh_user', + config['auth']['ssh_user']) + cloud_init = cloud_init.replace('skypilot:ssh_public_key_content', + public_key) + cloud_init = base64.b64encode( + cloud_init.encode('utf-8')).decode('utf-8') + node_config['azure_arm_parameters']['cloudInitSetupCommands'] = ( + cloud_init) + config_str = common_utils.dump_yaml_str(config) + config_str = config_str.replace('skypilot:ssh_user', + config['auth']['ssh_user']) + config_str = config_str.replace('skypilot:ssh_public_key_content', + public_key) + config = yaml.safe_load(config_str) + return config + + def setup_lambda_authentication(config: Dict[str, Any]) -> Dict[str, Any]: get_or_generate_keys() diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index d1e3381a800..4f8d5f37c27 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1156,10 +1156,12 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str): """ config = common_utils.read_yaml(cluster_config_file) # Check the availability of the cloud type. - if isinstance(cloud, (clouds.AWS, clouds.Azure, clouds.OCI, clouds.SCP)): + if isinstance(cloud, (clouds.AWS, clouds.OCI, clouds.SCP)): config = auth.configure_ssh_info(config) elif isinstance(cloud, clouds.GCP): config = auth.setup_gcp_authentication(config) + elif isinstance(cloud, clouds.Azure): + config = auth.setup_azure_authentication(config) elif isinstance(cloud, clouds.Lambda): config = auth.setup_lambda_authentication(config) elif isinstance(cloud, clouds.Kubernetes): diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 41f6862d9df..21731bdb0b3 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -134,37 +134,47 @@ def get_default_instance_type( clouds='azure') def _get_image_config(self, gen_version, instance_type): + # TODO(tian): images for Azure is not well organized. We should refactor + # it to images.csv like AWS. # az vm image list \ # --publisher microsoft-dsvm --all --output table - # nvidia-driver: 495.29.05, cuda: 11.5 - - # The latest image 2022.09.14/2022.08.11/22.06.10/22.05.11/ - # 22.04.27/22.04.05 has even older nvidia driver 470.57.02, - # cuda: 11.4 + # nvidia-driver: 535.54.03, cuda: 12.2 + # see: https://github.com/Azure/azhpc-images/releases/tag/ubuntu-hpc-20230803 + # All A100 instances is of gen2, so it will always use + # the latest ubuntu-hpc:2204 image. image_config = { 'image_publisher': 'microsoft-dsvm', - 'image_offer': 'ubuntu-2004', - 'image_sku': '2004-gen2', - 'image_version': '21.11.04' + 'image_offer': 'ubuntu-hpc', + 'image_sku': '2204', + 'image_version': '22.04.2023080201' } - # ubuntu-2004 v21.10.21 and v21.11.04 do not work on K80 - # due to an NVIDIA driver issue. + # ubuntu-2004 v21.08.30, K80 requires image with old NVIDIA driver version acc = self.get_accelerators_from_instance_type(instance_type) if acc is not None: acc_name = list(acc.keys())[0] if acc_name == 'K80': - image_config['image_version'] = '21.08.30' - - # ubuntu-2004 does not work on A100 - if instance_type in [ - 'Standard_ND96asr_v4', 'Standard_ND96amsr_A100_v4' - ]: - image_config['image_offer'] = 'ubuntu-hpc' - image_config['image_sku'] = '2004' - image_config['image_version'] = '20.04.2021120101' + image_config = { + 'image_publisher': 'microsoft-dsvm', + 'image_offer': 'ubuntu-2004', + 'image_sku': '2004-gen2', + 'image_version': '21.08.30' + } + + # ubuntu-2004 v21.11.04, the previous image we used in the past for + # V1 HyperV instance before we change default image to ubuntu-hpc. + # In Azure, all instances with K80 (Standard_NC series), some + # instances with M60 (Standard_NV series) and some cpu instances + # (Basic_A, Standard_D, ...) are V1 instance. For these instances, + # we use the previous image. if gen_version == 'V1': - image_config['image_sku'] = '2004' + image_config = { + 'image_publisher': 'microsoft-dsvm', + 'image_offer': 'ubuntu-2004', + 'image_sku': '2004', + 'image_version': '21.11.04' + } + return image_config @classmethod @@ -251,13 +261,15 @@ def make_deploy_resources_variables( # This script will modify /etc/ssh/sshd_config and add a bash script # into .bashrc. The bash script will restart sshd if it has not been # restarted, identified by a file /tmp/__restarted is existing. + # Also, add default user to docker group. # pylint: disable=line-too-long cloud_init_setup_commands = base64.b64encode( textwrap.dedent("""\ #cloud-config runcmd: - sed -i 's/#Banner none/Banner none/' /etc/ssh/sshd_config - - echo '\\nif [ ! -f "/tmp/__restarted" ]; then\\n sudo systemctl restart ssh\\n sleep 2\\n touch /tmp/__restarted\\nfi' >> /home/azureuser/.bashrc + - echo '\\nif [ ! -f "/tmp/__restarted" ]; then\\n sudo systemctl restart ssh\\n sleep 2\\n touch /tmp/__restarted\\nfi' >> /home/skypilot:ssh_user/.bashrc + - usermod -aG docker skypilot:ssh_user write_files: - path: /etc/apt/apt.conf.d/20auto-upgrades content: | diff --git a/tests/test_smoke.py b/tests/test_smoke.py index aecd5371de0..cab52ab2f75 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -727,6 +727,7 @@ def test_gcp_stale_job_manual_restart(): @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet def test_env_check(generic_cloud: str): name = _get_cluster_name() + total_timeout_minutes = 25 if generic_cloud == 'azure' else 15 test = Test( 'env_check', [ @@ -734,6 +735,7 @@ def test_env_check(generic_cloud: str): f'sky logs {name} 1 --status', # Ensure the job succeeded. ], f'sky down -y {name}', + timeout=total_timeout_minutes * 60, ) run_one_test(test) @@ -1134,6 +1136,7 @@ def test_scp_job_queue(): @pytest.mark.no_kubernetes # Kubernetes not support num_nodes > 1 yet def test_job_queue_multinode(generic_cloud: str): name = _get_cluster_name() + total_timeout_minutes = 30 if generic_cloud == 'azure' else 15 test = Test( 'job_queue_multinode', [ @@ -1164,6 +1167,7 @@ def test_job_queue_multinode(generic_cloud: str): f'sky logs {name} 7 --status', ], f'sky down -y {name}', + timeout=total_timeout_minutes * 60, ) run_one_test(test) @@ -1428,6 +1432,7 @@ def test_tpu_vm_pod(): @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet def test_multi_hostname(generic_cloud: str): name = _get_cluster_name() + total_timeout_minutes = 25 if generic_cloud == 'azure' else 15 test = Test( 'multi_hostname', [ @@ -1438,6 +1443,7 @@ def test_multi_hostname(generic_cloud: str): f'sky logs {name} 2 --status', # Ensure the job succeeded. ], f'sky down -y {name}', + timeout=total_timeout_minutes * 60, ) run_one_test(test) @@ -1573,6 +1579,12 @@ def test_azure_start_stop(): @pytest.mark.no_kubernetes # Kubernetes does not autostop yet def test_autostop(generic_cloud: str): name = _get_cluster_name() + # Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure + # the VM is stopped. + autostop_timeout = 600 if generic_cloud == 'azure' else 250 + # Launching and starting Azure clusters can take a long time too. e.g., restart + # a stopped Azure cluster can take 7m. So we set the total timeout to 70m. + total_timeout_minutes = 70 if generic_cloud == 'azure' else 20 test = Test( 'autostop', [ @@ -1587,7 +1599,7 @@ def test_autostop(generic_cloud: str): f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', # Ensure the cluster is STOPPED. - 'sleep 250', + f'sleep {autostop_timeout}', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', # Ensure the cluster is UP and the autostop setting is reset ('-'). @@ -1605,7 +1617,7 @@ def test_autostop(generic_cloud: str): f'sky autostop -y {name} -i 1', # Should restart the timer. 'sleep 45', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - 'sleep 250', + f'sleep {autostop_timeout}', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', # Test restarting the idleness timer via exec: @@ -1616,11 +1628,11 @@ def test_autostop(generic_cloud: str): f'sky exec {name} echo hi', # Should restart the timer. 'sleep 45', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', - 'sleep 250', + f'sleep {autostop_timeout}', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED', ], f'sky down -y {name}', - timeout=20 * 60, + timeout=total_timeout_minutes * 60, ) run_one_test(test) @@ -1629,6 +1641,10 @@ def test_autostop(generic_cloud: str): @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_autodown instead. def test_autodown(generic_cloud: str): name = _get_cluster_name() + # Azure takes ~ 13m30s (810s) to autodown a VM, so here we use 900 to ensure + # the VM is terminated. + autodown_timeout = 900 if generic_cloud == 'azure' else 240 + total_timeout_minutes = 90 if generic_cloud == 'azure' else 20 test = Test( 'autodown', [ @@ -1640,23 +1656,23 @@ def test_autodown(generic_cloud: str): 'sleep 45', f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP', # Ensure the cluster is terminated. - 'sleep 200', + f'sleep {autodown_timeout}', f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml', f'sky status | grep {name} | grep UP', # Ensure the cluster is UP. f'sky exec {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml', f'sky status | grep {name} | grep "1m (down)"', - 'sleep 240', + f'sleep {autodown_timeout}', # Ensure the cluster is terminated. f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}', f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml', f'sky autostop -y {name} --cancel', - 'sleep 240', + f'sleep {autodown_timeout}', # Ensure the cluster is still UP. f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && echo "$s" | grep {name} | grep UP', ], f'sky down -y {name}', - timeout=20 * 60, + timeout=total_timeout_minutes * 60, ) run_one_test(test) @@ -1797,6 +1813,7 @@ def test_cancel_ibm(): # ---------- Testing use-spot option ---------- +@pytest.mark.no_azure # Azure does not support spot instances @pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances @pytest.mark.no_ibm # IBM Cloud does not support spot instances @pytest.mark.no_scp # SCP does not support spot instances @@ -1818,6 +1835,7 @@ def test_use_spot(generic_cloud: str): # ---------- Testing managed spot ---------- +@pytest.mark.no_azure # Azure does not support spot instances @pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances @pytest.mark.no_ibm # IBM Cloud does not support spot instances @pytest.mark.no_scp # SCP does not support spot instances @@ -1851,6 +1869,7 @@ def test_spot(generic_cloud: str): run_one_test(test) +@pytest.mark.no_azure # Azure does not support spot instances @pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances @pytest.mark.no_ibm # IBM Cloud does not support spot instances @pytest.mark.no_scp # SCP does not support spot instances @@ -1890,6 +1909,7 @@ def test_spot_pipeline(generic_cloud: str): run_one_test(test) +@pytest.mark.no_azure # Azure does not support spot instances @pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances @pytest.mark.no_ibm # IBM Cloud does not support spot instances @pytest.mark.no_scp # SCP does not support spot instances @@ -1913,6 +1933,7 @@ def test_spot_failed_setup(generic_cloud: str): run_one_test(test) +@pytest.mark.no_azure # Azure does not support spot instances @pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances @pytest.mark.no_ibm # IBM Cloud does not support spot instances @pytest.mark.no_scp # SCP does not support spot instances @@ -2105,6 +2126,7 @@ def test_spot_pipeline_recovery_gcp(): run_one_test(test) +@pytest.mark.no_azure # Azure does not support spot instances @pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances @pytest.mark.no_ibm # IBM Cloud does not support spot instances @pytest.mark.no_scp # SCP does not support spot instances @@ -2326,6 +2348,7 @@ def test_spot_cancellation_gcp(): # ---------- Testing storage for managed spot ---------- +@pytest.mark.no_azure # Azure does not support spot instances @pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances @pytest.mark.no_ibm # IBM Cloud does not support spot instances @pytest.mark.no_scp # SCP does not support spot instances @@ -2381,6 +2404,7 @@ def test_spot_tpu(): # ---------- Testing env for spot ---------- +@pytest.mark.no_azure # Azure does not support spot instances @pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances @pytest.mark.no_ibm # IBM Cloud does not support spot instances @pytest.mark.no_scp # SCP does not support spot instances