From a19db13e446b19db40092a7c2c0f9fea7dbdf10c Mon Sep 17 00:00:00 2001 From: Chris Patterson Date: Tue, 28 Jan 2025 11:53:07 -0500 Subject: [PATCH] feat(self-test): add selftest and integration tests Add selftest.py, a standalone script meant to validate a the functionality of azure-vm-utils when installed in Azure VM. Add test_images.py, a pytest-based set of tests which tests marketplace and community images which feature azure-vm-utils baked in (e.g. debian 13, fedora 41) across a variety of vm sizes and types. It also features a test_custom() to allow for easy testing of any image & vm size combination via environment TEST_CUSTOM_IMAGES=image1,image2,... and TEST_CUSTOM_VM_SIZES=size1,size2... Instructions for usage added to README.md. Add python-autoformat and python-lint targets to cmake and update CI to use linters. Signed-off-by: Chris Patterson --- .github/workflows/main.yml | 6 + CMakeLists.txt | 1 + README.md | 53 +- cmake/python.cmake | 19 + selftest/__init__.py | 0 selftest/selftest.py | 869 +++++++++++++++++++++++++++++++++ selftest/test-requirements.txt | 12 + selftest/test_images.py | 331 +++++++++++++ 8 files changed, 1289 insertions(+), 2 deletions(-) create mode 100644 cmake/python.cmake create mode 100644 selftest/__init__.py create mode 100755 selftest/selftest.py create mode 100644 selftest/test-requirements.txt create mode 100755 selftest/test_images.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c930e14..f43edcc 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -72,3 +72,9 @@ jobs: - name: Check cppcheck run: | make -C build cppcheck + - name: Check python scripts + run: | + python -m venv venv + source venv/bin/activate + pip install -r selftest/test-requirements.txt + make -C build python-lint diff --git a/CMakeLists.txt b/CMakeLists.txt index b8e2117..e3d427a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,7 @@ endif() include(cmake/cppcheck.cmake) include(cmake/clangformat.cmake) include(cmake/doc.cmake) +include(cmake/python.cmake) include(CTest) enable_testing() diff --git a/README.md b/README.md index 57917f4..0433b5c 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,55 @@ DEVNAME=/dev/nvme0n1 azure-nvme-id --udev Provides helpful symlinks in /dev/disk/azure for local, data, and os disks. +# Testing + +## selfcheck + +selfcheck is provided to validate the runtime environment of a VM. + +With azure-vm-utils installed in a VM on Azure, simply copy the selfcheck.py executable to the target and +execute with sudo. + +``` +scp selfcheck/selftest.py $ip: && ssh $ip -- sudo ./selftest.py +``` + +## test_images + +To help automate a spread of tests, test_images provides functional testing for a set of pre-existing images, +assuming azure-vm-utils is already installed. It depends on az-cli, ssh, and ssh-keygen to create VMs +and ssh into them to run the tests. + +To run tests against marketplace and community images with azure-vm-utils: + +``` +AZURE_SUBSCRIPTION= \ +AZURE_LOCATION=eastus2 \ +pytest -v selftest +``` + +To run tests for custom images and vm sizes, test_custom() is provided and can be configured via environment. +TEST_CUSTOM_IMAGES and TEST_CUSTOM_VM_SIZES are comma-separated so multiple may be tested at a time. + +For example: + +``` +AZURE_SUBSCRIPTION= \ +AZURE_LOCATION=eastus2 \ +TEST_CUSTOM_IMAGES=/my/image1,/my/image2,... \ +TEST_CUSTOM_VM_SIZES=Standard_D2ds_v5,Standard_D2ds_v6,... \ +pytest -v -k test_custom +``` + +For convenience, the default spread of VM sizes can be re-used for custom tests by setting one of the +following that are appropriate for the image(s) under test: + +``` +TEST_CUSTOM_VM_SIZES=DEFAULT_GEN1_VM_SIZES +TEST_CUSTOM_VM_SIZES=DEFAULT_GEN2_VM_SIZES +TEST_CUSTOM_VM_SIZES=DEFAULT_ARM64_VM_SIZES +``` + # Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a @@ -57,8 +106,8 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio ## Trademarks -This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft -trademarks or logos is subject to and must follow +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. diff --git a/cmake/python.cmake b/cmake/python.cmake new file mode 100644 index 0000000..5bc8e0f --- /dev/null +++ b/cmake/python.cmake @@ -0,0 +1,19 @@ +file(GLOB PYTHON_SOURCES */*.py) + +add_custom_target( + python-autoformat + COMMAND isort ${PYTHON_SOURCES} + COMMAND black ${PYTHON_SOURCES} + COMMAND autoflake -r --in-place --remove-unused-variables --remove-all-unused-imports --ignore-init-module-imports ${PYTHON_SOURCES} + COMMENT "Running autoformatting tools" +) + +add_custom_target( + python-lint + COMMAND isort --check-only ${PYTHON_SOURCES} + COMMAND black --check --diff ${PYTHON_SOURCES} + COMMAND mypy --ignore-missing-imports ${PYTHON_SOURCES} + COMMAND flake8 --ignore=W503,E501,E402 ${PYTHON_SOURCES} + COMMAND pylint ${PYTHON_SOURCES} + COMMENT "Running Python lint checks and formatting checks" +) diff --git a/selftest/__init__.py b/selftest/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/selftest/selftest.py b/selftest/selftest.py new file mode 100755 index 0000000..5505e9a --- /dev/null +++ b/selftest/selftest.py @@ -0,0 +1,869 @@ +#!/usr/bin/env python3 + +# -------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in the project root for license information. +# -------------------------------------------------------------------------- + +"""Azure VM utilities self-tests script.""" + +import argparse +import glob +import logging +import os +import re +import subprocess +from dataclasses import dataclass, field +from typing import Dict, List, Literal, Optional + +import requests + +logger = logging.getLogger("selftest") + + +# pylint: disable=line-too-long +# pylint: disable=too-many-instance-attributes +# pylint: disable=too-many-locals + + +@dataclass(eq=True, repr=True) +class SkuConfig: + """VM sku-specific configuration related to disks.""" + + vm_size: str + vm_size_type: Literal["arm64", "x64"] = "x64" + nvme_controller_toggle_supported: bool = ( + False # whether the sku supports NVMe controller toggle (Eb[d]s_v5) + ) + nvme_only: bool = False # NVMe-only skus (v6+) + nvme_id_enabled_local: bool = False # whether the sku supports NVMe ID locally + nvme_id_enabled_remote: bool = False # whether the sku supports NVMe ID remotely + nvme_local_disk_count: int = 0 + nvme_local_disk_size_gib: int = 0 + temp_disk_size_gib: int = 0 # SCSI temp/resource disk size in GiB + + +@dataclass(eq=True, repr=True) +class V6SkuConfig(SkuConfig): + """V6 VM sku-specific configuration related to disks.""" + + nvme_only: bool = True + nvme_id_enabled_local: bool = True + nvme_id_enabled_remote: bool = False + + +def gb_to_gib(size_gb: int) -> int: + """Roughly convert GB to GiB as sizes are documented in both ways.""" + return int(size_gb * (1000**3) / (1024**3)) + + +SKU_CONFIGS = { + "Standard_B2ts_v2": SkuConfig(vm_size="Standard_B2ts_v2"), + # "Standard_D2s_v3": SkuConfig(vm_size="Standard_D2s_v3", temp_disk_size_gib=16), + "Standard_D2s_v4": SkuConfig(vm_size="Standard_D2s_v4"), + "Standard_D2ds_v4": SkuConfig(vm_size="Standard_D2ds_v4", temp_disk_size_gib=75), + "Standard_D2s_v5": SkuConfig(vm_size="Standard_D2s_v5"), + "Standard_D2ds_v5": SkuConfig(vm_size="Standard_D2ds_v5", temp_disk_size_gib=75), + "Standard_D2ads_v5": SkuConfig(vm_size="Standard_D2ads_v5", temp_disk_size_gib=75), + "Standard_D16ads_v5": SkuConfig( + vm_size="Standard_D16ads_v5", temp_disk_size_gib=600 + ), + "Standard_L8s_v2": SkuConfig( + vm_size="Standard_L8s_v2", + temp_disk_size_gib=80, + nvme_local_disk_count=1, + nvme_local_disk_size_gib=gb_to_gib(1920), + ), + "Standard_L8s_v3": SkuConfig( + vm_size="Standard_L8s_v3", + temp_disk_size_gib=80, + nvme_local_disk_count=1, + nvme_local_disk_size_gib=gb_to_gib(1920), + ), + "Standard_L80s_v3": SkuConfig( + vm_size="Standard_L80s_v3", + nvme_controller_toggle_supported=True, + temp_disk_size_gib=800, + nvme_local_disk_count=10, + nvme_local_disk_size_gib=gb_to_gib(1920), + ), + "Standard_E2bs_v5": SkuConfig( + vm_size="Standard_E2bs_v5", nvme_controller_toggle_supported=True + ), + "Standard_E2bds_v5": SkuConfig( + vm_size="Standard_E2bds_v5", + nvme_controller_toggle_supported=True, + temp_disk_size_gib=75, + ), + "Standard_D2as_v6": V6SkuConfig(vm_size="Standard_D2als_v6"), + "Standard_D2ads_v6": V6SkuConfig( + vm_size="Standard_D2alds_v6", + nvme_local_disk_count=1, + nvme_local_disk_size_gib=110, + ), + "Standard_D16ads_v6": V6SkuConfig( + vm_size="Standard_D16ads_v6", + nvme_local_disk_count=2, + nvme_local_disk_size_gib=440, + ), + "Standard_D32ads_v6": V6SkuConfig( + vm_size="Standard_D32ads_v6", + nvme_local_disk_count=4, + nvme_local_disk_size_gib=440, + ), + "Standard_D2pls_v5": SkuConfig( + vm_size="Standard_D2pls_v5", + vm_size_type="arm64", + ), + "Standard_D2plds_v5": SkuConfig( + vm_size="Standard_D2plds_v5", + vm_size_type="arm64", + temp_disk_size_gib=75, + ), + "Standard_D8pls_v5": SkuConfig( + vm_size="Standard_D8pls_v5", + vm_size_type="arm64", + ), + "Standard_D8plds_v5": SkuConfig( + vm_size="Standard_D8plds_v5", + vm_size_type="arm64", + temp_disk_size_gib=300, + ), + "Standard_D2pls_v6": SkuConfig( + vm_size="Standard_D2pls_v6", + vm_size_type="arm64", + ), + "Standard_D2plds_v6": SkuConfig( + vm_size="Standard_D2plds_v6", + vm_size_type="arm64", + nvme_local_disk_count=1, + nvme_local_disk_size_gib=110, + ), + "Standard_D16pls_v6": SkuConfig( + vm_size="Standard_D16pls_v6", + vm_size_type="arm64", + ), + "Standard_D16plds_v6": SkuConfig( + vm_size="Standard_D16plds_v6", + vm_size_type="arm64", + nvme_local_disk_count=2, + nvme_local_disk_size_gib=440, + ), +} + + +def device_sort(devices: List[str]) -> List[str]: + """Natural sort for devices.""" + + def natural_sort_key(s: str): + # Natural sort by turning a string into a list of string and number chunks. + # e.g. "nvme0n10" -> ["nvme", 0, "n", 10] + return [ + int(text) if text.isdigit() else text for text in re.split("([0-9]+)", s) + ] + + return sorted(devices, key=natural_sort_key) + + +def get_disk_size_gb(disk_path: str) -> int: + """Get the size of the disk in GB.""" + try: + proc = subprocess.run( + ["lsblk", "-b", "-n", "-o", "SIZE", "-d", disk_path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True, + ) + logger.debug("lsblk output: %r", proc) + size_bytes = int(proc.stdout.strip()) + size_gib = size_bytes // (1000**3) + return size_gib + except subprocess.CalledProcessError as error: + logger.error("error while fetching disk size: %r", error) + raise + except FileNotFoundError: + logger.error("lsblk command not found") + raise + + +def get_disk_size_gib(disk_path: str) -> int: + """Get the size of the disk in GiB.""" + try: + proc = subprocess.run( + ["lsblk", "-b", "-n", "-o", "SIZE", "-d", disk_path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True, + ) + logger.debug("lsblk output: %r", proc) + size_bytes = int(proc.stdout.strip()) + size_gib = size_bytes // (1024**3) + return size_gib + except subprocess.CalledProcessError as error: + logger.error("error while fetching disk size: %r", error) + raise + except FileNotFoundError: + logger.error("lsblk command not found") + raise + + +def get_imds_metadata() -> Dict: + """Fetch IMDS metadata.""" + url = "http://169.254.169.254/metadata/instance?api-version=2021-02-01" + headers = {"Metadata": "true"} + + try: + response = requests.get(url, headers=headers, timeout=60) + response.raise_for_status() + metadata = response.json() + logger.debug("fetched IMDS metadata: %r", metadata) + return metadata + except requests.RequestException as error: + logger.error("error fetching IMDS metadata: %r", error) + raise + + +def get_local_nvme_disks() -> List[str]: + """Get all local NVMe disks.""" + local_disk_controllers = get_nvme_controllers_with_model( + "Microsoft NVMe Direct Disk" + ) + local_disk_controllers_v2 = get_nvme_controllers_with_model( + "Microsoft NVMe Direct Disk v2" + ) + + return device_sort( + [ + namespace + for controller in local_disk_controllers + local_disk_controllers_v2 + for namespace in get_nvme_namespace_devices(controller) + ] + ) + + +def get_remote_nvme_disks() -> List[str]: + """Get all remote NVMe disks.""" + remote_disk_controllers = get_nvme_controllers_with_model( + "MSFT NVMe Accelerator v1.0" + ) + + assert ( + len(remote_disk_controllers) <= 1 + ), f"unexpected number of remote controllers {remote_disk_controllers}" + return device_sort( + [ + namespace + for controller in remote_disk_controllers + for namespace in get_nvme_namespace_devices(controller) + ] + ) + + +def get_nvme_controllers_with_model(model: str) -> List[str]: + """Get a list of all NVMe controllers with the specified model.""" + nvme_controllers = [] + nvme_path = "/sys/class/nvme" + + for controller in glob.glob(os.path.join(nvme_path, "nvme*")): + logger.debug("checking controller: %s", controller) + model_path = os.path.join(controller, "model") + try: + with open(model_path, "r", encoding="utf-8") as file: + controller_model = file.read().strip() + logger.debug("controller: %s model: %s", controller, controller_model) + if controller_model == model: + controller_name = controller.split("/")[-1] + nvme_controllers.append(controller_name) + except FileNotFoundError: + logger.debug("model file not found: %s", model_path) + continue + + return device_sort(nvme_controllers) + + +def get_nvme_namespace_devices_with_model(model: str) -> List[str]: + """Get all NVMe namespace devices for a given NVMe controller model.""" + controllers = get_nvme_controllers_with_model(model) + logger.debug("controllers found for model=%s: %r", model, controllers) + return device_sort( + [ + namespace + for controller in controllers + for namespace in get_nvme_namespace_devices(controller) + ] + ) + + +def get_nvme_namespace_devices(controller: str) -> List[str]: + """Get all NVMe namespace devices for a given NVMe controller.""" + namespace_devices = [] + controller_name = controller.split("/")[-1] + nvme_path = f"/sys/class/nvme/{controller_name}" + + logger.debug("checking namespaces under %s", nvme_path) + for namespace in glob.glob(os.path.join(nvme_path, "nvme*")): + logger.debug("checking namespace device: %s", namespace) + if os.path.isdir(namespace): + device_name = namespace.split("/")[-1] + namespace_devices.append(device_name) + + return device_sort(namespace_devices) + + +def get_root_block_device() -> str: + """Get the root block device using findmnt.""" + try: + proc = subprocess.run( + ["findmnt", "-n", "-o", "SOURCE", "/"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True, + ) + logger.debug("findmnt output: %r", proc) + return proc.stdout.strip() + except subprocess.CalledProcessError as error: + logger.error("error while fetching root block device: %r", error) + raise + except FileNotFoundError: + logger.error("findmnt command not found") + raise + + +def get_scsi_resource_disk() -> Optional[str]: + """Get the SCSI resource disk device.""" + paths = [ + # cloud-init udev rules + "/dev/disk/cloud/azure_resource", + # gen2 + "/dev/disk/by-path/acpi-VMBUS:00-vmbus-f8b3781a1e824818a1c363d806ec15bb-lun-1", + # gen1 + "/dev/disk/by-path/acpi-VMBUS:01-vmbus-000000000001*-lun-0", + ] + + for path in paths: + if "*" in path: + matched_paths = glob.glob(path) + for matched_path in matched_paths: + resolved_path = os.path.realpath(matched_path) + if os.path.exists(resolved_path): + return resolved_path.split("/")[-1] + else: + if os.path.exists(path): + resolved_path = os.path.realpath(path) + if os.path.exists(resolved_path): + return resolved_path.split("/")[-1] + + logger.info("no SCSI resource disk found") + return None + + +DEV_DISK_AZURE_RESOURCE = "/dev/disk/azure/resource" + + +@dataclass(eq=True, repr=True) +class DiskInfo: + """Information about different types of disks present.""" + + root_device: str + dev_disk_azure_links: List[str] = field(default_factory=list) + dev_disk_azure_resource_disk: Optional[str] = None # resolved path + dev_disk_azure_resource_disk_size_gib: int = 0 + nvme_local_disk_size_gib: int = 0 + nvme_local_disks_v1: List[str] = field(default_factory=list) + nvme_local_disks_v2: List[str] = field(default_factory=list) + nvme_local_disks: List[str] = field(default_factory=list) + nvme_remote_data_disks: List[str] = field(default_factory=list) + nvme_remote_disks: List[str] = field(default_factory=list) + nvme_remote_os_disk: Optional[str] = None + root_device_is_nvme: bool = False + scsi_resource_disk: Optional[str] = None + scsi_resource_disk_size_gib: int = 0 + + @classmethod + def gather(cls) -> "DiskInfo": + """Gather disk information and return an instance of DiskInfo.""" + dev_disk_azure_links = device_sort( + [ + link + for link in glob.glob( + os.path.join("/dev/disk/azure", "**"), recursive=True + ) + if os.path.islink(link) + ] + ) + + dev_disk_azure_resource_disk = None + dev_disk_azure_resource_disk_size_gib = 0 + if os.path.exists(DEV_DISK_AZURE_RESOURCE): + dev_disk_azure_resource_disk = os.path.realpath(DEV_DISK_AZURE_RESOURCE) + dev_disk_azure_resource_disk_size_gib = get_disk_size_gib( + dev_disk_azure_resource_disk + ) + + nvme_local_disks_v1 = get_nvme_namespace_devices_with_model( + "Microsoft NVMe Direct Disk" + ) + nvme_local_disks_v2 = get_nvme_namespace_devices_with_model( + "Microsoft NVMe Direct Disk v2" + ) + nvme_local_disks = device_sort(nvme_local_disks_v1 + nvme_local_disks_v2) + nvme_local_disk_size_gib = 0 + if nvme_local_disks: + nvme_local_disk_size_gib = min( + get_disk_size_gib(f"/dev/{disk}") for disk in nvme_local_disks + ) + local_disk_max_size = max( + get_disk_size_gib(f"/dev/{disk}") for disk in nvme_local_disks + ) + assert ( + nvme_local_disk_size_gib == local_disk_max_size + ), f"local disk size mismatch: {nvme_local_disk_size_gib} != {local_disk_max_size} for {nvme_local_disks}" + + nvme_remote_disks = get_remote_nvme_disks() + if nvme_remote_disks: + nvme_remote_os_disk = nvme_remote_disks.pop(0) + nvme_remote_data_disks = nvme_remote_disks + else: + nvme_remote_os_disk = None + nvme_remote_data_disks = [] + + root_device = get_root_block_device() + root_device_is_nvme = root_device.startswith("/dev/nvme") + root_device = root_device.split("/")[-1] + + scsi_resource_disk = get_scsi_resource_disk() + scsi_resource_disk_size_gib = ( + get_disk_size_gib(f"/dev/{scsi_resource_disk}") if scsi_resource_disk else 0 + ) + + disk_info = cls( + dev_disk_azure_links=dev_disk_azure_links, + dev_disk_azure_resource_disk=dev_disk_azure_resource_disk, + dev_disk_azure_resource_disk_size_gib=dev_disk_azure_resource_disk_size_gib, + nvme_local_disk_size_gib=nvme_local_disk_size_gib, + nvme_local_disks_v1=nvme_local_disks_v1, + nvme_local_disks_v2=nvme_local_disks_v2, + nvme_local_disks=nvme_local_disks, + nvme_remote_os_disk=nvme_remote_os_disk, + nvme_remote_data_disks=nvme_remote_data_disks, + nvme_remote_disks=nvme_remote_disks, + root_device=root_device, + root_device_is_nvme=root_device_is_nvme, + scsi_resource_disk=scsi_resource_disk, + scsi_resource_disk_size_gib=scsi_resource_disk_size_gib, + ) + + logger.info("disks info: %r", disk_info) + return disk_info + + +@dataclass +class AzureNvmeIdDevice: + """Azure NVMe ID device.""" + + device: str + nvme_id: str + type: Optional[str] + index: Optional[int] + name: Optional[str] + extra: Dict[str, str] + + +@dataclass(repr=True) +class AzureNvmeIdInfo: + """Azure NVMe ID.""" + + azure_nvme_id_stdout: str + azure_nvme_id_stderr: str + azure_nvme_id_returncode: int + + azure_nvme_id_version_stdout: str + azure_nvme_id_version_stderr: str + azure_nvme_id_version_returncode: int + azure_nvme_id_version: str + + azure_nvme_id_disks: Dict[str, AzureNvmeIdDevice] + + def validate_azure_nvme_id(self, disk_info: DiskInfo) -> None: + """Validate azure-nvme-id outputs.""" + assert self.azure_nvme_id_returncode == 0, "azure-nvme-id failed" + if not os.path.exists("/sys/class/nvme"): + assert ( + self.azure_nvme_id_stderr + == "no NVMe devices in /sys/class/nvme: No such file or directory\n" + ), f"unexpected azure-nvme-id stderr without /sys/class/nvme: {self.azure_nvme_id_stderr}" + else: + assert ( + self.azure_nvme_id_stderr == "" + ), f"unexpected azure-nvme-id stderr: {self.azure_nvme_id_stderr}" + + disk_cfg: Optional[AzureNvmeIdDevice] = None + for device_name, disk_cfg in self.azure_nvme_id_disks.items(): + assert f"/dev/{device_name}" == disk_cfg.device + assert disk_cfg.device.startswith( + "/dev/nvme" + ), f"unexpected device: {disk_cfg}" + + for device_name in disk_info.nvme_local_disks_v2: + assert ( + device_name in self.azure_nvme_id_disks + ), f"missing azure-nvme-id for {device_name}" + disk_cfg = self.azure_nvme_id_disks.get(device_name) + assert disk_cfg, f"failed to find azure-nvme-id for {device_name}" + assert disk_cfg.type == "local", "unexpected local disk type {disk_cfg}" + assert disk_cfg.name, "unexpected local disk name {disk_cfg}" + assert disk_cfg.index, "unexpected local disk index {disk_cfg}" + assert disk_cfg.nvme_id, "unexpected local disk id {disk_cfg}" + assert not disk_cfg.extra, "unexpected local disk extra {disk_cfg}" + + for device_name in disk_info.nvme_remote_disks + disk_info.nvme_local_disks_v1: + assert ( + device_name in self.azure_nvme_id_disks + ), f"missing azure-nvme-id for {device_name}" + disk_cfg = self.azure_nvme_id_disks.get(device_name) + assert disk_cfg, f"failed to find azure-nvme-id for {device_name}" + assert not disk_cfg.type, "unexpected disk type {disk_cfg}" + assert not disk_cfg.name, "unexpected disk name {disk_cfg}" + assert not disk_cfg.index, "unexpected disk index {disk_cfg}" + assert not disk_cfg.nvme_id, "unexpected disk id {disk_cfg}" + assert not disk_cfg.extra, "unexpected disk extra {disk_cfg}" + + logger.info("validate_azure_nvmve_id: OK") + + def validate_azure_nvme_id_version(self) -> None: + """Validate azure-nvme-id --version outputs.""" + assert ( + self.azure_nvme_id_version_returncode == 0 + ), "azure-nvme-id --version failed" + assert ( + self.azure_nvme_id_version_stderr == "" + ), f"unexpected azure-nvme-id stderr: {self.azure_nvme_id_stderr}" + assert self.azure_nvme_id_version_stdout, "missing azure-nvme-id version stdout" + assert re.match( + r"azure-nvme-id [0v]\.*", self.azure_nvme_id_version_stdout.strip() + ), f"unexpected azure-nvme-id version stdout: {self.azure_nvme_id_version_stdout}" + assert re.match( + r"[0v]\.*", self.azure_nvme_id_version + ), f"unexpected azure-nvme-id version: {self.azure_nvme_id_version}" + + logger.info("validate_azure_nvme_id_version OK: %s", self.azure_nvme_id_version) + + def validate(self, disk_info: DiskInfo) -> None: + """Validate Azure NVMe ID output.""" + self.validate_azure_nvme_id_version() + self.validate_azure_nvme_id(disk_info) + + @classmethod + def gather(cls) -> "AzureNvmeIdInfo": + """Gather Azure NVMe ID information.""" + proc = subprocess.run(["azure-nvme-id"], capture_output=True, check=False) + azure_nvme_id_stdout = proc.stdout.decode("utf-8") + azure_nvme_id_stderr = proc.stderr.decode("utf-8") + azure_nvme_id_returncode = proc.returncode + + proc = subprocess.run( + ["azure-nvme-id", "--version"], capture_output=True, check=False + ) + azure_nvme_id_version_stdout = proc.stdout.decode("utf-8") + azure_nvme_id_version_stderr = proc.stderr.decode("utf-8") + azure_nvme_id_version_returncode = proc.returncode + azure_nvme_id_version = cls.parse_azure_nvme_id_version( + azure_nvme_id_version_stdout + ) + azure_nvme_id_disks = cls.parse_azure_nvme_id_output(azure_nvme_id_stdout) + + azure_nvme_id_info = cls( + azure_nvme_id_stdout=azure_nvme_id_stdout, + azure_nvme_id_stderr=azure_nvme_id_stderr, + azure_nvme_id_returncode=azure_nvme_id_returncode, + azure_nvme_id_version_stdout=azure_nvme_id_version_stdout, + azure_nvme_id_version_stderr=azure_nvme_id_version_stderr, + azure_nvme_id_version_returncode=azure_nvme_id_version_returncode, + azure_nvme_id_version=azure_nvme_id_version, + azure_nvme_id_disks=azure_nvme_id_disks, + ) + logger.info("azure-nvme-id info: %r", azure_nvme_id_info) + return azure_nvme_id_info + + @staticmethod + def parse_azure_nvme_id_output(output: str) -> Dict[str, AzureNvmeIdDevice]: + """Parse azure-nvme-id output. + + Example output: + /dev/nvme0n1: + /dev/nvme0n2: + /dev/nvme0n3: + /dev/nvme1n1: type=local,index=1,name=nvme-440G-1 + /dev/nvme2n1: type=local,index=2,name=nvme-440G-2 + """ + devices = {} + + for line in output.splitlines(): + parts = line.strip().split(":", 1) + if parts[-1] == "": + parts.pop() + + device = parts[0].strip() + if len(parts) == 2: + nvme_id = parts[1].strip() + properties = dict(kv.split("=", 1) for kv in nvme_id.split(",")) + elif len(parts) == 1: + nvme_id = "" + properties = {} + else: + raise ValueError(f"unexpected azure-nvme-id output: {line}") + + device_type = properties.pop("type", None) + device_index = ( + int(properties.pop("index")) if "index" in properties else None + ) + device_name = properties.pop("name", None) + azure_nvme_id_device = AzureNvmeIdDevice( + device=device, + nvme_id=nvme_id, + type=device_type, + index=device_index, + name=device_name, + extra=properties, + ) + + key = device.split("/")[-1] + devices[key] = azure_nvme_id_device + + return devices + + @staticmethod + def parse_azure_nvme_id_version(azure_nvme_id_version_output: str) -> str: + """Parse azure-nvme-id version output and return version info.""" + parts = azure_nvme_id_version_output.strip().split(" ") + assert ( + len(parts) == 2 + ), f"unexpected azure-nvme-id version output: {azure_nvme_id_version_output}" + return parts[1] + + +class AzureVmUtilsValidator: + """Validate Azure VM utilities.""" + + def __init__(self) -> None: + self.azure_nvme_id_info = AzureNvmeIdInfo.gather() + self.disk_info = DiskInfo.gather() + self.imds_metadata = get_imds_metadata() + self.vm_size = self.imds_metadata.get("compute", {}).get("vmSize") + self.sku_config = SKU_CONFIGS.get(self.vm_size) + + logger.info("sku config: %r", self.sku_config) + + def validate_dev_disk_azure_links_data(self) -> None: + """Validate /dev/disk/azure/data links. + + All data disks should have by-lun if azure-vm-utils is installed. + Future variants of remote disks will include by-name. + """ + imds_data_disks = ( + self.imds_metadata.get("compute", {}) + .get("storageProfile", {}) + .get("dataDisks", []) + ) + expected_data_disks = len(imds_data_disks) + data_disks = [ + link + for link in self.disk_info.dev_disk_azure_links + if link.startswith("/dev/disk/azure/data/by-lun") + ] + if self.disk_info.nvme_remote_disks: + assert len(data_disks) == len( + self.disk_info.nvme_remote_data_disks + ), f"unexpected number of data disks: {data_disks} configured={self.disk_info.nvme_remote_data_disks}" + + assert ( + len(data_disks) == expected_data_disks + ), f"unexpected number of data disks: {data_disks} IMDS configured={imds_data_disks} (note that IMDS may not be accurate)" + + # Verify disk sizes match up with IMDS configuration. + for imds_disk in imds_data_disks: + lun = imds_disk.get("lun") + expected_size_gb = int(imds_disk.get("diskSizeGB")) + disk_path = f"/dev/disk/azure/data/by-lun/{lun}" + actual_size_gb = get_disk_size_gb(disk_path) + assert ( + actual_size_gb == expected_size_gb + ), f"disk size mismatch for {disk_path}: expected {expected_size_gb} GB, found {actual_size_gb} GB" + + logger.info("validate_dev_disk_azure_links_data OK: %r", data_disks) + + def validate_dev_disk_azure_links_local(self) -> None: + """Validate /dev/disk/azure/local links. + + All local disks should have by-serial if azure-vm-utils is installed. + If NVMe id is supported, by-index and by-name will be available as well. + """ + local_disks = sorted( + [ + link + for link in self.disk_info.dev_disk_azure_links + if link.startswith("/dev/disk/azure/local") + ] + ) + + for key in ["index", "name", "serial"]: + local_disks_by_key = sorted( + [ + link + for link in self.disk_info.dev_disk_azure_links + if link.startswith(f"/dev/disk/azure/local/by-{key}") + ] + ) + if key == "serial": + expected_count = len(self.disk_info.nvme_local_disks) + else: + expected_count = len(self.disk_info.nvme_local_disks_v2) + + assert ( + len(local_disks_by_key) == expected_count + ), f"unexpected number of local disks by-{key}: {local_disks_by_key} (expected {expected_count})" + assert ( + not self.sku_config + or not self.sku_config.nvme_id_enabled_local + or len(local_disks_by_key) == self.sku_config.nvme_local_disk_count + ), f"unexpected number of local disks by sku for by-{key}: {local_disks_by_key} (expected {expected_count})" + + if key == "name": + for disk in local_disks_by_key: + name = disk.split("/")[-1] + assert name.startswith( + "nvme-" + ), f"unexpected local disk name: {name}" + match = re.match(r"nvme-(\d+)G-(\d+)", name) + assert ( + match + ), f"local disk name does not conform to expected pattern: {name}" + size, index = match.groups() + assert ( + size.isdigit() and index.isdigit() + ), f"invalid size or index in local disk name: {name}" + + # Cross-check by-index links with by-name links. + by_index_path = f"/dev/disk/azure/local/by-index/{index}" + assert os.path.realpath(by_index_path) == os.path.realpath( + disk + ), f"mismatch between by-index and by-name links: {by_index_path} != {disk}" + + logger.info("validate_dev_disk_azure_links_local OK: %r", local_disks) + + def validate_dev_disk_azure_links_os(self) -> None: + """Validate /dev/disk/azure/os link.""" + os_disk = "/dev/disk/azure/os" + assert os_disk in self.disk_info.dev_disk_azure_links, f"missing {os_disk}" + + logger.info("validate_dev_disk_azure_links_os OK: %r", os_disk) + + def validate_dev_disk_azure_links_resource(self) -> None: + """Validate /dev/disk/azure/resource link.""" + resource_disk = "/dev/disk/azure/resource" + if self.sku_config and self.sku_config.temp_disk_size_gib: + assert ( + resource_disk in self.disk_info.dev_disk_azure_links + ), f"missing {resource_disk}" + else: + assert ( + resource_disk not in self.disk_info.dev_disk_azure_links + ), f"unexpected {resource_disk}" + + logger.info("validate_dev_disk_azure_links_resource OK: %r", resource_disk) + + def validate_nvme_local_disks(self) -> None: + """Validate NVMe local disks.""" + logger.info("validate_nvme_local_disks OK: %r", self.disk_info.nvme_local_disks) + + def validate_scsi_resource_disk(self) -> None: + """Validate SCSI resource disk symlink and size.""" + assert ( + self.disk_info.scsi_resource_disk_size_gib + == self.disk_info.dev_disk_azure_resource_disk_size_gib + ), f"resource disk size mismatch: {self.disk_info}" + if self.disk_info.scsi_resource_disk: + assert ( + f"/dev/{self.disk_info.scsi_resource_disk}" + == self.disk_info.dev_disk_azure_resource_disk + ), f"unexpected resource disk path: {self.disk_info}" + else: + assert ( + self.disk_info.scsi_resource_disk is None + and self.disk_info.dev_disk_azure_resource_disk is None + ), f"unexpected resource disk path: {self.disk_info}" + + logger.info( + "validate_scsi_resource_disk OK: /dev/disk/azure/resource => %s", + self.disk_info.dev_disk_azure_resource_disk, + ) + + def validate_sku_config(self) -> None: + """Validate SKU config.""" + if not self.sku_config: + logger.warning( + "validate_sku_config SKIPPED: no sku configuration for VM size %r", + self.vm_size, + ) + return + + assert ( + self.sku_config.vm_size == self.vm_size + ), f"vm size mismatch: {self.sku_config.vm_size} != {self.vm_size}" + assert ( + len(self.disk_info.nvme_local_disks) + == self.sku_config.nvme_local_disk_count + ), f"local disk count mismatch: {len(self.disk_info.nvme_local_disks)} != {self.sku_config.nvme_local_disk_count}" + assert ( + self.disk_info.nvme_local_disk_size_gib + == self.sku_config.nvme_local_disk_size_gib + ), f"local disk size mismatch: {self.disk_info.nvme_local_disk_size_gib} != {self.sku_config.nvme_local_disk_size_gib}" + assert ( + self.disk_info.scsi_resource_disk_size_gib + == self.sku_config.temp_disk_size_gib + ), f"temp disk size mismatch: {self.disk_info.scsi_resource_disk_size_gib} != {self.sku_config.temp_disk_size_gib}" + assert ( + self.disk_info.dev_disk_azure_resource_disk_size_gib + == self.sku_config.temp_disk_size_gib + ), f"temp disk size mismatch: {self.disk_info.dev_disk_azure_resource_disk_size_gib} != {self.sku_config.temp_disk_size_gib}" + + logger.info("validate_sku_config OK: %r", self.sku_config) + + def validate(self) -> None: + """Run validations.""" + self.azure_nvme_id_info.validate(self.disk_info) + self.validate_dev_disk_azure_links_data() + self.validate_dev_disk_azure_links_local() + self.validate_dev_disk_azure_links_os() + self.validate_dev_disk_azure_links_resource() + self.validate_scsi_resource_disk() + self.validate_sku_config() + logger.info("success!") + + +def main() -> None: + """Main entry point.""" + parser = argparse.ArgumentParser( + description="Azure VM utilities self-tests script." + ) + parser.add_argument( + "--debug", + action="store_true", + help="Enable debug logging", + ) + args = parser.parse_args() + + if args.debug: + logging.basicConfig(format="%(message)s", level=logging.DEBUG) + else: + logging.basicConfig(format="%(message)s", level=logging.INFO) + + validator = AzureVmUtilsValidator() + validator.validate() + + +if __name__ == "__main__": + main() diff --git a/selftest/test-requirements.txt b/selftest/test-requirements.txt new file mode 100644 index 0000000..2acf960 --- /dev/null +++ b/selftest/test-requirements.txt @@ -0,0 +1,12 @@ +autoflake +black +flake8 +isort +mypy +pycodestyle +pyflakes +pylint +pytest +pytest-xdist +types-requests +typing_extensions diff --git a/selftest/test_images.py b/selftest/test_images.py new file mode 100755 index 0000000..11ff8e9 --- /dev/null +++ b/selftest/test_images.py @@ -0,0 +1,331 @@ +#!/usr/bin/env python3 + +# -------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in the project root for license information. +# -------------------------------------------------------------------------- + +"""Azure VM utilities self-tests script.""" + +import os +import shlex +import subprocess +import time +import uuid +from datetime import datetime, timedelta +from pathlib import Path +from typing import List + +import pytest + +from . import selftest + +# pylint: disable=line-too-long +# pylint: disable=too-many-instance-attributes +# pylint: disable=too-many-arguments +# pylint: disable=too-many-positional-arguments +# pylint: disable=unused-argument +# pylint: disable=attribute-defined-outside-init +# pylint: disable=redefined-outer-name + +DEFAULT_GEN1_VM_SIZES = [ + sku.vm_size + for sku in selftest.SKU_CONFIGS.values() + if sku.vm_size_type == "x64" and not sku.vm_size.endswith("v6") +] + +DEFAULT_GEN2_VM_SIZES = [ + sku.vm_size for sku in selftest.SKU_CONFIGS.values() if sku.vm_size_type == "x64" +] + +DEFAULT_ARM64_VM_SIZES = [ + sku.vm_size for sku in selftest.SKU_CONFIGS.values() if sku.vm_size_type == "arm64" +] + +CUSTOM_IMAGES = [ + image for image in os.getenv("TEST_CUSTOM_IMAGES", "").split(",") if image +] +ENV_TEST_CUSTOM_VM_SIZES = os.getenv("TEST_CUSTOM_VM_SIZES", "") +if ENV_TEST_CUSTOM_VM_SIZES == "DEFAULT_GEN1_VM_SIZES": + CUSTOM_VM_SIZES = DEFAULT_GEN1_VM_SIZES +elif ENV_TEST_CUSTOM_VM_SIZES == "DEFAULT_GEN2_VM_SIZES": + CUSTOM_VM_SIZES = DEFAULT_GEN2_VM_SIZES +elif ENV_TEST_CUSTOM_VM_SIZES == "DEFAULT_ARM64_VM_SIZES": + CUSTOM_VM_SIZES = DEFAULT_ARM64_VM_SIZES +else: + CUSTOM_VM_SIZES = [image for image in ENV_TEST_CUSTOM_VM_SIZES.split(",") if image] + + +def subprocess_run(cmd: List[str], check: bool = True): + """Run a subprocess command and capture outputs as utf-8.""" + print(f"executing command: {shlex.join(cmd)}") + try: + proc = subprocess.run( + cmd, + check=check, + capture_output=True, + text=True, + encoding="utf-8", + ) + except subprocess.CalledProcessError as error: + print( + f"error running command: {error} stdout={error.stdout} stderr={error.stderr}" + ) + raise + + print(f"executed command: {proc}") + return proc + + +@pytest.fixture +def ssh_key_path(tmp_path): + """Generate a temporary SSH key pair for the test.""" + path = tmp_path / "id_rsa" + subprocess.run( + ["ssh-keygen", "-t", "rsa", "-b", "2048", "-f", str(path), "-N", ""], check=True + ) + yield path + + +@pytest.fixture +def azure_subscription(): + """Get the Azure subscription ID.""" + subscription = os.getenv("AZURE_SUBSCRIPTION_ID") or "" + assert subscription, "AZURE_SUBSCRIPTION_ID environment variable is required" + yield subscription + + +@pytest.fixture +def azure_location(): + """Get the Azure location.""" + location = os.getenv("AZURE_LOCATION") or "" + assert location, "AZURE_LOCATION environment variable is required" + yield location + + +@pytest.fixture +def temp_resource_group(azure_subscription, azure_location): + """Create a temporary resource group for the test.""" + resource_group_name = f"test-rg-{uuid.uuid4()}" + delete_after = (datetime.utcnow() + timedelta(hours=1)).isoformat() + subprocess.run( + [ + "az", + "group", + "create", + "--subscription", + azure_subscription, + "--location", + azure_location, + "--name", + resource_group_name, + "--tags", + f"deleteAfter={delete_after}", + ], + check=True, + ) + yield resource_group_name + subprocess.run( + ["az", "group", "delete", "--name", resource_group_name, "--yes", "--no-wait"], + check=True, + ) + + +class TestVMs: + """Test VMs with different images and sizes.""" + + @pytest.fixture(autouse=True) + def setup( + self, + azure_location, + azure_subscription, + ssh_key_path, + temp_resource_group, + image, + vm_size, + ): + """Initialize the test.""" + self.admin_username = "azureuser" + self.azure_location = azure_location + self.azure_subscription = azure_subscription + self.image = image + self.selftest_script_path = ( + Path(os.path.abspath(__file__)).parent / "selftest.py" + ) + self.ssh_key_path = ssh_key_path + self.temp_resource_group = temp_resource_group + self.vm_name = "test-vm" + self.vm_size = vm_size + + def run_test(self) -> None: + """Create VM and run self-tests.""" + target_script_path = f"/home/{self.admin_username}/selftest.py" + + # Create VM with 4 data disks + proc = subprocess_run( + [ + "az", + "vm", + "create", + "--subscription", + self.azure_subscription, + "--resource-group", + self.temp_resource_group, + "--name", + self.vm_name, + "--image", + self.image, + "--size", + self.vm_size, + "--admin-username", + self.admin_username, + "--ssh-key-value", + f"{self.ssh_key_path}.pub", + "--data-disk-sizes-gb", + "1", + "2", + "3", + "4", + "--accept-term", + ], + check=False, + ) + + if proc.returncode != 0: + # Skip the test if the VM creation failed and indicate the reason. + if "QuotaExceeded" in proc.stderr: + pytest.skip( + f"Unable to create VM due to QuotaExceeded for {self.vm_size}: {proc.stderr}" + ) + + if "SkuNotAvailable" in proc.stderr: + pytest.skip( + f"Unable to create VM due to SkuNotAvailable for {self.vm_size}: {proc.stderr}" + ) + + pytest.skip( + f"Unable to create VM: stdout={proc.stdout} stderr={proc.stderr}" + ) + + # Get public IP address of the VM + public_ip_address = None + for _ in range(10): + proc = subprocess_run( + [ + "az", + "vm", + "list-ip-addresses", + "--subscription", + self.azure_subscription, + "--resource-group", + self.temp_resource_group, + "--name", + self.vm_name, + "--query", + "[0].virtualMachine.network.publicIpAddresses[0].ipAddress", + "--output", + "tsv", + ], + check=False, + ) + public_ip_address = proc.stdout.strip() + if public_ip_address: + break + time.sleep(1) + else: + pytest.fail( + f"Unable to get public IP address of the VM: stdout={proc.stdout} stderr={proc.stderr}" + ) + + ssh_command = [ + "ssh", + "-i", + self.ssh_key_path.as_posix(), + f"{self.admin_username}@{public_ip_address}", + "--", + "sudo", + ] + + # Wait for the VM to be ready + status = "unknown" + while status not in ["running", "degraded"]: + proc = subprocess_run( + ssh_command + ["cloud-init", "status", "--wait"], check=False + ) + proc = subprocess_run( + ssh_command + ["systemctl", "is-system-running", "--wait"], check=False + ) + status = proc.stdout.strip() + + subprocess_run( + ssh_command + ["journalctl", "-o", "short-monotonic"], check=False + ) + subprocess_run( + [ + "scp", + "-i", + self.ssh_key_path.as_posix(), + self.selftest_script_path.as_posix(), + f"{self.admin_username}@{public_ip_address}:{target_script_path}", + ], + ) + subprocess_run(ssh_command + [target_script_path, "--debug"], check=True) + + @pytest.mark.parametrize( + "image", + [ + "debian:debian-13-daily:13:latest", + "debian:debian-sid-daily:sid:latest", + ], + ) + @pytest.mark.parametrize( + "vm_size", + DEFAULT_GEN1_VM_SIZES, + ) + def test_gen1_x64(self, image, vm_size): + """Test gen1 x64 images.""" + self.run_test() + + @pytest.mark.parametrize( + "image", + [ + "debian:debian-13-daily:13-gen2:latest", + "debian:debian-sid-daily:sid-gen2:latest", + "/CommunityGalleries/Fedora-5e266ba4-2250-406d-adad-5d73860d958f/Images/Fedora-Cloud-Rawhide-x64/versions/latest", + ], + ) + @pytest.mark.parametrize( + "vm_size", + DEFAULT_GEN2_VM_SIZES, + ) + def test_gen2_x64(self, image, vm_size): + """Test gen2 x64 images.""" + self.run_test() + + @pytest.mark.parametrize( + "image", + [ + "debian:debian-13-daily:13-arm64:latest", + "debian:debian-sid-daily:sid-arm64:latest", + "/CommunityGalleries/Fedora-5e266ba4-2250-406d-adad-5d73860d958f/Images/Fedora-Cloud-Rawhide-Arm64/versions/latest", + ], + ) + @pytest.mark.parametrize( + "vm_size", + DEFAULT_ARM64_VM_SIZES, + ) + def test_arm64(self, image, vm_size): + """Test arm64 images.""" + self.run_test() + + @pytest.mark.parametrize( + "image", + CUSTOM_IMAGES, + ) + @pytest.mark.parametrize( + "vm_size", + CUSTOM_VM_SIZES, + ) + def test_custom(self, image, vm_size): + """Test custom images.""" + self.run_test()