agent versioning test_suite (Azure#2770)

* agent versioning test_suite * address PR comments * fix pylint warning * fix update assertion * fix pylint error
nagworld9 · Apr 21, 2023 · f9b9a7a · f9b9a7a
1 parent 09b4bda
commit f9b9a7a
Show file tree

Hide file tree

Showing 12 changed files with 446 additions and 3 deletions.
diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py
@@ -127,6 +127,7 @@ class WALAEventOperation:
     Update = "Update"
     VmSettings = "VmSettings"
     VmSettingsSummary = "VmSettingsSummary"
+    FeatureFlag = "FeatureFlag"
 
 
 SHOULD_ENCODE_MESSAGE_LEN = 80

diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py
@@ -32,7 +32,8 @@
 from azurelinuxagent.common import logger
 from azurelinuxagent.common.protocol.imds import get_imds_client
 from azurelinuxagent.common.utils import fileutil, textutil
-from azurelinuxagent.common.agent_supported_feature import get_supported_feature_by_name, SupportedFeatureNames
+from azurelinuxagent.common.agent_supported_feature import get_supported_feature_by_name, SupportedFeatureNames, \
+    get_agent_supported_features_list_for_crp
 from azurelinuxagent.common.cgroupconfigurator import CGroupConfigurator
 from azurelinuxagent.common.event import add_event, initialize_event_logger_vminfo_common_parameters, \
     WALAEventOperation, EVENTS_DIRECTORY
@@ -149,6 +150,9 @@ def __init__(self):
         # VM Size is reported via the heartbeat, default it here.
         self._vm_size = None
 
+        # Flag is Used to log if GA supports versioning on agent start
+        self._agent_supports_versioning_logged = False
+
         # these members are used to avoid reporting errors too frequently
         self._heartbeat_update_goal_state_error_count = 0
         self._update_goal_state_error_count = 0
@@ -523,6 +527,7 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler, agent_
             # status reporting should be done even when the goal state is not updated
             agent_update_status = agent_update_handler.get_vmagent_update_status()
             self._report_status(exthandlers_handler, agent_update_status)
+            self._log_agent_supports_versioning_or_not()
             return
 
         # check for agent updates
@@ -547,6 +552,9 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler, agent_
             agent_update_status = agent_update_handler.get_vmagent_update_status()
             self._report_status(exthandlers_handler, agent_update_status)
 
+            # Logging after agent reports supported feature flag so this msg in sync with report status
+            self._log_agent_supports_versioning_or_not()
+
             if self._processing_new_incarnation():
                 remote_access_handler.run()
 
@@ -610,6 +618,24 @@ def _report_extensions_summary(self, vm_status):
                 logger.warn(msg)
                 add_event(op=WALAEventOperation.GoalState, is_success=False, message=msg)
 
+    def _log_agent_supports_versioning_or_not(self):
+        def _log_event(msg):
+            logger.info(msg)
+            add_event(AGENT_NAME, op=WALAEventOperation.FeatureFlag, message=msg)
+        if not self._agent_supports_versioning_logged:
+            supports_ga_versioning = False
+            for _, feature in get_agent_supported_features_list_for_crp().items():
+                if feature.name == SupportedFeatureNames.GAVersioningGovernance:
+                    supports_ga_versioning = True
+                    break
+            if supports_ga_versioning:
+                msg = "Agent : {0} supports GA Versioning".format(CURRENT_VERSION)
+                _log_event(msg)
+            else:
+                msg = "Agent : {0} doesn't support GA Versioning".format(CURRENT_VERSION)
+                _log_event(msg)
+            self._agent_supports_versioning_logged = True
+
     def _on_initial_goal_state_completed(self, extensions_summary):
         fileutil.write_file(self._initial_goal_state_file_path(), ustr(extensions_summary))
         if conf.get_extensions_enabled() and self._goal_state_period != conf.get_goal_state_period():

diff --git a/tests_e2e/orchestrator/scripts/install-agent b/tests_e2e/orchestrator/scripts/install-agent
@@ -92,6 +92,9 @@ unzip.py "$package" "/var/lib/waagent/WALinuxAgent-$version"
 if [[ -e /etc/waagent.conf ]]; then
   sed -i 's/AutoUpdate.Enabled=n/AutoUpdate.Enabled=y/g' /etc/waagent.conf
 fi
+# By default GAUpdates flag set to True, so that agent go through update logic to look for new agents.
+# But in e2e tests this flag needs to be off in test version 9.9.9.9 to stop the agent updates, so that our scenarios run on 9.9.9.9.
+sed -i '$a GAUpdates.Enabled=n' /etc/waagent.conf
 
 #
 # Restart the service

diff --git a/tests_e2e/test_suites/agent_update.yml b/tests_e2e/test_suites/agent_update.yml
@@ -0,0 +1,5 @@
+name: "AgentUpdate"
+tests:
+  - "agent_update/rsm_update.py"
+images: "endorsed"
+location: "eastus2euap"
diff --git a/tests_e2e/tests/agent_update/__init__.py b/tests_e2e/tests/agent_update/__init__.py
diff --git a/tests_e2e/tests/agent_update/rsm_update.py b/tests_e2e/tests/agent_update/rsm_update.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+
+# Microsoft Azure Linux Agent
+#
+# Copyright 2018 Microsoft Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# BVT for the agent update scenario
+#
+# The test verifies agent update for rsm workflow. This test covers three scenarios downgrade, upgrade and no update.
+# For each scenario, we intiaite the rsm request with target version and then verify agent updated to that target version.
+#
+import json
+
+import requests
+from azure.identity import DefaultAzureCredential
+from azure.mgmt.compute.models import VirtualMachine
+
+from tests_e2e.tests.lib.agent_test import AgentTest
+from tests_e2e.tests.lib.agent_test_context import AgentTestContext
+from tests_e2e.tests.lib.logging import log
+from tests_e2e.tests.lib.retry import retry_if_not_found
+from tests_e2e.tests.lib.ssh_client import SshClient
+from tests_e2e.tests.lib.virtual_machine import VmMachine
+
+
+class RsmUpdateBvt(AgentTest):
+
+    def __init__(self, context: AgentTestContext):
+        super().__init__(context)
+        self._ssh_client = SshClient(
+            ip_address=self._context.vm_ip_address,
+            username=self._context.username,
+            private_key_file=self._context.private_key_file)
+
+    def run(self) -> None:
+        # Allow agent to send supported feature flag
+        self._verify_agent_reported_supported_feature_flag()
+
+        log.info("*******Verifying the Agent Downgrade scenario*******")
+        self._mock_rsm_update("1.3.0.0")
+        self._prepare_agent()
+
+        # Verify downgrade scenario
+        self._verify_guest_agent_update("1.3.0.0")
+
+        # Verify upgrade scenario
+        log.info("*******Verifying the Agent Upgrade scenario*******")
+        self._mock_rsm_update("1.3.1.0")
+        self._verify_guest_agent_update("1.3.1.0")
+
+        # verify no version update. There is bug in CRP and will enable once it's fixed
+        # log.info("*******Verifying the no version update scenario*******")
+        # self._prepare_rsm_update("1.3.1.0")
+        # self._verify_guest_agent_update("1.3.1.0")
+
+    def _prepare_agent(self) -> None:
+        """
+        This method is to ensure agent is ready for accepting rsm updates. As part of that we update following flags
+        1) Changing daemon version since daemon has a hard check on agent version in order to update agent. It doesn't allow versions which are less than daemon version.
+        2) Updating GAFamily type "Test" and GAUpdates flag to process agent updates on test versions.
+        """
+        local_path = self._context.test_source_directory/"tests"/"scripts"/"agent-python"
+        remote_path = self._context.remote_working_directory/"agent-python"
+        self._ssh_client.copy(local_path, remote_path)
+        local_path = self._context.test_source_directory/"tests"/"scripts"/"agent-service"
+        remote_path = self._context.remote_working_directory/"agent-service"
+        self._ssh_client.copy(local_path, remote_path)
+        local_path = self._context.test_source_directory/"tests"/"scripts"/"agent-update-config"
+        remote_path = self._context.remote_working_directory/"agent-update-config"
+        self._ssh_client.copy(local_path, remote_path)
+        self._ssh_client.run_command(f"sudo {remote_path}")
+
+    @staticmethod
+    def _verify_agent_update_flag_enabled(vm: VmMachine) -> bool:
+        result: VirtualMachine = vm.get()
+        flag: bool = result.os_profile.linux_configuration.enable_vm_agent_platform_updates
+        if flag is None:
+            return False
+        return flag
+
+    def _enable_agent_update_flag(self, vm: VmMachine) -> None:
+        osprofile = {
+            "location": self._context.vm.location,  # location is required field
+            "properties": {
+                "osProfile": {
+                    "linuxConfiguration": {
+                        "enableVMAgentPlatformUpdates": True
+                    }
+                }
+            }
+        }
+        vm.create_or_update(osprofile)
+
+    def _mock_rsm_update(self, requested_version: str) -> None:
+        """
+        This method is to simulate the rsm request.
+        First we ensure the PlatformUpdates enabled in the vm and then make a request using rest api
+        """
+        vm: VmMachine = VmMachine(self._context.vm)
+        if not self._verify_agent_update_flag_enabled(vm):
+            # enable the flag
+            self._enable_agent_update_flag(vm)
+            log.info("Set the enableVMAgentPlatformUpdates flag to True")
+        else:
+            log.info("Already enableVMAgentPlatformUpdates flag set to True")
+
+        credential = DefaultAzureCredential()
+        token = credential.get_token("https://management.azure.com/.default")
+        headers = {'Authorization': 'Bearer ' + token.token, 'Content-Type': 'application/json'}
+        # Later this api call will be replaced by azure-python-sdk wrapper
+        # Todo: management endpoints are different for national clouds. we need to change this.
+        url = "https://management.azure.com/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.Compute/virtualMachines/{2}/" \
+              "UpgradeVMAgent?api-version=2022-08-01".format(self._context.vm.subscription, self._context.vm.resource_group, self._context.vm.name)
+        data = {
+            "target": "Microsoft.OSTCLinuxAgent.Test",
+            "targetVersion": requested_version
+        }
+
+        response = requests.post(url, data=json.dumps(data), headers=headers)
+        if response.status_code == 202:
+            log.info("RSM upgrade request accepted")
+        else:
+            raise Exception("Error occurred while RSM upgrade request. Status code : {0} and msg: {1}".format(response.status_code, response.content))
+
+    def _verify_guest_agent_update(self, requested_version: str) -> None:
+        """
+        Verify current agent version running on rsm requested version
+        """
+        def _check_agent_version(requested_version: str) -> bool:
+            stdout: str = self._ssh_client.run_command("sudo waagent --version")
+            expected_version = f"Goal state agent: {requested_version}"
+            if expected_version in stdout:
+                return True
+            else:
+                raise Exception("Guest agent didn't update to requested version {0} but found \n {1}. \n "
+                                             "To debug verify if CRP has upgrade operation around that time and also check if agent log has any errors ".format(requested_version, stdout))
+
+        log.info("Verifying agent updated to requested version")
+        retry_if_not_found(lambda: _check_agent_version(requested_version))
+        stdout: str = self._ssh_client.run_command("sudo waagent --version")
+        log.info(f"Verified agent updated to requested version. Current agent version running:\n {stdout}")
+
+    def _verify_agent_reported_supported_feature_flag(self):
+        """
+        RSM update rely on supported flag that agent sends to CRP.So, checking if GA reports feature flag from the agent log
+        """
+        def _check_agent_supports_versioning() -> bool:
+            found: str = self._ssh_client.run_command("grep -q 'Agent.*supports GA Versioning' /var/log/waagent.log && echo true || echo false").rstrip()
+            return True if found == "true" else False
+
+        log.info("Verifying agent reported supported feature flag")
+        found: bool = retry_if_not_found(_check_agent_supports_versioning)
+
+        if not found:
+            raise Exception("Agent failed to report supported feature flag, so skipping agent update validations")
+        else:
+            log.info("Successfully verified agent reported supported feature flag")
+
+
+if __name__ == "__main__":
+    RsmUpdateBvt.run_from_command_line()
diff --git a/tests_e2e/tests/lib/retry.py b/tests_e2e/tests/lib/retry.py
@@ -57,3 +57,23 @@ def retry_ssh_run(operation: Callable[[], Any]) -> Any:
                     raise
             log.warning("The operation failed, retrying in 30 secs.\n%s", e)
         time.sleep(30)
+
+
+def retry_if_not_found(operation: Callable[[], bool], attempts: int = 5) -> bool:
+    """
+    This method attempts the given operation retrying a few times
+    (after a short delay)
+    Note: Method used for operations which are return True or False
+    """
+    found: bool = False
+    while attempts > 0 and not found:
+        attempts -= 1
+        try:
+            found = operation()
+        except Exception:
+            if attempts == 0:
+                raise
+        if not found:
+            log.info("Current execution didn't find it, retrying in 30 secs.")
+        time.sleep(30)
+    return found
diff --git a/tests_e2e/tests/lib/ssh_client.py b/tests_e2e/tests/lib/ssh_client.py
@@ -83,3 +83,10 @@ def _copy(self, source: Path, target: Path, remote_source: bool, remote_target:
         command.extend([str(source), str(target)])
 
         shell.run_command(command)
+
+    def copy(self, local_path: Path, remote_path: Path):
+        """
+        Copy file from local to remote machine
+        """
+        destination = f"{self._username}@{self._ip_address}:{remote_path}"
+        shell.run_command(["scp", "-o", "StrictHostKeyChecking=no", "-i", self._private_key_file, local_path, destination])
diff --git a/tests_e2e/tests/lib/virtual_machine.py b/tests_e2e/tests/lib/virtual_machine.py
@@ -26,7 +26,7 @@
 from azure.core.polling import LROPoller
 from azure.identity import DefaultAzureCredential
 from azure.mgmt.compute import ComputeManagementClient
-from azure.mgmt.compute.models import VirtualMachineExtension, VirtualMachineScaleSetExtension, VirtualMachineInstanceView, VirtualMachineScaleSetInstanceView
+from azure.mgmt.compute.models import VirtualMachineExtension, VirtualMachineScaleSetExtension, VirtualMachineInstanceView, VirtualMachineScaleSetInstanceView, VirtualMachine, VirtualMachineScaleSetVM
 from azure.mgmt.resource import ResourceManagementClient
 from msrestazure.azure_cloud import Cloud
 
@@ -91,11 +91,39 @@ def _begin_restart(self) -> LROPoller:
         Derived classes must provide the implementation for this method using their corresponding begin_restart() implementation
         """
 
+    @abstractmethod
+    def get(self) -> Any:
+        """
+        Retrieves the information about the virtual machine or scale set
+        """
+
+    def create_or_update(self, parameters=None, timeout=5 * 60) -> None:
+        """
+        Creates or updates the virtual machine or scale set with custom settings
+        """
+        if parameters is None:
+            parameters = {}
+
+        log.info("Creating/Updating VM for %s", self._identifier)
+
+        poller: LROPoller = execute_with_retry(lambda: self._begin_create_or_update(parameters))
+
+        poller.wait(timeout=timeout)
+
+        if not poller.done():
+            raise TimeoutError(f"Failed to restart {self._identifier.name} after {timeout} seconds")
+
+    @abstractmethod
+    def _begin_create_or_update(self, parameters) -> Any:
+        """
+        Derived classes must provide the implementation for this method using their corresponding begin_create_or_update() implementation
+        """
+
     def __str__(self):
         return f"{self._identifier}"
 
 
-class VirtualMachine(VirtualMachineBaseClass):
+class VmMachine(VirtualMachineBaseClass):
     def get_instance_view(self) -> VirtualMachineInstanceView:
         log.info("Retrieving instance view for %s", self._identifier)
         return execute_with_retry(lambda: self._compute_client.virtual_machines.get(
@@ -110,11 +138,20 @@ def get_extensions(self) -> List[VirtualMachineExtension]:
             resource_group_name=self._identifier.resource_group,
             vm_name=self._identifier.name))
 
+    def get(self) -> VirtualMachine:
+        log.info("Retrieving vm information for %s", self._identifier)
+        return execute_with_retry(lambda: self._compute_client.virtual_machines.get(
+            resource_group_name=self._identifier.resource_group,
+            vm_name=self._identifier.name))
+
     def _begin_restart(self) -> LROPoller:
         return self._compute_client.virtual_machines.begin_restart(
             resource_group_name=self._identifier.resource_group,
             vm_name=self._identifier.name)
 
+    def _begin_create_or_update(self, parameters) -> LROPoller:
+        return self._compute_client.virtual_machines.begin_create_or_update(self._identifier.resource_group, self._identifier.name, parameters)
+
 
 class VmScaleSet(VirtualMachineBaseClass):
     def get_instance_view(self) -> VirtualMachineScaleSetInstanceView:
@@ -149,7 +186,25 @@ def get_extensions(self) -> List[VirtualMachineScaleSetExtension]:
             resource_group_name=self._identifier.resource_group,
             vm_scale_set_name=self._identifier.name))
 
+    def get(self) -> List[VirtualMachineScaleSetVM]:
+        log.info("Retrieving vm information for %s", self._identifier)
+        vmss_vm_list: List[VirtualMachineScaleSetVM] = []
+        for vm in execute_with_retry(lambda: self._compute_client.virtual_machine_scale_set_vms.list(self._identifier.resource_group, self._identifier.name)):
+            try:
+                vmss_vm: VirtualMachineScaleSetVM = execute_with_retry(self._compute_client.virtual_machine_scale_set_vms.get(
+                    resource_group_name=self._identifier.resource_group, vm_scale_set_name=self._identifier.name, instance_id=vm.instance_id))
+                vmss_vm_list.append(vmss_vm)
+
+            except Exception as e:
+                log.warning("Unable to retrieve vm information for scale set instance %s. Trying out other instances.\nError: %s", vm, e)
+
+        return vmss_vm_list
+
     def _begin_restart(self) -> LROPoller:
         return self._compute_client.virtual_machine_scale_sets.begin_restart(
             resource_group_name=self._identifier.resource_group,
             vm_scale_set_name=self._identifier.name)
+
+    def _begin_create_or_update(self, parameters) -> None:
+        # TODO: Revisit this implementation
+        return