Skip to content

Commit

Permalink
agent versioning test_suite (Azure#2770)
Browse files Browse the repository at this point in the history
* agent versioning test_suite

* address PR comments

* fix pylint warning

* fix update assertion

* fix pylint error
  • Loading branch information
nagworld9 committed Apr 21, 2023
1 parent 09b4bda commit f9b9a7a
Show file tree
Hide file tree
Showing 12 changed files with 446 additions and 3 deletions.
1 change: 1 addition & 0 deletions azurelinuxagent/common/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ class WALAEventOperation:
Update = "Update"
VmSettings = "VmSettings"
VmSettingsSummary = "VmSettingsSummary"
FeatureFlag = "FeatureFlag"


SHOULD_ENCODE_MESSAGE_LEN = 80
Expand Down
28 changes: 27 additions & 1 deletion azurelinuxagent/ga/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@
from azurelinuxagent.common import logger
from azurelinuxagent.common.protocol.imds import get_imds_client
from azurelinuxagent.common.utils import fileutil, textutil
from azurelinuxagent.common.agent_supported_feature import get_supported_feature_by_name, SupportedFeatureNames
from azurelinuxagent.common.agent_supported_feature import get_supported_feature_by_name, SupportedFeatureNames, \
get_agent_supported_features_list_for_crp
from azurelinuxagent.common.cgroupconfigurator import CGroupConfigurator
from azurelinuxagent.common.event import add_event, initialize_event_logger_vminfo_common_parameters, \
WALAEventOperation, EVENTS_DIRECTORY
Expand Down Expand Up @@ -149,6 +150,9 @@ def __init__(self):
# VM Size is reported via the heartbeat, default it here.
self._vm_size = None

# Flag is Used to log if GA supports versioning on agent start
self._agent_supports_versioning_logged = False

# these members are used to avoid reporting errors too frequently
self._heartbeat_update_goal_state_error_count = 0
self._update_goal_state_error_count = 0
Expand Down Expand Up @@ -523,6 +527,7 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler, agent_
# status reporting should be done even when the goal state is not updated
agent_update_status = agent_update_handler.get_vmagent_update_status()
self._report_status(exthandlers_handler, agent_update_status)
self._log_agent_supports_versioning_or_not()
return

# check for agent updates
Expand All @@ -547,6 +552,9 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler, agent_
agent_update_status = agent_update_handler.get_vmagent_update_status()
self._report_status(exthandlers_handler, agent_update_status)

# Logging after agent reports supported feature flag so this msg in sync with report status
self._log_agent_supports_versioning_or_not()

if self._processing_new_incarnation():
remote_access_handler.run()

Expand Down Expand Up @@ -610,6 +618,24 @@ def _report_extensions_summary(self, vm_status):
logger.warn(msg)
add_event(op=WALAEventOperation.GoalState, is_success=False, message=msg)

def _log_agent_supports_versioning_or_not(self):
def _log_event(msg):
logger.info(msg)
add_event(AGENT_NAME, op=WALAEventOperation.FeatureFlag, message=msg)
if not self._agent_supports_versioning_logged:
supports_ga_versioning = False
for _, feature in get_agent_supported_features_list_for_crp().items():
if feature.name == SupportedFeatureNames.GAVersioningGovernance:
supports_ga_versioning = True
break
if supports_ga_versioning:
msg = "Agent : {0} supports GA Versioning".format(CURRENT_VERSION)
_log_event(msg)
else:
msg = "Agent : {0} doesn't support GA Versioning".format(CURRENT_VERSION)
_log_event(msg)
self._agent_supports_versioning_logged = True

def _on_initial_goal_state_completed(self, extensions_summary):
fileutil.write_file(self._initial_goal_state_file_path(), ustr(extensions_summary))
if conf.get_extensions_enabled() and self._goal_state_period != conf.get_goal_state_period():
Expand Down
3 changes: 3 additions & 0 deletions tests_e2e/orchestrator/scripts/install-agent
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ unzip.py "$package" "/var/lib/waagent/WALinuxAgent-$version"
if [[ -e /etc/waagent.conf ]]; then
sed -i 's/AutoUpdate.Enabled=n/AutoUpdate.Enabled=y/g' /etc/waagent.conf
fi
# By default GAUpdates flag set to True, so that agent go through update logic to look for new agents.
# But in e2e tests this flag needs to be off in test version 9.9.9.9 to stop the agent updates, so that our scenarios run on 9.9.9.9.
sed -i '$a GAUpdates.Enabled=n' /etc/waagent.conf

#
# Restart the service
Expand Down
5 changes: 5 additions & 0 deletions tests_e2e/test_suites/agent_update.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
name: "AgentUpdate"
tests:
- "agent_update/rsm_update.py"
images: "endorsed"
location: "eastus2euap"
Empty file.
175 changes: 175 additions & 0 deletions tests_e2e/tests/agent_update/rsm_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
#!/usr/bin/env python3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#
# BVT for the agent update scenario
#
# The test verifies agent update for rsm workflow. This test covers three scenarios downgrade, upgrade and no update.
# For each scenario, we intiaite the rsm request with target version and then verify agent updated to that target version.
#
import json

import requests
from azure.identity import DefaultAzureCredential
from azure.mgmt.compute.models import VirtualMachine

from tests_e2e.tests.lib.agent_test import AgentTest
from tests_e2e.tests.lib.agent_test_context import AgentTestContext
from tests_e2e.tests.lib.logging import log
from tests_e2e.tests.lib.retry import retry_if_not_found
from tests_e2e.tests.lib.ssh_client import SshClient
from tests_e2e.tests.lib.virtual_machine import VmMachine


class RsmUpdateBvt(AgentTest):

def __init__(self, context: AgentTestContext):
super().__init__(context)
self._ssh_client = SshClient(
ip_address=self._context.vm_ip_address,
username=self._context.username,
private_key_file=self._context.private_key_file)

def run(self) -> None:
# Allow agent to send supported feature flag
self._verify_agent_reported_supported_feature_flag()

log.info("*******Verifying the Agent Downgrade scenario*******")
self._mock_rsm_update("1.3.0.0")
self._prepare_agent()

# Verify downgrade scenario
self._verify_guest_agent_update("1.3.0.0")

# Verify upgrade scenario
log.info("*******Verifying the Agent Upgrade scenario*******")
self._mock_rsm_update("1.3.1.0")
self._verify_guest_agent_update("1.3.1.0")

# verify no version update. There is bug in CRP and will enable once it's fixed
# log.info("*******Verifying the no version update scenario*******")
# self._prepare_rsm_update("1.3.1.0")
# self._verify_guest_agent_update("1.3.1.0")

def _prepare_agent(self) -> None:
"""
This method is to ensure agent is ready for accepting rsm updates. As part of that we update following flags
1) Changing daemon version since daemon has a hard check on agent version in order to update agent. It doesn't allow versions which are less than daemon version.
2) Updating GAFamily type "Test" and GAUpdates flag to process agent updates on test versions.
"""
local_path = self._context.test_source_directory/"tests"/"scripts"/"agent-python"
remote_path = self._context.remote_working_directory/"agent-python"
self._ssh_client.copy(local_path, remote_path)
local_path = self._context.test_source_directory/"tests"/"scripts"/"agent-service"
remote_path = self._context.remote_working_directory/"agent-service"
self._ssh_client.copy(local_path, remote_path)
local_path = self._context.test_source_directory/"tests"/"scripts"/"agent-update-config"
remote_path = self._context.remote_working_directory/"agent-update-config"
self._ssh_client.copy(local_path, remote_path)
self._ssh_client.run_command(f"sudo {remote_path}")

@staticmethod
def _verify_agent_update_flag_enabled(vm: VmMachine) -> bool:
result: VirtualMachine = vm.get()
flag: bool = result.os_profile.linux_configuration.enable_vm_agent_platform_updates
if flag is None:
return False
return flag

def _enable_agent_update_flag(self, vm: VmMachine) -> None:
osprofile = {
"location": self._context.vm.location, # location is required field
"properties": {
"osProfile": {
"linuxConfiguration": {
"enableVMAgentPlatformUpdates": True
}
}
}
}
vm.create_or_update(osprofile)

def _mock_rsm_update(self, requested_version: str) -> None:
"""
This method is to simulate the rsm request.
First we ensure the PlatformUpdates enabled in the vm and then make a request using rest api
"""
vm: VmMachine = VmMachine(self._context.vm)
if not self._verify_agent_update_flag_enabled(vm):
# enable the flag
self._enable_agent_update_flag(vm)
log.info("Set the enableVMAgentPlatformUpdates flag to True")
else:
log.info("Already enableVMAgentPlatformUpdates flag set to True")

credential = DefaultAzureCredential()
token = credential.get_token("https://management.azure.com/.default")
headers = {'Authorization': 'Bearer ' + token.token, 'Content-Type': 'application/json'}
# Later this api call will be replaced by azure-python-sdk wrapper
# Todo: management endpoints are different for national clouds. we need to change this.
url = "https://management.azure.com/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.Compute/virtualMachines/{2}/" \
"UpgradeVMAgent?api-version=2022-08-01".format(self._context.vm.subscription, self._context.vm.resource_group, self._context.vm.name)
data = {
"target": "Microsoft.OSTCLinuxAgent.Test",
"targetVersion": requested_version
}

response = requests.post(url, data=json.dumps(data), headers=headers)
if response.status_code == 202:
log.info("RSM upgrade request accepted")
else:
raise Exception("Error occurred while RSM upgrade request. Status code : {0} and msg: {1}".format(response.status_code, response.content))

def _verify_guest_agent_update(self, requested_version: str) -> None:
"""
Verify current agent version running on rsm requested version
"""
def _check_agent_version(requested_version: str) -> bool:
stdout: str = self._ssh_client.run_command("sudo waagent --version")
expected_version = f"Goal state agent: {requested_version}"
if expected_version in stdout:
return True
else:
raise Exception("Guest agent didn't update to requested version {0} but found \n {1}. \n "
"To debug verify if CRP has upgrade operation around that time and also check if agent log has any errors ".format(requested_version, stdout))

log.info("Verifying agent updated to requested version")
retry_if_not_found(lambda: _check_agent_version(requested_version))
stdout: str = self._ssh_client.run_command("sudo waagent --version")
log.info(f"Verified agent updated to requested version. Current agent version running:\n {stdout}")

def _verify_agent_reported_supported_feature_flag(self):
"""
RSM update rely on supported flag that agent sends to CRP.So, checking if GA reports feature flag from the agent log
"""
def _check_agent_supports_versioning() -> bool:
found: str = self._ssh_client.run_command("grep -q 'Agent.*supports GA Versioning' /var/log/waagent.log && echo true || echo false").rstrip()
return True if found == "true" else False

log.info("Verifying agent reported supported feature flag")
found: bool = retry_if_not_found(_check_agent_supports_versioning)

if not found:
raise Exception("Agent failed to report supported feature flag, so skipping agent update validations")
else:
log.info("Successfully verified agent reported supported feature flag")


if __name__ == "__main__":
RsmUpdateBvt.run_from_command_line()
20 changes: 20 additions & 0 deletions tests_e2e/tests/lib/retry.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,23 @@ def retry_ssh_run(operation: Callable[[], Any]) -> Any:
raise
log.warning("The operation failed, retrying in 30 secs.\n%s", e)
time.sleep(30)


def retry_if_not_found(operation: Callable[[], bool], attempts: int = 5) -> bool:
"""
This method attempts the given operation retrying a few times
(after a short delay)
Note: Method used for operations which are return True or False
"""
found: bool = False
while attempts > 0 and not found:
attempts -= 1
try:
found = operation()
except Exception:
if attempts == 0:
raise
if not found:
log.info("Current execution didn't find it, retrying in 30 secs.")
time.sleep(30)
return found
7 changes: 7 additions & 0 deletions tests_e2e/tests/lib/ssh_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,10 @@ def _copy(self, source: Path, target: Path, remote_source: bool, remote_target:
command.extend([str(source), str(target)])

shell.run_command(command)

def copy(self, local_path: Path, remote_path: Path):
"""
Copy file from local to remote machine
"""
destination = f"{self._username}@{self._ip_address}:{remote_path}"
shell.run_command(["scp", "-o", "StrictHostKeyChecking=no", "-i", self._private_key_file, local_path, destination])
59 changes: 57 additions & 2 deletions tests_e2e/tests/lib/virtual_machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from azure.core.polling import LROPoller
from azure.identity import DefaultAzureCredential
from azure.mgmt.compute import ComputeManagementClient
from azure.mgmt.compute.models import VirtualMachineExtension, VirtualMachineScaleSetExtension, VirtualMachineInstanceView, VirtualMachineScaleSetInstanceView
from azure.mgmt.compute.models import VirtualMachineExtension, VirtualMachineScaleSetExtension, VirtualMachineInstanceView, VirtualMachineScaleSetInstanceView, VirtualMachine, VirtualMachineScaleSetVM
from azure.mgmt.resource import ResourceManagementClient
from msrestazure.azure_cloud import Cloud

Expand Down Expand Up @@ -91,11 +91,39 @@ def _begin_restart(self) -> LROPoller:
Derived classes must provide the implementation for this method using their corresponding begin_restart() implementation
"""

@abstractmethod
def get(self) -> Any:
"""
Retrieves the information about the virtual machine or scale set
"""

def create_or_update(self, parameters=None, timeout=5 * 60) -> None:
"""
Creates or updates the virtual machine or scale set with custom settings
"""
if parameters is None:
parameters = {}

log.info("Creating/Updating VM for %s", self._identifier)

poller: LROPoller = execute_with_retry(lambda: self._begin_create_or_update(parameters))

poller.wait(timeout=timeout)

if not poller.done():
raise TimeoutError(f"Failed to restart {self._identifier.name} after {timeout} seconds")

@abstractmethod
def _begin_create_or_update(self, parameters) -> Any:
"""
Derived classes must provide the implementation for this method using their corresponding begin_create_or_update() implementation
"""

def __str__(self):
return f"{self._identifier}"


class VirtualMachine(VirtualMachineBaseClass):
class VmMachine(VirtualMachineBaseClass):
def get_instance_view(self) -> VirtualMachineInstanceView:
log.info("Retrieving instance view for %s", self._identifier)
return execute_with_retry(lambda: self._compute_client.virtual_machines.get(
Expand All @@ -110,11 +138,20 @@ def get_extensions(self) -> List[VirtualMachineExtension]:
resource_group_name=self._identifier.resource_group,
vm_name=self._identifier.name))

def get(self) -> VirtualMachine:
log.info("Retrieving vm information for %s", self._identifier)
return execute_with_retry(lambda: self._compute_client.virtual_machines.get(
resource_group_name=self._identifier.resource_group,
vm_name=self._identifier.name))

def _begin_restart(self) -> LROPoller:
return self._compute_client.virtual_machines.begin_restart(
resource_group_name=self._identifier.resource_group,
vm_name=self._identifier.name)

def _begin_create_or_update(self, parameters) -> LROPoller:
return self._compute_client.virtual_machines.begin_create_or_update(self._identifier.resource_group, self._identifier.name, parameters)


class VmScaleSet(VirtualMachineBaseClass):
def get_instance_view(self) -> VirtualMachineScaleSetInstanceView:
Expand Down Expand Up @@ -149,7 +186,25 @@ def get_extensions(self) -> List[VirtualMachineScaleSetExtension]:
resource_group_name=self._identifier.resource_group,
vm_scale_set_name=self._identifier.name))

def get(self) -> List[VirtualMachineScaleSetVM]:
log.info("Retrieving vm information for %s", self._identifier)
vmss_vm_list: List[VirtualMachineScaleSetVM] = []
for vm in execute_with_retry(lambda: self._compute_client.virtual_machine_scale_set_vms.list(self._identifier.resource_group, self._identifier.name)):
try:
vmss_vm: VirtualMachineScaleSetVM = execute_with_retry(self._compute_client.virtual_machine_scale_set_vms.get(
resource_group_name=self._identifier.resource_group, vm_scale_set_name=self._identifier.name, instance_id=vm.instance_id))
vmss_vm_list.append(vmss_vm)

except Exception as e:
log.warning("Unable to retrieve vm information for scale set instance %s. Trying out other instances.\nError: %s", vm, e)

return vmss_vm_list

def _begin_restart(self) -> LROPoller:
return self._compute_client.virtual_machine_scale_sets.begin_restart(
resource_group_name=self._identifier.resource_group,
vm_scale_set_name=self._identifier.name)

def _begin_create_or_update(self, parameters) -> None:
# TODO: Revisit this implementation
return
Loading

0 comments on commit f9b9a7a

Please sign in to comment.