From c2cc2c6b8a3421588cf16bc9dc57a926aab48e3b Mon Sep 17 00:00:00 2001 From: mgunnala Date: Fri, 8 Nov 2024 16:22:59 -0500 Subject: [PATCH 01/32] Block disallowed extension processing --- azurelinuxagent/ga/exthandlers.py | 46 ++- azurelinuxagent/ga/policy/policy_engine.py | 21 +- tests/ga/test_extension.py | 144 ++++++++- tests/ga/test_multi_config_extension.py | 64 ++++ tests_e2e/test_suites/ext_policy.yml | 8 + .../ext_policy_with_dependencies.yml | 8 + tests_e2e/tests/ext_policy/ext_policy.py | 211 +++++++++++++ .../ext_policy_with_dependencies.py | 280 ++++++++++++++++++ .../ext_policy/policy_dependencies_cases.py | 243 +++++++++++++++ ...nt_ext_workflow-check_data_in_agent_log.py | 5 +- 10 files changed, 1026 insertions(+), 4 deletions(-) create mode 100644 tests_e2e/test_suites/ext_policy.yml create mode 100644 tests_e2e/test_suites/ext_policy_with_dependencies.yml create mode 100644 tests_e2e/tests/ext_policy/ext_policy.py create mode 100644 tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py create mode 100644 tests_e2e/tests/ext_policy/policy_dependencies_cases.py diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index b0ef37969..18b520357 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -39,6 +39,7 @@ from azurelinuxagent.common.agent_supported_feature import get_agent_supported_features_list_for_extensions, \ SupportedFeatureNames, get_supported_feature_by_name, get_agent_supported_features_list_for_crp from azurelinuxagent.ga.cgroupconfigurator import CGroupConfigurator +from azurelinuxagent.ga.policy.policy_engine import ExtensionPolicyEngine, ExtensionPolicyError from azurelinuxagent.common.datacontract import get_properties, set_properties from azurelinuxagent.common.errorstate import ErrorState from azurelinuxagent.common.event import add_event, elapsed_milliseconds, WALAEventOperation, \ @@ -482,10 +483,49 @@ def handle_ext_handlers(self, goal_state_id): depends_on_err_msg = None extensions_enabled = conf.get_extensions_enabled() + # Instantiate policy engine, and use same engine to handle all extension handlers. + # If an error is thrown during policy engine initialization, we block all extensions and report the error via handler/extension status for + # each extension. + policy_error = None + try: + policy_engine = ExtensionPolicyEngine() + except Exception as ex: + policy_error = ex + for extension, ext_handler in all_extensions: handler_i = ExtHandlerInstance(ext_handler, self.protocol, extension=extension) + # Invoke policy engine to determine if extension is allowed. If not, block extension and report error on + # behalf of the extension. + policy_err_map = { + ExtensionRequestedState.Enabled: ('enable', ExtensionErrorCodes.PluginEnableProcessingFailed), + # TODO: CRP does not currently have a terminal error code for uninstall. Once CRP adds + # an error code for uninstall or for policy, use this code instead of PluginDisableProcessingFailed + # Note that currently, CRP waits for 90 minutes to time out for a failed uninstall operation, instead of + # failing fast. + ExtensionRequestedState.Uninstall: ('uninstall', ExtensionErrorCodes.PluginDisableProcessingFailed), + ExtensionRequestedState.Disabled: ('disable', ExtensionErrorCodes.PluginDisableProcessingFailed), + } + policy_op, policy_err_code = policy_err_map.get(ext_handler.state) + if policy_error is not None: + err = ExtensionPolicyError(msg="", inner=policy_error, code=policy_err_code) + self.__handle_and_report_ext_handler_errors(handler_i, err, + report_op=handler_i.operation, + message=ustr(err), extension=extension, report=True) + continue + + extension_allowed = policy_engine.should_allow_extension(ext_handler.name) + if not extension_allowed: + msg = "failed to {0} extension '{1}' because extension is not specified in allowlist. To {0}, " \ + "add extension to the allowed list in the policy file ('{2}').".format(policy_op, + ext_handler.name, + conf.get_policy_file_path()) + err = ExtensionPolicyError(msg, code=policy_err_code) + self.__handle_and_report_ext_handler_errors(handler_i, err, + report_op=handler_i.operation, + message=ustr(err), extension=extension, report=True) + # In case of extensions disabled, we skip processing extensions. But CRP is still waiting for some status # back for the skipped extensions. In order to propagate the status back to CRP, we will report status back # here with an error message. @@ -527,7 +567,11 @@ def handle_ext_handlers(self, goal_state_id): continue # Process extensions and get if it was successfully executed or not - extension_success = self.handle_ext_handler(handler_i, extension, goal_state_id) + # If extension was blocked by policy, treat the extension as failed and do not process the handler. + if not extension_allowed: + extension_success = False + else: + extension_success = self.handle_ext_handler(handler_i, extension, goal_state_id) dep_level = self.__get_dependency_level((extension, ext_handler)) if 0 <= dep_level < max_dep_level: diff --git a/azurelinuxagent/ga/policy/policy_engine.py b/azurelinuxagent/ga/policy/policy_engine.py index 065589f67..4e1983c09 100644 --- a/azurelinuxagent/ga/policy/policy_engine.py +++ b/azurelinuxagent/ga/policy/policy_engine.py @@ -22,7 +22,7 @@ from azurelinuxagent.common import logger from azurelinuxagent.common.event import WALAEventOperation, add_event from azurelinuxagent.common import conf -from azurelinuxagent.common.exception import AgentError +from azurelinuxagent.common.exception import AgentError, ExtensionError from azurelinuxagent.common.protocol.extensions_goal_state_from_vm_settings import _CaseFoldedDict from azurelinuxagent.common.utils.flexible_version import FlexibleVersion @@ -51,6 +51,25 @@ def __init__(self, msg, inner=None): super(InvalidPolicyError, self).__init__(msg, inner) +class ExtensionPolicyError(ExtensionError): + """ + Error raised during agent extension policy enforcement. + """ + # TODO: when CRP adds terminal error code for policy-related extension failures, set that as the default code. + def __init__(self, msg, inner=None, code=-1): + msg = "Extension is disallowed by agent policy and will not be processed: {0}".format(msg) + super(ExtensionPolicyError, self).__init__(msg, inner, code) + + +class InvalidPolicyError(AgentError): + """ + Error raised if user-provided policy is invalid. + """ + def __init__(self, msg, inner=None): + msg = "Customer-provided policy file ('{0}') is invalid, please correct the following error: {1}".format(conf.get_policy_file_path(), msg) + super(InvalidPolicyError, self).__init__(msg, inner) + + class _PolicyEngine(object): """ Implements base policy engine API. diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index 851a6d5c2..a4883add1 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -427,7 +427,7 @@ def test_migration_ignores_tree_remove_errors(self, shutil_mock): # pylint: dis class TestExtensionBase(AgentTestCase): def _assert_handler_status(self, report_vm_status, expected_status, expected_ext_count, version, - expected_handler_name="OSTCExtensions.ExampleHandlerLinux", expected_msg=None): + expected_handler_name="OSTCExtensions.ExampleHandlerLinux", expected_msg=None, expected_code=None): self.assertTrue(report_vm_status.called) args, kw = report_vm_status.call_args # pylint: disable=unused-variable vm_status = args[0] @@ -443,6 +443,9 @@ def _assert_handler_status(self, report_vm_status, expected_status, if expected_msg is not None: self.assertIn(expected_msg, handler_status.message) + if expected_code is not None: + self.assertEqual(expected_code, handler_status.code) + # Deprecated. New tests should be added to the TestExtension class @patch('time.sleep', side_effect=lambda _: mock_sleep(0.001)) @@ -3507,5 +3510,144 @@ def test_report_msg_if_handler_manifest_contains_invalid_values(self): self.assertIn("'supportsMultipleExtensions' has a non-boolean value", kw_messages[2]['message']) +class TestExtensionPolicy(TestExtensionBase): + def setUp(self): + AgentTestCase.setUp(self) + self.policy_path = os.path.join(self.tmp_dir, "waagent_policy.json") + + # Patch attributes to enable policy feature + self.patch_policy_path = patch('azurelinuxagent.common.conf.get_policy_file_path', + return_value=str(self.policy_path)) + self.patch_policy_path.start() + self.patch_conf_flag = patch('azurelinuxagent.ga.policy.policy_engine.conf.get_extension_policy_enabled', + return_value=True) + self.patch_conf_flag.start() + self.maxDiff = None # When long error messages don't match, display the entire diff. + + def tearDown(self): + patch.stopall() + AgentTestCase.tearDown(self) + + def _create_policy_file(self, policy): + with open(self.policy_path, mode='w') as policy_file: + if isinstance(policy, dict): + json.dump(policy, policy_file, indent=4) + else: + policy_file.write(policy) + policy_file.flush() + + def _test_policy_failure(self, policy, op, expected_status_code, expected_handler_status, + expected_status_msg=None): + + with mock_wire_protocol(wire_protocol_data.DATA_FILE) as protocol: + if op == ExtensionRequestedState.Uninstall: + protocol.mock_wire_data.set_incarnation(2) + protocol.mock_wire_data.set_extensions_config_state(ExtensionRequestedState.Uninstall) + protocol.client.update_goal_state() + protocol.aggregate_status = None + protocol.report_vm_status = MagicMock() + exthandlers_handler = get_exthandlers_handler(protocol) + + self._create_policy_file(policy) + exthandlers_handler.run() + exthandlers_handler.report_ext_handlers_status() + + report_vm_status = protocol.report_vm_status + self.assertTrue(report_vm_status.called) + self._assert_handler_status(report_vm_status, expected_handler_status, 0, "1.0.0", 'OSTCExtensions.ExampleHandlerLinux', + expected_msg=expected_status_msg, expected_code=expected_status_code) + + def test_should_fail_enable_if_extension_disallowed(self): + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + } + } + expected_msg = "failed to enable extension 'OSTCExtensions.ExampleHandlerLinux' because extension is not specified in allowlist." + self._test_policy_failure(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, + expected_handler_status='NotReady', expected_status_msg=expected_msg) + + def test_should_fail_enable_for_invalid_policy(self): + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": "False" + } + } + expected_msg = "attribute 'extensionPolicies.allowListedExtensionsOnly'; must be 'boolean'" + self._test_policy_failure(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, + expected_handler_status='NotReady', expected_status_msg=expected_msg) + + def test_should_fail_extension_if_error_thrown_during_policy_engine_init(self): + policy = \ + { + "policyVersion": "0.1.0" + } + with patch('azurelinuxagent.ga.policy.policy_engine.ExtensionPolicyEngine.__init__', + side_effect=Exception("mock exception")): + expected_msg = "Extension is disallowed by agent policy and will not be processed: \nInner error: mock exception" + self._test_policy_failure(policy=policy, op=ExtensionRequestedState.Enabled, + expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, + expected_handler_status='NotReady', expected_status_msg=expected_msg) + + def test_should_fail_uninstall_if_extension_disallowed(self): + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "signatureRequired": False, + "extensions": {} + }, + } + expected_msg = "failed to uninstall extension 'OSTCExtensions.ExampleHandlerLinux' because extension is not specified in allowlist." + self._test_policy_failure(policy=policy, op=ExtensionRequestedState.Uninstall, expected_status_code=ExtensionErrorCodes.PluginDisableProcessingFailed, + expected_handler_status='NotReady', expected_status_msg=expected_msg) + + def test_should_fail_enable_if_dependent_extension_disallowed(self): + self._create_policy_file({ + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "extensions": { + "OSTCExtensions.ExampleHandlerLinux": {} + } + } + }) + with mock_wire_protocol(wire_protocol_data.DATA_FILE_EXT_SEQUENCING) as protocol: + protocol.aggregate_status = None + protocol.report_vm_status = MagicMock() + exthandlers_handler = get_exthandlers_handler(protocol) + dep_ext_level_2 = extension_emulator(name="OSTCExtensions.ExampleHandlerLinux") + dep_ext_level_1 = extension_emulator(name="OSTCExtensions.OtherExampleHandlerLinux") + + exthandlers_handler.run() + exthandlers_handler.report_ext_handlers_status() + + # OtherExampleHandlerLinux should be disallowed by policy, ExampleHandlerLinux should be skipped because + # dependent extension failed + self._assert_handler_status(protocol.report_vm_status, "NotReady", 0, "1.0.0", + expected_handler_name="OSTCExtensions.OtherExampleHandlerLinux", + expected_msg=("failed to enable extension 'OSTCExtensions.OtherExampleHandlerLinux' " + "because extension is not specified in allowlist.")) + + self._assert_handler_status(protocol.report_vm_status, "NotReady", 0, "1.0.0", + expected_handler_name="OSTCExtensions.ExampleHandlerLinux", + expected_msg="Skipping processing of extensions since execution of dependent " + "extension OSTCExtensions.OtherExampleHandlerLinux failed") + + # check handler list and dependency levels + self.assertTrue(exthandlers_handler.ext_handlers is not None) + self.assertTrue(exthandlers_handler.ext_handlers is not None) + self.assertEqual(len(exthandlers_handler.ext_handlers), 2) + self.assertEqual(1, next(handler for handler in exthandlers_handler.ext_handlers if + handler.name == dep_ext_level_1.name).settings[0].dependencyLevel) + self.assertEqual(2, next(handler for handler in exthandlers_handler.ext_handlers if + handler.name == dep_ext_level_2.name).settings[0].dependencyLevel) + + if __name__ == '__main__': unittest.main() diff --git a/tests/ga/test_multi_config_extension.py b/tests/ga/test_multi_config_extension.py index 482126be3..66f4c3ca9 100644 --- a/tests/ga/test_multi_config_extension.py +++ b/tests/ga/test_multi_config_extension.py @@ -630,6 +630,70 @@ def test_it_should_handle_and_report_enable_errors_properly(self): } self._assert_extension_status(sc_handler, expected_extensions) + def test_it_should_handle_and_report_disallowed_extensions_properly(self): + """If multiconfig extension is disallowed by policy, all instances should be blocked.""" + policy_path = os.path.join(self.tmp_dir, "waagent_policy.json") + patch('azurelinuxagent.common.conf.get_policy_file_path', return_value=str(policy_path)).start() + patch('azurelinuxagent.ga.policy.policy_engine.conf.get_extension_policy_enabled', + return_value=True).start() + policy = \ + { + "policyVersion": "0.0.1", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "signatureRequired": True, + "extensions": { + "Microsoft.Powershell.ExampleExtension": {} + } + } + } + with open(policy_path, mode='w') as policy_file: + json.dump(policy, policy_file, indent=4) + policy_file.flush() + self.test_data['ext_conf'] = os.path.join(self._MULTI_CONFIG_TEST_DATA, + "ext_conf_multi_config_no_dependencies.xml") + with self._setup_test_env() as (exthandlers_handler, protocol, no_of_extensions): + disallowed_mc_1 = extension_emulator(name="OSTCExtensions.ExampleHandlerLinux.firstExtension", + supports_multiple_extensions=True) + disallowed_mc_2 = extension_emulator(name="OSTCExtensions.ExampleHandlerLinux.secondExtension", + supports_multiple_extensions=True) + disallowed_mc_3 = extension_emulator(name="OSTCExtensions.ExampleHandlerLinux.thirdExtension", + supports_multiple_extensions=True) + allowed_ext = extension_emulator(name="Microsoft.Powershell.ExampleExtension") + with enable_invocations(disallowed_mc_1, disallowed_mc_2, disallowed_mc_3, + allowed_ext) as invocation_record: + exthandlers_handler.run() + exthandlers_handler.report_ext_handlers_status() + self.assertEqual(no_of_extensions, + len(protocol.aggregate_status['aggregateStatus']['handlerAggregateStatus']), + "incorrect extensions reported") + + # We should only enable the allowed extension, no instances of the multiconfig extension should be enabled + invocation_record.compare( + (allowed_ext, ExtensionCommandNames.INSTALL), + (allowed_ext, ExtensionCommandNames.ENABLE) + ) + + mc_handlers = self._assert_and_get_handler_status(aggregate_status=protocol.aggregate_status, + handler_name="OSTCExtensions.ExampleHandlerLinux", + expected_count=3, status="NotReady") + msg = "failed to enable extension 'OSTCExtensions.ExampleHandlerLinux' because extension is not specified in allowlist" + expected_extensions = { + "firstExtension": {"status": ExtensionStatusValue.error, "seq_no": 1, "message": msg}, + "secondExtension": {"status": ExtensionStatusValue.error, "seq_no": 2, "message": msg}, + "thirdExtension": {"status": ExtensionStatusValue.error, "seq_no": 3, "message": msg}, + } + self._assert_extension_status(mc_handlers, expected_extensions, multi_config=True) + + sc_handler = self._assert_and_get_handler_status(aggregate_status=protocol.aggregate_status, + handler_name="Microsoft.Powershell.ExampleExtension", + status="Ready", message=None) + expected_extensions = { + "Microsoft.Powershell.ExampleExtension": {"status": ExtensionStatusValue.success, "seq_no": 9, + "message": None} + } + self._assert_extension_status(sc_handler, expected_extensions) + def test_it_should_cleanup_extension_state_on_disable(self): def __assert_state_file(handler_name, handler_version, extensions, state, not_present=None): diff --git a/tests_e2e/test_suites/ext_policy.yml b/tests_e2e/test_suites/ext_policy.yml new file mode 100644 index 000000000..bdde59a02 --- /dev/null +++ b/tests_e2e/test_suites/ext_policy.yml @@ -0,0 +1,8 @@ +# +# The test suite verifies that disallowed extensions are not processed, but the agent should still report status. +# +name: "ExtensionPolicy" +tests: + - "ext_policy/ext_policy.py" +images: "random(endorsed)" +owns_vm: true \ No newline at end of file diff --git a/tests_e2e/test_suites/ext_policy_with_dependencies.yml b/tests_e2e/test_suites/ext_policy_with_dependencies.yml new file mode 100644 index 000000000..3a460c531 --- /dev/null +++ b/tests_e2e/test_suites/ext_policy_with_dependencies.yml @@ -0,0 +1,8 @@ +# +# The test suite verifies that disallowed extensions are not processed, but the agent should still report status. +# +name: "ExtPolicyWithDependencies" +tests: + - "ext_policy/ext_policy_with_dependencies.py" +images: "random(endorsed)" +executes_on_scale_set: true \ No newline at end of file diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py new file mode 100644 index 000000000..2d1c50917 --- /dev/null +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +import uuid +from typing import List, Dict, Any +from assertpy import assert_that, fail + +from tests_e2e.tests.lib.agent_test import AgentVmTest +from tests_e2e.tests.lib.vm_extension_identifier import VmExtensionIds +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.ssh_client import SshClient +from tests_e2e.tests.lib.virtual_machine_extension_client import VirtualMachineExtensionClient +from tests_e2e.tests.lib.agent_test_context import AgentVmTestContext + + +class ExtPolicy(AgentVmTest): + class TestCase: + def __init__(self, extension: VirtualMachineExtensionClient, settings: Any): + self.extension = extension + self.settings = settings + + def __init__(self, context: AgentVmTestContext): + super().__init__(context) + self._ssh_client: SshClient = self._context.create_ssh_client() + + def _create_policy_file(self, policy): + """ + Create policy json file and copy to /etc/waagent_policy.json on test machine. + """ + with open("waagent_policy.json", mode='w') as policy_file: + json.dump(policy, policy_file, indent=4) + policy_file.flush() + + remote_path = "/tmp/waagent_policy.json" + local_path = policy_file.name + self._ssh_client.copy_to_node(local_path=local_path, remote_path=remote_path) + policy_file_final_dest = "/etc/waagent_policy.json" + log.info("Copying policy file to test VM [%s]", self._context.vm.name) + self._ssh_client.run_command(f"mv {remote_path} {policy_file_final_dest}", use_sudo=True) + + def _operation_should_succeed(self, operation, extension_case): + log.info(f"Attempting to {operation} {extension_case.extension.__str__()}, expected to succeed ") + # Attempt operation. If enabling, assert that the extension is present in instance view. + # If deleting, assert that the extension is not present in instance view. + try: + if operation == "enable": + extension_case.extension.enable(settings=extension_case.settings, force_update=True, timeout=15 * 60) + extension_case.extension.assert_instance_view() + elif operation == "delete": + extension_case.extension.delete(timeout=15 * 60) + instance_view_extensions = self._context.vm.get_instance_view().extensions + if instance_view_extensions is not None and any( + e.name == extension_case.extension._resource_name for e in instance_view_extensions): + raise Exception( + "extension {0} still in instance view after attempting to delete".format(extension_case.extension._resource_nam)) + log.info(f"Operation '{operation}' for {extension_case.extension.__str__()} succeeded as expected.") + except Exception as error: + fail( + f"Unexpected error while trying to {operation} {extension_case.extension.__str__()}. " + f"Extension is allowed by policy so this operation should have completed successfully.\n" + f"Error: {error}") + + @staticmethod + def _operation_should_fail(operation, extension_case): + log.info(f"Attempting to {operation} {extension_case.extension.__str__()}, should fail fast.") + try: + timeout = (6 * 60) # Fail fast. + if operation == "enable": + extension_case.extension.enable(settings=extension_case.settings, force_update=True, timeout=timeout) + elif operation == "delete": + extension_case.extension.delete(timeout=timeout) + fail(f"The agent should have reported an error trying to {operation} {extension_case.extension.__str__()} " + f"because the extension is disallowed by policy.") + except Exception as error: + assert_that("Extension is disallowed by agent policy and will not be processed" in str(error)) \ + .described_as( + f"Error message should communicate that extension is disallowed by policy, but actual error " + f"was: {error}").is_true() + log.info(f"{extension_case.extension.__str__()} {operation} failed as expected") + + def run(self): + + # Prepare extensions to test + unique = str(uuid.uuid4()) + test_file = f"waagent-test.{unique}" + custom_script = ExtPolicy.TestCase( + VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, + resource_name="CustomScript"), + {'commandToExecute': f"echo '{unique}' > /tmp/{test_file}"} + ) + run_command = ExtPolicy.TestCase( + VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, + resource_name="RunCommandHandler"), + {'source': {'script': f"echo '{unique}' > /tmp/{test_file}"}} + ) + azure_monitor = ExtPolicy.TestCase( + VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent, + resource_name="AzureMonitorLinuxAgent"), + None + ) + unique2 = str(uuid.uuid4()) + test_file2 = f"waagent-test.{unique2}" + run_command_2 = ExtPolicy.TestCase( + VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, + resource_name="RunCommandHandler2"), + {'source': {'script': f"echo '{unique2}' > /tmp/{test_file2}"}} + ) + + # Enable policy via conf + log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) + self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) + + # Test case 1: should only enable allowlisted extensions + # CustomScript should be enabled, RunCommand and AzureMonitor should fail. + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "signatureRequired": False, + "extensions": { + "Microsoft.Azure.Extensions.CustomScript": {} + } + } + } + self._create_policy_file(policy) + self._operation_should_succeed("enable", custom_script) + self._operation_should_fail("enable", run_command) + if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): + self._operation_should_fail("enable", azure_monitor) + + # Test case 2: turn allowlist off + # RunCommand should be successfully enabled and then deleted. + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": False, + "signatureRequired": False, + "extensions": {} + } + } + self._create_policy_file(policy) + self._operation_should_succeed("enable", run_command) + self._operation_should_succeed("delete", run_command) + if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): + self._operation_should_succeed("enable", azure_monitor) + self._operation_should_succeed("delete", azure_monitor) + + # Test case 3: uninstall should fail when disallowed + # Remove CustomScript from allowlist and try to uninstall, should fail. + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "signatureRequired": False, + "extensions": {} + } + } + self._create_policy_file(policy) + + # Known CRP issue - delete/uninstall operation times out instead of reporting an error. + # TODO: uncomment this test case after issue is resolved + # self._operation_should_fail("delete", custom_script) + + # Test case 4: both instances in a multiconfig extension should fail, if disallowed. + # Disallow RunCommand and try to install two instances, both should fail fast. + self._operation_should_fail("enable", run_command) + self._operation_should_fail("enable", run_command_2) + + def get_ignore_error_rules(self) -> List[Dict[str, Any]]: + ignore_rules = [ + # + # 2024-10-23T17:50:38.107793Z WARNING ExtHandler ExtHandler Dependent extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent failed or timed out, will skip processing the rest of the extensions + # We intentionally block extensions with policy and expect any dependent extensions to be skipped + { + 'message': r"Dependent extension .* failed or timed out, will skip processing the rest of the extensions" + }, + # 2024-10-23T18:01:32.247341Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=ExtensionProcessing, message=Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent failed, duration=0 + # We intentionally block extensions with policy and expect any dependent extensions to be skipped + { + 'message': r"Skipping processing of extensions since execution of dependent extension .* failed" + }, + # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 + # We intentionally block extensions with policy and expect this failure message + { + 'message': r"Extension is disallowed by agent policy and will not be processed" + } + ] + return ignore_rules + + +if __name__ == "__main__": + ExtPolicy.run_from_command_line() diff --git a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py new file mode 100644 index 000000000..ae8cf1e96 --- /dev/null +++ b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# This test adds extensions with multiple dependencies to a VMSS using the 'provisionAfterExtensions' property and +# validates they are enabled in order of dependencies. +# +import copy +import json +import random +import re +import uuid +from datetime import datetime +from typing import List, Dict, Any + +from assertpy import fail +from tests_e2e.tests.lib.agent_test import AgentVmssTest, TestSkipped +from tests_e2e.tests.lib.agent_test_context import AgentVmTestContext +from tests_e2e.tests.lib.virtual_machine_scale_set_client import VmssInstanceIpAddress +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.resource_group_client import ResourceGroupClient +from tests_e2e.tests.lib.ssh_client import SshClient +from tests_e2e.tests.lib.vm_extension_identifier import VmExtensionIds +from tests_e2e.tests.ext_policy.policy_dependencies_cases import _should_fail_single_config_depends_on_disallowed_no_config, \ + _should_fail_single_config_depends_on_disallowed_single_config, \ + _should_succeed_single_config_depends_on_no_config, \ + _should_succeed_single_config_depends_on_single_config + # _should_fail_single_config_depends_on_disallowed_multi_config, + # _should_fail_multi_config_depends_on_disallowed_single_config, + # _should_fail_multi_config_depends_on_disallowed_no_config, + +class ExtPolicyWithDependencies(AgentVmssTest): + def __init__(self, context: AgentVmTestContext): + super().__init__(context) + self._scenario_start = datetime.min + + # Cases to test different dependency scenarios + _test_cases = [ + _should_fail_single_config_depends_on_disallowed_no_config, + _should_fail_single_config_depends_on_disallowed_single_config, + # TODO: RunCommand is unable to be installed properly, so these tests are currently disabled. Investigate the + # issue and enable these 3 tests. + # _should_fail_single_config_depends_on_disallowed_multi_config, + # _should_fail_multi_config_depends_on_disallowed_single_config, + # _should_fail_multi_config_depends_on_disallowed_no_config, + _should_succeed_single_config_depends_on_no_config, + _should_succeed_single_config_depends_on_single_config + ] + + @staticmethod + def _create_policy_file(ssh_client, policy): + with open("waagent_policy.json", mode='w') as policy_file: + json.dump(policy, policy_file, indent=4) + policy_file.flush() + + remote_path = "/tmp/waagent_policy.json" + local_path = policy_file.name + ssh_client.copy_to_node(local_path=local_path, remote_path=remote_path) + policy_file_final_dest = "/etc/waagent_policy.json" + ssh_client.run_command(f"mv {remote_path} {policy_file_final_dest}", use_sudo=True) + + def run(self): + instances_ip_address: List[VmssInstanceIpAddress] = self._context.vmss.get_instances_ip_address() + ssh_clients: Dict[str, SshClient] = {} + for instance in instances_ip_address: + ssh_clients[instance.instance_name] = SshClient(ip_address=instance.ip_address, username=self._context.username, identity_file=self._context.identity_file) + + for ssh_client in ssh_clients.values(): + ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) + + if not VmExtensionIds.AzureMonitorLinuxAgent.supports_distro(next(iter(ssh_clients.values())).run_command("get_distro.py").rstrip()): + raise TestSkipped("Currently AzureMonitorLinuxAgent is not supported on this distro") + + # This is the base ARM template that's used for deploying extensions for this scenario + base_extension_template = { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json", + "contentVersion": "1.0.0.0", + "resources": [ + { + "type": "Microsoft.Compute/virtualMachineScaleSets", + "name": f"{self._context.vmss.name}", + "location": "[resourceGroup().location]", + "apiVersion": "2018-06-01", + "properties": { + "virtualMachineProfile": { + "extensionProfile": { + "extensions": [] + } + } + } + } + ] + } + + for case in self._test_cases: + log.info("") + log.info("Test case: {0}".format(case.__name__.replace('_', ' '))) + test_case_start = random.choice(list(ssh_clients.values())).run_command("date '+%Y-%m-%d %T'").rstrip() + if self._scenario_start == datetime.min: + self._scenario_start = test_case_start + log.info("Test case start time: {0}".format(test_case_start)) + + # Assign unique guid to forceUpdateTag for each extension to make sure they're always unique to force CRP + # to generate a new sequence number each time + test_guid = str(uuid.uuid4()) + policy, extensions, expected_errors, deletion_order = case() + + for ext in extensions: + ext["properties"].update({ + "forceUpdateTag": test_guid + }) + + # We update the extension template here with extensions that are specific to the scenario that we want to + # test out + ext_template = copy.deepcopy(base_extension_template) + ext_template['resources'][0]['properties']['virtualMachineProfile']['extensionProfile'][ + 'extensions'] = extensions + + # Log the dependencies for the extensions in this test case + for ext in extensions: + provisioned_after = ext['properties'].get('provisionAfterExtensions') + depends_on = provisioned_after if provisioned_after else [] + dependency_list = "-" if not depends_on else ' and '.join(depends_on) + log.info("{0} depends on {1}".format(ext['name'], dependency_list)) + + # Copy policy file to each instance + log.info("Updating policy file with new policy: {0}".format(policy)) + for instance_name, ssh_client in ssh_clients.items(): + self._create_policy_file(ssh_client, policy) + + # Deploy updated extension template to the scale set. + log.info("Deploying extensions to the scale set...") + rg_client = ResourceGroupClient(self._context.vmss.cloud, self._context.vmss.subscription, + self._context.vmss.resource_group, self._context.vmss.location) + + try: + rg_client.deploy_template(template=ext_template) + if expected_errors is not None and len(expected_errors) != 0: + fail("Extension deployment was expected to fail with the following errors: {0}".format(expected_errors)) + log.info("Extension deployment succeeded as expected") + log.info("") + except Exception as e: + if expected_errors is None or len(expected_errors) == 0: + fail("Extension template deployment unexpectedly failed: {0}".format(e)) + else: + deployment_failure_pattern = r"[\s\S]*\"code\":\s*\"ResourceDeploymentFailure\"[\s\S]*\"details\":\s*\[\s*(?P[\s\S]*)\]" + deployment_failure_match = re.match(deployment_failure_pattern, str(e)) + try: + if deployment_failure_match is None: + raise Exception("Unable to match a ResourceDeploymentFailure") + error_json = json.loads(deployment_failure_match.group("error")) + error_message = error_json['message'] + except Exception as parse_exc: + fail("Extension template deployment failed as expected, but there was an error in parsing the failure. Parsing failure: {0}\nDeployment Failure: {1}".format(parse_exc, e)) + + for phrase in expected_errors: + if phrase not in error_message: + fail("Extension template deployment failed as expected, but with an unexpected error. Error expected to contain message '{0}'. Actual error: {1}".format(phrase, e)) + log.info("Extensions failed as expected") + log.info("") + + # After each test, clean up failed extensions to leave VMSS in a good state for the next test. + # If there are leftover failed extensions, CRP will attempt to uninstall them in the next test, but they + # will be disallowed by policy. Since CRP waits for a 90 minute timeout for uninstall, the operation will + # timeout and fail, and subsequently, the whole test case will fail. + # To clean up, we first update the policy to allow all, then remove the extensions. + log.info("Starting cleanup for test case...") + for ssh_client in ssh_clients.values(): + allow_all_policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": False + } + } + self._create_policy_file(ssh_client, allow_all_policy) + + for ext_to_delete in deletion_order: + ext_name_to_delete = ext_to_delete.type + try: + self._context.vmss.delete_extension(ext_name_to_delete) + except Exception as crp_err: + # Known issue - CRP returns stale status in cases of dependency failures. Even if the deletion succeeds, + # CRP may return a failure here. We swallow the error, and instead, check that the logs for uninstall + # are present in the agent log (after the start time of this test case). + log.info("CRP returned an error for deletion operation, may be a false error. Checking agent log to determine if operation succeeded. Exception: {0}".format(crp_err)) + try: + for ssh_client in ssh_clients.values(): + msg = ("Remove the extension slice: {0}".format(str(ext_to_delete))) + result = ssh_client.run_command(f"agent_ext_workflow-check_data_in_agent_log.py --data '{msg}' --after-timestamp '{test_case_start}'", use_sudo=True) + log.info(result) + except Exception as agent_err: + fail("Unable to successfully uninstall extension {0}. Exception: {1}".format(ext_name_to_delete, agent_err)) + log.info("Successfully uninstalled extension {0}".format(ext_name_to_delete)) + + log.info("Successfully removed all extensions from VMSS") + log.info("---------------------------------------------") + + def get_ignore_errors_before_timestamp(self) -> datetime: + # Ignore errors in the agent log before the first test case starts + if self._scenario_start == datetime.min: + return self._scenario_start + return datetime.strptime(self._scenario_start, u'%Y-%m-%d %H:%M:%S') + + def get_ignore_error_rules(self) -> List[Dict[str, Any]]: + ignore_rules = [ + # + # WARNING ExtHandler ExtHandler Missing dependsOnExtension on extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent + # This message appears when an extension doesn't depend on another extension + # + { + 'message': r"Missing dependsOnExtension on extension .*" + }, + # + # WARNING ExtHandler ExtHandler Extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent does not have any settings. Will ignore dependency (dependency level: 1) + # We currently ignore dependencies for extensions without settings + # + { + 'message': r"Extension .* does not have any settings\. Will ignore dependency \(dependency level: \d\)" + }, + # + # 2023-10-31T17:46:59.675959Z WARNING ExtHandler ExtHandler Dependent extension Microsoft.Azure.Extensions.CustomScript failed or timed out, will skip processing the rest of the extensions + # We intentionally disallow some extensions to test that dependent are skipped. We assert the specific expected failure message in the test. + # + { + 'message': r"Dependent extension .* failed or timed out, will skip processing the rest of the extensions" + }, + # + # 2023-10-31T17:48:13.349214Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Extensions.CustomScript, op=ExtensionProcessing, message=Dependent Extension Microsoft.Azure.Extensions.CustomScript did not succeed. Status was error, duration=0 + # We intentionally fail to test that dependent extensions are skipped + # + { + 'message': r"Event: name=Microsoft.Azure.Extensions.CustomScript, op=ExtensionProcessing, message=Dependent Extension .* did not succeed. Status was error, duration=0" + }, + # + # 2023-10-31T17:47:07.689083Z WARNING ExtHandler ExtHandler [PERIODIC] This status is being reported by the Guest Agent since no status file was reported by extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent: [ExtensionStatusError] Status file /var/lib/waagent/Microsoft.Azure.Monitor.AzureMonitorLinuxAgent-1.28.11/status/6.status does not exist + # We expect extensions that are dependent on a failing extension to not report status + # + { + 'message': r"\[PERIODIC\] This status is being reported by the Guest Agent since no status file was reported by extension .*: \[ExtensionStatusError\] Status file \/var\/lib\/waagent\/.*\/status\/\d.status does not exist" + }, + # + # 2023-10-31T17:48:11.306835Z WARNING ExtHandler ExtHandler A new goal state was received, but not all the extensions in the previous goal state have completed: [('Microsoft.Azure.Extensions.CustomScript', 'error'), ('Microsoft.Azure.Monitor.AzureMonitorLinuxAgent', 'transitioning'), ('Microsoft.CPlat.Core.RunCommandLinux', 'success')] + # This message appears when the previous test scenario had failing extensions due to extension dependencies + # + { + 'message': r"A new goal state was received, but not all the extensions in the previous goal state have completed: \[(\(u?'.*', u?'(error|transitioning|success)'\),?)+\]" + }, + # 2024-10-23T18:01:32.247341Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=ExtensionProcessing, message=Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent failed, duration=0 + # We intentionally block extensions with policy and expect any dependent extensions to be skipped + { + 'message': r"Skipping processing of extensions since execution of dependent extension .* failed" + }, + # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 + # We intentionally block extensions with policy and expect this failure message + { + 'message': r"Extension is disallowed by agent policy and will not be processed" + } + ] + return ignore_rules + + +if __name__ == "__main__": + ExtPolicyWithDependencies.run_from_command_line() \ No newline at end of file diff --git a/tests_e2e/tests/ext_policy/policy_dependencies_cases.py b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py new file mode 100644 index 000000000..1abc3234a --- /dev/null +++ b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py @@ -0,0 +1,243 @@ + + +from tests_e2e.tests.lib.vm_extension_identifier import VmExtensionIds, VmExtensionIdentifier +import uuid + + +def __get_extension_template(extension_id: VmExtensionIdentifier, depends_on=None): + template = { + "name": extension_id.type, + "properties": { + "provisionAfterExtensions": depends_on, + "publisher": extension_id.publisher, + "type": extension_id.type, + "typeHandlerVersion": extension_id.version, + "autoUpgradeMinorVersion": True + } + } + + if depends_on is not None and len(depends_on) > 0: + template["properties"]["provisionAfterExtensions"] = depends_on + + # Update template properties for each extension type + if extension_id == VmExtensionIds.AzureMonitorLinuxAgent: + # For compliance with S360, enable automatic upgrade for AzureMonitorLinuxAgent + template["properties"]["enableAutomaticUpgrade"] = True + elif extension_id == VmExtensionIds.CustomScript: + template["properties"]["settings"] = {"commandToExecute": "date"} + elif extension_id == VmExtensionIds.RunCommandHandler: + # Each time, we generate a RunCommand template with different settings + unique = str(uuid.uuid4()) + test_file = f"waagent-test.{unique}" + unique_command = f"echo '{unique}' > /tmp/{test_file}" + template["properties"]["settings"] = {"commandToExecute": unique_command} + elif extension_id == VmExtensionIds.VmAccess: + template["properties"]["settings"] = {} + template["properties"]["protectedSettings"] = {"username": "testuser"} + else: + raise ValueError("invalid value '{0}' for 'extension_id'".format(extension_id)) + + return template + + +def _should_fail_single_config_depends_on_disallowed_single_config(): + template = [ + __get_extension_template(VmExtensionIds.VmAccess), + __get_extension_template(VmExtensionIds.CustomScript, depends_on=["VMAccessForLinux"]) + ] + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "extensions": { + "Microsoft.Azure.Extensions.CustomScript": {}, + } + } + } + expected_errors = [ + "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", + "'CustomScript' is marked as failed since it depends upon the VM Extension 'VMAccessForLinux' which has failed" + ] + deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.VmAccess] + return policy, template, expected_errors, deletion_order + + +def _should_fail_single_config_depends_on_disallowed_no_config(): + template = [ + __get_extension_template(VmExtensionIds.AzureMonitorLinuxAgent), + __get_extension_template(VmExtensionIds.CustomScript, depends_on=["AzureMonitorLinuxAgent"]) + + ] + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "extensions": { + "Microsoft.Azure.Extensions.CustomScript": {} + } + } + } + expected_errors = [ + "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", + "'CustomScript' is marked as failed since it depends upon the VM Extension 'AzureMonitorLinuxAgent' which has failed" + ] + deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.AzureMonitorLinuxAgent] + return policy, template, expected_errors, deletion_order + + +def _should_fail_single_config_depends_on_disallowed_multi_config(): + template = [ + __get_extension_template(VmExtensionIds.RunCommandHandler), + __get_extension_template(VmExtensionIds.CustomScript, depends_on=["RunCommandHandlerLinux"]) + ] + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "extensions": { + "Microsoft.Azure.Extensions.CustomScript": {} + } + } + } + expected_errors = [ + "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.CPlat.Core.RunCommandHandlerLinux' because extension is not specified in allowlist", + "'CustomScript' is marked as failed since it depends upon the VM Extension 'RunCommandHandlerLinux' which has failed" + ] + deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.RunCommandHandler] + return policy, template, expected_errors, deletion_order + + +def _should_fail_multi_config_depends_on_disallowed_single_config(): + template = [ + __get_extension_template(VmExtensionIds.CustomScript), + __get_extension_template(VmExtensionIds.RunCommandHandler, depends_on=["CustomScript"]) + ] + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "extensions": { + "Microsoft.CPlat.Core.RunCommandHandlerLinux": {} + } + } + } + expected_errors = [ + "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist", + "VM has reported a failure when processing extension 'RunCommandHandlerLinux' (publisher 'Microsoft.CPlat.Core' and type 'RunCommandHandlerLinux'). Error message: 'Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Extensions.CustomScript failed'." + ] + deletion_order = [VmExtensionIds.RunCommandHandler, VmExtensionIds.CustomScript] + return policy, template, expected_errors, deletion_order + + +def _should_fail_multi_config_depends_on_disallowed_no_config(): + template = [ + __get_extension_template(VmExtensionIds.AzureMonitorLinuxAgent), + __get_extension_template(VmExtensionIds.RunCommandHandler, depends_on=["AzureMonitorLinuxAgent"]) + ] + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "extensions": { + "Microsoft.CPlat.Core.RunCommandHandlerLinux": {} + } + } + } + expected_errors = [ + "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", + "VM has reported a failure when processing extension 'RunCommandHandlerLinux' (publisher 'Microsoft.CPlat.Core' and type 'RunCommandHandlerLinux'). Error message: 'Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent failed'." + ] + deletion_order = [VmExtensionIds.RunCommandHandler, VmExtensionIds.AzureMonitorLinuxAgent] + return policy, template, expected_errors, deletion_order + + +def _should_succeed_single_config_depends_on_no_config(): + template = [ + __get_extension_template(VmExtensionIds.AzureMonitorLinuxAgent), + __get_extension_template(VmExtensionIds.CustomScript, depends_on=["AzureMonitorLinuxAgent"]) + + ] + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "extensions": { + "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent": {}, + "Microsoft.Azure.Extensions.CustomScript": {} + } + } + } + expected_errors = [] + deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.AzureMonitorLinuxAgent] + return policy, template, expected_errors, deletion_order + + +def _should_succeed_single_config_depends_on_single_config(): + template = [ + __get_extension_template(VmExtensionIds.CustomScript), + __get_extension_template(VmExtensionIds.VmAccess, depends_on=["CustomScript"]) + ] + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "extensions": { + "Microsoft.Azure.Extensions.CustomScript": {}, + "Microsoft.OSTCExtensions.VMAccessForLinux": {} + } + } + } + expected_errors = [] + deletion_order = [VmExtensionIds.VmAccess, VmExtensionIds.CustomScript] + return policy, template, expected_errors, deletion_order + + +def _should_no_dependencies(): + template = \ + [{ + "name": "CustomScript", + "properties": { + "publisher": "Microsoft.Azure.Extensions", + "type": "CustomScript", + "typeHandlerVersion": "2.1", + "autoUpgradeMinorVersion": True, + "settings": { + "commandToExecute": "date" + } + } + }, + { + "name": "VMAccessForLinux", + "properties": { + "publisher": "Microsoft.OSTCExtensions", + "type": "VMAccessForLinux", + "typeHandlerVersion": "1.0", + "autoUpgradeMinorVersion": True, + "settings": {}, + "protectedSettings": { + "username": "testuser" + } + } + }] + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "extensions": { + } + } + } + expected_errors = [ + "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", + "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist" + ] + deletion_order = [VmExtensionIds.VmAccess, VmExtensionIds.CustomScript] + return policy, template, expected_errors, deletion_order \ No newline at end of file diff --git a/tests_e2e/tests/scripts/agent_ext_workflow-check_data_in_agent_log.py b/tests_e2e/tests/scripts/agent_ext_workflow-check_data_in_agent_log.py index 867c9b67d..5bab40b8d 100755 --- a/tests_e2e/tests/scripts/agent_ext_workflow-check_data_in_agent_log.py +++ b/tests_e2e/tests/scripts/agent_ext_workflow-check_data_in_agent_log.py @@ -21,6 +21,7 @@ import argparse import sys +from datetime import datetime from pathlib import Path from tests_e2e.tests.lib.agent_log import AgentLog @@ -28,13 +29,15 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("--data", dest='data', required=True) + parser.add_argument("--after-timestamp", dest='after_timestamp', required=False) args, _ = parser.parse_known_args() print("Verifying data: {0} in waagent.log".format(args.data)) found = False try: - found = AgentLog(Path('/var/log/waagent.log')).agent_log_contains(args.data) + after_timestamp = (datetime.strptime(args.after_timestamp, '%Y-%m-%d %H:%M:%S') if args.after_timestamp else None) + found = AgentLog(Path('/var/log/waagent.log')).agent_log_contains(args.data, after_timestamp) if found: print("Found data: {0} in agent log".format(args.data)) else: From 151081d1d92def94292b9adc16a1b2b5a695a907 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Fri, 8 Nov 2024 17:00:46 -0500 Subject: [PATCH 02/32] Enable policy e2e tests --- tests_e2e/orchestrator/runbook.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml index b96cc5107..250392f90 100644 --- a/tests_e2e/orchestrator/runbook.yml +++ b/tests_e2e/orchestrator/runbook.yml @@ -55,6 +55,8 @@ variable: - recover_network_interface - cgroup_v2_disabled - log_collector + - ext_policy + - ext_policy_with_dependencies # # Additional arguments pass to the test suites From edec2aff7e30672819275612b2935485b1e60e98 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Fri, 8 Nov 2024 17:07:42 -0500 Subject: [PATCH 03/32] Pylint --- azurelinuxagent/ga/policy/policy_engine.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/azurelinuxagent/ga/policy/policy_engine.py b/azurelinuxagent/ga/policy/policy_engine.py index 4e1983c09..11f3d9c45 100644 --- a/azurelinuxagent/ga/policy/policy_engine.py +++ b/azurelinuxagent/ga/policy/policy_engine.py @@ -36,12 +36,6 @@ _MAX_SUPPORTED_POLICY_VERSION = "0.1.0" -class PolicyError(AgentError): - """ - Error raised during agent policy enforcement. - """ - - class InvalidPolicyError(AgentError): """ Error raised if user-provided policy is invalid. @@ -61,15 +55,6 @@ def __init__(self, msg, inner=None, code=-1): super(ExtensionPolicyError, self).__init__(msg, inner, code) -class InvalidPolicyError(AgentError): - """ - Error raised if user-provided policy is invalid. - """ - def __init__(self, msg, inner=None): - msg = "Customer-provided policy file ('{0}') is invalid, please correct the following error: {1}".format(conf.get_policy_file_path(), msg) - super(InvalidPolicyError, self).__init__(msg, inner) - - class _PolicyEngine(object): """ Implements base policy engine API. From a37508f79b6b9cf6b8314d48b1c18069b6b2ada0 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Mon, 11 Nov 2024 12:38:33 -0500 Subject: [PATCH 04/32] Fix e2e test failures --- .../ext_policy_with_dependencies.yml | 5 ++++- .../ext_policy/ext_policy_with_dependencies.py | 18 ++++++++++-------- ...ent_ext_workflow-check_data_in_agent_log.py | 7 +++++-- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/tests_e2e/test_suites/ext_policy_with_dependencies.yml b/tests_e2e/test_suites/ext_policy_with_dependencies.yml index 3a460c531..15522cd1d 100644 --- a/tests_e2e/test_suites/ext_policy_with_dependencies.yml +++ b/tests_e2e/test_suites/ext_policy_with_dependencies.yml @@ -5,4 +5,7 @@ name: "ExtPolicyWithDependencies" tests: - "ext_policy/ext_policy_with_dependencies.py" images: "random(endorsed)" -executes_on_scale_set: true \ No newline at end of file +executes_on_scale_set: true +# This test should run on its own VMSS, because other tests may leave behind extensions +# that are disallowed by policy and affect results. +owns_vm: true \ No newline at end of file diff --git a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py index ae8cf1e96..7882f31df 100644 --- a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py +++ b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py @@ -76,18 +76,19 @@ def _create_policy_file(ssh_client, policy): ssh_client.run_command(f"mv {remote_path} {policy_file_final_dest}", use_sudo=True) def run(self): + + # Set up the test run instances_ip_address: List[VmssInstanceIpAddress] = self._context.vmss.get_instances_ip_address() ssh_clients: Dict[str, SshClient] = {} for instance in instances_ip_address: ssh_clients[instance.instance_name] = SshClient(ip_address=instance.ip_address, username=self._context.username, identity_file=self._context.identity_file) - for ssh_client in ssh_clients.values(): ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) if not VmExtensionIds.AzureMonitorLinuxAgent.supports_distro(next(iter(ssh_clients.values())).run_command("get_distro.py").rstrip()): raise TestSkipped("Currently AzureMonitorLinuxAgent is not supported on this distro") - # This is the base ARM template that's used for deploying extensions for this scenario + # This is the base ARM template that's used for deploying extensions for this scenario. base_extension_template = { "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json", "contentVersion": "1.0.0.0", @@ -139,16 +140,17 @@ def run(self): dependency_list = "-" if not depends_on else ' and '.join(depends_on) log.info("{0} depends on {1}".format(ext['name'], dependency_list)) - # Copy policy file to each instance + # Copy policy file to each VM instance log.info("Updating policy file with new policy: {0}".format(policy)) - for instance_name, ssh_client in ssh_clients.items(): + for ssh_client in ssh_clients.values(): self._create_policy_file(ssh_client, policy) - # Deploy updated extension template to the scale set. log.info("Deploying extensions to the scale set...") rg_client = ResourceGroupClient(self._context.vmss.cloud, self._context.vmss.subscription, self._context.vmss.resource_group, self._context.vmss.location) + # Deploy updated extension template to the scale set. + # If test case is supposed to fail, assert that the operation fails with the expected error messages. try: rg_client.deploy_template(template=ext_template) if expected_errors is not None and len(expected_errors) != 0: @@ -176,9 +178,9 @@ def run(self): log.info("") # After each test, clean up failed extensions to leave VMSS in a good state for the next test. - # If there are leftover failed extensions, CRP will attempt to uninstall them in the next test, but they + # If there are leftover failed extensions, CRP will attempt to uninstall them in the next test, but uninstall # will be disallowed by policy. Since CRP waits for a 90 minute timeout for uninstall, the operation will - # timeout and fail, and subsequently, the whole test case will fail. + # timeout and fail without an appropriate error message (known issue), and the whole test case will fail. # To clean up, we first update the policy to allow all, then remove the extensions. log.info("Starting cleanup for test case...") for ssh_client in ssh_clients.values(): @@ -246,7 +248,7 @@ def get_ignore_error_rules(self) -> List[Dict[str, Any]]: # We intentionally fail to test that dependent extensions are skipped # { - 'message': r"Event: name=Microsoft.Azure.Extensions.CustomScript, op=ExtensionProcessing, message=Dependent Extension .* did not succeed. Status was error, duration=0" + 'message': r"message=Dependent Extension .* did not succeed. Status was error, duration=0" }, # # 2023-10-31T17:47:07.689083Z WARNING ExtHandler ExtHandler [PERIODIC] This status is being reported by the Guest Agent since no status file was reported by extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent: [ExtensionStatusError] Status file /var/lib/waagent/Microsoft.Azure.Monitor.AzureMonitorLinuxAgent-1.28.11/status/6.status does not exist diff --git a/tests_e2e/tests/scripts/agent_ext_workflow-check_data_in_agent_log.py b/tests_e2e/tests/scripts/agent_ext_workflow-check_data_in_agent_log.py index 5bab40b8d..03155cb44 100755 --- a/tests_e2e/tests/scripts/agent_ext_workflow-check_data_in_agent_log.py +++ b/tests_e2e/tests/scripts/agent_ext_workflow-check_data_in_agent_log.py @@ -36,8 +36,11 @@ def main(): found = False try: - after_timestamp = (datetime.strptime(args.after_timestamp, '%Y-%m-%d %H:%M:%S') if args.after_timestamp else None) - found = AgentLog(Path('/var/log/waagent.log')).agent_log_contains(args.data, after_timestamp) + if args.after_timestamp is not None: + after_datetime = datetime.strptime(args.after_timestamp, '%Y-%m-%d %H:%M:%S') + found = AgentLog(Path('/var/log/waagent.log')).agent_log_contains(args.data, after_datetime) + else: + found = AgentLog(Path('/var/log/waagent.log')).agent_log_contains(args.data) if found: print("Found data: {0} in agent log".format(args.data)) else: From b0da55496643f245485ac3c19bd2e3efef09b88c Mon Sep 17 00:00:00 2001 From: mgunnala Date: Mon, 18 Nov 2024 11:49:37 -0500 Subject: [PATCH 05/32] Address review comments --- azurelinuxagent/ga/exthandlers.py | 30 +++++++++--- azurelinuxagent/ga/policy/policy_engine.py | 5 +- tests/ga/test_extension.py | 2 +- tests_e2e/tests/ext_policy/ext_policy.py | 48 ++++++++++++------- .../ext_policy_with_dependencies.py | 4 +- .../ext_policy/policy_dependencies_cases.py | 14 +++--- 6 files changed, 67 insertions(+), 36 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 18b520357..e7378ad63 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -510,9 +510,8 @@ def handle_ext_handlers(self, goal_state_id): policy_op, policy_err_code = policy_err_map.get(ext_handler.state) if policy_error is not None: err = ExtensionPolicyError(msg="", inner=policy_error, code=policy_err_code) - self.__handle_and_report_ext_handler_errors(handler_i, err, - report_op=handler_i.operation, - message=ustr(err), extension=extension, report=True) + self.__handle_and_report_policy_error(handler_i, err, report_op=handler_i.operation, message=ustr(err), + extension=extension, report=True) continue extension_allowed = policy_engine.should_allow_extension(ext_handler.name) @@ -522,9 +521,8 @@ def handle_ext_handlers(self, goal_state_id): ext_handler.name, conf.get_policy_file_path()) err = ExtensionPolicyError(msg, code=policy_err_code) - self.__handle_and_report_ext_handler_errors(handler_i, err, - report_op=handler_i.operation, - message=ustr(err), extension=extension, report=True) + self.__handle_and_report_policy_error(handler_i, err, report_op=handler_i.operation, message=ustr(err), + extension=extension, report=True) # In case of extensions disabled, we skip processing extensions. But CRP is still waiting for some status # back for the skipped extensions. In order to propagate the status back to CRP, we will report status back @@ -736,6 +734,26 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True, message=message) + @staticmethod + def __handle_and_report_policy_error(ext_handler_i, error, report_op, message, report=True, extension=None): + # TODO: Consider merging this function with __handle_and_report_ext_handler_errors() above. + + # Set handler status for all extensions + ext_handler_i.set_handler_status(message=message, code=error.code) + + # Create status file for only extensions with settings. Since extensions are not processed in the case of + # policy-related failures, no extension status file is created. For CRP to report status, we need to create the + # file with failure on behalf of the extension. This should be done for both multi-config and single-config extensions. + if extension is not None: + ext_handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=error.code, + operation=report_op, message=message) + + if report: + name = ext_handler_i.get_extension_full_name(extension) + handler_version = ext_handler_i.ext_handler.version + add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True, + message=message) + def handle_enable(self, ext_handler_i, extension): """ 1- Ensure the handler is installed diff --git a/azurelinuxagent/ga/policy/policy_engine.py b/azurelinuxagent/ga/policy/policy_engine.py index 11f3d9c45..3a0add6f8 100644 --- a/azurelinuxagent/ga/policy/policy_engine.py +++ b/azurelinuxagent/ga/policy/policy_engine.py @@ -49,9 +49,8 @@ class ExtensionPolicyError(ExtensionError): """ Error raised during agent extension policy enforcement. """ - # TODO: when CRP adds terminal error code for policy-related extension failures, set that as the default code. - def __init__(self, msg, inner=None, code=-1): - msg = "Extension is disallowed by agent policy and will not be processed: {0}".format(msg) + def __init__(self, msg, code, inner=None): + msg = "Extension will not be processed: {0}".format(msg) super(ExtensionPolicyError, self).__init__(msg, inner, code) diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index a4883add1..0423fd585 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -3588,7 +3588,7 @@ def test_should_fail_extension_if_error_thrown_during_policy_engine_init(self): } with patch('azurelinuxagent.ga.policy.policy_engine.ExtensionPolicyEngine.__init__', side_effect=Exception("mock exception")): - expected_msg = "Extension is disallowed by agent policy and will not be processed: \nInner error: mock exception" + expected_msg = "Extension will not be processed: \nInner error: mock exception" self._test_policy_failure(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, expected_handler_status='NotReady', expected_status_msg=expected_msg) diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index 2d1c50917..cce68d85b 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -88,7 +88,7 @@ def _operation_should_fail(operation, extension_case): fail(f"The agent should have reported an error trying to {operation} {extension_case.extension.__str__()} " f"because the extension is disallowed by policy.") except Exception as error: - assert_that("Extension is disallowed by agent policy and will not be processed" in str(error)) \ + assert_that("Extension will not be processed" in str(error)) \ .described_as( f"Error message should communicate that extension is disallowed by policy, but actual error " f"was: {error}").is_true() @@ -109,11 +109,6 @@ def run(self): resource_name="RunCommandHandler"), {'source': {'script': f"echo '{unique}' > /tmp/{test_file}"}} ) - azure_monitor = ExtPolicy.TestCase( - VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent, - resource_name="AzureMonitorLinuxAgent"), - None - ) unique2 = str(uuid.uuid4()) test_file2 = f"waagent-test.{unique2}" run_command_2 = ExtPolicy.TestCase( @@ -121,13 +116,18 @@ def run(self): resource_name="RunCommandHandler2"), {'source': {'script': f"echo '{unique2}' > /tmp/{test_file2}"}} ) + azure_monitor = ExtPolicy.TestCase( + VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent, + resource_name="AzureMonitorLinuxAgent"), + None + ) # Enable policy via conf log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) - # Test case 1: should only enable allowlisted extensions - # CustomScript should be enabled, RunCommand and AzureMonitor should fail. + # Only allowlisted extensions should be processed. + # We only allowlist CustomScript: CustomScript should be enabled, RunCommand and AzureMonitor should fail. policy = \ { "policyVersion": "0.1.0", @@ -145,8 +145,8 @@ def run(self): if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): self._operation_should_fail("enable", azure_monitor) - # Test case 2: turn allowlist off - # RunCommand should be successfully enabled and then deleted. + # When allowlist is turned off, all extensions should be processed. + # RunCommand and AzureMonitorLinuxAgent should be successfully enabled and then deleted. policy = \ { "policyVersion": "0.1.0", @@ -163,8 +163,8 @@ def run(self): self._operation_should_succeed("enable", azure_monitor) self._operation_should_succeed("delete", azure_monitor) - # Test case 3: uninstall should fail when disallowed - # Remove CustomScript from allowlist and try to uninstall, should fail. + # Should not uninstall disallowed extensions. + # CustomScript is removed from the allowlist: delete operation should fail. policy = \ { "policyVersion": "0.1.0", @@ -175,16 +175,30 @@ def run(self): } } self._create_policy_file(policy) - # Known CRP issue - delete/uninstall operation times out instead of reporting an error. # TODO: uncomment this test case after issue is resolved # self._operation_should_fail("delete", custom_script) - # Test case 4: both instances in a multiconfig extension should fail, if disallowed. - # Disallow RunCommand and try to install two instances, both should fail fast. + # If a multiconfig extension is disallowed, no instances should be processed. + # RunCommand is not allowed - if we try to enable two instances, both should fail fast. self._operation_should_fail("enable", run_command) self._operation_should_fail("enable", run_command_2) + # If single-config extension is initially blocked, and policy is updated to allow it, extension should be + # successfully enabled and report status correctly. + self._operation_should_fail("enable", custom_script) + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": False, + "signatureRequired": False, + "extensions": {} + } + } + self._create_policy_file(policy) + self._operation_should_succeed("enable", custom_script) + def get_ignore_error_rules(self) -> List[Dict[str, Any]]: ignore_rules = [ # @@ -198,10 +212,10 @@ def get_ignore_error_rules(self) -> List[Dict[str, Any]]: { 'message': r"Skipping processing of extensions since execution of dependent extension .* failed" }, - # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 + # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 # We intentionally block extensions with policy and expect this failure message { - 'message': r"Extension is disallowed by agent policy and will not be processed" + 'message': r"Extension will not be processed" } ] return ignore_rules diff --git a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py index 7882f31df..403734b23 100644 --- a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py +++ b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py @@ -269,10 +269,10 @@ def get_ignore_error_rules(self) -> List[Dict[str, Any]]: { 'message': r"Skipping processing of extensions since execution of dependent extension .* failed" }, - # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 + # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 # We intentionally block extensions with policy and expect this failure message { - 'message': r"Extension is disallowed by agent policy and will not be processed" + 'message': r"Extension will not be processed" } ] return ignore_rules diff --git a/tests_e2e/tests/ext_policy/policy_dependencies_cases.py b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py index 1abc3234a..6eabd67a5 100644 --- a/tests_e2e/tests/ext_policy/policy_dependencies_cases.py +++ b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py @@ -56,7 +56,7 @@ def _should_fail_single_config_depends_on_disallowed_single_config(): } } expected_errors = [ - "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", + "Extension will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", "'CustomScript' is marked as failed since it depends upon the VM Extension 'VMAccessForLinux' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.VmAccess] @@ -80,7 +80,7 @@ def _should_fail_single_config_depends_on_disallowed_no_config(): } } expected_errors = [ - "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", + "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", "'CustomScript' is marked as failed since it depends upon the VM Extension 'AzureMonitorLinuxAgent' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.AzureMonitorLinuxAgent] @@ -103,7 +103,7 @@ def _should_fail_single_config_depends_on_disallowed_multi_config(): } } expected_errors = [ - "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.CPlat.Core.RunCommandHandlerLinux' because extension is not specified in allowlist", + "Extension will not be processed: failed to enable extension 'Microsoft.CPlat.Core.RunCommandHandlerLinux' because extension is not specified in allowlist", "'CustomScript' is marked as failed since it depends upon the VM Extension 'RunCommandHandlerLinux' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.RunCommandHandler] @@ -126,7 +126,7 @@ def _should_fail_multi_config_depends_on_disallowed_single_config(): } } expected_errors = [ - "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist", + "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist", "VM has reported a failure when processing extension 'RunCommandHandlerLinux' (publisher 'Microsoft.CPlat.Core' and type 'RunCommandHandlerLinux'). Error message: 'Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Extensions.CustomScript failed'." ] deletion_order = [VmExtensionIds.RunCommandHandler, VmExtensionIds.CustomScript] @@ -149,7 +149,7 @@ def _should_fail_multi_config_depends_on_disallowed_no_config(): } } expected_errors = [ - "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", + "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", "VM has reported a failure when processing extension 'RunCommandHandlerLinux' (publisher 'Microsoft.CPlat.Core' and type 'RunCommandHandlerLinux'). Error message: 'Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent failed'." ] deletion_order = [VmExtensionIds.RunCommandHandler, VmExtensionIds.AzureMonitorLinuxAgent] @@ -236,8 +236,8 @@ def _should_no_dependencies(): } } expected_errors = [ - "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", - "Extension is disallowed by agent policy and will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist" + "Extension will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", + "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist" ] deletion_order = [VmExtensionIds.VmAccess, VmExtensionIds.CustomScript] return policy, template, expected_errors, deletion_order \ No newline at end of file From 699b9baf9a2f6c89849f901e8075bb8bad6ec991 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Wed, 20 Nov 2024 12:45:26 -0500 Subject: [PATCH 06/32] Address review comments --- azurelinuxagent/ga/exthandlers.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index e7378ad63..cb7726fa5 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -500,10 +500,13 @@ def handle_ext_handlers(self, goal_state_id): # behalf of the extension. policy_err_map = { ExtensionRequestedState.Enabled: ('enable', ExtensionErrorCodes.PluginEnableProcessingFailed), - # TODO: CRP does not currently have a terminal error code for uninstall. Once CRP adds - # an error code for uninstall or for policy, use this code instead of PluginDisableProcessingFailed - # Note that currently, CRP waits for 90 minutes to time out for a failed uninstall operation, instead of - # failing fast. + # Note: currently, when uninstall is requested for an extension, CRP polls until the agent does not + # report status for that extension, or until timeout is reached. In the case of a policy error, the + # agent reports failed status on behalf of the extension, which will cause CRP to for the full timeout, + # instead of failing fast. + # + # TODO: CRP does not currently have a terminal error code for uninstall. Once this code is added, use + # it instead of PluginDisableProcessingFailed below. ExtensionRequestedState.Uninstall: ('uninstall', ExtensionErrorCodes.PluginDisableProcessingFailed), ExtensionRequestedState.Disabled: ('disable', ExtensionErrorCodes.PluginDisableProcessingFailed), } @@ -736,14 +739,20 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess @staticmethod def __handle_and_report_policy_error(ext_handler_i, error, report_op, message, report=True, extension=None): - # TODO: Consider merging this function with __handle_and_report_ext_handler_errors() above. + # TODO: Consider merging this function with __handle_and_report_ext_handler_errors() above, after investigating + # the impact of this change. + # + # CRP will poll for extension status for extensions with settings until timeout. In the case of policy errors, + # extensions are not processed so no extension status file is created. Extensions should still fail fast, so the + # agent should write a .status file on behalf of any extension with settings. + # __handle_and_report_ext_handler_errors() does not create the file for single-config extensions, but changing + # it will require additional testing/investigation. As a temporary workaround, this separate function was created + # to write a status file for single-config extensions. - # Set handler status for all extensions + # Set handler status for all extensions (with and without settings) ext_handler_i.set_handler_status(message=message, code=error.code) - # Create status file for only extensions with settings. Since extensions are not processed in the case of - # policy-related failures, no extension status file is created. For CRP to report status, we need to create the - # file with failure on behalf of the extension. This should be done for both multi-config and single-config extensions. + # Create status file for extensions with settings (single and multi config). if extension is not None: ext_handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=error.code, operation=report_op, message=message) From 86de0c522a1998f269ee9575ab857fa5f462660a Mon Sep 17 00:00:00 2001 From: mgunnala Date: Thu, 21 Nov 2024 16:05:17 -0500 Subject: [PATCH 07/32] Address test review comments --- tests/ga/test_extension.py | 25 ++++++++++++---- tests/ga/test_multi_config_extension.py | 30 ++++++++++++++++++- tests_e2e/tests/ext_policy/ext_policy.py | 12 ++++++-- .../ext_policy/policy_dependencies_cases.py | 14 ++++----- 4 files changed, 65 insertions(+), 16 deletions(-) diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index 0423fd585..c2f554239 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -3536,7 +3536,7 @@ def _create_policy_file(self, policy): policy_file.write(policy) policy_file.flush() - def _test_policy_failure(self, policy, op, expected_status_code, expected_handler_status, + def _test_policy_case(self, policy, op, expected_status_code, expected_handler_status, expected_ext_count=0, expected_status_msg=None): with mock_wire_protocol(wire_protocol_data.DATA_FILE) as protocol: @@ -3554,7 +3554,8 @@ def _test_policy_failure(self, policy, op, expected_status_code, expected_handle report_vm_status = protocol.report_vm_status self.assertTrue(report_vm_status.called) - self._assert_handler_status(report_vm_status, expected_handler_status, 0, "1.0.0", 'OSTCExtensions.ExampleHandlerLinux', + self._assert_handler_status(report_vm_status, expected_handler_status, expected_ext_count=expected_ext_count, + version="1.0.0", expected_handler_name='OSTCExtensions.ExampleHandlerLinux', expected_msg=expected_status_msg, expected_code=expected_status_code) def test_should_fail_enable_if_extension_disallowed(self): @@ -3566,7 +3567,7 @@ def test_should_fail_enable_if_extension_disallowed(self): } } expected_msg = "failed to enable extension 'OSTCExtensions.ExampleHandlerLinux' because extension is not specified in allowlist." - self._test_policy_failure(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, + self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, expected_handler_status='NotReady', expected_status_msg=expected_msg) def test_should_fail_enable_for_invalid_policy(self): @@ -3578,7 +3579,7 @@ def test_should_fail_enable_for_invalid_policy(self): } } expected_msg = "attribute 'extensionPolicies.allowListedExtensionsOnly'; must be 'boolean'" - self._test_policy_failure(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, + self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, expected_handler_status='NotReady', expected_status_msg=expected_msg) def test_should_fail_extension_if_error_thrown_during_policy_engine_init(self): @@ -3589,7 +3590,7 @@ def test_should_fail_extension_if_error_thrown_during_policy_engine_init(self): with patch('azurelinuxagent.ga.policy.policy_engine.ExtensionPolicyEngine.__init__', side_effect=Exception("mock exception")): expected_msg = "Extension will not be processed: \nInner error: mock exception" - self._test_policy_failure(policy=policy, op=ExtensionRequestedState.Enabled, + self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, expected_handler_status='NotReady', expected_status_msg=expected_msg) @@ -3604,7 +3605,7 @@ def test_should_fail_uninstall_if_extension_disallowed(self): }, } expected_msg = "failed to uninstall extension 'OSTCExtensions.ExampleHandlerLinux' because extension is not specified in allowlist." - self._test_policy_failure(policy=policy, op=ExtensionRequestedState.Uninstall, expected_status_code=ExtensionErrorCodes.PluginDisableProcessingFailed, + self._test_policy_case(policy=policy, op=ExtensionRequestedState.Uninstall, expected_status_code=ExtensionErrorCodes.PluginDisableProcessingFailed, expected_handler_status='NotReady', expected_status_msg=expected_msg) def test_should_fail_enable_if_dependent_extension_disallowed(self): @@ -3648,6 +3649,18 @@ def test_should_fail_enable_if_dependent_extension_disallowed(self): self.assertEqual(2, next(handler for handler in exthandlers_handler.ext_handlers if handler.name == dep_ext_level_2.name).settings[0].dependencyLevel) + def test_enable_should_succeed_if_extension_allowed(self): + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": False, + } + } + self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, + expected_status_code=0, + expected_handler_status='Ready', expected_ext_count=1) + if __name__ == '__main__': unittest.main() diff --git a/tests/ga/test_multi_config_extension.py b/tests/ga/test_multi_config_extension.py index 66f4c3ca9..93d47e919 100644 --- a/tests/ga/test_multi_config_extension.py +++ b/tests/ga/test_multi_config_extension.py @@ -630,7 +630,7 @@ def test_it_should_handle_and_report_enable_errors_properly(self): } self._assert_extension_status(sc_handler, expected_extensions) - def test_it_should_handle_and_report_disallowed_extensions_properly(self): + def test_it_should_handle_and_report_extensions_disallowed_by_policy_properly(self): """If multiconfig extension is disallowed by policy, all instances should be blocked.""" policy_path = os.path.join(self.tmp_dir, "waagent_policy.json") patch('azurelinuxagent.common.conf.get_policy_file_path', return_value=str(policy_path)).start() @@ -694,6 +694,34 @@ def test_it_should_handle_and_report_disallowed_extensions_properly(self): } self._assert_extension_status(sc_handler, expected_extensions) + + def test_it_should_handle_and_report_extensions_allowed_by_policy_properly(self): + """If multiconfig extension is allowed by policy, all instances should be allowed.""" + policy_path = os.path.join(self.tmp_dir, "waagent_policy.json") + patch('azurelinuxagent.common.conf.get_policy_file_path', return_value=str(policy_path)).start() + patch('azurelinuxagent.ga.policy.policy_engine.conf.get_extension_policy_enabled', + return_value=True).start() + policy = \ + { + "policyVersion": "0.0.1", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "signatureRequired": True, + "extensions": { + "OSTCExtensions.ExampleHandlerLinux": {}, + "Microsoft.Powershell.ExampleExtension": {} + } + } + } + with open(policy_path, mode='w') as policy_file: + json.dump(policy, policy_file, indent=4) + policy_file.flush() + + self.test_data['ext_conf'] = os.path.join(self._MULTI_CONFIG_TEST_DATA, + "ext_conf_multi_config_no_dependencies.xml") + with self._setup_test_env(mock_manifest=True) as (exthandlers_handler, protocol, no_of_extensions): + self.__run_and_assert_generic_case(exthandlers_handler, protocol, no_of_extensions) + def test_it_should_cleanup_extension_state_on_disable(self): def __assert_state_file(handler_name, handler_version, extensions, state, not_present=None): diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index cce68d85b..9a14df023 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -88,7 +88,7 @@ def _operation_should_fail(operation, extension_case): fail(f"The agent should have reported an error trying to {operation} {extension_case.extension.__str__()} " f"because the extension is disallowed by policy.") except Exception as error: - assert_that("Extension will not be processed" in str(error)) \ + assert_that("[ExtensionPolicyError] Extension will not be processed" in str(error)) \ .described_as( f"Error message should communicate that extension is disallowed by policy, but actual error " f"was: {error}").is_true() @@ -96,14 +96,19 @@ def _operation_should_fail(operation, extension_case): def run(self): - # Prepare extensions to test + # Prepare no-config, single-config, and multi-config extension to test. Extensions with settings and extensions + # without settings have different status reporting logic, so we should test all cases. unique = str(uuid.uuid4()) test_file = f"waagent-test.{unique}" + + # CustomScript is a single-config extension. custom_script = ExtPolicy.TestCase( VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, resource_name="CustomScript"), {'commandToExecute': f"echo '{unique}' > /tmp/{test_file}"} ) + + # RunCommandHandler is a multi-config extension, so we set up two instances (configurations) here and test both. run_command = ExtPolicy.TestCase( VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, resource_name="RunCommandHandler"), @@ -116,6 +121,8 @@ def run(self): resource_name="RunCommandHandler2"), {'source': {'script': f"echo '{unique2}' > /tmp/{test_file2}"}} ) + + # AzureMonitorLinuxAgent is a no-config extension (extension without settings). azure_monitor = ExtPolicy.TestCase( VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent, resource_name="AzureMonitorLinuxAgent"), @@ -128,6 +135,7 @@ def run(self): # Only allowlisted extensions should be processed. # We only allowlist CustomScript: CustomScript should be enabled, RunCommand and AzureMonitor should fail. + # (Note that CustomScript blocked by policy is tested in a later test case.) policy = \ { "policyVersion": "0.1.0", diff --git a/tests_e2e/tests/ext_policy/policy_dependencies_cases.py b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py index 6eabd67a5..4f11b4139 100644 --- a/tests_e2e/tests/ext_policy/policy_dependencies_cases.py +++ b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py @@ -56,7 +56,7 @@ def _should_fail_single_config_depends_on_disallowed_single_config(): } } expected_errors = [ - "Extension will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", + "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", "'CustomScript' is marked as failed since it depends upon the VM Extension 'VMAccessForLinux' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.VmAccess] @@ -80,7 +80,7 @@ def _should_fail_single_config_depends_on_disallowed_no_config(): } } expected_errors = [ - "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", + "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", "'CustomScript' is marked as failed since it depends upon the VM Extension 'AzureMonitorLinuxAgent' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.AzureMonitorLinuxAgent] @@ -103,7 +103,7 @@ def _should_fail_single_config_depends_on_disallowed_multi_config(): } } expected_errors = [ - "Extension will not be processed: failed to enable extension 'Microsoft.CPlat.Core.RunCommandHandlerLinux' because extension is not specified in allowlist", + "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.CPlat.Core.RunCommandHandlerLinux' because extension is not specified in allowlist", "'CustomScript' is marked as failed since it depends upon the VM Extension 'RunCommandHandlerLinux' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.RunCommandHandler] @@ -126,7 +126,7 @@ def _should_fail_multi_config_depends_on_disallowed_single_config(): } } expected_errors = [ - "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist", + "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist", "VM has reported a failure when processing extension 'RunCommandHandlerLinux' (publisher 'Microsoft.CPlat.Core' and type 'RunCommandHandlerLinux'). Error message: 'Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Extensions.CustomScript failed'." ] deletion_order = [VmExtensionIds.RunCommandHandler, VmExtensionIds.CustomScript] @@ -149,7 +149,7 @@ def _should_fail_multi_config_depends_on_disallowed_no_config(): } } expected_errors = [ - "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", + "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", "VM has reported a failure when processing extension 'RunCommandHandlerLinux' (publisher 'Microsoft.CPlat.Core' and type 'RunCommandHandlerLinux'). Error message: 'Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent failed'." ] deletion_order = [VmExtensionIds.RunCommandHandler, VmExtensionIds.AzureMonitorLinuxAgent] @@ -236,8 +236,8 @@ def _should_no_dependencies(): } } expected_errors = [ - "Extension will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", - "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist" + "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", + "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist" ] deletion_order = [VmExtensionIds.VmAccess, VmExtensionIds.CustomScript] return policy, template, expected_errors, deletion_order \ No newline at end of file From c3e9b89df5977688c113096e2d0b9391c4824485 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Fri, 22 Nov 2024 13:08:52 -0500 Subject: [PATCH 08/32] Remove status file for single-config --- azurelinuxagent/ga/exthandlers.py | 34 ++++--------------------------- tests/ga/test_extension.py | 18 ++++++++++++---- 2 files changed, 18 insertions(+), 34 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index cb7726fa5..eb3479d01 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -502,8 +502,8 @@ def handle_ext_handlers(self, goal_state_id): ExtensionRequestedState.Enabled: ('enable', ExtensionErrorCodes.PluginEnableProcessingFailed), # Note: currently, when uninstall is requested for an extension, CRP polls until the agent does not # report status for that extension, or until timeout is reached. In the case of a policy error, the - # agent reports failed status on behalf of the extension, which will cause CRP to for the full timeout, - # instead of failing fast. + # agent reports failed status on behalf of the extension, which will cause CRP to poll for the full + # timeout, instead of failing fast. # # TODO: CRP does not currently have a terminal error code for uninstall. Once this code is added, use # it instead of PluginDisableProcessingFailed below. @@ -513,7 +513,7 @@ def handle_ext_handlers(self, goal_state_id): policy_op, policy_err_code = policy_err_map.get(ext_handler.state) if policy_error is not None: err = ExtensionPolicyError(msg="", inner=policy_error, code=policy_err_code) - self.__handle_and_report_policy_error(handler_i, err, report_op=handler_i.operation, message=ustr(err), + self.__handle_and_report_ext_handler_errors(handler_i, err, report_op=handler_i.operation, message=ustr(err), extension=extension, report=True) continue @@ -524,7 +524,7 @@ def handle_ext_handlers(self, goal_state_id): ext_handler.name, conf.get_policy_file_path()) err = ExtensionPolicyError(msg, code=policy_err_code) - self.__handle_and_report_policy_error(handler_i, err, report_op=handler_i.operation, message=ustr(err), + self.__handle_and_report_ext_handler_errors(handler_i, err, report_op=handler_i.operation, message=ustr(err), extension=extension, report=True) # In case of extensions disabled, we skip processing extensions. But CRP is still waiting for some status @@ -737,32 +737,6 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True, message=message) - @staticmethod - def __handle_and_report_policy_error(ext_handler_i, error, report_op, message, report=True, extension=None): - # TODO: Consider merging this function with __handle_and_report_ext_handler_errors() above, after investigating - # the impact of this change. - # - # CRP will poll for extension status for extensions with settings until timeout. In the case of policy errors, - # extensions are not processed so no extension status file is created. Extensions should still fail fast, so the - # agent should write a .status file on behalf of any extension with settings. - # __handle_and_report_ext_handler_errors() does not create the file for single-config extensions, but changing - # it will require additional testing/investigation. As a temporary workaround, this separate function was created - # to write a status file for single-config extensions. - - # Set handler status for all extensions (with and without settings) - ext_handler_i.set_handler_status(message=message, code=error.code) - - # Create status file for extensions with settings (single and multi config). - if extension is not None: - ext_handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=error.code, - operation=report_op, message=message) - - if report: - name = ext_handler_i.get_extension_full_name(extension) - handler_version = ext_handler_i.ext_handler.version - add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True, - message=message) - def handle_enable(self, ext_handler_i, extension): """ 1- Ensure the handler is installed diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index c2f554239..f6410d7bd 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -3650,16 +3650,26 @@ def test_should_fail_enable_if_dependent_extension_disallowed(self): handler.name == dep_ext_level_2.name).settings[0].dependencyLevel) def test_enable_should_succeed_if_extension_allowed(self): - policy = \ + policy_cases = [ { "policyVersion": "0.1.0", "extensionPolicies": { "allowListedExtensionsOnly": False, } + }, + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "extensions": { + "OSTCExtensions.ExampleHandlerLinux": {} + } + } } - self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, - expected_status_code=0, - expected_handler_status='Ready', expected_ext_count=1) + ] + for policy in policy_cases: + self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=0, + expected_handler_status='Ready', expected_ext_count=1) if __name__ == '__main__': From 65d7034aa02ee5f853013a42bdfc8e9a252c4d76 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Fri, 22 Nov 2024 16:22:44 -0500 Subject: [PATCH 09/32] Add back status file for single-config --- azurelinuxagent/ga/exthandlers.py | 35 +++++++++++- azurelinuxagent/ga/policy/policy_engine.py | 4 +- tests/ga/test_extension.py | 4 +- tests_e2e/tests/ext_policy/ext_policy.py | 64 +++++++++++----------- 4 files changed, 68 insertions(+), 39 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index eb3479d01..1d4a7273c 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -513,7 +513,7 @@ def handle_ext_handlers(self, goal_state_id): policy_op, policy_err_code = policy_err_map.get(ext_handler.state) if policy_error is not None: err = ExtensionPolicyError(msg="", inner=policy_error, code=policy_err_code) - self.__handle_and_report_ext_handler_errors(handler_i, err, report_op=handler_i.operation, message=ustr(err), + self.__handle_and_report_policy_error(handler_i, err, report_op=handler_i.operation, message=ustr(err), extension=extension, report=True) continue @@ -524,7 +524,7 @@ def handle_ext_handlers(self, goal_state_id): ext_handler.name, conf.get_policy_file_path()) err = ExtensionPolicyError(msg, code=policy_err_code) - self.__handle_and_report_ext_handler_errors(handler_i, err, report_op=handler_i.operation, message=ustr(err), + self.__handle_and_report_policy_error(handler_i, err, report_op=handler_i.operation, message=ustr(err), extension=extension, report=True) # In case of extensions disabled, we skip processing extensions. But CRP is still waiting for some status @@ -737,6 +737,32 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True, message=message) + @staticmethod + def __handle_and_report_policy_error(ext_handler_i, error, report_op, message, report=True, extension=None): + # TODO: Consider merging this function with __handle_and_report_ext_handler_errors() above, after investigating + # the impact of this change. + # + # If extension status is present, CRP will ignore handler status and report extension status. In the case of policy errors, + # extensions are not processed, so collect_ext_status() reports transitioning status on behalf of the extension. + # However, extensions blocked by policy should fail fast, so agent should write a .status file for policy failures. + # Note that __handle_and_report_ext_handler_errors() does not create the file for single-config extensions, but changing + # it will require additional testing/investigation. As a temporary workaround, this separate function was created + # to write a status file for single-config extensions. + + # Set handler status for all extensions (with and without settings) + ext_handler_i.set_handler_status(message=message, code=error.code) + + # Create status file for extensions with settings (single and multi config). + if extension is not None: + ext_handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=error.code, + operation=report_op, message=message) + + if report: + name = ext_handler_i.get_extension_full_name(extension) + handler_version = ext_handler_i.ext_handler.version + add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True, + message=message) + def handle_enable(self, ext_handler_i, extension): """ 1- Ensure the handler is installed @@ -1035,7 +1061,10 @@ def report_ext_handler_status(self, vm_status, ext_handler, goal_state_changed): # extension even if HandlerState == NotInstalled (Sample scenario: ExtensionsGoalStateError, DecideVersionError, etc) # We also need to report extension status for an uninstalled handler if extensions are disabled because CRP # waits for extension runtime status before failing the extension operation. - if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or not conf.get_extensions_enabled(): + # In the case of policy failures, we want to report extension status with a terminal code so CRP fails fast. If + # extension status is not present, collect_ext_status() will set a default transitioning status, and CRP will + # wait for timeout. + if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or not conf.get_extensions_enabled() or ExtensionPolicyEngine.get_policy_enforcement_enabled(): # Since we require reading the Manifest for reading the heartbeat, this would fail if HandlerManifest not found. # Only try to read heartbeat if HandlerState != NotInstalled. diff --git a/azurelinuxagent/ga/policy/policy_engine.py b/azurelinuxagent/ga/policy/policy_engine.py index 3a0add6f8..336882114 100644 --- a/azurelinuxagent/ga/policy/policy_engine.py +++ b/azurelinuxagent/ga/policy/policy_engine.py @@ -60,7 +60,7 @@ class _PolicyEngine(object): """ def __init__(self): # Set defaults for policy - self._policy_enforcement_enabled = self.__get_policy_enforcement_enabled() + self._policy_enforcement_enabled = self.get_policy_enforcement_enabled() if not self.policy_enforcement_enabled: return @@ -79,7 +79,7 @@ def _log_policy_event(msg, is_success=True, op=WALAEventOperation.Policy, send_e add_event(op=op, message=msg, is_success=is_success, log_event=False) @staticmethod - def __get_policy_enforcement_enabled(): + def get_policy_enforcement_enabled(): """ Policy will be enabled if (1) policy file exists at the expected location and (2) the conf flag "Debug.EnableExtensionPolicy" is true. """ diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index f6410d7bd..77cc9b091 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -3536,7 +3536,7 @@ def _create_policy_file(self, policy): policy_file.write(policy) policy_file.flush() - def _test_policy_case(self, policy, op, expected_status_code, expected_handler_status, expected_ext_count=0, + def _test_policy_case(self, policy, op, expected_status_code, expected_handler_status, expected_ext_count=1, expected_status_msg=None): with mock_wire_protocol(wire_protocol_data.DATA_FILE) as protocol: @@ -3669,7 +3669,7 @@ def test_enable_should_succeed_if_extension_allowed(self): ] for policy in policy_cases: self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=0, - expected_handler_status='Ready', expected_ext_count=1) + expected_handler_status='Ready') if __name__ == '__main__': diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index 9a14df023..9de598e80 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -149,30 +149,30 @@ def run(self): } self._create_policy_file(policy) self._operation_should_succeed("enable", custom_script) - self._operation_should_fail("enable", run_command) - if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): - self._operation_should_fail("enable", azure_monitor) - - # When allowlist is turned off, all extensions should be processed. - # RunCommand and AzureMonitorLinuxAgent should be successfully enabled and then deleted. - policy = \ - { - "policyVersion": "0.1.0", - "extensionPolicies": { - "allowListedExtensionsOnly": False, - "signatureRequired": False, - "extensions": {} - } - } - self._create_policy_file(policy) - self._operation_should_succeed("enable", run_command) - self._operation_should_succeed("delete", run_command) - if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): - self._operation_should_succeed("enable", azure_monitor) - self._operation_should_succeed("delete", azure_monitor) - - # Should not uninstall disallowed extensions. - # CustomScript is removed from the allowlist: delete operation should fail. + # self._operation_should_fail("enable", run_command) + # if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): + # self._operation_should_fail("enable", azure_monitor) + # + # # When allowlist is turned off, all extensions should be processed. + # # RunCommand and AzureMonitorLinuxAgent should be successfully enabled and then deleted. + # policy = \ + # { + # "policyVersion": "0.1.0", + # "extensionPolicies": { + # "allowListedExtensionsOnly": False, + # "signatureRequired": False, + # "extensions": {} + # } + # } + # self._create_policy_file(policy) + # self._operation_should_succeed("enable", run_command) + # self._operation_should_succeed("delete", run_command) + # if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): + # self._operation_should_succeed("enable", azure_monitor) + # self._operation_should_succeed("delete", azure_monitor) + # + # # Should not uninstall disallowed extensions. + # # CustomScript is removed from the allowlist: delete operation should fail. policy = \ { "policyVersion": "0.1.0", @@ -183,14 +183,14 @@ def run(self): } } self._create_policy_file(policy) - # Known CRP issue - delete/uninstall operation times out instead of reporting an error. - # TODO: uncomment this test case after issue is resolved - # self._operation_should_fail("delete", custom_script) - - # If a multiconfig extension is disallowed, no instances should be processed. - # RunCommand is not allowed - if we try to enable two instances, both should fail fast. - self._operation_should_fail("enable", run_command) - self._operation_should_fail("enable", run_command_2) + # # Known CRP issue - delete/uninstall operation times out instead of reporting an error. + # # TODO: uncomment this test case after issue is resolved + # # self._operation_should_fail("delete", custom_script) + # + # # If a multiconfig extension is disallowed, no instances should be processed. + # # RunCommand is not allowed - if we try to enable two instances, both should fail fast. + # self._operation_should_fail("enable", run_command) + # self._operation_should_fail("enable", run_command_2) # If single-config extension is initially blocked, and policy is updated to allow it, extension should be # successfully enabled and report status correctly. From 95f247a6444e09dc9dfc40254854d91fc1a4f1c6 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Fri, 22 Nov 2024 18:48:10 -0500 Subject: [PATCH 10/32] Run e2e tests on all endorsed --- tests_e2e/test_suites/ext_policy.yml | 2 +- tests_e2e/test_suites/ext_policy_with_dependencies.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests_e2e/test_suites/ext_policy.yml b/tests_e2e/test_suites/ext_policy.yml index bdde59a02..15a3dc73d 100644 --- a/tests_e2e/test_suites/ext_policy.yml +++ b/tests_e2e/test_suites/ext_policy.yml @@ -4,5 +4,5 @@ name: "ExtensionPolicy" tests: - "ext_policy/ext_policy.py" -images: "random(endorsed)" +images: "endorsed" owns_vm: true \ No newline at end of file diff --git a/tests_e2e/test_suites/ext_policy_with_dependencies.yml b/tests_e2e/test_suites/ext_policy_with_dependencies.yml index 15522cd1d..09e01fe63 100644 --- a/tests_e2e/test_suites/ext_policy_with_dependencies.yml +++ b/tests_e2e/test_suites/ext_policy_with_dependencies.yml @@ -4,7 +4,7 @@ name: "ExtPolicyWithDependencies" tests: - "ext_policy/ext_policy_with_dependencies.py" -images: "random(endorsed)" +images: "endorsed" executes_on_scale_set: true # This test should run on its own VMSS, because other tests may leave behind extensions # that are disallowed by policy and affect results. From 3b185191278d0bdafd140c5073328868acbb2f30 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Fri, 22 Nov 2024 19:05:11 -0500 Subject: [PATCH 11/32] Fix UT failures --- tests/ga/test_extension.py | 41 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index 77cc9b091..4a583cfce 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -3630,12 +3630,12 @@ def test_should_fail_enable_if_dependent_extension_disallowed(self): # OtherExampleHandlerLinux should be disallowed by policy, ExampleHandlerLinux should be skipped because # dependent extension failed - self._assert_handler_status(protocol.report_vm_status, "NotReady", 0, "1.0.0", + self._assert_handler_status(protocol.report_vm_status, "NotReady", 1, "1.0.0", expected_handler_name="OSTCExtensions.OtherExampleHandlerLinux", expected_msg=("failed to enable extension 'OSTCExtensions.OtherExampleHandlerLinux' " "because extension is not specified in allowlist.")) - self._assert_handler_status(protocol.report_vm_status, "NotReady", 0, "1.0.0", + self._assert_handler_status(protocol.report_vm_status, "NotReady", 1, "1.0.0", expected_handler_name="OSTCExtensions.ExampleHandlerLinux", expected_msg="Skipping processing of extensions since execution of dependent " "extension OSTCExtensions.OtherExampleHandlerLinux failed") @@ -3671,6 +3671,43 @@ def test_enable_should_succeed_if_extension_allowed(self): self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=0, expected_handler_status='Ready') + def test_uninstall_should_succeed_if_extension_allowed(self): + policy_cases = [ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": False, + } + }, + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "extensions": { + "OSTCExtensions.ExampleHandlerLinux": {} + } + } + } + ] + for policy in policy_cases: + with mock_wire_protocol(wire_protocol_data.DATA_FILE) as protocol: + protocol.mock_wire_data.set_incarnation(2) + protocol.mock_wire_data.set_extensions_config_state(ExtensionRequestedState.Uninstall) + protocol.client.update_goal_state() + protocol.aggregate_status = None + protocol.report_vm_status = MagicMock() + exthandlers_handler = get_exthandlers_handler(protocol) + + self._create_policy_file(policy) + exthandlers_handler.run() + exthandlers_handler.report_ext_handlers_status() + + report_vm_status = protocol.report_vm_status + self.assertTrue(report_vm_status.called) + args, kw = report_vm_status.call_args # pylint: disable=unused-variable + vm_status = args[0] + self.assertEqual(0, len(vm_status.vmAgent.extensionHandlers)) + if __name__ == '__main__': unittest.main() From 63da127a0c0f94db60c511e6e93cc5add4240278 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Tue, 26 Nov 2024 00:34:12 -0500 Subject: [PATCH 12/32] Pylint --- tests_e2e/tests/ext_policy/ext_policy.py | 58 ++++++++++++------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index 9de598e80..da7b752f9 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -149,30 +149,30 @@ def run(self): } self._create_policy_file(policy) self._operation_should_succeed("enable", custom_script) - # self._operation_should_fail("enable", run_command) - # if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): - # self._operation_should_fail("enable", azure_monitor) - # - # # When allowlist is turned off, all extensions should be processed. - # # RunCommand and AzureMonitorLinuxAgent should be successfully enabled and then deleted. - # policy = \ - # { - # "policyVersion": "0.1.0", - # "extensionPolicies": { - # "allowListedExtensionsOnly": False, - # "signatureRequired": False, - # "extensions": {} - # } - # } - # self._create_policy_file(policy) - # self._operation_should_succeed("enable", run_command) - # self._operation_should_succeed("delete", run_command) - # if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): - # self._operation_should_succeed("enable", azure_monitor) - # self._operation_should_succeed("delete", azure_monitor) - # - # # Should not uninstall disallowed extensions. - # # CustomScript is removed from the allowlist: delete operation should fail. + self._operation_should_fail("enable", run_command) + if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): + self._operation_should_fail("enable", azure_monitor) + + # When allowlist is turned off, all extensions should be processed. + # RunCommand and AzureMonitorLinuxAgent should be successfully enabled and then deleted. + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": False, + "signatureRequired": False, + "extensions": {} + } + } + self._create_policy_file(policy) + self._operation_should_succeed("enable", run_command) + self._operation_should_succeed("delete", run_command) + if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): + self._operation_should_succeed("enable", azure_monitor) + self._operation_should_succeed("delete", azure_monitor) + + # Should not uninstall disallowed extensions. + # CustomScript is removed from the allowlist: delete operation should fail. policy = \ { "policyVersion": "0.1.0", @@ -186,11 +186,11 @@ def run(self): # # Known CRP issue - delete/uninstall operation times out instead of reporting an error. # # TODO: uncomment this test case after issue is resolved # # self._operation_should_fail("delete", custom_script) - # - # # If a multiconfig extension is disallowed, no instances should be processed. - # # RunCommand is not allowed - if we try to enable two instances, both should fail fast. - # self._operation_should_fail("enable", run_command) - # self._operation_should_fail("enable", run_command_2) + + # If a multiconfig extension is disallowed, no instances should be processed. + # RunCommand is not allowed - if we try to enable two instances, both should fail fast. + self._operation_should_fail("enable", run_command) + self._operation_should_fail("enable", run_command_2) # If single-config extension is initially blocked, and policy is updated to allow it, extension should be # successfully enabled and report status correctly. From 8ea989b0ac2846b869b9096ec49af145836051ff Mon Sep 17 00:00:00 2001 From: mgunnala Date: Tue, 3 Dec 2024 11:47:56 -0500 Subject: [PATCH 13/32] Address review comments for agent code --- azurelinuxagent/ga/exthandlers.py | 93 +++++++++++----------- azurelinuxagent/ga/policy/policy_engine.py | 12 +-- tests/ga/test_extension.py | 10 +-- tests/ga/test_multi_config_extension.py | 2 +- 4 files changed, 55 insertions(+), 62 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 1d4a7273c..07792096a 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -39,7 +39,7 @@ from azurelinuxagent.common.agent_supported_feature import get_agent_supported_features_list_for_extensions, \ SupportedFeatureNames, get_supported_feature_by_name, get_agent_supported_features_list_for_crp from azurelinuxagent.ga.cgroupconfigurator import CGroupConfigurator -from azurelinuxagent.ga.policy.policy_engine import ExtensionPolicyEngine, ExtensionPolicyError +from azurelinuxagent.ga.policy.policy_engine import ExtensionPolicyEngine from azurelinuxagent.common.datacontract import get_properties, set_properties from azurelinuxagent.common.errorstate import ErrorState from azurelinuxagent.common.event import add_event, elapsed_milliseconds, WALAEventOperation, \ @@ -87,6 +87,26 @@ # This is the default sequence number we use when there are no settings available for Handlers _DEFAULT_SEQ_NO = "0" +# For policy-related errors, this mapping is used to generate user-friendly error messages and determine the appropriate +# terminal error code based on the blocked operation. +# Format: {: (, )} +# - The first element of the tuple is a user-friendly operation name included in error messages. +# - The second element of the tuple is the CRP terminal error code for the operation. +_POLICY_ERROR_MAP = \ + { + ExtensionRequestedState.Enabled: ('run', ExtensionErrorCodes.PluginEnableProcessingFailed), + # Note: currently, when uninstall is requested for an extension, CRP polls until the agent does not + # report status for that extension, or until timeout is reached. In the case of a policy error, the + # agent reports failed status on behalf of the extension, which will cause CRP to poll for the full + # timeout, instead of failing fast. + # + # TODO: CRP does not currently have a terminal error code for uninstall. Once this code is added, use + # it instead of PluginDisableProcessingFailed below. + ExtensionRequestedState.Uninstall: ('uninstall', ExtensionErrorCodes.PluginDisableProcessingFailed), + # "Disable" is an internal operation, users are unaware of it. We surface the term "uninstall" instead. + ExtensionRequestedState.Disabled: ('uninstall', ExtensionErrorCodes.PluginDisableProcessingFailed), + } + class ExtHandlerStatusValue(object): """ @@ -496,37 +516,6 @@ def handle_ext_handlers(self, goal_state_id): handler_i = ExtHandlerInstance(ext_handler, self.protocol, extension=extension) - # Invoke policy engine to determine if extension is allowed. If not, block extension and report error on - # behalf of the extension. - policy_err_map = { - ExtensionRequestedState.Enabled: ('enable', ExtensionErrorCodes.PluginEnableProcessingFailed), - # Note: currently, when uninstall is requested for an extension, CRP polls until the agent does not - # report status for that extension, or until timeout is reached. In the case of a policy error, the - # agent reports failed status on behalf of the extension, which will cause CRP to poll for the full - # timeout, instead of failing fast. - # - # TODO: CRP does not currently have a terminal error code for uninstall. Once this code is added, use - # it instead of PluginDisableProcessingFailed below. - ExtensionRequestedState.Uninstall: ('uninstall', ExtensionErrorCodes.PluginDisableProcessingFailed), - ExtensionRequestedState.Disabled: ('disable', ExtensionErrorCodes.PluginDisableProcessingFailed), - } - policy_op, policy_err_code = policy_err_map.get(ext_handler.state) - if policy_error is not None: - err = ExtensionPolicyError(msg="", inner=policy_error, code=policy_err_code) - self.__handle_and_report_policy_error(handler_i, err, report_op=handler_i.operation, message=ustr(err), - extension=extension, report=True) - continue - - extension_allowed = policy_engine.should_allow_extension(ext_handler.name) - if not extension_allowed: - msg = "failed to {0} extension '{1}' because extension is not specified in allowlist. To {0}, " \ - "add extension to the allowed list in the policy file ('{2}').".format(policy_op, - ext_handler.name, - conf.get_policy_file_path()) - err = ExtensionPolicyError(msg, code=policy_err_code) - self.__handle_and_report_policy_error(handler_i, err, report_op=handler_i.operation, message=ustr(err), - extension=extension, report=True) - # In case of extensions disabled, we skip processing extensions. But CRP is still waiting for some status # back for the skipped extensions. In order to propagate the status back to CRP, we will report status back # here with an error message. @@ -545,6 +534,24 @@ def handle_ext_handlers(self, goal_state_id): operation=handler_i.operation, message=msg) continue + # Invoke policy engine to determine if extension is allowed. If not, block extension and report error on + # behalf of the extension. + policy_op, policy_err_code = _POLICY_ERROR_MAP.get(ext_handler.state) + if policy_error is not None: + msg = "Extension will not be processed: {0}".format(ustr(policy_error)) + self.__report_policy_error(ext_handler_i=handler_i, error_code=policy_err_code, + report_op=handler_i.operation, message=msg, + extension=extension) + continue + + extension_allowed = policy_engine.should_allow_extension(ext_handler.name) + if not extension_allowed: + msg = ( + "Extension will not be processed: failed to {0} extension '{1}' because it is not specified " + "in the allowlist. To {0}, add the extension to the allowed list in the policy file ('{2}')." + ).format(policy_op, ext_handler.name, conf.get_policy_file_path()) + self.__report_policy_error(handler_i, policy_err_code, report_op=handler_i.operation, + message=msg, extension=extension) # In case of depends-on errors, we skip processing extensions if there was an error processing dependent extensions. # But CRP is still waiting for some status back for the skipped extensions. In order to propagate the status back to CRP, @@ -738,7 +745,7 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess message=message) @staticmethod - def __handle_and_report_policy_error(ext_handler_i, error, report_op, message, report=True, extension=None): + def __report_policy_error(ext_handler_i, error_code, report_op, message, extension=None): # TODO: Consider merging this function with __handle_and_report_ext_handler_errors() above, after investigating # the impact of this change. # @@ -750,18 +757,17 @@ def __handle_and_report_policy_error(ext_handler_i, error, report_op, message, r # to write a status file for single-config extensions. # Set handler status for all extensions (with and without settings) - ext_handler_i.set_handler_status(message=message, code=error.code) + ext_handler_i.set_handler_status(message=message, code=error_code) # Create status file for extensions with settings (single and multi config). if extension is not None: - ext_handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=error.code, + ext_handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=error_code, operation=report_op, message=message) - if report: - name = ext_handler_i.get_extension_full_name(extension) - handler_version = ext_handler_i.ext_handler.version - add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True, - message=message) + name = ext_handler_i.get_extension_full_name(extension) + handler_version = ext_handler_i.ext_handler.version + add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True, + message=message) def handle_enable(self, ext_handler_i, extension): """ @@ -1059,11 +1065,8 @@ def report_ext_handler_status(self, vm_status, ext_handler, goal_state_changed): # For MultiConfig, we need to report status per extension even for Handler level failures. # If we have HandlerStatus for a MultiConfig handler and GS is requesting for it, we would report status per # extension even if HandlerState == NotInstalled (Sample scenario: ExtensionsGoalStateError, DecideVersionError, etc) - # We also need to report extension status for an uninstalled handler if extensions are disabled because CRP - # waits for extension runtime status before failing the extension operation. - # In the case of policy failures, we want to report extension status with a terminal code so CRP fails fast. If - # extension status is not present, collect_ext_status() will set a default transitioning status, and CRP will - # wait for timeout. + # We also need to report extension status for an uninstalled handler if extensions are disabled, or if the extension + # failed due to policy, because CRP waits for extension runtime status before failing the extension operation. if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or not conf.get_extensions_enabled() or ExtensionPolicyEngine.get_policy_enforcement_enabled(): # Since we require reading the Manifest for reading the heartbeat, this would fail if HandlerManifest not found. diff --git a/azurelinuxagent/ga/policy/policy_engine.py b/azurelinuxagent/ga/policy/policy_engine.py index 336882114..202eb4dbc 100644 --- a/azurelinuxagent/ga/policy/policy_engine.py +++ b/azurelinuxagent/ga/policy/policy_engine.py @@ -22,7 +22,7 @@ from azurelinuxagent.common import logger from azurelinuxagent.common.event import WALAEventOperation, add_event from azurelinuxagent.common import conf -from azurelinuxagent.common.exception import AgentError, ExtensionError +from azurelinuxagent.common.exception import AgentError, ExtensionError, ExtensionErrorCodes from azurelinuxagent.common.protocol.extensions_goal_state_from_vm_settings import _CaseFoldedDict from azurelinuxagent.common.utils.flexible_version import FlexibleVersion @@ -44,16 +44,6 @@ def __init__(self, msg, inner=None): msg = "Customer-provided policy file ('{0}') is invalid, please correct the following error: {1}".format(conf.get_policy_file_path(), msg) super(InvalidPolicyError, self).__init__(msg, inner) - -class ExtensionPolicyError(ExtensionError): - """ - Error raised during agent extension policy enforcement. - """ - def __init__(self, msg, code, inner=None): - msg = "Extension will not be processed: {0}".format(msg) - super(ExtensionPolicyError, self).__init__(msg, inner, code) - - class _PolicyEngine(object): """ Implements base policy engine API. diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index 4a583cfce..865441777 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -3566,7 +3566,7 @@ def test_should_fail_enable_if_extension_disallowed(self): "allowListedExtensionsOnly": True, } } - expected_msg = "failed to enable extension 'OSTCExtensions.ExampleHandlerLinux' because extension is not specified in allowlist." + expected_msg = "failed to run extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified in the allowlist." self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, expected_handler_status='NotReady', expected_status_msg=expected_msg) @@ -3589,7 +3589,7 @@ def test_should_fail_extension_if_error_thrown_during_policy_engine_init(self): } with patch('azurelinuxagent.ga.policy.policy_engine.ExtensionPolicyEngine.__init__', side_effect=Exception("mock exception")): - expected_msg = "Extension will not be processed: \nInner error: mock exception" + expected_msg = "Extension will not be processed: mock exception" self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, expected_handler_status='NotReady', expected_status_msg=expected_msg) @@ -3604,7 +3604,7 @@ def test_should_fail_uninstall_if_extension_disallowed(self): "extensions": {} }, } - expected_msg = "failed to uninstall extension 'OSTCExtensions.ExampleHandlerLinux' because extension is not specified in allowlist." + expected_msg = "failed to uninstall extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified in the allowlist." self._test_policy_case(policy=policy, op=ExtensionRequestedState.Uninstall, expected_status_code=ExtensionErrorCodes.PluginDisableProcessingFailed, expected_handler_status='NotReady', expected_status_msg=expected_msg) @@ -3632,8 +3632,8 @@ def test_should_fail_enable_if_dependent_extension_disallowed(self): # dependent extension failed self._assert_handler_status(protocol.report_vm_status, "NotReady", 1, "1.0.0", expected_handler_name="OSTCExtensions.OtherExampleHandlerLinux", - expected_msg=("failed to enable extension 'OSTCExtensions.OtherExampleHandlerLinux' " - "because extension is not specified in allowlist.")) + expected_msg=("failed to run extension 'OSTCExtensions.OtherExampleHandlerLinux' " + "because it is not specified in the allowlist.")) self._assert_handler_status(protocol.report_vm_status, "NotReady", 1, "1.0.0", expected_handler_name="OSTCExtensions.ExampleHandlerLinux", diff --git a/tests/ga/test_multi_config_extension.py b/tests/ga/test_multi_config_extension.py index 93d47e919..02fb8cdad 100644 --- a/tests/ga/test_multi_config_extension.py +++ b/tests/ga/test_multi_config_extension.py @@ -677,7 +677,7 @@ def test_it_should_handle_and_report_extensions_disallowed_by_policy_properly(se mc_handlers = self._assert_and_get_handler_status(aggregate_status=protocol.aggregate_status, handler_name="OSTCExtensions.ExampleHandlerLinux", expected_count=3, status="NotReady") - msg = "failed to enable extension 'OSTCExtensions.ExampleHandlerLinux' because extension is not specified in allowlist" + msg = "failed to run extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified in the allowlist" expected_extensions = { "firstExtension": {"status": ExtensionStatusValue.error, "seq_no": 1, "message": msg}, "secondExtension": {"status": ExtensionStatusValue.error, "seq_no": 2, "message": msg}, From 83f6ff00ff2ab32edca9e7c0284d45386fe544ee Mon Sep 17 00:00:00 2001 From: mgunnala Date: Tue, 3 Dec 2024 15:12:02 -0500 Subject: [PATCH 14/32] Tests --- tests_e2e/test_suites/ext_policy.yml | 2 +- tests_e2e/tests/ext_policy/ext_policy.py | 286 +++++++++++++++++------ 2 files changed, 215 insertions(+), 73 deletions(-) diff --git a/tests_e2e/test_suites/ext_policy.yml b/tests_e2e/test_suites/ext_policy.yml index 15a3dc73d..bb592f17d 100644 --- a/tests_e2e/test_suites/ext_policy.yml +++ b/tests_e2e/test_suites/ext_policy.yml @@ -4,5 +4,5 @@ name: "ExtensionPolicy" tests: - "ext_policy/ext_policy.py" -images: "endorsed" +images: "mariner_2" owns_vm: true \ No newline at end of file diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index da7b752f9..4977d9dc8 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -68,7 +68,8 @@ def _operation_should_succeed(self, operation, extension_case): if instance_view_extensions is not None and any( e.name == extension_case.extension._resource_name for e in instance_view_extensions): raise Exception( - "extension {0} still in instance view after attempting to delete".format(extension_case.extension._resource_nam)) + "extension {0} still in instance view after attempting to delete".format( + extension_case.extension._resource_nam)) log.info(f"Operation '{operation}' for {extension_case.extension.__str__()} succeeded as expected.") except Exception as error: fail( @@ -94,48 +95,49 @@ def _operation_should_fail(operation, extension_case): f"was: {error}").is_true() log.info(f"{extension_case.extension.__str__()} {operation} failed as expected") - def run(self): - - # Prepare no-config, single-config, and multi-config extension to test. Extensions with settings and extensions - # without settings have different status reporting logic, so we should test all cases. + def _run_extension_test_case(self, extension_id): + # Set up test case for the specified extension ID unique = str(uuid.uuid4()) test_file = f"waagent-test.{unique}" + ext_case_instances = [] - # CustomScript is a single-config extension. - custom_script = ExtPolicy.TestCase( - VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, - resource_name="CustomScript"), - {'commandToExecute': f"echo '{unique}' > /tmp/{test_file}"} - ) - - # RunCommandHandler is a multi-config extension, so we set up two instances (configurations) here and test both. - run_command = ExtPolicy.TestCase( - VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, - resource_name="RunCommandHandler"), - {'source': {'script': f"echo '{unique}' > /tmp/{test_file}"}} - ) - unique2 = str(uuid.uuid4()) - test_file2 = f"waagent-test.{unique2}" - run_command_2 = ExtPolicy.TestCase( - VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, - resource_name="RunCommandHandler2"), - {'source': {'script': f"echo '{unique2}' > /tmp/{test_file2}"}} - ) - - # AzureMonitorLinuxAgent is a no-config extension (extension without settings). - azure_monitor = ExtPolicy.TestCase( - VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent, - resource_name="AzureMonitorLinuxAgent"), - None - ) - - # Enable policy via conf - log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) - self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) + if extension_id == VmExtensionIds.CustomScript: + ext_case = ExtPolicy.TestCase( + VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, + resource_name="CustomScript"), + {'commandToExecute': f"echo '{unique}' > /tmp/{test_file}"} + ) + ext_case_instances.append(ext_case) + elif extension_id == VmExtensionIds.RunCommandHandler: + # For multiconfig extension, we want to test all behavior for more than one instance, so we set up two + # test cases, and append to ext_case_instances. + ext_case = ExtPolicy.TestCase( + VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, + resource_name="RunCommandHandler"), + {'source': {'script': f"echo '{unique}' > /tmp/{test_file}"}} + ) + ext_case_instances.append(ext_case) - # Only allowlisted extensions should be processed. - # We only allowlist CustomScript: CustomScript should be enabled, RunCommand and AzureMonitor should fail. - # (Note that CustomScript blocked by policy is tested in a later test case.) + unique2 = str(uuid.uuid4()) + test_file2 = f"waagent-test.{unique}" + ext_case_2 = ExtPolicy.TestCase( + VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, + resource_name="RunCommandHandler2"), + {'source': {'script': f"echo '{unique2}' > /tmp/{test_file2}"}} + ) + ext_case_instances.append(ext_case_2) + elif extension_id == VmExtensionIds.AzureMonitorLinuxAgent: + # Skip test case if distro does not support AzureMonitorLinuxAgent + if not VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): + return + ext_case = ExtPolicy.TestCase( + VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent, + resource_name="AzureMonitorLinuxAgent"), + None + ) + ext_case_instances.append(ext_case) + + # Extension enable operation succeeds when extension is in the allowlist policy = \ { "policyVersion": "0.1.0", @@ -143,36 +145,31 @@ def run(self): "allowListedExtensionsOnly": True, "signatureRequired": False, "extensions": { - "Microsoft.Azure.Extensions.CustomScript": {} + str(extension_id): {} } } } self._create_policy_file(policy) - self._operation_should_succeed("enable", custom_script) - self._operation_should_fail("enable", run_command) - if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): - self._operation_should_fail("enable", azure_monitor) + for ext_case in ext_case_instances: + self._operation_should_succeed("enable", ext_case) - # When allowlist is turned off, all extensions should be processed. - # RunCommand and AzureMonitorLinuxAgent should be successfully enabled and then deleted. + # Extension uninstall operation succeeds when extension is in the allowlist policy = \ { "policyVersion": "0.1.0", "extensionPolicies": { - "allowListedExtensionsOnly": False, + "allowListedExtensionsOnly": True, "signatureRequired": False, - "extensions": {} + "extensions": { + str(extension_id): {} + } } } self._create_policy_file(policy) - self._operation_should_succeed("enable", run_command) - self._operation_should_succeed("delete", run_command) - if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): - self._operation_should_succeed("enable", azure_monitor) - self._operation_should_succeed("delete", azure_monitor) - - # Should not uninstall disallowed extensions. - # CustomScript is removed from the allowlist: delete operation should fail. + for ext_case in ext_case_instances: + self._operation_should_succeed("uninstall", ext_case) + + # Extension enable operation fails when extension is disallowed (NOT in the allowlist) policy = \ { "policyVersion": "0.1.0", @@ -183,29 +180,174 @@ def run(self): } } self._create_policy_file(policy) - # # Known CRP issue - delete/uninstall operation times out instead of reporting an error. - # # TODO: uncomment this test case after issue is resolved - # # self._operation_should_fail("delete", custom_script) - - # If a multiconfig extension is disallowed, no instances should be processed. - # RunCommand is not allowed - if we try to enable two instances, both should fail fast. - self._operation_should_fail("enable", run_command) - self._operation_should_fail("enable", run_command_2) - - # If single-config extension is initially blocked, and policy is updated to allow it, extension should be - # successfully enabled and report status correctly. - self._operation_should_fail("enable", custom_script) + for ext_case in ext_case_instances: + self._operation_should_fail("enable", ext_case) + + # Extension enable operation succeeds when allowlist is turned off policy = \ { "policyVersion": "0.1.0", "extensionPolicies": { - "allowListedExtensionsOnly": False, - "signatureRequired": False, - "extensions": {} + "allowListedExtensionsOnly": False } } self._create_policy_file(policy) - self._operation_should_succeed("enable", custom_script) + for ext_case in ext_case_instances: + self._operation_should_succeed("enable", ext_case) + + # # Extension uninstall operation fails when extension is disallowed (NOT in the allowlist) + # policy = \ + # { + # "policyVersion": "0.1.0", + # "extensionPolicies": { + # "allowListedExtensionsOnly": True, + # "signatureRequired": False, + # "extensions": {} + # } + # } + # self._create_policy_file(policy) + # for ext_case in ext_case_instances: + # self._operation_should_fail("uninstall", ext_case) + + # Clean up extensions for next test case + policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": False + } + } + self._create_policy_file(policy) + for ext_case in ext_case_instances: + self._operation_should_succeed("uninstall", ext_case) + + def run(self): + # Enable policy on VM via conf file + log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) + self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) + + # Run test cases + ext_to_test = [ + # VmExtensionIds.CustomScript, # single-config extension + # VmExtensionIds.RunCommandHandler, # multi-config extension + VmExtensionIds.AzureMonitorLinuxAgent # no-config extension + ] + for ext in ext_to_test: + self._run_extension_test_case(ext) + + # # Prepare no-config, single-config, and multi-config extension to test. Extensions with settings and extensions + # # without settings have different status reporting logic, so we should test all cases. + # + # unique = str(uuid.uuid4()) + # test_file = f"waagent-test.{unique}" + # + # # CustomScript is a single-config extension. + # custom_script = ExtPolicy.TestCase( + # VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, + # resource_name="CustomScript"), + # {'commandToExecute': f"echo '{unique}' > /tmp/{test_file}"} + # ) + # + # # RunCommandHandler is a multi-config extension, so we set up two instances (configurations) here and test both. + # run_command = ExtPolicy.TestCase( + # VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, + # resource_name="RunCommandHandler"), + # {'source': {'script': f"echo '{unique}' > /tmp/{test_file}"}} + # ) + # unique2 = str(uuid.uuid4()) + # test_file2 = f"waagent-test.{unique2}" + # run_command_2 = ExtPolicy.TestCase( + # VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, + # resource_name="RunCommandHandler2"), + # {'source': {'script': f"echo '{unique2}' > /tmp/{test_file2}"}} + # ) + # + # # AzureMonitorLinuxAgent is a no-config extension (extension without settings). + # azure_monitor = ExtPolicy.TestCase( + # VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent, + # resource_name="AzureMonitorLinuxAgent"), + # None + # ) + # + # # Enable policy via conf + # log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) + # self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) + # + # # Only allowlisted extensions should be processed. + # # We only allowlist CustomScript: CustomScript should be enabled, RunCommand and AzureMonitor should fail. + # # (Note that CustomScript blocked by policy is tested in a later test case.) + # policy = \ + # { + # "policyVersion": "0.1.0", + # "extensionPolicies": { + # "allowListedExtensionsOnly": True, + # "signatureRequired": False, + # "extensions": { + # "Microsoft.Azure.Extensions.CustomScript": {} + # } + # } + # } + # self._create_policy_file(policy) + # self._operation_should_succeed("enable", custom_script) + # self._operation_should_fail("enable", run_command) + # if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro( + # (self._ssh_client.run_command("get_distro.py").rstrip())): + # self._operation_should_fail("enable", azure_monitor) + # + # # When allowlist is turned off, all extensions should be processed. + # # RunCommand and AzureMonitorLinuxAgent should be successfully enabled and then deleted. + # policy = \ + # { + # "policyVersion": "0.1.0", + # "extensionPolicies": { + # "allowListedExtensionsOnly": False, + # "signatureRequired": False, + # "extensions": {} + # } + # } + # self._create_policy_file(policy) + # self._operation_should_succeed("enable", run_command) + # self._operation_should_succeed("delete", run_command) + # if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro( + # (self._ssh_client.run_command("get_distro.py").rstrip())): + # self._operation_should_succeed("enable", azure_monitor) + # self._operation_should_succeed("delete", azure_monitor) + # + # # Should not uninstall disallowed extensions. + # # CustomScript is removed from the allowlist: delete operation should fail. + # policy = \ + # { + # "policyVersion": "0.1.0", + # "extensionPolicies": { + # "allowListedExtensionsOnly": True, + # "signatureRequired": False, + # "extensions": {} + # } + # } + # self._create_policy_file(policy) + # # # Known CRP issue - delete/uninstall operation times out instead of reporting an error. + # # # TODO: uncomment this test case after issue is resolved + # # # self._operation_should_fail("delete", custom_script) + # + # # If a multiconfig extension is disallowed, no instances should be processed. + # # RunCommand is not allowed - if we try to enable two instances, both should fail fast. + # self._operation_should_fail("enable", run_command) + # self._operation_should_fail("enable", run_command_2) + # + # # If single-config extension is initially blocked, and policy is updated to allow it, extension should be + # # successfully enabled and report status correctly. + # self._operation_should_fail("enable", custom_script) + # policy = \ + # { + # "policyVersion": "0.1.0", + # "extensionPolicies": { + # "allowListedExtensionsOnly": False, + # "signatureRequired": False, + # "extensions": {} + # } + # } + # self._create_policy_file(policy) + # self._operation_should_succeed("enable", custom_script) def get_ignore_error_rules(self) -> List[Dict[str, Any]]: ignore_rules = [ From b037e41b720d7d996413b9586707598ad14e9762 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Tue, 3 Dec 2024 15:22:17 -0500 Subject: [PATCH 15/32] Revert "Tests" This reverts commit 83f6ff00ff2ab32edca9e7c0284d45386fe544ee. --- tests_e2e/test_suites/ext_policy.yml | 2 +- tests_e2e/tests/ext_policy/ext_policy.py | 286 ++++++----------------- 2 files changed, 73 insertions(+), 215 deletions(-) diff --git a/tests_e2e/test_suites/ext_policy.yml b/tests_e2e/test_suites/ext_policy.yml index bb592f17d..15a3dc73d 100644 --- a/tests_e2e/test_suites/ext_policy.yml +++ b/tests_e2e/test_suites/ext_policy.yml @@ -4,5 +4,5 @@ name: "ExtensionPolicy" tests: - "ext_policy/ext_policy.py" -images: "mariner_2" +images: "endorsed" owns_vm: true \ No newline at end of file diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index 4977d9dc8..da7b752f9 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -68,8 +68,7 @@ def _operation_should_succeed(self, operation, extension_case): if instance_view_extensions is not None and any( e.name == extension_case.extension._resource_name for e in instance_view_extensions): raise Exception( - "extension {0} still in instance view after attempting to delete".format( - extension_case.extension._resource_nam)) + "extension {0} still in instance view after attempting to delete".format(extension_case.extension._resource_nam)) log.info(f"Operation '{operation}' for {extension_case.extension.__str__()} succeeded as expected.") except Exception as error: fail( @@ -95,49 +94,48 @@ def _operation_should_fail(operation, extension_case): f"was: {error}").is_true() log.info(f"{extension_case.extension.__str__()} {operation} failed as expected") - def _run_extension_test_case(self, extension_id): - # Set up test case for the specified extension ID + def run(self): + + # Prepare no-config, single-config, and multi-config extension to test. Extensions with settings and extensions + # without settings have different status reporting logic, so we should test all cases. unique = str(uuid.uuid4()) test_file = f"waagent-test.{unique}" - ext_case_instances = [] - if extension_id == VmExtensionIds.CustomScript: - ext_case = ExtPolicy.TestCase( - VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, - resource_name="CustomScript"), - {'commandToExecute': f"echo '{unique}' > /tmp/{test_file}"} - ) - ext_case_instances.append(ext_case) - elif extension_id == VmExtensionIds.RunCommandHandler: - # For multiconfig extension, we want to test all behavior for more than one instance, so we set up two - # test cases, and append to ext_case_instances. - ext_case = ExtPolicy.TestCase( - VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, - resource_name="RunCommandHandler"), - {'source': {'script': f"echo '{unique}' > /tmp/{test_file}"}} - ) - ext_case_instances.append(ext_case) - - unique2 = str(uuid.uuid4()) - test_file2 = f"waagent-test.{unique}" - ext_case_2 = ExtPolicy.TestCase( - VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, - resource_name="RunCommandHandler2"), - {'source': {'script': f"echo '{unique2}' > /tmp/{test_file2}"}} - ) - ext_case_instances.append(ext_case_2) - elif extension_id == VmExtensionIds.AzureMonitorLinuxAgent: - # Skip test case if distro does not support AzureMonitorLinuxAgent - if not VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): - return - ext_case = ExtPolicy.TestCase( - VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent, - resource_name="AzureMonitorLinuxAgent"), - None - ) - ext_case_instances.append(ext_case) + # CustomScript is a single-config extension. + custom_script = ExtPolicy.TestCase( + VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, + resource_name="CustomScript"), + {'commandToExecute': f"echo '{unique}' > /tmp/{test_file}"} + ) + + # RunCommandHandler is a multi-config extension, so we set up two instances (configurations) here and test both. + run_command = ExtPolicy.TestCase( + VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, + resource_name="RunCommandHandler"), + {'source': {'script': f"echo '{unique}' > /tmp/{test_file}"}} + ) + unique2 = str(uuid.uuid4()) + test_file2 = f"waagent-test.{unique2}" + run_command_2 = ExtPolicy.TestCase( + VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, + resource_name="RunCommandHandler2"), + {'source': {'script': f"echo '{unique2}' > /tmp/{test_file2}"}} + ) + + # AzureMonitorLinuxAgent is a no-config extension (extension without settings). + azure_monitor = ExtPolicy.TestCase( + VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent, + resource_name="AzureMonitorLinuxAgent"), + None + ) + + # Enable policy via conf + log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) + self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) - # Extension enable operation succeeds when extension is in the allowlist + # Only allowlisted extensions should be processed. + # We only allowlist CustomScript: CustomScript should be enabled, RunCommand and AzureMonitor should fail. + # (Note that CustomScript blocked by policy is tested in a later test case.) policy = \ { "policyVersion": "0.1.0", @@ -145,31 +143,36 @@ def _run_extension_test_case(self, extension_id): "allowListedExtensionsOnly": True, "signatureRequired": False, "extensions": { - str(extension_id): {} + "Microsoft.Azure.Extensions.CustomScript": {} } } } self._create_policy_file(policy) - for ext_case in ext_case_instances: - self._operation_should_succeed("enable", ext_case) + self._operation_should_succeed("enable", custom_script) + self._operation_should_fail("enable", run_command) + if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): + self._operation_should_fail("enable", azure_monitor) - # Extension uninstall operation succeeds when extension is in the allowlist + # When allowlist is turned off, all extensions should be processed. + # RunCommand and AzureMonitorLinuxAgent should be successfully enabled and then deleted. policy = \ { "policyVersion": "0.1.0", "extensionPolicies": { - "allowListedExtensionsOnly": True, + "allowListedExtensionsOnly": False, "signatureRequired": False, - "extensions": { - str(extension_id): {} - } + "extensions": {} } } self._create_policy_file(policy) - for ext_case in ext_case_instances: - self._operation_should_succeed("uninstall", ext_case) - - # Extension enable operation fails when extension is disallowed (NOT in the allowlist) + self._operation_should_succeed("enable", run_command) + self._operation_should_succeed("delete", run_command) + if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): + self._operation_should_succeed("enable", azure_monitor) + self._operation_should_succeed("delete", azure_monitor) + + # Should not uninstall disallowed extensions. + # CustomScript is removed from the allowlist: delete operation should fail. policy = \ { "policyVersion": "0.1.0", @@ -180,174 +183,29 @@ def _run_extension_test_case(self, extension_id): } } self._create_policy_file(policy) - for ext_case in ext_case_instances: - self._operation_should_fail("enable", ext_case) - - # Extension enable operation succeeds when allowlist is turned off + # # Known CRP issue - delete/uninstall operation times out instead of reporting an error. + # # TODO: uncomment this test case after issue is resolved + # # self._operation_should_fail("delete", custom_script) + + # If a multiconfig extension is disallowed, no instances should be processed. + # RunCommand is not allowed - if we try to enable two instances, both should fail fast. + self._operation_should_fail("enable", run_command) + self._operation_should_fail("enable", run_command_2) + + # If single-config extension is initially blocked, and policy is updated to allow it, extension should be + # successfully enabled and report status correctly. + self._operation_should_fail("enable", custom_script) policy = \ { "policyVersion": "0.1.0", "extensionPolicies": { - "allowListedExtensionsOnly": False - } - } - self._create_policy_file(policy) - for ext_case in ext_case_instances: - self._operation_should_succeed("enable", ext_case) - - # # Extension uninstall operation fails when extension is disallowed (NOT in the allowlist) - # policy = \ - # { - # "policyVersion": "0.1.0", - # "extensionPolicies": { - # "allowListedExtensionsOnly": True, - # "signatureRequired": False, - # "extensions": {} - # } - # } - # self._create_policy_file(policy) - # for ext_case in ext_case_instances: - # self._operation_should_fail("uninstall", ext_case) - - # Clean up extensions for next test case - policy = \ - { - "policyVersion": "0.1.0", - "extensionPolicies": { - "allowListedExtensionsOnly": False + "allowListedExtensionsOnly": False, + "signatureRequired": False, + "extensions": {} } } self._create_policy_file(policy) - for ext_case in ext_case_instances: - self._operation_should_succeed("uninstall", ext_case) - - def run(self): - # Enable policy on VM via conf file - log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) - self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) - - # Run test cases - ext_to_test = [ - # VmExtensionIds.CustomScript, # single-config extension - # VmExtensionIds.RunCommandHandler, # multi-config extension - VmExtensionIds.AzureMonitorLinuxAgent # no-config extension - ] - for ext in ext_to_test: - self._run_extension_test_case(ext) - - # # Prepare no-config, single-config, and multi-config extension to test. Extensions with settings and extensions - # # without settings have different status reporting logic, so we should test all cases. - # - # unique = str(uuid.uuid4()) - # test_file = f"waagent-test.{unique}" - # - # # CustomScript is a single-config extension. - # custom_script = ExtPolicy.TestCase( - # VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, - # resource_name="CustomScript"), - # {'commandToExecute': f"echo '{unique}' > /tmp/{test_file}"} - # ) - # - # # RunCommandHandler is a multi-config extension, so we set up two instances (configurations) here and test both. - # run_command = ExtPolicy.TestCase( - # VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, - # resource_name="RunCommandHandler"), - # {'source': {'script': f"echo '{unique}' > /tmp/{test_file}"}} - # ) - # unique2 = str(uuid.uuid4()) - # test_file2 = f"waagent-test.{unique2}" - # run_command_2 = ExtPolicy.TestCase( - # VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, - # resource_name="RunCommandHandler2"), - # {'source': {'script': f"echo '{unique2}' > /tmp/{test_file2}"}} - # ) - # - # # AzureMonitorLinuxAgent is a no-config extension (extension without settings). - # azure_monitor = ExtPolicy.TestCase( - # VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent, - # resource_name="AzureMonitorLinuxAgent"), - # None - # ) - # - # # Enable policy via conf - # log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) - # self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) - # - # # Only allowlisted extensions should be processed. - # # We only allowlist CustomScript: CustomScript should be enabled, RunCommand and AzureMonitor should fail. - # # (Note that CustomScript blocked by policy is tested in a later test case.) - # policy = \ - # { - # "policyVersion": "0.1.0", - # "extensionPolicies": { - # "allowListedExtensionsOnly": True, - # "signatureRequired": False, - # "extensions": { - # "Microsoft.Azure.Extensions.CustomScript": {} - # } - # } - # } - # self._create_policy_file(policy) - # self._operation_should_succeed("enable", custom_script) - # self._operation_should_fail("enable", run_command) - # if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro( - # (self._ssh_client.run_command("get_distro.py").rstrip())): - # self._operation_should_fail("enable", azure_monitor) - # - # # When allowlist is turned off, all extensions should be processed. - # # RunCommand and AzureMonitorLinuxAgent should be successfully enabled and then deleted. - # policy = \ - # { - # "policyVersion": "0.1.0", - # "extensionPolicies": { - # "allowListedExtensionsOnly": False, - # "signatureRequired": False, - # "extensions": {} - # } - # } - # self._create_policy_file(policy) - # self._operation_should_succeed("enable", run_command) - # self._operation_should_succeed("delete", run_command) - # if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro( - # (self._ssh_client.run_command("get_distro.py").rstrip())): - # self._operation_should_succeed("enable", azure_monitor) - # self._operation_should_succeed("delete", azure_monitor) - # - # # Should not uninstall disallowed extensions. - # # CustomScript is removed from the allowlist: delete operation should fail. - # policy = \ - # { - # "policyVersion": "0.1.0", - # "extensionPolicies": { - # "allowListedExtensionsOnly": True, - # "signatureRequired": False, - # "extensions": {} - # } - # } - # self._create_policy_file(policy) - # # # Known CRP issue - delete/uninstall operation times out instead of reporting an error. - # # # TODO: uncomment this test case after issue is resolved - # # # self._operation_should_fail("delete", custom_script) - # - # # If a multiconfig extension is disallowed, no instances should be processed. - # # RunCommand is not allowed - if we try to enable two instances, both should fail fast. - # self._operation_should_fail("enable", run_command) - # self._operation_should_fail("enable", run_command_2) - # - # # If single-config extension is initially blocked, and policy is updated to allow it, extension should be - # # successfully enabled and report status correctly. - # self._operation_should_fail("enable", custom_script) - # policy = \ - # { - # "policyVersion": "0.1.0", - # "extensionPolicies": { - # "allowListedExtensionsOnly": False, - # "signatureRequired": False, - # "extensions": {} - # } - # } - # self._create_policy_file(policy) - # self._operation_should_succeed("enable", custom_script) + self._operation_should_succeed("enable", custom_script) def get_ignore_error_rules(self) -> List[Dict[str, Any]]: ignore_rules = [ From ba3869c426c8056a9f4934631efc2af798048ab7 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Fri, 6 Dec 2024 14:41:47 -0500 Subject: [PATCH 16/32] Address test comments --- azurelinuxagent/ga/exthandlers.py | 64 ++++--- tests/ga/test_extension.py | 12 +- tests/ga/test_multi_config_extension.py | 163 +++++++++--------- tests_e2e/tests/ext_policy/ext_policy.py | 143 +++++++++------ .../ext_policy_with_dependencies.py | 31 ++-- .../ext_policy/policy_dependencies_cases.py | 54 +----- 6 files changed, 236 insertions(+), 231 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 07792096a..12a85fe18 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -528,14 +528,16 @@ def handle_ext_handlers(self, goal_state_id): logger.info("{0}: {1}".format(ext_full_name, msg)) add_event(op=WALAEventOperation.ExtensionProcessing, message="{0}: {1}".format(ext_full_name, msg)) handler_i.set_handler_status(status=ExtHandlerStatusValue.not_ready, message=msg, code=-1) - handler_i.create_status_file_if_not_exist(extension, - status=ExtensionStatusValue.error, - code=-1, - operation=handler_i.operation, - message=msg) + handler_i.create_status_file(extension, + status=ExtensionStatusValue.error, + code=-1, + operation=handler_i.operation, + message=msg) continue - # Invoke policy engine to determine if extension is allowed. If not, block extension and report error on - # behalf of the extension. + + # If an error was thrown during policy engine initialization, skip further processing of the extension. + # CRP is still waiting for status, so we report error status here. + # of the extension. policy_op, policy_err_code = _POLICY_ERROR_MAP.get(ext_handler.state) if policy_error is not None: msg = "Extension will not be processed: {0}".format(ustr(policy_error)) @@ -544,15 +546,6 @@ def handle_ext_handlers(self, goal_state_id): extension=extension) continue - extension_allowed = policy_engine.should_allow_extension(ext_handler.name) - if not extension_allowed: - msg = ( - "Extension will not be processed: failed to {0} extension '{1}' because it is not specified " - "in the allowlist. To {0}, add the extension to the allowed list in the policy file ('{2}')." - ).format(policy_op, ext_handler.name, conf.get_policy_file_path()) - self.__report_policy_error(handler_i, policy_err_code, report_op=handler_i.operation, - message=msg, extension=extension) - # In case of depends-on errors, we skip processing extensions if there was an error processing dependent extensions. # But CRP is still waiting for some status back for the skipped extensions. In order to propagate the status back to CRP, # we will report status back here with the relevant error message for each of the dependent extension. @@ -564,9 +557,9 @@ def handle_ext_handlers(self, goal_state_id): if handler_i.get_handler_status() is None: handler_i.set_handler_status(message=depends_on_err_msg, code=-1) - handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=-1, - operation=WALAEventOperation.ExtensionProcessing, - message=depends_on_err_msg) + handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=-1, + operation=WALAEventOperation.ExtensionProcessing, + message=depends_on_err_msg) # For SC extensions, overwrite the HandlerStatus with the relevant message else: @@ -574,6 +567,17 @@ def handle_ext_handlers(self, goal_state_id): continue + # Invoke policy engine to determine if extension is allowed. If disallowed, report an error on behalf of + # the extension and do not process the extension. Dependent extensions will also be blocked. + extension_allowed = policy_engine.should_allow_extension(ext_handler.name) + if not extension_allowed: + msg = ( + "Extension will not be processed: failed to {0} extension '{1}' because it is not specified " + "in the allowlist. To {0}, add the extension to the allowed list in the policy file ('{2}')." + ).format(policy_op, ext_handler.name, conf.get_policy_file_path()) + self.__report_policy_error(handler_i, policy_err_code, report_op=handler_i.operation, + message=msg, extension=extension) + # Process extensions and get if it was successfully executed or not # If extension was blocked by policy, treat the extension as failed and do not process the handler. if not extension_allowed: @@ -694,8 +698,8 @@ def handle_ext_handler(self, ext_handler_i, extension, goal_state_id): # This error is only thrown for enable operation on MultiConfig extension. # Since these are maintained by the extensions, the expectation here is that they would update their status files appropriately with their errors. # The extensions should already have a placeholder status file, but incase they dont, setting one here to fail fast. - ext_handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=error.code, - operation=ext_handler_i.operation, message=err_msg) + ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error.code, + operation=ext_handler_i.operation, message=err_msg) add_event(name=ext_name, version=ext_handler_i.ext_handler.version, op=ext_handler_i.operation, is_success=False, log_event=True, message=err_msg) except ExtensionsGoalStateError as error: @@ -735,8 +739,8 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess # file with failure since the extensions wont be called where they can create their status files. # This way we guarantee reporting back to CRP if ext_handler_i.should_perform_multi_config_op(extension): - ext_handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=error.code, - operation=report_op, message=message) + ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error.code, + operation=report_op, message=message) if report: name = ext_handler_i.get_extension_full_name(extension) @@ -760,9 +764,11 @@ def __report_policy_error(ext_handler_i, error_code, report_op, message, extensi ext_handler_i.set_handler_status(message=message, code=error_code) # Create status file for extensions with settings (single and multi config). + # If status file already exists, overwrite it. If an extension was previously reporting status and is now + # blocked by a policy error, we should report the policy error. if extension is not None: - ext_handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=error_code, - operation=report_op, message=message) + ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error_code, + operation=report_op, message=message, overwrite=True) name = ext_handler_i.get_extension_full_name(extension) handler_version = ext_handler_i.ext_handler.version @@ -1067,7 +1073,7 @@ def report_ext_handler_status(self, vm_status, ext_handler, goal_state_changed): # extension even if HandlerState == NotInstalled (Sample scenario: ExtensionsGoalStateError, DecideVersionError, etc) # We also need to report extension status for an uninstalled handler if extensions are disabled, or if the extension # failed due to policy, because CRP waits for extension runtime status before failing the extension operation. - if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or not conf.get_extensions_enabled() or ExtensionPolicyEngine.get_policy_enforcement_enabled(): + if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or not conf.get_extensions_enabled(): # Since we require reading the Manifest for reading the heartbeat, this would fail if HandlerManifest not found. # Only try to read heartbeat if HandlerState != NotInstalled. @@ -1420,9 +1426,11 @@ def set_extension_resource_limits(self): extension_name=extension_name, cpu_quota=resource_limits.get_extension_slice_cpu_quota()) CGroupConfigurator.get_instance().set_extension_services_cpu_memory_quota(resource_limits.get_service_list()) - def create_status_file_if_not_exist(self, extension, status, code, operation, message): + def create_status_file(self, extension, status, code, operation, message, overwrite=False): + # Create status file for specified extension. If overwrite is true, overwrite any existing status file. If + # false, create a status file only if it does not already exist. _, status_path = self.get_status_file_path(extension) - if status_path is not None and not os.path.exists(status_path): + if status_path is not None and (overwrite or not os.path.exists(status_path)): now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") status_contents = [ { diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index 865441777..902c069c7 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -3536,7 +3536,7 @@ def _create_policy_file(self, policy): policy_file.write(policy) policy_file.flush() - def _test_policy_case(self, policy, op, expected_status_code, expected_handler_status, expected_ext_count=1, + def _test_policy_case(self, policy, op, expected_status_code, expected_handler_status, expected_ext_count=0, expected_status_msg=None): with mock_wire_protocol(wire_protocol_data.DATA_FILE) as protocol: @@ -3630,13 +3630,13 @@ def test_should_fail_enable_if_dependent_extension_disallowed(self): # OtherExampleHandlerLinux should be disallowed by policy, ExampleHandlerLinux should be skipped because # dependent extension failed - self._assert_handler_status(protocol.report_vm_status, "NotReady", 1, "1.0.0", - expected_handler_name="OSTCExtensions.OtherExampleHandlerLinux", + self._assert_handler_status(protocol.report_vm_status, expected_status="NotReady", expected_ext_count=0, + version="1.0.0", expected_handler_name="OSTCExtensions.OtherExampleHandlerLinux", expected_msg=("failed to run extension 'OSTCExtensions.OtherExampleHandlerLinux' " "because it is not specified in the allowlist.")) - self._assert_handler_status(protocol.report_vm_status, "NotReady", 1, "1.0.0", - expected_handler_name="OSTCExtensions.ExampleHandlerLinux", + self._assert_handler_status(protocol.report_vm_status, expected_status="NotReady", expected_ext_count=0, + version="1.0.0", expected_handler_name="OSTCExtensions.ExampleHandlerLinux", expected_msg="Skipping processing of extensions since execution of dependent " "extension OSTCExtensions.OtherExampleHandlerLinux failed") @@ -3669,7 +3669,7 @@ def test_enable_should_succeed_if_extension_allowed(self): ] for policy in policy_cases: self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=0, - expected_handler_status='Ready') + expected_handler_status='Ready', expected_ext_count=1) def test_uninstall_should_succeed_if_extension_allowed(self): policy_cases = [ diff --git a/tests/ga/test_multi_config_extension.py b/tests/ga/test_multi_config_extension.py index 02fb8cdad..8ce38441e 100644 --- a/tests/ga/test_multi_config_extension.py +++ b/tests/ga/test_multi_config_extension.py @@ -630,97 +630,94 @@ def test_it_should_handle_and_report_enable_errors_properly(self): } self._assert_extension_status(sc_handler, expected_extensions) - def test_it_should_handle_and_report_extensions_disallowed_by_policy_properly(self): + def test_it_should_report_failed_status_for_extensions_disallowed_by_policy(self): """If multiconfig extension is disallowed by policy, all instances should be blocked.""" policy_path = os.path.join(self.tmp_dir, "waagent_policy.json") - patch('azurelinuxagent.common.conf.get_policy_file_path', return_value=str(policy_path)).start() - patch('azurelinuxagent.ga.policy.policy_engine.conf.get_extension_policy_enabled', - return_value=True).start() - policy = \ - { - "policyVersion": "0.0.1", - "extensionPolicies": { - "allowListedExtensionsOnly": True, - "signatureRequired": True, - "extensions": { - "Microsoft.Powershell.ExampleExtension": {} + with patch('azurelinuxagent.common.conf.get_policy_file_path', return_value=str(policy_path)): + with patch('azurelinuxagent.ga.policy.policy_engine.conf.get_extension_policy_enabled', return_value=True): + policy = \ + { + "policyVersion": "0.0.1", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "signatureRequired": True, + "extensions": { + "Microsoft.Powershell.ExampleExtension": {} + } + } } - } - } - with open(policy_path, mode='w') as policy_file: - json.dump(policy, policy_file, indent=4) - policy_file.flush() - self.test_data['ext_conf'] = os.path.join(self._MULTI_CONFIG_TEST_DATA, - "ext_conf_multi_config_no_dependencies.xml") - with self._setup_test_env() as (exthandlers_handler, protocol, no_of_extensions): - disallowed_mc_1 = extension_emulator(name="OSTCExtensions.ExampleHandlerLinux.firstExtension", - supports_multiple_extensions=True) - disallowed_mc_2 = extension_emulator(name="OSTCExtensions.ExampleHandlerLinux.secondExtension", - supports_multiple_extensions=True) - disallowed_mc_3 = extension_emulator(name="OSTCExtensions.ExampleHandlerLinux.thirdExtension", - supports_multiple_extensions=True) - allowed_ext = extension_emulator(name="Microsoft.Powershell.ExampleExtension") - with enable_invocations(disallowed_mc_1, disallowed_mc_2, disallowed_mc_3, - allowed_ext) as invocation_record: - exthandlers_handler.run() - exthandlers_handler.report_ext_handlers_status() - self.assertEqual(no_of_extensions, - len(protocol.aggregate_status['aggregateStatus']['handlerAggregateStatus']), - "incorrect extensions reported") - - # We should only enable the allowed extension, no instances of the multiconfig extension should be enabled - invocation_record.compare( - (allowed_ext, ExtensionCommandNames.INSTALL), - (allowed_ext, ExtensionCommandNames.ENABLE) - ) - - mc_handlers = self._assert_and_get_handler_status(aggregate_status=protocol.aggregate_status, - handler_name="OSTCExtensions.ExampleHandlerLinux", - expected_count=3, status="NotReady") - msg = "failed to run extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified in the allowlist" - expected_extensions = { - "firstExtension": {"status": ExtensionStatusValue.error, "seq_no": 1, "message": msg}, - "secondExtension": {"status": ExtensionStatusValue.error, "seq_no": 2, "message": msg}, - "thirdExtension": {"status": ExtensionStatusValue.error, "seq_no": 3, "message": msg}, - } - self._assert_extension_status(mc_handlers, expected_extensions, multi_config=True) - - sc_handler = self._assert_and_get_handler_status(aggregate_status=protocol.aggregate_status, - handler_name="Microsoft.Powershell.ExampleExtension", - status="Ready", message=None) - expected_extensions = { - "Microsoft.Powershell.ExampleExtension": {"status": ExtensionStatusValue.success, "seq_no": 9, - "message": None} - } - self._assert_extension_status(sc_handler, expected_extensions) - + with open(policy_path, mode='w') as policy_file: + json.dump(policy, policy_file, indent=4) + policy_file.flush() + self.test_data['ext_conf'] = os.path.join(self._MULTI_CONFIG_TEST_DATA, + "ext_conf_multi_config_no_dependencies.xml") + with self._setup_test_env() as (exthandlers_handler, protocol, no_of_extensions): + disallowed_mc_1 = extension_emulator(name="OSTCExtensions.ExampleHandlerLinux.firstExtension", + supports_multiple_extensions=True) + disallowed_mc_2 = extension_emulator(name="OSTCExtensions.ExampleHandlerLinux.secondExtension", + supports_multiple_extensions=True) + disallowed_mc_3 = extension_emulator(name="OSTCExtensions.ExampleHandlerLinux.thirdExtension", + supports_multiple_extensions=True) + allowed_ext = extension_emulator(name="Microsoft.Powershell.ExampleExtension") + with enable_invocations(disallowed_mc_1, disallowed_mc_2, disallowed_mc_3, + allowed_ext) as invocation_record: + exthandlers_handler.run() + exthandlers_handler.report_ext_handlers_status() + self.assertEqual(no_of_extensions, + len(protocol.aggregate_status['aggregateStatus']['handlerAggregateStatus']), + "incorrect extensions reported") + + # We should only enable the allowed extension, no instances of the multiconfig extension should be enabled + invocation_record.compare( + (allowed_ext, ExtensionCommandNames.INSTALL), + (allowed_ext, ExtensionCommandNames.ENABLE) + ) + + mc_handlers = self._assert_and_get_handler_status(aggregate_status=protocol.aggregate_status, + handler_name="OSTCExtensions.ExampleHandlerLinux", + expected_count=3, status="NotReady") + msg = "failed to run extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified in the allowlist" + expected_extensions = { + "firstExtension": {"status": ExtensionStatusValue.error, "seq_no": 1, "message": msg}, + "secondExtension": {"status": ExtensionStatusValue.error, "seq_no": 2, "message": msg}, + "thirdExtension": {"status": ExtensionStatusValue.error, "seq_no": 3, "message": msg}, + } + self._assert_extension_status(mc_handlers, expected_extensions, multi_config=True) + + sc_handler = self._assert_and_get_handler_status(aggregate_status=protocol.aggregate_status, + handler_name="Microsoft.Powershell.ExampleExtension", + status="Ready", message=None) + expected_extensions = { + "Microsoft.Powershell.ExampleExtension": {"status": ExtensionStatusValue.success, "seq_no": 9, + "message": None} + } + self._assert_extension_status(sc_handler, expected_extensions) - def test_it_should_handle_and_report_extensions_allowed_by_policy_properly(self): + def test_it_should_report_successful_status_for_extensions_allowed_by_policy(self): """If multiconfig extension is allowed by policy, all instances should be allowed.""" policy_path = os.path.join(self.tmp_dir, "waagent_policy.json") - patch('azurelinuxagent.common.conf.get_policy_file_path', return_value=str(policy_path)).start() - patch('azurelinuxagent.ga.policy.policy_engine.conf.get_extension_policy_enabled', - return_value=True).start() - policy = \ - { - "policyVersion": "0.0.1", - "extensionPolicies": { - "allowListedExtensionsOnly": True, - "signatureRequired": True, - "extensions": { - "OSTCExtensions.ExampleHandlerLinux": {}, - "Microsoft.Powershell.ExampleExtension": {} + with patch('azurelinuxagent.common.conf.get_policy_file_path', return_value=str(policy_path)): + with patch('azurelinuxagent.ga.policy.policy_engine.conf.get_extension_policy_enabled', return_value=True): + policy = \ + { + "policyVersion": "0.0.1", + "extensionPolicies": { + "allowListedExtensionsOnly": True, + "signatureRequired": True, + "extensions": { + "OSTCExtensions.ExampleHandlerLinux": {}, + "Microsoft.Powershell.ExampleExtension": {} + } + } } - } - } - with open(policy_path, mode='w') as policy_file: - json.dump(policy, policy_file, indent=4) - policy_file.flush() + with open(policy_path, mode='w') as policy_file: + json.dump(policy, policy_file, indent=4) + policy_file.flush() - self.test_data['ext_conf'] = os.path.join(self._MULTI_CONFIG_TEST_DATA, - "ext_conf_multi_config_no_dependencies.xml") - with self._setup_test_env(mock_manifest=True) as (exthandlers_handler, protocol, no_of_extensions): - self.__run_and_assert_generic_case(exthandlers_handler, protocol, no_of_extensions) + self.test_data['ext_conf'] = os.path.join(self._MULTI_CONFIG_TEST_DATA, + "ext_conf_multi_config_no_dependencies.xml") + with self._setup_test_env(mock_manifest=True) as (exthandlers_handler, protocol, no_of_extensions): + self.__run_and_assert_generic_case(exthandlers_handler, protocol, no_of_extensions) def test_it_should_cleanup_extension_state_on_disable(self): diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index da7b752f9..c489c323b 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -26,12 +26,13 @@ from tests_e2e.tests.lib.logging import log from tests_e2e.tests.lib.ssh_client import SshClient from tests_e2e.tests.lib.virtual_machine_extension_client import VirtualMachineExtensionClient +from tests_e2e.tests.lib.virtual_machine_runcommand_client import VirtualMachineRunCommandClient from tests_e2e.tests.lib.agent_test_context import AgentVmTestContext class ExtPolicy(AgentVmTest): class TestCase: - def __init__(self, extension: VirtualMachineExtensionClient, settings: Any): + def __init__(self, extension, settings: Any): self.extension = extension self.settings = settings @@ -55,12 +56,17 @@ def _create_policy_file(self, policy): self._ssh_client.run_command(f"mv {remote_path} {policy_file_final_dest}", use_sudo=True) def _operation_should_succeed(self, operation, extension_case): + log.info("") log.info(f"Attempting to {operation} {extension_case.extension.__str__()}, expected to succeed ") # Attempt operation. If enabling, assert that the extension is present in instance view. # If deleting, assert that the extension is not present in instance view. try: if operation == "enable": - extension_case.extension.enable(settings=extension_case.settings, force_update=True, timeout=15 * 60) + # VirtualMachineRunCommandClient (and VirtualMachineRunCommand) does not take force_update_tag as a parameter. + if type(extension_case.extension) == VirtualMachineRunCommandClient: + extension_case.extension.enable(settings=extension_case.settings, timeout=15*60) + else: + extension_case.extension.enable(settings=extension_case.settings, force_update=True, timeout=15*60) extension_case.extension.assert_instance_view() elif operation == "delete": extension_case.extension.delete(timeout=15 * 60) @@ -68,7 +74,7 @@ def _operation_should_succeed(self, operation, extension_case): if instance_view_extensions is not None and any( e.name == extension_case.extension._resource_name for e in instance_view_extensions): raise Exception( - "extension {0} still in instance view after attempting to delete".format(extension_case.extension._resource_nam)) + "extension {0} still in instance view after attempting to delete".format(extension_case.extension)) log.info(f"Operation '{operation}' for {extension_case.extension.__str__()} succeeded as expected.") except Exception as error: fail( @@ -76,50 +82,75 @@ def _operation_should_succeed(self, operation, extension_case): f"Extension is allowed by policy so this operation should have completed successfully.\n" f"Error: {error}") - @staticmethod - def _operation_should_fail(operation, extension_case): - log.info(f"Attempting to {operation} {extension_case.extension.__str__()}, should fail fast.") - try: - timeout = (6 * 60) # Fail fast. - if operation == "enable": - extension_case.extension.enable(settings=extension_case.settings, force_update=True, timeout=timeout) - elif operation == "delete": - extension_case.extension.delete(timeout=timeout) - fail(f"The agent should have reported an error trying to {operation} {extension_case.extension.__str__()} " - f"because the extension is disallowed by policy.") - except Exception as error: - assert_that("[ExtensionPolicyError] Extension will not be processed" in str(error)) \ - .described_as( - f"Error message should communicate that extension is disallowed by policy, but actual error " - f"was: {error}").is_true() - log.info(f"{extension_case.extension.__str__()} {operation} failed as expected") + def _operation_should_fail(self, operation, extension_case): + log.info("") + log.info(f"Attempting to {operation} {extension_case.extension}, should fail fast.") + if operation == "enable": + try: + timeout = (6 * 60) # Fail fast. + # VirtualMachineRunCommandClient (and VirtualMachineRunCommand) does not take force_update_tag as a parameter. + if type(extension_case.extension) == VirtualMachineRunCommandClient: + extension_case.extension.enable(settings=extension_case.settings, timeout=timeout) + else: + extension_case.extension.enable(settings=extension_case.settings, force_update=True, + timeout=timeout) + fail( + f"The agent should have reported an error trying to {operation} {extension_case.extension} " + f"because the extension is disallowed by policy.") + except Exception as error: + expected_msg = "Extension will not be processed: failed to run extension" + assert_that(expected_msg in str(error)) \ + .described_as( + f"Error message is expected to contain '{expected_msg}', but actual error message was '{error}'").is_true() + log.info(f"{extension_case.extension} {operation} failed as expected") + + elif operation == "delete": + # Delete is a best effort operation and should not fail, so CRP will wait for the full timeout instead + # instead of reporting an error for the operation. We set a short timeout limit, swallow the error, and + # assert that the extension is still in the instance view to confirm that the delete failed. + try: + delete_start_time = self._ssh_client.run_command("date '+%Y-%m-%d %T'").rstrip() + extension_case.extension.delete(timeout=(1 * 60)) + fail(f"The agent should have reported a timeout error when attempting to delete {extension_case.extension} " + f"because the extension is disallowed by policy.") + except TimeoutError: + log.info("Reported a timeout error when attempting to delete extension, as expected. Checking instance view " + "and agent log to confirm that delete operation failed.") + # Confirm that extension is still present in instance view + instance_view_extensions = self._context.vm.get_instance_view().extensions + if instance_view_extensions is not None and not any( + e.name == extension_case.extension._resource_name for e in instance_view_extensions): + fail(f"Delete operation is disallowed by policy and should have failed, but extension " + f"{extension_case.extension} is no longer present in the instance view.") + + # Confirm that expected error message is in the agent log + expected_msg = "Extension will not be processed: failed to uninstall extension" + result = self._ssh_client.run_command( + f"agent_ext_workflow-check_data_in_agent_log.py --data '{expected_msg}' --after-timestamp '{delete_start_time}'", + use_sudo=True) def run(self): # Prepare no-config, single-config, and multi-config extension to test. Extensions with settings and extensions # without settings have different status reporting logic, so we should test all cases. - unique = str(uuid.uuid4()) - test_file = f"waagent-test.{unique}" # CustomScript is a single-config extension. custom_script = ExtPolicy.TestCase( VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, resource_name="CustomScript"), - {'commandToExecute': f"echo '{unique}' > /tmp/{test_file}"} + {'commandToExecute': f"echo '{str(uuid.uuid4())}'"} ) # RunCommandHandler is a multi-config extension, so we set up two instances (configurations) here and test both. run_command = ExtPolicy.TestCase( - VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, + VirtualMachineRunCommandClient(self._context.vm, VmExtensionIds.RunCommandHandler, resource_name="RunCommandHandler"), - {'source': {'script': f"echo '{unique}' > /tmp/{test_file}"}} + {'source': f"echo '{str(uuid.uuid4())}'"} ) - unique2 = str(uuid.uuid4()) - test_file2 = f"waagent-test.{unique2}" run_command_2 = ExtPolicy.TestCase( - VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler, + VirtualMachineRunCommandClient(self._context.vm, VmExtensionIds.RunCommandHandler, resource_name="RunCommandHandler2"), - {'source': {'script': f"echo '{unique2}' > /tmp/{test_file2}"}} + {'source': f"echo '{str(uuid.uuid4())}'"} ) # AzureMonitorLinuxAgent is a no-config extension (extension without settings). @@ -129,13 +160,21 @@ def run(self): None ) + # Another e2e test may have left behind an extension we want to test here. Cleanup any leftovers so that they + # do not affect the test results. + ext_to_cleanup = [custom_script, run_command, run_command_2, azure_monitor] + for ext in ext_to_cleanup: + ext.extension.delete() + # Enable policy via conf log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) - # Only allowlisted extensions should be processed. - # We only allowlist CustomScript: CustomScript should be enabled, RunCommand and AzureMonitor should fail. - # (Note that CustomScript blocked by policy is tested in a later test case.) + # This policy tests the following scenarios: + # - enable a single-config extension (CustomScript) that is allowed by policy -> should succeed + # - enable a no-config extension (AzureMonitorLinuxAgent) that is disallowed by policy -> should fail fast + # - enable two instances of a multi-config extension (RunCommandHandler) that is disallowed by policy -> both should fail fast + # (Note that CustomScript disallowed by policy is tested in a later test case.) policy = \ { "policyVersion": "0.1.0", @@ -150,11 +189,15 @@ def run(self): self._create_policy_file(policy) self._operation_should_succeed("enable", custom_script) self._operation_should_fail("enable", run_command) + self._operation_should_fail("enable", run_command_2) if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): self._operation_should_fail("enable", azure_monitor) - # When allowlist is turned off, all extensions should be processed. - # RunCommand and AzureMonitorLinuxAgent should be successfully enabled and then deleted. + # This policy tests the following scenarios: + # - enable two instances of a multi-config extension (RunCommandHandler) when allowed by policy -> should succeed + # - delete two instances of a multi-config extension (RunCommandHandler) when allowed by policy -> should succeed + # - enable no-config extension (AzureMonitorLinuxAgent) when allowed by policy -> should succeed + # - delete no-config extension (AzureMonitorLinuxAgent) when allowed by policy -> should succeed policy = \ { "policyVersion": "0.1.0", @@ -165,14 +208,21 @@ def run(self): } } self._create_policy_file(policy) + # Update settings to force an update to the seq no + run_command.settings = {'source': f"echo '{str(uuid.uuid4())}'"} + run_command_2.settings = {'source': f"echo '{str(uuid.uuid4())}'"} self._operation_should_succeed("enable", run_command) + self._operation_should_succeed("enable", run_command_2) self._operation_should_succeed("delete", run_command) + self._operation_should_succeed("delete", run_command_2) if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): self._operation_should_succeed("enable", azure_monitor) self._operation_should_succeed("delete", azure_monitor) - # Should not uninstall disallowed extensions. - # CustomScript is removed from the allowlist: delete operation should fail. + # This policy tests the following scenarios: + # - disallow a previously-enabled single-config extension (CustomScript), then delete -> should fail fast + # - enable two instances of a multi-config extension (RunCommandHandler) when disallowed by policy -> should fail fast + # - enable single-config extension (CustomScript) when disallowed by policy -> should fail fast policy = \ { "policyVersion": "0.1.0", @@ -183,18 +233,16 @@ def run(self): } } self._create_policy_file(policy) - # # Known CRP issue - delete/uninstall operation times out instead of reporting an error. - # # TODO: uncomment this test case after issue is resolved - # # self._operation_should_fail("delete", custom_script) + self._operation_should_fail("delete", custom_script) # If a multiconfig extension is disallowed, no instances should be processed. # RunCommand is not allowed - if we try to enable two instances, both should fail fast. self._operation_should_fail("enable", run_command) self._operation_should_fail("enable", run_command_2) - - # If single-config extension is initially blocked, and policy is updated to allow it, extension should be - # successfully enabled and report status correctly. self._operation_should_fail("enable", custom_script) + + # This policy tests the following scenario: + # - allow a previously-disallowed single-config extension (CSE), then enable -> should succeed policy = \ { "policyVersion": "0.1.0", @@ -209,17 +257,6 @@ def run(self): def get_ignore_error_rules(self) -> List[Dict[str, Any]]: ignore_rules = [ - # - # 2024-10-23T17:50:38.107793Z WARNING ExtHandler ExtHandler Dependent extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent failed or timed out, will skip processing the rest of the extensions - # We intentionally block extensions with policy and expect any dependent extensions to be skipped - { - 'message': r"Dependent extension .* failed or timed out, will skip processing the rest of the extensions" - }, - # 2024-10-23T18:01:32.247341Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=ExtensionProcessing, message=Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent failed, duration=0 - # We intentionally block extensions with policy and expect any dependent extensions to be skipped - { - 'message': r"Skipping processing of extensions since execution of dependent extension .* failed" - }, # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 # We intentionally block extensions with policy and expect this failure message { diff --git a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py index 403734b23..739432fcd 100644 --- a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py +++ b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py @@ -54,7 +54,7 @@ def __init__(self, context: AgentVmTestContext): _test_cases = [ _should_fail_single_config_depends_on_disallowed_no_config, _should_fail_single_config_depends_on_disallowed_single_config, - # TODO: RunCommand is unable to be installed properly, so these tests are currently disabled. Investigate the + # TODO: RunCommandHandler is unable to be uninstalled properly, so these tests are currently disabled. Investigate the # issue and enable these 3 tests. # _should_fail_single_config_depends_on_disallowed_multi_config, # _should_fail_multi_config_depends_on_disallowed_single_config, @@ -82,6 +82,14 @@ def run(self): ssh_clients: Dict[str, SshClient] = {} for instance in instances_ip_address: ssh_clients[instance.instance_name] = SshClient(ip_address=instance.ip_address, username=self._context.username, identity_file=self._context.identity_file) + + # Cleanup any extensions left behind by other tests, as they may be blocked by policy and erroneously cause failures. + instance_view_ext = self._context.vmss.get_instance_view().extensions + if instance_view_ext is not None and len(instance_view_ext) > 0: + for ex in instance_view_ext: + self._context.vmss.delete_extension(ex.name) + + # Enable policy via conf file. for ssh_client in ssh_clients.values(): ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) @@ -121,7 +129,6 @@ def run(self): # to generate a new sequence number each time test_guid = str(uuid.uuid4()) policy, extensions, expected_errors, deletion_order = case() - for ext in extensions: ext["properties"].update({ "forceUpdateTag": test_guid @@ -174,23 +181,23 @@ def run(self): for phrase in expected_errors: if phrase not in error_message: fail("Extension template deployment failed as expected, but with an unexpected error. Error expected to contain message '{0}'. Actual error: {1}".format(phrase, e)) - log.info("Extensions failed as expected") + log.info("Extensions failed as expected. Expected errors: '{0}'. Actual errors: '{1}'.".format(expected_errors, e)) log.info("") # After each test, clean up failed extensions to leave VMSS in a good state for the next test. # If there are leftover failed extensions, CRP will attempt to uninstall them in the next test, but uninstall # will be disallowed by policy. Since CRP waits for a 90 minute timeout for uninstall, the operation will - # timeout and fail without an appropriate error message (known issue), and the whole test case will fail. + # timeout and fail without an appropriate error message, and the whole test case will fail. # To clean up, we first update the policy to allow all, then remove the extensions. log.info("Starting cleanup for test case...") - for ssh_client in ssh_clients.values(): - allow_all_policy = \ - { - "policyVersion": "0.1.0", - "extensionPolicies": { - "allowListedExtensionsOnly": False - } + allow_all_policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": False } + } + for ssh_client in ssh_clients.values(): self._create_policy_file(ssh_client, allow_all_policy) for ext_to_delete in deletion_order: @@ -269,7 +276,7 @@ def get_ignore_error_rules(self) -> List[Dict[str, Any]]: { 'message': r"Skipping processing of extensions since execution of dependent extension .* failed" }, - # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 + # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 # We intentionally block extensions with policy and expect this failure message { 'message': r"Extension will not be processed" diff --git a/tests_e2e/tests/ext_policy/policy_dependencies_cases.py b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py index 4f11b4139..b9fb8157b 100644 --- a/tests_e2e/tests/ext_policy/policy_dependencies_cases.py +++ b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py @@ -56,7 +56,7 @@ def _should_fail_single_config_depends_on_disallowed_single_config(): } } expected_errors = [ - "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", + "Extension will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", "'CustomScript' is marked as failed since it depends upon the VM Extension 'VMAccessForLinux' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.VmAccess] @@ -80,7 +80,7 @@ def _should_fail_single_config_depends_on_disallowed_no_config(): } } expected_errors = [ - "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", + "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", "'CustomScript' is marked as failed since it depends upon the VM Extension 'AzureMonitorLinuxAgent' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.AzureMonitorLinuxAgent] @@ -103,7 +103,7 @@ def _should_fail_single_config_depends_on_disallowed_multi_config(): } } expected_errors = [ - "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.CPlat.Core.RunCommandHandlerLinux' because extension is not specified in allowlist", + "Extension will not be processed: failed to enable extension 'Microsoft.CPlat.Core.RunCommandHandlerLinux' because extension is not specified in allowlist", "'CustomScript' is marked as failed since it depends upon the VM Extension 'RunCommandHandlerLinux' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.RunCommandHandler] @@ -126,7 +126,7 @@ def _should_fail_multi_config_depends_on_disallowed_single_config(): } } expected_errors = [ - "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist", + "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist", "VM has reported a failure when processing extension 'RunCommandHandlerLinux' (publisher 'Microsoft.CPlat.Core' and type 'RunCommandHandlerLinux'). Error message: 'Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Extensions.CustomScript failed'." ] deletion_order = [VmExtensionIds.RunCommandHandler, VmExtensionIds.CustomScript] @@ -149,7 +149,7 @@ def _should_fail_multi_config_depends_on_disallowed_no_config(): } } expected_errors = [ - "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", + "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", "VM has reported a failure when processing extension 'RunCommandHandlerLinux' (publisher 'Microsoft.CPlat.Core' and type 'RunCommandHandlerLinux'). Error message: 'Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent failed'." ] deletion_order = [VmExtensionIds.RunCommandHandler, VmExtensionIds.AzureMonitorLinuxAgent] @@ -197,47 +197,3 @@ def _should_succeed_single_config_depends_on_single_config(): expected_errors = [] deletion_order = [VmExtensionIds.VmAccess, VmExtensionIds.CustomScript] return policy, template, expected_errors, deletion_order - - -def _should_no_dependencies(): - template = \ - [{ - "name": "CustomScript", - "properties": { - "publisher": "Microsoft.Azure.Extensions", - "type": "CustomScript", - "typeHandlerVersion": "2.1", - "autoUpgradeMinorVersion": True, - "settings": { - "commandToExecute": "date" - } - } - }, - { - "name": "VMAccessForLinux", - "properties": { - "publisher": "Microsoft.OSTCExtensions", - "type": "VMAccessForLinux", - "typeHandlerVersion": "1.0", - "autoUpgradeMinorVersion": True, - "settings": {}, - "protectedSettings": { - "username": "testuser" - } - } - }] - policy = \ - { - "policyVersion": "0.1.0", - "extensionPolicies": { - "allowListedExtensionsOnly": True, - "extensions": { - } - } - } - expected_errors = [ - "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", - "[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist" - ] - deletion_order = [VmExtensionIds.VmAccess, VmExtensionIds.CustomScript] - return policy, template, expected_errors, deletion_order \ No newline at end of file From dfcc1580f97a7515d8373a24b10250ed26c048ca Mon Sep 17 00:00:00 2001 From: mgunnala Date: Mon, 9 Dec 2024 09:33:47 -0500 Subject: [PATCH 17/32] Address test comments --- azurelinuxagent/ga/policy/policy_engine.py | 2 +- tests_e2e/test_suites/ext_policy.yml | 2 +- .../ext_policy_with_dependencies.yml | 4 +- tests_e2e/tests/ext_policy/ext_policy.py | 24 ++++---- .../ext_policy_with_dependencies.py | 8 +-- .../ext_policy/policy_dependencies_cases.py | 10 ++-- ...ent_ext_policy-verify_uninstall_success.py | 59 +++++++++++++++++++ 7 files changed, 83 insertions(+), 26 deletions(-) create mode 100755 tests_e2e/tests/scripts/agent_ext_policy-verify_uninstall_success.py diff --git a/azurelinuxagent/ga/policy/policy_engine.py b/azurelinuxagent/ga/policy/policy_engine.py index 202eb4dbc..2b3a34415 100644 --- a/azurelinuxagent/ga/policy/policy_engine.py +++ b/azurelinuxagent/ga/policy/policy_engine.py @@ -22,7 +22,7 @@ from azurelinuxagent.common import logger from azurelinuxagent.common.event import WALAEventOperation, add_event from azurelinuxagent.common import conf -from azurelinuxagent.common.exception import AgentError, ExtensionError, ExtensionErrorCodes +from azurelinuxagent.common.exception import AgentError from azurelinuxagent.common.protocol.extensions_goal_state_from_vm_settings import _CaseFoldedDict from azurelinuxagent.common.utils.flexible_version import FlexibleVersion diff --git a/tests_e2e/test_suites/ext_policy.yml b/tests_e2e/test_suites/ext_policy.yml index 15a3dc73d..a394f471d 100644 --- a/tests_e2e/test_suites/ext_policy.yml +++ b/tests_e2e/test_suites/ext_policy.yml @@ -5,4 +5,4 @@ name: "ExtensionPolicy" tests: - "ext_policy/ext_policy.py" images: "endorsed" -owns_vm: true \ No newline at end of file +owns_vm: false \ No newline at end of file diff --git a/tests_e2e/test_suites/ext_policy_with_dependencies.yml b/tests_e2e/test_suites/ext_policy_with_dependencies.yml index 09e01fe63..0c9fe8b18 100644 --- a/tests_e2e/test_suites/ext_policy_with_dependencies.yml +++ b/tests_e2e/test_suites/ext_policy_with_dependencies.yml @@ -6,6 +6,4 @@ tests: - "ext_policy/ext_policy_with_dependencies.py" images: "endorsed" executes_on_scale_set: true -# This test should run on its own VMSS, because other tests may leave behind extensions -# that are disallowed by policy and affect results. -owns_vm: true \ No newline at end of file +owns_vm: false \ No newline at end of file diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index c489c323b..86f1a8db9 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -105,12 +105,12 @@ def _operation_should_fail(self, operation, extension_case): log.info(f"{extension_case.extension} {operation} failed as expected") elif operation == "delete": - # Delete is a best effort operation and should not fail, so CRP will wait for the full timeout instead - # instead of reporting an error for the operation. We set a short timeout limit, swallow the error, and - # assert that the extension is still in the instance view to confirm that the delete failed. + # Delete is a best effort operation and should not fail, so CRP will timeout instead of reporting the + # appropriate error. We swallow the timeout error, and instead, assert that the extension is still in the + # instance view and that the expected error is in the agent log to confirm that deletion failed. + delete_start_time = self._ssh_client.run_command("date '+%Y-%m-%d %T'").rstrip() try: - delete_start_time = self._ssh_client.run_command("date '+%Y-%m-%d %T'").rstrip() - extension_case.extension.delete(timeout=(1 * 60)) + extension_case.extension.delete(timeout=(15 * 60)) fail(f"The agent should have reported a timeout error when attempting to delete {extension_case.extension} " f"because the extension is disallowed by policy.") except TimeoutError: @@ -125,15 +125,20 @@ def _operation_should_fail(self, operation, extension_case): # Confirm that expected error message is in the agent log expected_msg = "Extension will not be processed: failed to uninstall extension" - result = self._ssh_client.run_command( + self._ssh_client.run_command( f"agent_ext_workflow-check_data_in_agent_log.py --data '{expected_msg}' --after-timestamp '{delete_start_time}'", use_sudo=True) def run(self): + # The full CRP timeout period for extension operation failure is 90 minutes. For efficiency, we reduce the + # timeout limit to 15 minutes here. We expect "delete" operations on disallowed VMs to reach timeout instead of + # failing fast, because delete is a best effort operation by-design and should not fail. + self._context.vm.update({"extensionsTimeBudget": "PT15M"}) + + # Prepare no-config, single-config, and multi-config extension to test. Extensions with settings and extensions # without settings have different status reporting logic, so we should test all cases. - # CustomScript is a single-config extension. custom_script = ExtPolicy.TestCase( VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, @@ -166,7 +171,7 @@ def run(self): for ext in ext_to_cleanup: ext.extension.delete() - # Enable policy via conf + # Enable policy via conf file log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) @@ -234,9 +239,6 @@ def run(self): } self._create_policy_file(policy) self._operation_should_fail("delete", custom_script) - - # If a multiconfig extension is disallowed, no instances should be processed. - # RunCommand is not allowed - if we try to enable two instances, both should fail fast. self._operation_should_fail("enable", run_command) self._operation_should_fail("enable", run_command_2) self._operation_should_fail("enable", custom_script) diff --git a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py index 739432fcd..a726101fb 100644 --- a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py +++ b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py @@ -206,14 +206,12 @@ def run(self): self._context.vmss.delete_extension(ext_name_to_delete) except Exception as crp_err: # Known issue - CRP returns stale status in cases of dependency failures. Even if the deletion succeeds, - # CRP may return a failure here. We swallow the error, and instead, check that the logs for uninstall - # are present in the agent log (after the start time of this test case). + # CRP may return a failure. We swallow the error and instead, verify that the agent does not report + # status for the uninstalled extension. log.info("CRP returned an error for deletion operation, may be a false error. Checking agent log to determine if operation succeeded. Exception: {0}".format(crp_err)) try: for ssh_client in ssh_clients.values(): - msg = ("Remove the extension slice: {0}".format(str(ext_to_delete))) - result = ssh_client.run_command(f"agent_ext_workflow-check_data_in_agent_log.py --data '{msg}' --after-timestamp '{test_case_start}'", use_sudo=True) - log.info(result) + ssh_client.run_command(f"agent_ext_policy-verify_uninstall_success.py --extension-name '{ext_to_delete}'") except Exception as agent_err: fail("Unable to successfully uninstall extension {0}. Exception: {1}".format(ext_name_to_delete, agent_err)) log.info("Successfully uninstalled extension {0}".format(ext_name_to_delete)) diff --git a/tests_e2e/tests/ext_policy/policy_dependencies_cases.py b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py index b9fb8157b..f645e9d96 100644 --- a/tests_e2e/tests/ext_policy/policy_dependencies_cases.py +++ b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py @@ -56,7 +56,7 @@ def _should_fail_single_config_depends_on_disallowed_single_config(): } } expected_errors = [ - "Extension will not be processed: failed to enable extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because extension is not specified in allowlist", + "Extension will not be processed: failed to run extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because it is not specified in the allowlist", "'CustomScript' is marked as failed since it depends upon the VM Extension 'VMAccessForLinux' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.VmAccess] @@ -80,7 +80,7 @@ def _should_fail_single_config_depends_on_disallowed_no_config(): } } expected_errors = [ - "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", + "Extension will not be processed: failed to run extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because it is not specified in the allowlist", "'CustomScript' is marked as failed since it depends upon the VM Extension 'AzureMonitorLinuxAgent' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.AzureMonitorLinuxAgent] @@ -103,7 +103,7 @@ def _should_fail_single_config_depends_on_disallowed_multi_config(): } } expected_errors = [ - "Extension will not be processed: failed to enable extension 'Microsoft.CPlat.Core.RunCommandHandlerLinux' because extension is not specified in allowlist", + "Extension will not be processed: failed to run extension 'Microsoft.CPlat.Core.RunCommandHandlerLinux' because it is not specified in the allowlist", "'CustomScript' is marked as failed since it depends upon the VM Extension 'RunCommandHandlerLinux' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.RunCommandHandler] @@ -126,7 +126,7 @@ def _should_fail_multi_config_depends_on_disallowed_single_config(): } } expected_errors = [ - "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Extensions.CustomScript' because extension is not specified in allowlist", + "Extension will not be processed: failed to run extension 'Microsoft.Azure.Extensions.CustomScript' because it is not specified in the allowlist", "VM has reported a failure when processing extension 'RunCommandHandlerLinux' (publisher 'Microsoft.CPlat.Core' and type 'RunCommandHandlerLinux'). Error message: 'Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Extensions.CustomScript failed'." ] deletion_order = [VmExtensionIds.RunCommandHandler, VmExtensionIds.CustomScript] @@ -149,7 +149,7 @@ def _should_fail_multi_config_depends_on_disallowed_no_config(): } } expected_errors = [ - "Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist", + "Extension will not be processed: failed to run extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because it is not specified in the allowlist", "VM has reported a failure when processing extension 'RunCommandHandlerLinux' (publisher 'Microsoft.CPlat.Core' and type 'RunCommandHandlerLinux'). Error message: 'Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent failed'." ] deletion_order = [VmExtensionIds.RunCommandHandler, VmExtensionIds.AzureMonitorLinuxAgent] diff --git a/tests_e2e/tests/scripts/agent_ext_policy-verify_uninstall_success.py b/tests_e2e/tests/scripts/agent_ext_policy-verify_uninstall_success.py new file mode 100755 index 000000000..a9e01fcb2 --- /dev/null +++ b/tests_e2e/tests/scripts/agent_ext_policy-verify_uninstall_success.py @@ -0,0 +1,59 @@ +#!/usr/bin/env pypy3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Verify if the agent reported update status to CRP via status file +# +import argparse +import glob +import json + +from assertpy import fail + +from tests_e2e.tests.lib.logging import log + + +def extension_found_in_agent_status_file(ext_name: str) -> bool: + # Check if the provided extension name is present in the agent status file, under handlerAggregateStatus. + # If the name is not present, the uninstall operation was successful. + agent_status_file = "/var/lib/waagent/history/*/waagent_status.json" + file_paths = glob.glob(agent_status_file, recursive=True) + for file in file_paths: + with open(file, 'r') as f: + data = json.load(f) + log.info("Agent status file (%s): %s", file, data) + handler_status = data["aggregateStatus"]["handlerAggregateStatus"] + if any(handler["handlerName"].lower() == ext_name.lower() for handler in handler_status): + return True + return False + + +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument('--extension-name', dest='name', required=True) + args = parser.parse_args() + + log.info("Checking agent status file to verify that the uninstalled extension is not present in reported status") + if extension_found_in_agent_status_file(args.name): + fail("Handler status was found in the status file for extension {0}, uninstall failed.".format(args.name)) + else: + log.info("Handler status was not found in the status file for extension {0}, uninstall succeeded.".format(args.name)) + + +if __name__ == "__main__": + main() From a31bdcf4c7b540e4af410df2ebf5af37e6519b13 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Tue, 10 Dec 2024 10:30:32 -0500 Subject: [PATCH 18/32] Address test comments --- tests/ga/test_extension.py | 11 ++++++++ tests_e2e/orchestrator/runbook.yml | 4 +-- tests_e2e/tests/ext_policy/ext_policy.py | 28 ++++++++++++------- .../ext_policy_with_dependencies.py | 2 +- .../ext_policy/policy_dependencies_cases.py | 26 +++++++++++++---- 5 files changed, 52 insertions(+), 19 deletions(-) diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index 902c069c7..b8541401e 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -3539,6 +3539,9 @@ def _create_policy_file(self, policy): def _test_policy_case(self, policy, op, expected_status_code, expected_handler_status, expected_ext_count=0, expected_status_msg=None): + # Set up a mock protocol instance. In the case of uninstall, we need to update the goal state to test uninstall. + # update_goal_state() only updates the goal state if incarnation has changed, so we increment the incarnation + # number. with mock_wire_protocol(wire_protocol_data.DATA_FILE) as protocol: if op == ExtensionRequestedState.Uninstall: protocol.mock_wire_data.set_incarnation(2) @@ -3548,10 +3551,12 @@ def _test_policy_case(self, policy, op, expected_status_code, expected_handler_s protocol.report_vm_status = MagicMock() exthandlers_handler = get_exthandlers_handler(protocol) + # Create policy file and process extensions. self._create_policy_file(policy) exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() + # Assert that agent is reporting the expected handler status report_vm_status = protocol.report_vm_status self.assertTrue(report_vm_status.called) self._assert_handler_status(report_vm_status, expected_handler_status, expected_ext_count=expected_ext_count, @@ -3691,6 +3696,10 @@ def test_uninstall_should_succeed_if_extension_allowed(self): ] for policy in policy_cases: with mock_wire_protocol(wire_protocol_data.DATA_FILE) as protocol: + # Set up a mock protocol instance, which instantiates a goal state with incarnation number 1. + # We then change the goal state to test uninstall, and need to update the goal state with this change. + # update_goal_state() only updates the goal state if the incarnation has changed, so we increment the + # incarnation number to 2. protocol.mock_wire_data.set_incarnation(2) protocol.mock_wire_data.set_extensions_config_state(ExtensionRequestedState.Uninstall) protocol.client.update_goal_state() @@ -3698,10 +3707,12 @@ def test_uninstall_should_succeed_if_extension_allowed(self): protocol.report_vm_status = MagicMock() exthandlers_handler = get_exthandlers_handler(protocol) + # Create policy file and process extensions. self._create_policy_file(policy) exthandlers_handler.run() exthandlers_handler.report_ext_handlers_status() + # Assert that no status is being reported for the extension, to confirm that uninstall was successful. report_vm_status = protocol.report_vm_status self.assertTrue(report_vm_status.called) args, kw = report_vm_status.call_args # pylint: disable=unused-variable diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml index 250392f90..c7bcaed61 100644 --- a/tests_e2e/orchestrator/runbook.yml +++ b/tests_e2e/orchestrator/runbook.yml @@ -45,6 +45,8 @@ variable: - agent_update - ext_cgroups - extensions_disabled + - ext_policy + - ext_policy_with_dependencies - ext_sequencing - ext_telemetry_pipeline - fips @@ -55,8 +57,6 @@ variable: - recover_network_interface - cgroup_v2_disabled - log_collector - - ext_policy - - ext_policy_with_dependencies # # Additional arguments pass to the test suites diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index 86f1a8db9..e111ed18f 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -110,7 +110,8 @@ def _operation_should_fail(self, operation, extension_case): # instance view and that the expected error is in the agent log to confirm that deletion failed. delete_start_time = self._ssh_client.run_command("date '+%Y-%m-%d %T'").rstrip() try: - extension_case.extension.delete(timeout=(15 * 60)) + timeout = (16 * 60) # We want to wait for CRP timeout, which is 15 minutes. + extension_case.extension.delete(timeout) fail(f"The agent should have reported a timeout error when attempting to delete {extension_case.extension} " f"because the extension is disallowed by policy.") except TimeoutError: @@ -167,6 +168,7 @@ def run(self): # Another e2e test may have left behind an extension we want to test here. Cleanup any leftovers so that they # do not affect the test results. + log.info("Cleaning up existing extensions on the test VM [%s]", self._context.vm.name) ext_to_cleanup = [custom_script, run_command, run_command_2, azure_monitor] for ext in ext_to_cleanup: ext.extension.delete() @@ -187,7 +189,9 @@ def run(self): "allowListedExtensionsOnly": True, "signatureRequired": False, "extensions": { - "Microsoft.Azure.Extensions.CustomScript": {} + "Microsoft.Azure.Extensions.CustomScript": {}, + # GuestConfiguration is added to all VMs for security requirements, so we always allow it. + "Microsoft.GuestConfiguration.ConfigurationforLinux": {} } } } @@ -225,26 +229,27 @@ def run(self): self._operation_should_succeed("delete", azure_monitor) # This policy tests the following scenarios: - # - disallow a previously-enabled single-config extension (CustomScript), then delete -> should fail fast - # - enable two instances of a multi-config extension (RunCommandHandler) when disallowed by policy -> should fail fast - # - enable single-config extension (CustomScript) when disallowed by policy -> should fail fast + # - disallow a previously-enabled single-config extension (CustomScript, then try to enable again -> should fail fast + # - disallow a previously-enabled single-config extension (CustomScript), then try to delete -> should fail fast policy = \ { "policyVersion": "0.1.0", "extensionPolicies": { "allowListedExtensionsOnly": True, "signatureRequired": False, - "extensions": {} + "extensions": { + # GuestConfiguration is added to all VMs for security requirements, so we always allow it. + "Microsoft.GuestConfiguration.ConfigurationforLinux": {} + } } } self._create_policy_file(policy) - self._operation_should_fail("delete", custom_script) - self._operation_should_fail("enable", run_command) - self._operation_should_fail("enable", run_command_2) self._operation_should_fail("enable", custom_script) + self._operation_should_fail("delete", custom_script) # This policy tests the following scenario: - # - allow a previously-disallowed single-config extension (CSE), then enable -> should succeed + # - allow a previously-disallowed single-config extension (CustomScript), then delete -> should succeed + # - allow a previously-disallowed single-config extension (CustomScript), then enable -> should succeed policy = \ { "policyVersion": "0.1.0", @@ -255,6 +260,9 @@ def run(self): } } self._create_policy_file(policy) + # Since CustomScript is marked for deletion by previous test case, we can only retry the delete operation (enable + # is not allowed by CRP). So we first delete successfully, and then re-install/enable CustomScript. + self._operation_should_succeed("delete", custom_script) self._operation_should_succeed("enable", custom_script) def get_ignore_error_rules(self) -> List[Dict[str, Any]]: diff --git a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py index a726101fb..0114f3a88 100644 --- a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py +++ b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py @@ -208,7 +208,7 @@ def run(self): # Known issue - CRP returns stale status in cases of dependency failures. Even if the deletion succeeds, # CRP may return a failure. We swallow the error and instead, verify that the agent does not report # status for the uninstalled extension. - log.info("CRP returned an error for deletion operation, may be a false error. Checking agent log to determine if operation succeeded. Exception: {0}".format(crp_err)) + log.info("CRP returned an error for deletion operation, may be a false error. Checking agent status file to determine if operation succeeded. Exception: {0}".format(crp_err)) try: for ssh_client in ssh_clients.values(): ssh_client.run_command(f"agent_ext_policy-verify_uninstall_success.py --extension-name '{ext_to_delete}'") diff --git a/tests_e2e/tests/ext_policy/policy_dependencies_cases.py b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py index f645e9d96..5de3136d4 100644 --- a/tests_e2e/tests/ext_policy/policy_dependencies_cases.py +++ b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py @@ -52,6 +52,8 @@ def _should_fail_single_config_depends_on_disallowed_single_config(): "allowListedExtensionsOnly": True, "extensions": { "Microsoft.Azure.Extensions.CustomScript": {}, + # GuestConfiguration is added to all VMs for security requirements, so we always allow it. + "Microsoft.GuestConfiguration.ConfigurationforLinux": {} } } } @@ -75,7 +77,9 @@ def _should_fail_single_config_depends_on_disallowed_no_config(): "extensionPolicies": { "allowListedExtensionsOnly": True, "extensions": { - "Microsoft.Azure.Extensions.CustomScript": {} + "Microsoft.Azure.Extensions.CustomScript": {}, + # GuestConfiguration is added to all VMs for security requirements, so we always allow it. + "Microsoft.GuestConfiguration.ConfigurationforLinux": {} } } } @@ -98,7 +102,9 @@ def _should_fail_single_config_depends_on_disallowed_multi_config(): "extensionPolicies": { "allowListedExtensionsOnly": True, "extensions": { - "Microsoft.Azure.Extensions.CustomScript": {} + "Microsoft.Azure.Extensions.CustomScript": {}, + # GuestConfiguration is added to all VMs for security requirements, so we always allow it. + "Microsoft.GuestConfiguration.ConfigurationforLinux": {} } } } @@ -121,7 +127,9 @@ def _should_fail_multi_config_depends_on_disallowed_single_config(): "extensionPolicies": { "allowListedExtensionsOnly": True, "extensions": { - "Microsoft.CPlat.Core.RunCommandHandlerLinux": {} + "Microsoft.CPlat.Core.RunCommandHandlerLinux": {}, + # GuestConfiguration is added to all VMs for security requirements, so we always allow it. + "Microsoft.GuestConfiguration.ConfigurationforLinux": {} } } } @@ -144,7 +152,9 @@ def _should_fail_multi_config_depends_on_disallowed_no_config(): "extensionPolicies": { "allowListedExtensionsOnly": True, "extensions": { - "Microsoft.CPlat.Core.RunCommandHandlerLinux": {} + "Microsoft.CPlat.Core.RunCommandHandlerLinux": {}, + # GuestConfiguration is added to all VMs for security requirements, so we always allow it. + "Microsoft.GuestConfiguration.ConfigurationforLinux": {} } } } @@ -169,7 +179,9 @@ def _should_succeed_single_config_depends_on_no_config(): "allowListedExtensionsOnly": True, "extensions": { "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent": {}, - "Microsoft.Azure.Extensions.CustomScript": {} + "Microsoft.Azure.Extensions.CustomScript": {}, + # GuestConfiguration is added to all VMs for security requirements, so we always allow it. + "Microsoft.GuestConfiguration.ConfigurationforLinux": {} } } } @@ -190,7 +202,9 @@ def _should_succeed_single_config_depends_on_single_config(): "allowListedExtensionsOnly": True, "extensions": { "Microsoft.Azure.Extensions.CustomScript": {}, - "Microsoft.OSTCExtensions.VMAccessForLinux": {} + "Microsoft.OSTCExtensions.VMAccessForLinux": {}, + # GuestConfiguration is added to all VMs for security requirements, so we always allow it. + "Microsoft.GuestConfiguration.ConfigurationforLinux": {} } } } From 5198cf8eecef964814ccfb1b93279764260cdd96 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Thu, 12 Dec 2024 11:18:08 -0500 Subject: [PATCH 19/32] Cleanup existing extensions on test VMs --- tests_e2e/tests/ext_policy/ext_policy.py | 17 ++++++++++------- .../ext_policy/ext_policy_with_dependencies.py | 5 +++++ tests_e2e/tests/lib/virtual_machine_client.py | 16 ++++++++++++++++ 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index e111ed18f..b6be22d06 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -143,35 +143,33 @@ def run(self): # CustomScript is a single-config extension. custom_script = ExtPolicy.TestCase( VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, - resource_name="CustomScript"), + resource_name="CustomScriptPolicy"), {'commandToExecute': f"echo '{str(uuid.uuid4())}'"} ) # RunCommandHandler is a multi-config extension, so we set up two instances (configurations) here and test both. run_command = ExtPolicy.TestCase( VirtualMachineRunCommandClient(self._context.vm, VmExtensionIds.RunCommandHandler, - resource_name="RunCommandHandler"), + resource_name="RunCommandHandlerPolicy"), {'source': f"echo '{str(uuid.uuid4())}'"} ) run_command_2 = ExtPolicy.TestCase( VirtualMachineRunCommandClient(self._context.vm, VmExtensionIds.RunCommandHandler, - resource_name="RunCommandHandler2"), + resource_name="RunCommandHandlerPolicy2"), {'source': f"echo '{str(uuid.uuid4())}'"} ) # AzureMonitorLinuxAgent is a no-config extension (extension without settings). azure_monitor = ExtPolicy.TestCase( VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent, - resource_name="AzureMonitorLinuxAgent"), + resource_name="AzureMonitorLinuxAgentPolicy"), None ) # Another e2e test may have left behind an extension we want to test here. Cleanup any leftovers so that they # do not affect the test results. log.info("Cleaning up existing extensions on the test VM [%s]", self._context.vm.name) - ext_to_cleanup = [custom_script, run_command, run_command_2, azure_monitor] - for ext in ext_to_cleanup: - ext.extension.delete() + self._context.vm.delete_all_extensions() # Enable policy via conf file log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) @@ -265,6 +263,11 @@ def run(self): self._operation_should_succeed("delete", custom_script) self._operation_should_succeed("enable", custom_script) + # Disable policy via conf file + log.info("Disabling policy via conf file on the test VM [%s]", self._context.vm.name) + self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True) + + def get_ignore_error_rules(self) -> List[Dict[str, Any]]: ignore_rules = [ # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 diff --git a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py index 0114f3a88..c518f23e0 100644 --- a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py +++ b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py @@ -219,6 +219,11 @@ def run(self): log.info("Successfully removed all extensions from VMSS") log.info("---------------------------------------------") + # Disable policy via conf file. + for ssh_client in ssh_clients.values(): + ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True) + + def get_ignore_errors_before_timestamp(self) -> datetime: # Ignore errors in the agent log before the first test case starts if self._scenario_start == datetime.min: diff --git a/tests_e2e/tests/lib/virtual_machine_client.py b/tests_e2e/tests/lib/virtual_machine_client.py index c4181be5a..42f3f33c2 100644 --- a/tests_e2e/tests/lib/virtual_machine_client.py +++ b/tests_e2e/tests/lib/virtual_machine_client.py @@ -108,6 +108,22 @@ def get_extensions(self) -> List[VirtualMachineExtension]: resource_group_name=self.resource_group, vm_name=self.name)) + def delete_all_extensions(self, timeout: int = AzureSdkClient._DEFAULT_TIMEOUT) -> None: + """ + Delete all extensions installed on the virtual machine + """ + extensions_to_delete = self.get_extensions().value + for ext in extensions_to_delete: + ext_name = ext.name + log.info(f"Deleting extension {ext_name} from {self.name}") + self._execute_async_operation( + lambda: self._compute_client.virtual_machine_extensions.begin_delete( + self.resource_group, + self.name, + ext_name), + operation_name=f"Delete extension {ext_name}", + timeout=timeout) + def update(self, properties: Dict[str, Any], timeout: int = AzureSdkClient._DEFAULT_TIMEOUT) -> None: """ Updates a set of properties on the virtual machine From 4a0a4efab6adbed7fd0e134a783931e1db6f3053 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Mon, 16 Dec 2024 17:20:20 -0500 Subject: [PATCH 20/32] Address comments and disable dependencies e2e tests --- azurelinuxagent/ga/exthandlers.py | 37 ++++++++--------- azurelinuxagent/ga/policy/policy_engine.py | 4 +- tests/ga/test_extension.py | 20 ++++----- tests_e2e/orchestrator/runbook.yml | 3 +- tests_e2e/tests/ext_policy/ext_policy.py | 41 +++++++++++-------- .../ext_policy_with_dependencies.py | 9 +++- ...ent_ext_policy-verify_uninstall_success.py | 2 + 7 files changed, 62 insertions(+), 54 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 12a85fe18..d42ef7ba3 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -503,9 +503,8 @@ def handle_ext_handlers(self, goal_state_id): depends_on_err_msg = None extensions_enabled = conf.get_extensions_enabled() - # Instantiate policy engine, and use same engine to handle all extension handlers. - # If an error is thrown during policy engine initialization, we block all extensions and report the error via handler/extension status for - # each extension. + # Instantiate policy engine, and use same engine to handle all extension handlers. If an error is thrown during + # policy engine initialization, we block all extensions and report the error via handler status for each extension. policy_error = None try: policy_engine = ExtensionPolicyEngine() @@ -537,11 +536,10 @@ def handle_ext_handlers(self, goal_state_id): # If an error was thrown during policy engine initialization, skip further processing of the extension. # CRP is still waiting for status, so we report error status here. - # of the extension. - policy_op, policy_err_code = _POLICY_ERROR_MAP.get(ext_handler.state) + operation, error_code = _POLICY_ERROR_MAP.get(ext_handler.state) if policy_error is not None: msg = "Extension will not be processed: {0}".format(ustr(policy_error)) - self.__report_policy_error(ext_handler_i=handler_i, error_code=policy_err_code, + self.__report_policy_error(ext_handler_i=handler_i, error_code=error_code, report_op=handler_i.operation, message=msg, extension=extension) continue @@ -567,20 +565,18 @@ def handle_ext_handlers(self, goal_state_id): continue - # Invoke policy engine to determine if extension is allowed. If disallowed, report an error on behalf of - # the extension and do not process the extension. Dependent extensions will also be blocked. + # Invoke policy engine to determine if extension is allowed. + # - if allowed: process the extension and get if it was successfully executed or not + # - if disallowed: do not process the handler and report an error on behalf of the extension, dependent + # extensions will also be blocked. extension_allowed = policy_engine.should_allow_extension(ext_handler.name) if not extension_allowed: msg = ( "Extension will not be processed: failed to {0} extension '{1}' because it is not specified " - "in the allowlist. To {0}, add the extension to the allowed list in the policy file ('{2}')." - ).format(policy_op, ext_handler.name, conf.get_policy_file_path()) - self.__report_policy_error(handler_i, policy_err_code, report_op=handler_i.operation, + "in the allowlist. To {0}, add the extension to the list of allowed extensions in the policy file ('{2}')." + ).format(operation, ext_handler.name, conf.get_policy_file_path()) + self.__report_policy_error(handler_i, error_code, report_op=handler_i.operation, message=msg, extension=extension) - - # Process extensions and get if it was successfully executed or not - # If extension was blocked by policy, treat the extension as failed and do not process the handler. - if not extension_allowed: extension_success = False else: extension_success = self.handle_ext_handler(handler_i, extension, goal_state_id) @@ -749,7 +745,7 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess message=message) @staticmethod - def __report_policy_error(ext_handler_i, error_code, report_op, message, extension=None): + def __report_policy_error(ext_handler_i, error_code, report_op, message, extension): # TODO: Consider merging this function with __handle_and_report_ext_handler_errors() above, after investigating # the impact of this change. # @@ -760,13 +756,16 @@ def __report_policy_error(ext_handler_i, error_code, report_op, message, extensi # it will require additional testing/investigation. As a temporary workaround, this separate function was created # to write a status file for single-config extensions. - # Set handler status for all extensions (with and without settings) + # Set handler status for all extensions (with and without settings). We report the same error at both the + # handler and extension status level. ext_handler_i.set_handler_status(message=message, code=error_code) # Create status file for extensions with settings (single and multi config). # If status file already exists, overwrite it. If an extension was previously reporting status and is now # blocked by a policy error, we should report the policy error. if extension is not None: + # TODO: if extension is reporting a heartbeat, it overwrites status. Consider overwriting heartbeat, if + # it exists. ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error_code, operation=report_op, message=message, overwrite=True) @@ -1071,8 +1070,8 @@ def report_ext_handler_status(self, vm_status, ext_handler, goal_state_changed): # For MultiConfig, we need to report status per extension even for Handler level failures. # If we have HandlerStatus for a MultiConfig handler and GS is requesting for it, we would report status per # extension even if HandlerState == NotInstalled (Sample scenario: ExtensionsGoalStateError, DecideVersionError, etc) - # We also need to report extension status for an uninstalled handler if extensions are disabled, or if the extension - # failed due to policy, because CRP waits for extension runtime status before failing the extension operation. + # We also need to report extension status for an uninstalled handler if extensions are disabled, because CRP + # waits for extension runtime status before failing the extension operation. if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or not conf.get_extensions_enabled(): # Since we require reading the Manifest for reading the heartbeat, this would fail if HandlerManifest not found. diff --git a/azurelinuxagent/ga/policy/policy_engine.py b/azurelinuxagent/ga/policy/policy_engine.py index 2b3a34415..a2125d134 100644 --- a/azurelinuxagent/ga/policy/policy_engine.py +++ b/azurelinuxagent/ga/policy/policy_engine.py @@ -50,7 +50,7 @@ class _PolicyEngine(object): """ def __init__(self): # Set defaults for policy - self._policy_enforcement_enabled = self.get_policy_enforcement_enabled() + self._policy_enforcement_enabled = self.__get_policy_enforcement_enabled() if not self.policy_enforcement_enabled: return @@ -69,7 +69,7 @@ def _log_policy_event(msg, is_success=True, op=WALAEventOperation.Policy, send_e add_event(op=op, message=msg, is_success=is_success, log_event=False) @staticmethod - def get_policy_enforcement_enabled(): + def __get_policy_enforcement_enabled(): """ Policy will be enabled if (1) policy file exists at the expected location and (2) the conf flag "Debug.EnableExtensionPolicy" is true. """ diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index b8541401e..d2cc69dc2 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -3536,14 +3536,13 @@ def _create_policy_file(self, policy): policy_file.write(policy) policy_file.flush() - def _test_policy_case(self, policy, op, expected_status_code, expected_handler_status, expected_ext_count=0, + def _test_policy_case(self, policy, op, expected_status_code, expected_handler_status, expected_ext_count, expected_status_msg=None): - # Set up a mock protocol instance. In the case of uninstall, we need to update the goal state to test uninstall. - # update_goal_state() only updates the goal state if incarnation has changed, so we increment the incarnation - # number. + # Set up a mock protocol instance. with mock_wire_protocol(wire_protocol_data.DATA_FILE) as protocol: if op == ExtensionRequestedState.Uninstall: + # Generate a new mock goal state to uninstall the extension - increment the incarnation protocol.mock_wire_data.set_incarnation(2) protocol.mock_wire_data.set_extensions_config_state(ExtensionRequestedState.Uninstall) protocol.client.update_goal_state() @@ -3573,7 +3572,7 @@ def test_should_fail_enable_if_extension_disallowed(self): } expected_msg = "failed to run extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified in the allowlist." self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, - expected_handler_status='NotReady', expected_status_msg=expected_msg) + expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg) def test_should_fail_enable_for_invalid_policy(self): policy = \ @@ -3585,7 +3584,7 @@ def test_should_fail_enable_for_invalid_policy(self): } expected_msg = "attribute 'extensionPolicies.allowListedExtensionsOnly'; must be 'boolean'" self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, - expected_handler_status='NotReady', expected_status_msg=expected_msg) + expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg) def test_should_fail_extension_if_error_thrown_during_policy_engine_init(self): policy = \ @@ -3597,7 +3596,7 @@ def test_should_fail_extension_if_error_thrown_during_policy_engine_init(self): expected_msg = "Extension will not be processed: mock exception" self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, - expected_handler_status='NotReady', expected_status_msg=expected_msg) + expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg) def test_should_fail_uninstall_if_extension_disallowed(self): policy = \ @@ -3611,7 +3610,7 @@ def test_should_fail_uninstall_if_extension_disallowed(self): } expected_msg = "failed to uninstall extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified in the allowlist." self._test_policy_case(policy=policy, op=ExtensionRequestedState.Uninstall, expected_status_code=ExtensionErrorCodes.PluginDisableProcessingFailed, - expected_handler_status='NotReady', expected_status_msg=expected_msg) + expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg) def test_should_fail_enable_if_dependent_extension_disallowed(self): self._create_policy_file({ @@ -3696,10 +3695,7 @@ def test_uninstall_should_succeed_if_extension_allowed(self): ] for policy in policy_cases: with mock_wire_protocol(wire_protocol_data.DATA_FILE) as protocol: - # Set up a mock protocol instance, which instantiates a goal state with incarnation number 1. - # We then change the goal state to test uninstall, and need to update the goal state with this change. - # update_goal_state() only updates the goal state if the incarnation has changed, so we increment the - # incarnation number to 2. + # Generate a new mock goal state to uninstall the extension - increment the incarnation protocol.mock_wire_data.set_incarnation(2) protocol.mock_wire_data.set_extensions_config_state(ExtensionRequestedState.Uninstall) protocol.client.update_goal_state() diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml index c7bcaed61..b64c66d82 100644 --- a/tests_e2e/orchestrator/runbook.yml +++ b/tests_e2e/orchestrator/runbook.yml @@ -46,7 +46,8 @@ variable: - ext_cgroups - extensions_disabled - ext_policy - - ext_policy_with_dependencies +# TODO: re-enable ext_policy_with_dependencies after investigating status reporting failures +# - ext_policy_with_dependencies - ext_sequencing - ext_telemetry_pipeline - fips diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index b6be22d06..f63bfabb5 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -63,13 +63,13 @@ def _operation_should_succeed(self, operation, extension_case): try: if operation == "enable": # VirtualMachineRunCommandClient (and VirtualMachineRunCommand) does not take force_update_tag as a parameter. - if type(extension_case.extension) == VirtualMachineRunCommandClient: - extension_case.extension.enable(settings=extension_case.settings, timeout=15*60) + if isinstance(extension_case.extension, VirtualMachineRunCommandClient): + extension_case.extension.enable(settings=extension_case.settings) else: - extension_case.extension.enable(settings=extension_case.settings, force_update=True, timeout=15*60) + extension_case.extension.enable(settings=extension_case.settings, force_update=True) extension_case.extension.assert_instance_view() elif operation == "delete": - extension_case.extension.delete(timeout=15 * 60) + extension_case.extension.delete() instance_view_extensions = self._context.vm.get_instance_view().extensions if instance_view_extensions is not None and any( e.name == extension_case.extension._resource_name for e in instance_view_extensions): @@ -110,10 +110,11 @@ def _operation_should_fail(self, operation, extension_case): # instance view and that the expected error is in the agent log to confirm that deletion failed. delete_start_time = self._ssh_client.run_command("date '+%Y-%m-%d %T'").rstrip() try: - timeout = (16 * 60) # We want to wait for CRP timeout, which is 15 minutes. - extension_case.extension.delete(timeout) - fail(f"The agent should have reported a timeout error when attempting to delete {extension_case.extension} " - f"because the extension is disallowed by policy.") + # TODO: consider checking the agent's log asynchronously to confirm that the uninstall failed instead of + # waiting for the full CRP timeout. + extension_case.extension.delete() + fail(f"CRP should have reported a timeout error when attempting to delete {extension_case.extension} " + f"because the extension is disallowed by policy and agent should have reported a policy failure.") except TimeoutError: log.info("Reported a timeout error when attempting to delete extension, as expected. Checking instance view " "and agent log to confirm that delete operation failed.") @@ -137,39 +138,40 @@ def run(self): # failing fast, because delete is a best effort operation by-design and should not fail. self._context.vm.update({"extensionsTimeBudget": "PT15M"}) - # Prepare no-config, single-config, and multi-config extension to test. Extensions with settings and extensions # without settings have different status reporting logic, so we should test all cases. # CustomScript is a single-config extension. custom_script = ExtPolicy.TestCase( VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, - resource_name="CustomScriptPolicy"), + resource_name="CustomScript"), {'commandToExecute': f"echo '{str(uuid.uuid4())}'"} ) # RunCommandHandler is a multi-config extension, so we set up two instances (configurations) here and test both. run_command = ExtPolicy.TestCase( VirtualMachineRunCommandClient(self._context.vm, VmExtensionIds.RunCommandHandler, - resource_name="RunCommandHandlerPolicy"), + resource_name="RunCommandHandler"), {'source': f"echo '{str(uuid.uuid4())}'"} ) run_command_2 = ExtPolicy.TestCase( VirtualMachineRunCommandClient(self._context.vm, VmExtensionIds.RunCommandHandler, - resource_name="RunCommandHandlerPolicy2"), + resource_name="RunCommandHandler2"), {'source': f"echo '{str(uuid.uuid4())}'"} ) # AzureMonitorLinuxAgent is a no-config extension (extension without settings). azure_monitor = ExtPolicy.TestCase( VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent, - resource_name="AzureMonitorLinuxAgentPolicy"), + resource_name="AzureMonitorLinuxAgent"), None ) # Another e2e test may have left behind an extension we want to test here. Cleanup any leftovers so that they # do not affect the test results. + extensions_to_test = [custom_script, run_command, run_command_2, azure_monitor] log.info("Cleaning up existing extensions on the test VM [%s]", self._context.vm.name) - self._context.vm.delete_all_extensions() + for ext_case in extensions_to_test: + ext_case.extension.delete() # Enable policy via conf file log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) @@ -243,7 +245,7 @@ def run(self): } self._create_policy_file(policy) self._operation_should_fail("enable", custom_script) - self._operation_should_fail("delete", custom_script) + # self._operation_should_fail("delete", custom_script) # This policy tests the following scenario: # - allow a previously-disallowed single-config extension (CustomScript), then delete -> should succeed @@ -263,17 +265,20 @@ def run(self): self._operation_should_succeed("delete", custom_script) self._operation_should_succeed("enable", custom_script) - # Disable policy via conf file + # Cleanup after test: disable policy enforcement in conf file, and delete any leftover extensions. log.info("Disabling policy via conf file on the test VM [%s]", self._context.vm.name) self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True) + log.info("Test completed. Cleaning up leftover extensions on the test VM [%s]", self._context.vm.name) + for ext_case in extensions_to_test: + ext_case.extension.delete() def get_ignore_error_rules(self) -> List[Dict[str, Any]]: ignore_rules = [ - # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension will not be processed: failed to enable extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because extension is not specified in allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 + # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension will not be processed: failed to run extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because it is not specified in the allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 # We intentionally block extensions with policy and expect this failure message { - 'message': r"Extension will not be processed" + 'message': r"Extension will not be processed: failed to .* extension .* because it is not specified in the allowlist" } ] return ignore_rules diff --git a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py index c518f23e0..f00982745 100644 --- a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py +++ b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py @@ -18,9 +18,12 @@ # # -# This test adds extensions with multiple dependencies to a VMSS using the 'provisionAfterExtensions' property and -# validates they are enabled in order of dependencies. +# This test adds extensions with multiple dependencies to a VMSS and checks that extensions fail and report status +# as expected when blocked by extension policy. +# Note that this test is currently disabled in daily automation due to status reporting errors that need further investigation. +# TODO: re-enable this test in daily automation after investigating test failures # + import copy import json import random @@ -41,6 +44,8 @@ _should_fail_single_config_depends_on_disallowed_single_config, \ _should_succeed_single_config_depends_on_no_config, \ _should_succeed_single_config_depends_on_single_config + # TODO: RunCommandHandler is unable to be uninstalled properly, so these tests are currently disabled. Uncomment + # the below imports after re-enabling the test. # _should_fail_single_config_depends_on_disallowed_multi_config, # _should_fail_multi_config_depends_on_disallowed_single_config, # _should_fail_multi_config_depends_on_disallowed_no_config, diff --git a/tests_e2e/tests/scripts/agent_ext_policy-verify_uninstall_success.py b/tests_e2e/tests/scripts/agent_ext_policy-verify_uninstall_success.py index a9e01fcb2..8bcf725ba 100755 --- a/tests_e2e/tests/scripts/agent_ext_policy-verify_uninstall_success.py +++ b/tests_e2e/tests/scripts/agent_ext_policy-verify_uninstall_success.py @@ -30,6 +30,8 @@ def extension_found_in_agent_status_file(ext_name: str) -> bool: # Check if the provided extension name is present in the agent status file, under handlerAggregateStatus. # If the name is not present, the uninstall operation was successful. + # TODO: a new status file is created for each goal state. To make this a reliable check, make sure to correlate + # the status file to the goal state being checked. Alternatively, consider checking waagent.log. agent_status_file = "/var/lib/waagent/history/*/waagent_status.json" file_paths = glob.glob(agent_status_file, recursive=True) for file in file_paths: From bacc425c4daf4c2b00d736ed89c34d49d81d57b3 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Tue, 17 Dec 2024 01:33:41 -0500 Subject: [PATCH 21/32] Add fixes for e2e tests --- tests_e2e/tests/ext_policy/ext_policy.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index f63bfabb5..5034937f9 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -150,12 +150,12 @@ def run(self): # RunCommandHandler is a multi-config extension, so we set up two instances (configurations) here and test both. run_command = ExtPolicy.TestCase( VirtualMachineRunCommandClient(self._context.vm, VmExtensionIds.RunCommandHandler, - resource_name="RunCommandHandler"), + resource_name="RunCommandHandlerPolicy"), {'source': f"echo '{str(uuid.uuid4())}'"} ) run_command_2 = ExtPolicy.TestCase( VirtualMachineRunCommandClient(self._context.vm, VmExtensionIds.RunCommandHandler, - resource_name="RunCommandHandler2"), + resource_name="RunCommandHandlerPolicy2"), {'source': f"echo '{str(uuid.uuid4())}'"} ) @@ -168,10 +168,8 @@ def run(self): # Another e2e test may have left behind an extension we want to test here. Cleanup any leftovers so that they # do not affect the test results. - extensions_to_test = [custom_script, run_command, run_command_2, azure_monitor] log.info("Cleaning up existing extensions on the test VM [%s]", self._context.vm.name) - for ext_case in extensions_to_test: - ext_case.extension.delete() + self._context.vm.delete_all_extensions() # Enable policy via conf file log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) @@ -265,12 +263,11 @@ def run(self): self._operation_should_succeed("delete", custom_script) self._operation_should_succeed("enable", custom_script) - # Cleanup after test: disable policy enforcement in conf file, and delete any leftover extensions. + # Cleanup after test: delete leftover extensions and disable policy enforcement in conf file. + self._operation_should_succeed("delete", custom_script) log.info("Disabling policy via conf file on the test VM [%s]", self._context.vm.name) self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True) - log.info("Test completed. Cleaning up leftover extensions on the test VM [%s]", self._context.vm.name) - for ext_case in extensions_to_test: - ext_case.extension.delete() + def get_ignore_error_rules(self) -> List[Dict[str, Any]]: From 3319916c5a7bab1adf00a92c76610eb4eea29627 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Tue, 17 Dec 2024 09:10:46 -0500 Subject: [PATCH 22/32] Add back delete failure test case --- tests_e2e/tests/ext_policy/ext_policy.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index 5034937f9..4d2b7f018 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -148,6 +148,8 @@ def run(self): ) # RunCommandHandler is a multi-config extension, so we set up two instances (configurations) here and test both. + # We append the resource name with "Policy" because agent_bvt/run_command.py leaves behind a "RunCommandHandler" + # that cannot be deleted via extensions API. run_command = ExtPolicy.TestCase( VirtualMachineRunCommandClient(self._context.vm, VmExtensionIds.RunCommandHandler, resource_name="RunCommandHandlerPolicy"), @@ -243,7 +245,7 @@ def run(self): } self._create_policy_file(policy) self._operation_should_fail("enable", custom_script) - # self._operation_should_fail("delete", custom_script) + self._operation_should_fail("delete", custom_script) # This policy tests the following scenario: # - allow a previously-disallowed single-config extension (CustomScript), then delete -> should succeed @@ -264,9 +266,9 @@ def run(self): self._operation_should_succeed("enable", custom_script) # Cleanup after test: delete leftover extensions and disable policy enforcement in conf file. - self._operation_should_succeed("delete", custom_script) log.info("Disabling policy via conf file on the test VM [%s]", self._context.vm.name) self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True) + self._context.vm.delete_all_extensions() From 8c317982abb93b7ff14dac15e1dfed2a886546fd Mon Sep 17 00:00:00 2001 From: mgunnala Date: Tue, 17 Dec 2024 11:40:29 -0500 Subject: [PATCH 23/32] Address comments round 3 --- tests/data/test_waagent.conf | 2 +- tests/test_agent.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data/test_waagent.conf b/tests/data/test_waagent.conf index 8abbd55e5..a0147b01a 100644 --- a/tests/data/test_waagent.conf +++ b/tests/data/test_waagent.conf @@ -140,4 +140,4 @@ OS.SshDir=/notareal/path # - The default is false to protect the state of existing VMs OS.EnableFirewall=n -Debug.EnableExtensionPolicy=y +Debug.EnableExtensionPolicy=n diff --git a/tests/test_agent.py b/tests/test_agent.py index abd8f661a..b8b837af8 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -47,7 +47,7 @@ Debug.CgroupLogMetrics = False Debug.EnableAgentMemoryUsageCheck = False Debug.EnableCgroupV2ResourceLimiting = False -Debug.EnableExtensionPolicy = True +Debug.EnableExtensionPolicy = False Debug.EnableFastTrack = True Debug.EnableGAVersioning = True Debug.EtpCollectionPeriod = 300 From 32ef5c1c41499611701b119802ef99192bfc15b9 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Tue, 17 Dec 2024 13:45:40 -0500 Subject: [PATCH 24/32] Address comments --- azurelinuxagent/ga/exthandlers.py | 17 +++++++++-------- azurelinuxagent/ga/policy/policy_engine.py | 5 ++++- tests/ga/test_extension.py | 6 +++--- tests/ga/test_multi_config_extension.py | 2 +- tests_e2e/tests/ext_policy/ext_policy.py | 10 ++++++---- 5 files changed, 23 insertions(+), 17 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index d42ef7ba3..ad9eda757 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -531,7 +531,8 @@ def handle_ext_handlers(self, goal_state_id): status=ExtensionStatusValue.error, code=-1, operation=handler_i.operation, - message=msg) + message=msg, + overwrite=False) continue # If an error was thrown during policy engine initialization, skip further processing of the extension. @@ -540,7 +541,7 @@ def handle_ext_handlers(self, goal_state_id): if policy_error is not None: msg = "Extension will not be processed: {0}".format(ustr(policy_error)) self.__report_policy_error(ext_handler_i=handler_i, error_code=error_code, - report_op=handler_i.operation, message=msg, + report_op=WALAEventOperation.ExtensionPolicy, message=msg, extension=extension) continue @@ -557,7 +558,7 @@ def handle_ext_handlers(self, goal_state_id): handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=-1, operation=WALAEventOperation.ExtensionProcessing, - message=depends_on_err_msg) + message=depends_on_err_msg, overwrite=False) # For SC extensions, overwrite the HandlerStatus with the relevant message else: @@ -573,9 +574,9 @@ def handle_ext_handlers(self, goal_state_id): if not extension_allowed: msg = ( "Extension will not be processed: failed to {0} extension '{1}' because it is not specified " - "in the allowlist. To {0}, add the extension to the list of allowed extensions in the policy file ('{2}')." + "as an allowed extension. To {0}, add the extension to the list of allowed extensions in the policy file ('{2}')." ).format(operation, ext_handler.name, conf.get_policy_file_path()) - self.__report_policy_error(handler_i, error_code, report_op=handler_i.operation, + self.__report_policy_error(handler_i, error_code, report_op=WALAEventOperation.ExtensionPolicy, message=msg, extension=extension) extension_success = False else: @@ -695,7 +696,7 @@ def handle_ext_handler(self, ext_handler_i, extension, goal_state_id): # Since these are maintained by the extensions, the expectation here is that they would update their status files appropriately with their errors. # The extensions should already have a placeholder status file, but incase they dont, setting one here to fail fast. ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error.code, - operation=ext_handler_i.operation, message=err_msg) + operation=ext_handler_i.operation, message=err_msg, overwrite=False) add_event(name=ext_name, version=ext_handler_i.ext_handler.version, op=ext_handler_i.operation, is_success=False, log_event=True, message=err_msg) except ExtensionsGoalStateError as error: @@ -736,7 +737,7 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess # This way we guarantee reporting back to CRP if ext_handler_i.should_perform_multi_config_op(extension): ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error.code, - operation=report_op, message=message) + operation=report_op, message=message, overwrite=False) if report: name = ext_handler_i.get_extension_full_name(extension) @@ -1425,7 +1426,7 @@ def set_extension_resource_limits(self): extension_name=extension_name, cpu_quota=resource_limits.get_extension_slice_cpu_quota()) CGroupConfigurator.get_instance().set_extension_services_cpu_memory_quota(resource_limits.get_service_list()) - def create_status_file(self, extension, status, code, operation, message, overwrite=False): + def create_status_file(self, extension, status, code, operation, message, overwrite): # Create status file for specified extension. If overwrite is true, overwrite any existing status file. If # false, create a status file only if it does not already exist. _, status_path = self.get_status_file_path(extension) diff --git a/azurelinuxagent/ga/policy/policy_engine.py b/azurelinuxagent/ga/policy/policy_engine.py index a2125d134..347e7887e 100644 --- a/azurelinuxagent/ga/policy/policy_engine.py +++ b/azurelinuxagent/ga/policy/policy_engine.py @@ -54,6 +54,7 @@ def __init__(self): if not self.policy_enforcement_enabled: return + _PolicyEngine._log_policy_event("Policy enforcement is enabled.") self._policy = self._parse_policy(self.__read_policy()) @staticmethod @@ -91,8 +92,10 @@ def __read_policy(): with open(conf.get_policy_file_path(), 'r') as f: try: contents = f.read() + # TODO: Consider copying the policy file contents to the history folder, and only log the policy locally + # in the case of policy-related failure. _PolicyEngine._log_policy_event( - "Policy enforcement is enabled. Enforcing policy using policy file found at '{0}'. File contents:\n{1}" + "Enforcing policy using policy file found at '{0}'. File contents:\n{1}" .format(conf.get_policy_file_path(), contents)) # json.loads will raise error if file contents are not a valid json (including empty file). custom_policy = json.loads(contents) diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index d2cc69dc2..299a0c3a8 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -3570,7 +3570,7 @@ def test_should_fail_enable_if_extension_disallowed(self): "allowListedExtensionsOnly": True, } } - expected_msg = "failed to run extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified in the allowlist." + expected_msg = "failed to run extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified as an allowed extension." self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg) @@ -3608,7 +3608,7 @@ def test_should_fail_uninstall_if_extension_disallowed(self): "extensions": {} }, } - expected_msg = "failed to uninstall extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified in the allowlist." + expected_msg = "failed to uninstall extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified as an allowed extension." self._test_policy_case(policy=policy, op=ExtensionRequestedState.Uninstall, expected_status_code=ExtensionErrorCodes.PluginDisableProcessingFailed, expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg) @@ -3637,7 +3637,7 @@ def test_should_fail_enable_if_dependent_extension_disallowed(self): self._assert_handler_status(protocol.report_vm_status, expected_status="NotReady", expected_ext_count=0, version="1.0.0", expected_handler_name="OSTCExtensions.OtherExampleHandlerLinux", expected_msg=("failed to run extension 'OSTCExtensions.OtherExampleHandlerLinux' " - "because it is not specified in the allowlist.")) + "because it is not specified as an allowed extension.")) self._assert_handler_status(protocol.report_vm_status, expected_status="NotReady", expected_ext_count=0, version="1.0.0", expected_handler_name="OSTCExtensions.ExampleHandlerLinux", diff --git a/tests/ga/test_multi_config_extension.py b/tests/ga/test_multi_config_extension.py index 8ce38441e..c7ce20c73 100644 --- a/tests/ga/test_multi_config_extension.py +++ b/tests/ga/test_multi_config_extension.py @@ -676,7 +676,7 @@ def test_it_should_report_failed_status_for_extensions_disallowed_by_policy(self mc_handlers = self._assert_and_get_handler_status(aggregate_status=protocol.aggregate_status, handler_name="OSTCExtensions.ExampleHandlerLinux", expected_count=3, status="NotReady") - msg = "failed to run extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified in the allowlist" + msg = "failed to run extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified as an allowed extension" expected_extensions = { "firstExtension": {"status": ExtensionStatusValue.error, "seq_no": 1, "message": msg}, "secondExtension": {"status": ExtensionStatusValue.error, "seq_no": 2, "message": msg}, diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index 4d2b7f018..a4e36dc68 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -84,9 +84,9 @@ def _operation_should_succeed(self, operation, extension_case): def _operation_should_fail(self, operation, extension_case): log.info("") - log.info(f"Attempting to {operation} {extension_case.extension}, should fail fast.") if operation == "enable": try: + log.info(f"Attempting to enable {extension_case.extension}, should fail fast.") timeout = (6 * 60) # Fail fast. # VirtualMachineRunCommandClient (and VirtualMachineRunCommand) does not take force_update_tag as a parameter. if type(extension_case.extension) == VirtualMachineRunCommandClient: @@ -108,6 +108,7 @@ def _operation_should_fail(self, operation, extension_case): # Delete is a best effort operation and should not fail, so CRP will timeout instead of reporting the # appropriate error. We swallow the timeout error, and instead, assert that the extension is still in the # instance view and that the expected error is in the agent log to confirm that deletion failed. + log.info(f"Attempting to delete {extension_case.extension}, should reach timeout.") delete_start_time = self._ssh_client.run_command("date '+%Y-%m-%d %T'").rstrip() try: # TODO: consider checking the agent's log asynchronously to confirm that the uninstall failed instead of @@ -171,6 +172,7 @@ def run(self): # Another e2e test may have left behind an extension we want to test here. Cleanup any leftovers so that they # do not affect the test results. log.info("Cleaning up existing extensions on the test VM [%s]", self._context.vm.name) + # TODO: Consider deleting only extensions used by this test instead of all extensions. self._context.vm.delete_all_extensions() # Enable policy via conf file @@ -199,8 +201,7 @@ def run(self): self._operation_should_succeed("enable", custom_script) self._operation_should_fail("enable", run_command) self._operation_should_fail("enable", run_command_2) - if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): - self._operation_should_fail("enable", azure_monitor) + self._operation_should_fail("enable", azure_monitor) # This policy tests the following scenarios: # - enable two instances of a multi-config extension (RunCommandHandler) when allowed by policy -> should succeed @@ -268,6 +269,7 @@ def run(self): # Cleanup after test: delete leftover extensions and disable policy enforcement in conf file. log.info("Disabling policy via conf file on the test VM [%s]", self._context.vm.name) self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True) + # TODO: Consider deleting only extensions used by this test instead of all extensions. self._context.vm.delete_all_extensions() @@ -277,7 +279,7 @@ def get_ignore_error_rules(self) -> List[Dict[str, Any]]: # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension will not be processed: failed to run extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because it is not specified in the allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 # We intentionally block extensions with policy and expect this failure message { - 'message': r"Extension will not be processed: failed to .* extension .* because it is not specified in the allowlist" + 'message': r"Extension will not be processed: failed to .* extension .* because it is not specified as an allowed extension" } ] return ignore_rules From 0c9f1c735192734d65659696bf4a0e8098eb4c6f Mon Sep 17 00:00:00 2001 From: mgunnala Date: Wed, 18 Dec 2024 10:35:20 -0500 Subject: [PATCH 25/32] Pylint --- tests_e2e/tests/ext_policy/ext_policy.py | 30 +++++++++++++++++++ tests_e2e/tests/lib/virtual_machine_client.py | 4 +-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index a4e36dc68..555ec6853 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -137,6 +137,8 @@ def run(self): # The full CRP timeout period for extension operation failure is 90 minutes. For efficiency, we reduce the # timeout limit to 15 minutes here. We expect "delete" operations on disallowed VMs to reach timeout instead of # failing fast, because delete is a best effort operation by-design and should not fail. + log.info("*** Begin test setup") + log.info("Set CRP timeout to 15 minutes") self._context.vm.update({"extensionsTimeBudget": "PT15M"}) # Prepare no-config, single-config, and multi-config extension to test. Extensions with settings and extensions @@ -178,12 +180,19 @@ def run(self): # Enable policy via conf file log.info("Enabling policy via conf file on the test VM [%s]", self._context.vm.name) self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) + log.info("Test setup complete.") # This policy tests the following scenarios: # - enable a single-config extension (CustomScript) that is allowed by policy -> should succeed # - enable a no-config extension (AzureMonitorLinuxAgent) that is disallowed by policy -> should fail fast # - enable two instances of a multi-config extension (RunCommandHandler) that is disallowed by policy -> both should fail fast # (Note that CustomScript disallowed by policy is tested in a later test case.) + log.info("") + log.info("*** Begin test case 1") + log.info("This policy tests the following scenarios:") + log.info(" - enable a single-config extension (CustomScript) that is allowed by policy -> should succeed") + log.info(" - enable a no-config extension (AzureMonitorLinuxAgent) that is disallowed by policy -> should fail fast") + log.info(" - enable two instances of a multi-config extension (RunCommandHandler) that is disallowed by policy -> both should fail fast") policy = \ { "policyVersion": "0.1.0", @@ -208,6 +217,13 @@ def run(self): # - delete two instances of a multi-config extension (RunCommandHandler) when allowed by policy -> should succeed # - enable no-config extension (AzureMonitorLinuxAgent) when allowed by policy -> should succeed # - delete no-config extension (AzureMonitorLinuxAgent) when allowed by policy -> should succeed + log.info("") + log.info("*** Begin test case 2") + log.info("This policy tests the following scenarios:") + log.info(" - enable two instances of a multi-config extension (RunCommandHandler) when allowed by policy -> should succeed") + log.info(" - delete two instances of a multi-config extension (RunCommandHandler) when allowed by policy -> should succeed") + log.info(" - enable no-config extension (AzureMonitorLinuxAgent) when allowed by policy -> should succeed") + log.info(" - delete no-config extension (AzureMonitorLinuxAgent) when allowed by policy -> should succeed") policy = \ { "policyVersion": "0.1.0", @@ -232,6 +248,11 @@ def run(self): # This policy tests the following scenarios: # - disallow a previously-enabled single-config extension (CustomScript, then try to enable again -> should fail fast # - disallow a previously-enabled single-config extension (CustomScript), then try to delete -> should fail fast + log.info("") + log.info("*** Begin test case 3") + log.info("This policy tests the following scenarios:") + log.info(" - disallow a previously-enabled single-config extension (CustomScript, then try to enable again -> should fail fast") + log.info(" - disallow a previously-enabled single-config extension (CustomScript), then try to delete -> should reach timeout") policy = \ { "policyVersion": "0.1.0", @@ -251,6 +272,11 @@ def run(self): # This policy tests the following scenario: # - allow a previously-disallowed single-config extension (CustomScript), then delete -> should succeed # - allow a previously-disallowed single-config extension (CustomScript), then enable -> should succeed + log.info("") + log.info("*** Begin test case 4") + log.info("This policy tests the following scenario:") + log.info(" - allow a previously-disallowed single-config extension (CustomScript), then delete -> should succeed") + log.info(" - allow a previously-disallowed single-config extension (CustomScript), then enable -> should succeed") policy = \ { "policyVersion": "0.1.0", @@ -267,10 +293,14 @@ def run(self): self._operation_should_succeed("enable", custom_script) # Cleanup after test: delete leftover extensions and disable policy enforcement in conf file. + log.info("") + log.info("*** Begin test cleanup") log.info("Disabling policy via conf file on the test VM [%s]", self._context.vm.name) self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True) # TODO: Consider deleting only extensions used by this test instead of all extensions. self._context.vm.delete_all_extensions() + log.info("*** Test cleanup complete.") + diff --git a/tests_e2e/tests/lib/virtual_machine_client.py b/tests_e2e/tests/lib/virtual_machine_client.py index 42f3f33c2..68a71c3e9 100644 --- a/tests_e2e/tests/lib/virtual_machine_client.py +++ b/tests_e2e/tests/lib/virtual_machine_client.py @@ -117,10 +117,10 @@ def delete_all_extensions(self, timeout: int = AzureSdkClient._DEFAULT_TIMEOUT) ext_name = ext.name log.info(f"Deleting extension {ext_name} from {self.name}") self._execute_async_operation( - lambda: self._compute_client.virtual_machine_extensions.begin_delete( + lambda extension_name=ext_name: self._compute_client.virtual_machine_extensions.begin_delete( self.resource_group, self.name, - ext_name), + extension_name), operation_name=f"Delete extension {ext_name}", timeout=timeout) From c3aac0f9e95e7c88cf885c870dc87a64c69751b1 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Mon, 13 Jan 2025 12:33:51 -0500 Subject: [PATCH 26/32] Report status for single-config ext --- azurelinuxagent/ga/exthandlers.py | 47 +++---- tests/ga/test_extension.py | 10 +- tests_e2e/orchestrator/scripts/collect-logs | 3 +- .../ext_policy_with_dependencies.yml | 4 +- tests_e2e/tests/ext_policy/ext_policy.py | 10 +- .../ext_policy_with_dependencies.py | 68 ++++++---- .../ext_policy/policy_dependencies_cases.py | 13 +- ...ent_ext_policy-verify_operation_success.py | 117 ++++++++++++++++++ ...ent_ext_policy-verify_uninstall_success.py | 61 --------- 9 files changed, 210 insertions(+), 123 deletions(-) create mode 100755 tests_e2e/tests/scripts/agent_ext_policy-verify_operation_success.py delete mode 100755 tests_e2e/tests/scripts/agent_ext_policy-verify_uninstall_success.py diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 2644df863..49eddcd85 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -297,6 +297,8 @@ class ExtHandlersHandler(object): def __init__(self, protocol): self.protocol = protocol self.ext_handlers = None + # Maintain a list of extensions that are disallowed, and always report extension status for disallowed extensions. + self.disallowed_ext_handlers = [] # The GoalState Aggregate status needs to report the last status of the GoalState. Since we only process # extensions on goal state change, we need to maintain its state. # Setting the status to None here. This would be overridden as soon as the first GoalState is processed @@ -519,6 +521,7 @@ def handle_ext_handlers(self, goal_state_id): # back for the skipped extensions. In order to propagate the status back to CRP, we will report status back # here with an error message. if not extensions_enabled: + self.disallowed_ext_handlers.append(ext_handler) agent_conf_file_path = get_osutil().agent_conf_file_path msg = "Extension will not be processed since extension processing is disabled. To enable extension " \ "processing, set Extensions.Enabled=y in '{0}'".format(agent_conf_file_path) @@ -745,28 +748,25 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True, message=message) - @staticmethod - def __report_policy_error(ext_handler_i, error_code, report_op, message, extension): - # TODO: Consider merging this function with __handle_and_report_ext_handler_errors() above, after investigating - # the impact of this change. - # - # If extension status is present, CRP will ignore handler status and report extension status. In the case of policy errors, - # extensions are not processed, so collect_ext_status() reports transitioning status on behalf of the extension. - # However, extensions blocked by policy should fail fast, so agent should write a .status file for policy failures. - # Note that __handle_and_report_ext_handler_errors() does not create the file for single-config extensions, but changing - # it will require additional testing/investigation. As a temporary workaround, this separate function was created - # to write a status file for single-config extensions. - - # Set handler status for all extensions (with and without settings). We report the same error at both the - # handler and extension status level. + def __report_policy_error(self, ext_handler_i, error_code, report_op, message, extension): + # TODO: __handle_and_report_ext_handler_errors() does not create a status file for single-config extensions, this + # function was created as a temporary workaround. Consider merging the two functions function after assessing its impact. + + # If extension status exists, CRP ignores handler status and reports extension status. In the case of policy errors, + # we write a .status file to force CRP to fail fast - the agent will otherwise report a transitioning status. + # - For extensions without settings or uninstall errors: report at the handler level. + # - For extensions with settings (install/enable errors): report at both handler and extension levels. + + # Keep a list of disallowed extensions so that report_ext_handler_status() can report status for them. + self.disallowed_ext_handlers.append(ext_handler_i.ext_handler) + + # Set handler status for all extensions (with and without settings). ext_handler_i.set_handler_status(message=message, code=error_code) - # Create status file for extensions with settings (single and multi config). - # If status file already exists, overwrite it. If an extension was previously reporting status and is now - # blocked by a policy error, we should report the policy error. - if extension is not None: - # TODO: if extension is reporting a heartbeat, it overwrites status. Consider overwriting heartbeat, if - # it exists. + # For extensions with settings (install/enable errors), also update extension-level status. + # Overwrite any existing status file to reflect policy failures accurately. + if extension is not None and ext_handler_i.ext_handler.state == ExtensionRequestedState.Enabled: + # TODO: if extension is reporting heartbeat, it overwrites status. Consider overwriting heartbeat here ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error_code, operation=report_op, message=message, overwrite=True) @@ -1068,12 +1068,13 @@ def report_ext_handler_status(self, vm_status, ext_handler, goal_state_changed): handler_state = ext_handler_i.get_handler_state() ext_handler_statuses = [] + ext_disallowed = ext_handler in self.disallowed_ext_handlers # For MultiConfig, we need to report status per extension even for Handler level failures. # If we have HandlerStatus for a MultiConfig handler and GS is requesting for it, we would report status per # extension even if HandlerState == NotInstalled (Sample scenario: ExtensionsGoalStateError, DecideVersionError, etc) - # We also need to report extension status for an uninstalled handler if extensions are disabled, because CRP - # waits for extension runtime status before failing the extension operation. - if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or not conf.get_extensions_enabled(): + # We also need to report extension status for an uninstalled handler if the extension is disallowed (due to + # policy failure, extensions disabled, etc.) because CRP waits for extension runtime status before failing the operation. + if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or ext_disallowed: # Since we require reading the Manifest for reading the heartbeat, this would fail if HandlerManifest not found. # Only try to read heartbeat if HandlerState != NotInstalled. diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index 299a0c3a8..b7d830ccf 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -3572,7 +3572,7 @@ def test_should_fail_enable_if_extension_disallowed(self): } expected_msg = "failed to run extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified as an allowed extension." self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, - expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg) + expected_handler_status='NotReady', expected_ext_count=1, expected_status_msg=expected_msg) def test_should_fail_enable_for_invalid_policy(self): policy = \ @@ -3584,7 +3584,7 @@ def test_should_fail_enable_for_invalid_policy(self): } expected_msg = "attribute 'extensionPolicies.allowListedExtensionsOnly'; must be 'boolean'" self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, - expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg) + expected_handler_status='NotReady', expected_ext_count=1, expected_status_msg=expected_msg) def test_should_fail_extension_if_error_thrown_during_policy_engine_init(self): policy = \ @@ -3596,7 +3596,7 @@ def test_should_fail_extension_if_error_thrown_during_policy_engine_init(self): expected_msg = "Extension will not be processed: mock exception" self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed, - expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg) + expected_handler_status='NotReady', expected_ext_count=1, expected_status_msg=expected_msg) def test_should_fail_uninstall_if_extension_disallowed(self): policy = \ @@ -3610,7 +3610,7 @@ def test_should_fail_uninstall_if_extension_disallowed(self): } expected_msg = "failed to uninstall extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified as an allowed extension." self._test_policy_case(policy=policy, op=ExtensionRequestedState.Uninstall, expected_status_code=ExtensionErrorCodes.PluginDisableProcessingFailed, - expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg) + expected_handler_status='NotReady', expected_ext_count=1, expected_status_msg=expected_msg) def test_should_fail_enable_if_dependent_extension_disallowed(self): self._create_policy_file({ @@ -3634,7 +3634,7 @@ def test_should_fail_enable_if_dependent_extension_disallowed(self): # OtherExampleHandlerLinux should be disallowed by policy, ExampleHandlerLinux should be skipped because # dependent extension failed - self._assert_handler_status(protocol.report_vm_status, expected_status="NotReady", expected_ext_count=0, + self._assert_handler_status(protocol.report_vm_status, expected_status="NotReady", expected_ext_count=1, version="1.0.0", expected_handler_name="OSTCExtensions.OtherExampleHandlerLinux", expected_msg=("failed to run extension 'OSTCExtensions.OtherExampleHandlerLinux' " "because it is not specified as an allowed extension.")) diff --git a/tests_e2e/orchestrator/scripts/collect-logs b/tests_e2e/orchestrator/scripts/collect-logs index c221288a1..85ef29ab4 100755 --- a/tests_e2e/orchestrator/scripts/collect-logs +++ b/tests_e2e/orchestrator/scripts/collect-logs @@ -19,7 +19,8 @@ tar --exclude='journal/*' --exclude='omsbundle' --exclude='omsagent' --exclude=' -czf "$logs_file_name" \ /var/log \ /var/lib/waagent/ \ - $waagent_conf + $waagent_conf \ + /etc/waagent_policy.json set -euxo pipefail diff --git a/tests_e2e/test_suites/ext_policy_with_dependencies.yml b/tests_e2e/test_suites/ext_policy_with_dependencies.yml index 0c9fe8b18..a44453b52 100644 --- a/tests_e2e/test_suites/ext_policy_with_dependencies.yml +++ b/tests_e2e/test_suites/ext_policy_with_dependencies.yml @@ -6,4 +6,6 @@ tests: - "ext_policy/ext_policy_with_dependencies.py" images: "endorsed" executes_on_scale_set: true -owns_vm: false \ No newline at end of file +owns_vm: false +skip_on_images: + - "alma_9" # TODO: Currently AlmaLinux is not available for scale sets; enable this image when it is available. \ No newline at end of file diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index 555ec6853..35345d21c 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -18,6 +18,7 @@ # import json import uuid +import os from typing import List, Dict, Any from assertpy import assert_that, fail @@ -44,9 +45,12 @@ def _create_policy_file(self, policy): """ Create policy json file and copy to /etc/waagent_policy.json on test machine. """ - with open("waagent_policy.json", mode='w') as policy_file: + unique_id = uuid.uuid4() + file_path = "/tmp/waagent_policy_{0}.json".format(unique_id) + with open(file_path, mode='w') as policy_file: json.dump(policy, policy_file, indent=4) policy_file.flush() + log.info("Policy file contents: {0}".format(json.dumps(policy, indent=4))) remote_path = "/tmp/waagent_policy.json" local_path = policy_file.name @@ -54,6 +58,7 @@ def _create_policy_file(self, policy): policy_file_final_dest = "/etc/waagent_policy.json" log.info("Copying policy file to test VM [%s]", self._context.vm.name) self._ssh_client.run_command(f"mv {remote_path} {policy_file_final_dest}", use_sudo=True) + os.remove(file_path) def _operation_should_succeed(self, operation, extension_case): log.info("") @@ -234,6 +239,7 @@ def run(self): } } self._create_policy_file(policy) + self._operation_should_succeed("enable", custom_script) # Update settings to force an update to the seq no run_command.settings = {'source': f"echo '{str(uuid.uuid4())}'"} run_command_2.settings = {'source': f"echo '{str(uuid.uuid4())}'"} @@ -316,4 +322,4 @@ def get_ignore_error_rules(self) -> List[Dict[str, Any]]: if __name__ == "__main__": - ExtPolicy.run_from_command_line() + ExtPolicy.run_from_command_line() \ No newline at end of file diff --git a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py index f00982745..368e85fe8 100644 --- a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py +++ b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py @@ -20,15 +20,14 @@ # # This test adds extensions with multiple dependencies to a VMSS and checks that extensions fail and report status # as expected when blocked by extension policy. -# Note that this test is currently disabled in daily automation due to status reporting errors that need further investigation. -# TODO: re-enable this test in daily automation after investigating test failures -# import copy import json import random import re +import os import uuid +import time from datetime import datetime from typing import List, Dict, Any @@ -57,20 +56,23 @@ def __init__(self, context: AgentVmTestContext): # Cases to test different dependency scenarios _test_cases = [ - _should_fail_single_config_depends_on_disallowed_no_config, _should_fail_single_config_depends_on_disallowed_single_config, + _should_fail_single_config_depends_on_disallowed_no_config, # TODO: RunCommandHandler is unable to be uninstalled properly, so these tests are currently disabled. Investigate the # issue and enable these 3 tests. # _should_fail_single_config_depends_on_disallowed_multi_config, # _should_fail_multi_config_depends_on_disallowed_single_config, # _should_fail_multi_config_depends_on_disallowed_no_config, - _should_succeed_single_config_depends_on_no_config, - _should_succeed_single_config_depends_on_single_config + _should_succeed_single_config_depends_on_single_config, + _should_succeed_single_config_depends_on_no_config ] @staticmethod def _create_policy_file(ssh_client, policy): - with open("waagent_policy.json", mode='w') as policy_file: + # Generate a unique file name to avoid conflicts with any other tests running in parallel. + unique_id = uuid.uuid4() + file_path = "/tmp/waagent_policy_{0}.json".format(unique_id) + with open(file_path, mode='w') as policy_file: json.dump(policy, policy_file, indent=4) policy_file.flush() @@ -79,6 +81,7 @@ def _create_policy_file(ssh_client, policy): ssh_client.copy_to_node(local_path=local_path, remote_path=remote_path) policy_file_final_dest = "/etc/waagent_policy.json" ssh_client.run_command(f"mv {remote_path} {policy_file_final_dest}", use_sudo=True) + os.remove(file_path) def run(self): @@ -124,7 +127,7 @@ def run(self): for case in self._test_cases: log.info("") - log.info("Test case: {0}".format(case.__name__.replace('_', ' '))) + log.info("*** Test case: {0}".format(case.__name__.replace('_', ' '))) test_case_start = random.choice(list(ssh_clients.values())).run_command("date '+%Y-%m-%d %T'").rstrip() if self._scenario_start == datetime.min: self._scenario_start = test_case_start @@ -189,11 +192,10 @@ def run(self): log.info("Extensions failed as expected. Expected errors: '{0}'. Actual errors: '{1}'.".format(expected_errors, e)) log.info("") - # After each test, clean up failed extensions to leave VMSS in a good state for the next test. - # If there are leftover failed extensions, CRP will attempt to uninstall them in the next test, but uninstall - # will be disallowed by policy. Since CRP waits for a 90 minute timeout for uninstall, the operation will - # timeout and fail without an appropriate error message, and the whole test case will fail. - # To clean up, we first update the policy to allow all, then remove the extensions. + # Clean up failed extensions to leave VMSS in a good state for the next test. CRP will attempt to uninstall + # leftover extensions in the next test, but uninstall will be disallowed and reach timeout unexpectedly. + # CRP also won't allow deletion of an extension that is dependent on another failed extension, so we first + # update policy to allow all, re-enable all extensions, and then delete them. log.info("Starting cleanup for test case...") allow_all_policy = \ { @@ -205,20 +207,41 @@ def run(self): for ssh_client in ssh_clients.values(): self._create_policy_file(ssh_client, allow_all_policy) + log.info("Trying to re-enable before deleting extensions...") + for ext in extensions: + ext["properties"].update({ + "forceUpdateTag": str(uuid.uuid4()) + }) + ext_template['resources'][0]['properties']['virtualMachineProfile']['extensionProfile'][ + 'extensions'] = extensions + enable_start_time = random.choice(list(ssh_clients.values())).run_command("date '+%Y-%m-%d %T'").rstrip() + try: + rg_client.deploy_template(template=ext_template) + except Exception as err: + # Known issue - CRP returns a stale status for no-config extensions, because it does not wait for a new + # sequence number. AzureMonitorLinuxAgent is the only no-config extension we test. For this extension + # only, swallow the CRP error and check agent log instead to confirm that extensions were enabled + # successfully. + if VmExtensionIds.AzureMonitorLinuxAgent in deletion_order: + log.info("CRP returned error when re-enabling extensions after allowing. Checking agent log to see if enable succeeded. " + "Error: {0}".format(err)) + time.sleep(60) # Give extensions some time to finish processing. + extension_list = ' '.join([str(e) for e in deletion_order]) + command = (f"agent_ext_policy-verify_operation_success.py --after-timestamp '{enable_start_time}' " + f"--operation 'enable' --extension-list {extension_list}") + for ssh_client in ssh_clients.values(): + ssh_client.run_command(command, use_sudo=True) + log.info("Agent reported successful status for all extensions, enable succeeded.") + else: + fail("Failed to re-enable extensions after allowing with policy.") + + # Delete all extensions in dependency order. for ext_to_delete in deletion_order: ext_name_to_delete = ext_to_delete.type try: self._context.vmss.delete_extension(ext_name_to_delete) except Exception as crp_err: - # Known issue - CRP returns stale status in cases of dependency failures. Even if the deletion succeeds, - # CRP may return a failure. We swallow the error and instead, verify that the agent does not report - # status for the uninstalled extension. - log.info("CRP returned an error for deletion operation, may be a false error. Checking agent status file to determine if operation succeeded. Exception: {0}".format(crp_err)) - try: - for ssh_client in ssh_clients.values(): - ssh_client.run_command(f"agent_ext_policy-verify_uninstall_success.py --extension-name '{ext_to_delete}'") - except Exception as agent_err: - fail("Unable to successfully uninstall extension {0}. Exception: {1}".format(ext_name_to_delete, agent_err)) + fail("Failed to uninstall extension {0}. Exception: {1}".format(ext_name_to_delete, crp_err)) log.info("Successfully uninstalled extension {0}".format(ext_name_to_delete)) log.info("Successfully removed all extensions from VMSS") @@ -228,7 +251,6 @@ def run(self): for ssh_client in ssh_clients.values(): ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True) - def get_ignore_errors_before_timestamp(self) -> datetime: # Ignore errors in the agent log before the first test case starts if self._scenario_start == datetime.min: diff --git a/tests_e2e/tests/ext_policy/policy_dependencies_cases.py b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py index 5de3136d4..55c7daf95 100644 --- a/tests_e2e/tests/ext_policy/policy_dependencies_cases.py +++ b/tests_e2e/tests/ext_policy/policy_dependencies_cases.py @@ -58,7 +58,7 @@ def _should_fail_single_config_depends_on_disallowed_single_config(): } } expected_errors = [ - "Extension will not be processed: failed to run extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because it is not specified in the allowlist", + "Extension will not be processed: failed to run extension 'Microsoft.OSTCExtensions.VMAccessForLinux' because it is not specified as an allowed extension", "'CustomScript' is marked as failed since it depends upon the VM Extension 'VMAccessForLinux' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.VmAccess] @@ -69,7 +69,6 @@ def _should_fail_single_config_depends_on_disallowed_no_config(): template = [ __get_extension_template(VmExtensionIds.AzureMonitorLinuxAgent), __get_extension_template(VmExtensionIds.CustomScript, depends_on=["AzureMonitorLinuxAgent"]) - ] policy = \ { @@ -84,7 +83,7 @@ def _should_fail_single_config_depends_on_disallowed_no_config(): } } expected_errors = [ - "Extension will not be processed: failed to run extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because it is not specified in the allowlist", + "Extension will not be processed: failed to run extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because it is not specified as an allowed extension", "'CustomScript' is marked as failed since it depends upon the VM Extension 'AzureMonitorLinuxAgent' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.AzureMonitorLinuxAgent] @@ -109,7 +108,7 @@ def _should_fail_single_config_depends_on_disallowed_multi_config(): } } expected_errors = [ - "Extension will not be processed: failed to run extension 'Microsoft.CPlat.Core.RunCommandHandlerLinux' because it is not specified in the allowlist", + "Extension will not be processed: failed to run extension 'Microsoft.CPlat.Core.RunCommandHandlerLinux' because it is not specified as an allowed extension", "'CustomScript' is marked as failed since it depends upon the VM Extension 'RunCommandHandlerLinux' which has failed" ] deletion_order = [VmExtensionIds.CustomScript, VmExtensionIds.RunCommandHandler] @@ -134,7 +133,7 @@ def _should_fail_multi_config_depends_on_disallowed_single_config(): } } expected_errors = [ - "Extension will not be processed: failed to run extension 'Microsoft.Azure.Extensions.CustomScript' because it is not specified in the allowlist", + "Extension will not be processed: failed to run extension 'Microsoft.Azure.Extensions.CustomScript' because it is not specified as an allowed extension", "VM has reported a failure when processing extension 'RunCommandHandlerLinux' (publisher 'Microsoft.CPlat.Core' and type 'RunCommandHandlerLinux'). Error message: 'Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Extensions.CustomScript failed'." ] deletion_order = [VmExtensionIds.RunCommandHandler, VmExtensionIds.CustomScript] @@ -159,7 +158,7 @@ def _should_fail_multi_config_depends_on_disallowed_no_config(): } } expected_errors = [ - "Extension will not be processed: failed to run extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because it is not specified in the allowlist", + "Extension will not be processed: failed to run extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because it is not specified as an allowed extension", "VM has reported a failure when processing extension 'RunCommandHandlerLinux' (publisher 'Microsoft.CPlat.Core' and type 'RunCommandHandlerLinux'). Error message: 'Skipping processing of extensions since execution of dependent extension Microsoft.Azure.Monitor.AzureMonitorLinuxAgent failed'." ] deletion_order = [VmExtensionIds.RunCommandHandler, VmExtensionIds.AzureMonitorLinuxAgent] @@ -210,4 +209,4 @@ def _should_succeed_single_config_depends_on_single_config(): } expected_errors = [] deletion_order = [VmExtensionIds.VmAccess, VmExtensionIds.CustomScript] - return policy, template, expected_errors, deletion_order + return policy, template, expected_errors, deletion_order \ No newline at end of file diff --git a/tests_e2e/tests/scripts/agent_ext_policy-verify_operation_success.py b/tests_e2e/tests/scripts/agent_ext_policy-verify_operation_success.py new file mode 100755 index 000000000..2e382cebe --- /dev/null +++ b/tests_e2e/tests/scripts/agent_ext_policy-verify_operation_success.py @@ -0,0 +1,117 @@ +#!/usr/bin/env pypy3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +import argparse + +from assertpy import fail +from pathlib import Path +from datetime import datetime +import time +import re + +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.agent_log import AgentLog + +# This script verifies the success of an operation using the agent log. +# Enable: check that the agent has reported a successful status for the specified list of extensions +# Uninstall: check that the agent has not reported any status for the specified list of extensions +# +# Usage: +# agent_ext_policy-verify_operation_success.py --extension-list "A" "B" --operation "enable" --after-timestamp "2025-01-13 11:21:40" + + +def __get_last_reported_status(after_timestamp): + # Get last reported status from the agent log file. If after_timestamp is specified, return only the status reported + # after that timestamp, and raise error if not found after 2 tries. + agent_log = AgentLog(Path('/var/log/waagent.log')) + + retries = 2 + for attempt in range(retries): + phrase = "All extensions in the goal state have reached a terminal state" + latest_status = None + for record in agent_log.read(): + if record.timestamp < after_timestamp: + continue + + if phrase in record.message: + if latest_status is None: + latest_status = record + else: + if latest_status.timestamp < record.timestamp: + latest_status = record + + if latest_status is not None: + log.info("Latest status: {0}".format(latest_status.message)) + return latest_status + + log.info("Unable to find handler status in agent log on attempt {0}. Retrying...".format(attempt + 1)) + time.sleep(30) + + return None + + +def check_extension_reported_successful_status(status_message, ext_name: str): + # Extract extension statuses from the agent record + pattern = r"\(u?'(" + re.escape(ext_name) + r")', u?'([^']+)'\)" + match = re.search(pattern, status_message) + if match is None: + fail("Agent did not report any status for extension {0}, enable failed.".format(ext_name)) + else: + status_code = match.group(2).lower() + log.info("Status code: {0}".format(status_code)) + if status_code not in ["success", "ready"]: + fail("Agent did not report a successful status for extension {0}, enable failed. Status: {1}".format(ext_name, status_code)) + else: + log.info("Agent reported a successful status for extension {0}, enable succeeded.".format(ext_name)) + +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument('--extension-list', dest='extension_list', required=True, nargs='+', + help='Extension name(s) to process. Provide a single name or a space-separated list of names.') + + parser.add_argument('--operation', dest='operation', required=True, choices=['enable', 'uninstall']) + parser.add_argument("--after-timestamp", dest='after_timestamp', required=False) + args = parser.parse_args() + + if args.after_timestamp is not None: + after_datetime = datetime.strptime(args.after_timestamp, '%Y-%m-%d %H:%M:%S') + else: + after_datetime = datetime.min + + status = __get_last_reported_status(after_datetime) + if status is None: + fail("Unable to find extension status in agent log.") + + if args.operation == "enable": + log.info("Checking agent status file to verify that extensions were enabled successfully.") + for extension in args.extension_list: + check_extension_reported_successful_status(status.message, extension) + + elif args.operation == "uninstall": + log.info("Checking agent log to verify that status is not reported for uninstalled extensions.") + for extension in args.extension_list: + if extension in status.message: + fail("Agent reported status for extension {0}, uninstall failed.".format(extension)) + else: + log.info("Agent did not report status for extension {0}, uninstall succeeded.".format(extension)) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests_e2e/tests/scripts/agent_ext_policy-verify_uninstall_success.py b/tests_e2e/tests/scripts/agent_ext_policy-verify_uninstall_success.py deleted file mode 100755 index 8bcf725ba..000000000 --- a/tests_e2e/tests/scripts/agent_ext_policy-verify_uninstall_success.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env pypy3 - -# Microsoft Azure Linux Agent -# -# Copyright 2018 Microsoft Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Verify if the agent reported update status to CRP via status file -# -import argparse -import glob -import json - -from assertpy import fail - -from tests_e2e.tests.lib.logging import log - - -def extension_found_in_agent_status_file(ext_name: str) -> bool: - # Check if the provided extension name is present in the agent status file, under handlerAggregateStatus. - # If the name is not present, the uninstall operation was successful. - # TODO: a new status file is created for each goal state. To make this a reliable check, make sure to correlate - # the status file to the goal state being checked. Alternatively, consider checking waagent.log. - agent_status_file = "/var/lib/waagent/history/*/waagent_status.json" - file_paths = glob.glob(agent_status_file, recursive=True) - for file in file_paths: - with open(file, 'r') as f: - data = json.load(f) - log.info("Agent status file (%s): %s", file, data) - handler_status = data["aggregateStatus"]["handlerAggregateStatus"] - if any(handler["handlerName"].lower() == ext_name.lower() for handler in handler_status): - return True - return False - - -def main(): - - parser = argparse.ArgumentParser() - parser.add_argument('--extension-name', dest='name', required=True) - args = parser.parse_args() - - log.info("Checking agent status file to verify that the uninstalled extension is not present in reported status") - if extension_found_in_agent_status_file(args.name): - fail("Handler status was found in the status file for extension {0}, uninstall failed.".format(args.name)) - else: - log.info("Handler status was not found in the status file for extension {0}, uninstall succeeded.".format(args.name)) - - -if __name__ == "__main__": - main() From be4264051758897762137315756d88c83bfb5d5d Mon Sep 17 00:00:00 2001 From: mgunnala Date: Mon, 13 Jan 2025 15:01:36 -0500 Subject: [PATCH 27/32] Small e2e test cleanups --- tests_e2e/orchestrator/runbook.yml | 3 +-- tests_e2e/tests/ext_policy/ext_policy.py | 6 +++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml index b64c66d82..c7bcaed61 100644 --- a/tests_e2e/orchestrator/runbook.yml +++ b/tests_e2e/orchestrator/runbook.yml @@ -46,8 +46,7 @@ variable: - ext_cgroups - extensions_disabled - ext_policy -# TODO: re-enable ext_policy_with_dependencies after investigating status reporting failures -# - ext_policy_with_dependencies + - ext_policy_with_dependencies - ext_sequencing - ext_telemetry_pipeline - fips diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index 35345d21c..f78f6706b 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -215,7 +215,10 @@ def run(self): self._operation_should_succeed("enable", custom_script) self._operation_should_fail("enable", run_command) self._operation_should_fail("enable", run_command_2) - self._operation_should_fail("enable", azure_monitor) + # Only call enable on AMA if supported. The agent will try to re-enable AMA as a part of the next goal state, when + # policy is changed to allow all. This will cause errors on an unsupported distro. + if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): + self._operation_should_fail("enable", azure_monitor) # This policy tests the following scenarios: # - enable two instances of a multi-config extension (RunCommandHandler) when allowed by policy -> should succeed @@ -251,6 +254,7 @@ def run(self): self._operation_should_succeed("enable", azure_monitor) self._operation_should_succeed("delete", azure_monitor) + # This policy tests the following scenarios: # - disallow a previously-enabled single-config extension (CustomScript, then try to enable again -> should fail fast # - disallow a previously-enabled single-config extension (CustomScript), then try to delete -> should fail fast From 7069a0b6f5449dad9c28c57d64413779a184a8b2 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Wed, 15 Jan 2025 15:35:21 -0500 Subject: [PATCH 28/32] Address agent code comments --- azurelinuxagent/ga/exthandlers.py | 64 +++++++++---------- tests/ga/test_extension.py | 8 +-- .../extensions_disabled.py | 2 +- 3 files changed, 36 insertions(+), 38 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 49eddcd85..941ecc1d0 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -87,12 +87,12 @@ # This is the default sequence number we use when there are no settings available for Handlers _DEFAULT_SEQ_NO = "0" -# For policy-related errors, this mapping is used to generate user-friendly error messages and determine the appropriate -# terminal error code based on the blocked operation. +# For extension disallowed errors (e.g. blocked by policy, extensions disabled), this mapping is used to generate +# user-friendly error messages and determine the appropriate terminal error code based on the blocked operation. # Format: {: (, )} # - The first element of the tuple is a user-friendly operation name included in error messages. # - The second element of the tuple is the CRP terminal error code for the operation. -_POLICY_ERROR_MAP = \ +_EXT_DISALLOWED_ERROR_MAP = \ { ExtensionRequestedState.Enabled: ('run', ExtensionErrorCodes.PluginEnableProcessingFailed), # Note: currently, when uninstall is requested for an extension, CRP polls until the agent does not @@ -297,8 +297,9 @@ class ExtHandlersHandler(object): def __init__(self, protocol): self.protocol = protocol self.ext_handlers = None - # Maintain a list of extensions that are disallowed, and always report extension status for disallowed extensions. - self.disallowed_ext_handlers = [] + # Maintain a list of extension handler objects that are disallowed (e.g. blocked by policy, extensions disabled, etc.). + # Extension status is always reported for the extensions in this list. List is reset for each goal state. + self.__disallowed_ext_handlers = [] # The GoalState Aggregate status needs to report the last status of the GoalState. Since we only process # extensions on goal state change, we need to maintain its state. # Setting the status to None here. This would be overridden as soon as the first GoalState is processed @@ -513,39 +514,35 @@ def handle_ext_handlers(self, goal_state_id): except Exception as ex: policy_error = ex + self.__disallowed_ext_handlers = [] + for extension, ext_handler in all_extensions: handler_i = ExtHandlerInstance(ext_handler, self.protocol, extension=extension) + # Get user-friendly operation name and terminal error code to use in status messages if extension is disallowed + operation, error_code = _EXT_DISALLOWED_ERROR_MAP.get(ext_handler.state) + # In case of extensions disabled, we skip processing extensions. But CRP is still waiting for some status # back for the skipped extensions. In order to propagate the status back to CRP, we will report status back # here with an error message. if not extensions_enabled: - self.disallowed_ext_handlers.append(ext_handler) - agent_conf_file_path = get_osutil().agent_conf_file_path - msg = "Extension will not be processed since extension processing is disabled. To enable extension " \ - "processing, set Extensions.Enabled=y in '{0}'".format(agent_conf_file_path) ext_full_name = handler_i.get_extension_full_name(extension) - logger.info('') - logger.info("{0}: {1}".format(ext_full_name, msg)) - add_event(op=WALAEventOperation.ExtensionProcessing, message="{0}: {1}".format(ext_full_name, msg)) - handler_i.set_handler_status(status=ExtHandlerStatusValue.not_ready, message=msg, code=-1) - handler_i.create_status_file(extension, - status=ExtensionStatusValue.error, - code=-1, - operation=handler_i.operation, - message=msg, - overwrite=False) + agent_conf_file_path = get_osutil().agent_conf_file_path + msg = "Extension '{0}' will not be processed since extension processing is disabled. To enable extension " \ + "processing, set Extensions.Enabled=y in '{1}'".format(ext_full_name, agent_conf_file_path) + # logger.info(msg) + self.__handle_ext_disallowed_error(handler_i, error_code, report_op=WALAEventOperation.ExtensionProcessing, + message=msg, extension=extension) continue # If an error was thrown during policy engine initialization, skip further processing of the extension. # CRP is still waiting for status, so we report error status here. - operation, error_code = _POLICY_ERROR_MAP.get(ext_handler.state) if policy_error is not None: msg = "Extension will not be processed: {0}".format(ustr(policy_error)) - self.__report_policy_error(ext_handler_i=handler_i, error_code=error_code, - report_op=WALAEventOperation.ExtensionPolicy, message=msg, - extension=extension) + self.__handle_ext_disallowed_error(ext_handler_i=handler_i, error_code=error_code, + report_op=WALAEventOperation.ExtensionPolicy, message=msg, + extension=extension) continue # In case of depends-on errors, we skip processing extensions if there was an error processing dependent extensions. @@ -579,8 +576,8 @@ def handle_ext_handlers(self, goal_state_id): "Extension will not be processed: failed to {0} extension '{1}' because it is not specified " "as an allowed extension. To {0}, add the extension to the list of allowed extensions in the policy file ('{2}')." ).format(operation, ext_handler.name, conf.get_policy_file_path()) - self.__report_policy_error(handler_i, error_code, report_op=WALAEventOperation.ExtensionPolicy, - message=msg, extension=extension) + self.__handle_ext_disallowed_error(handler_i, error_code, report_op=WALAEventOperation.ExtensionPolicy, + message=msg, extension=extension) extension_success = False else: extension_success = self.handle_ext_handler(handler_i, extension, goal_state_id) @@ -748,27 +745,28 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True, message=message) - def __report_policy_error(self, ext_handler_i, error_code, report_op, message, extension): - # TODO: __handle_and_report_ext_handler_errors() does not create a status file for single-config extensions, this - # function was created as a temporary workaround. Consider merging the two functions function after assessing its impact. - + def __handle_ext_disallowed_error(self, ext_handler_i, error_code, report_op, message, extension): + # Report error for disallowed extensions (extensions blocked by policy or extensions disabled via config). # If extension status exists, CRP ignores handler status and reports extension status. In the case of policy errors, # we write a .status file to force CRP to fail fast - the agent will otherwise report a transitioning status. # - For extensions without settings or uninstall errors: report at the handler level. # - For extensions with settings (install/enable errors): report at both handler and extension levels. + # + # TODO: __handle_and_report_ext_handler_errors() does not create a status file for single-config extensions, this + # function was created as a temporary workaround. Consider merging the two functions function after assessing the impact. # Keep a list of disallowed extensions so that report_ext_handler_status() can report status for them. - self.disallowed_ext_handlers.append(ext_handler_i.ext_handler) + self.__disallowed_ext_handlers.append(ext_handler_i.ext_handler) # Set handler status for all extensions (with and without settings). - ext_handler_i.set_handler_status(message=message, code=error_code) + ext_handler_i.set_handler_status(status=ExtHandlerStatusValue.not_ready, message=message, code=error_code) # For extensions with settings (install/enable errors), also update extension-level status. # Overwrite any existing status file to reflect policy failures accurately. if extension is not None and ext_handler_i.ext_handler.state == ExtensionRequestedState.Enabled: # TODO: if extension is reporting heartbeat, it overwrites status. Consider overwriting heartbeat here ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error_code, - operation=report_op, message=message, overwrite=True) + operation=ext_handler_i.operation, message=message, overwrite=True) name = ext_handler_i.get_extension_full_name(extension) handler_version = ext_handler_i.ext_handler.version @@ -1068,7 +1066,7 @@ def report_ext_handler_status(self, vm_status, ext_handler, goal_state_changed): handler_state = ext_handler_i.get_handler_state() ext_handler_statuses = [] - ext_disallowed = ext_handler in self.disallowed_ext_handlers + ext_disallowed = ext_handler in self.__disallowed_ext_handlers # For MultiConfig, we need to report status per extension even for Handler level failures. # If we have HandlerStatus for a MultiConfig handler and GS is requesting for it, we would report status per # extension even if HandlerState == NotInstalled (Sample scenario: ExtensionsGoalStateError, DecideVersionError, etc) diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index b7d830ccf..c1afd6bcf 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -1652,13 +1652,13 @@ def test_extensions_disabled(self, _, *args): vm_status = args[0] self.assertEqual(1, len(vm_status.vmAgent.extensionHandlers)) exthandler = vm_status.vmAgent.extensionHandlers[0] - self.assertEqual(-1, exthandler.code) + self.assertEqual(ExtensionErrorCodes.PluginEnableProcessingFailed, exthandler.code) self.assertEqual('NotReady', exthandler.status) - self.assertEqual("Extension will not be processed since extension processing is disabled. To enable extension processing, set Extensions.Enabled=y in '/etc/waagent.conf'", exthandler.message) + self.assertEqual("Extension 'OSTCExtensions.ExampleHandlerLinux' will not be processed since extension processing is disabled. To enable extension processing, set Extensions.Enabled=y in '/etc/waagent.conf'", exthandler.message) ext_status = exthandler.extension_status - self.assertEqual(-1, ext_status.code) + self.assertEqual(ExtensionErrorCodes.PluginEnableProcessingFailed, ext_status.code) self.assertEqual('error', ext_status.status) - self.assertEqual("Extension will not be processed since extension processing is disabled. To enable extension processing, set Extensions.Enabled=y in '/etc/waagent.conf'", ext_status.message) + self.assertEqual("Extension 'OSTCExtensions.ExampleHandlerLinux' will not be processed since extension processing is disabled. To enable extension processing, set Extensions.Enabled=y in '/etc/waagent.conf'", ext_status.message) def test_extensions_deleted(self, *args): # Ensure initial enable is successful diff --git a/tests_e2e/tests/extensions_disabled/extensions_disabled.py b/tests_e2e/tests/extensions_disabled/extensions_disabled.py index 002d83357..1111f7e6a 100755 --- a/tests_e2e/tests/extensions_disabled/extensions_disabled.py +++ b/tests_e2e/tests/extensions_disabled/extensions_disabled.py @@ -88,7 +88,7 @@ def run(self): assert_that("VMExtensionProvisioningError" in str(error)) \ .described_as(f"Expected a VMExtensionProvisioningError error, but actual error was: {error}") \ .is_true() - assert_that("Extension will not be processed since extension processing is disabled" in str(error)) \ + assert_that("will not be processed since extension processing is disabled" in str(error)) \ .described_as( f"Error message should communicate that extension will not be processed, but actual error " f"was: {error}").is_true() From ce5cf20845535f673dc27076a0108829d3a82a12 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Fri, 17 Jan 2025 14:16:51 -0500 Subject: [PATCH 29/32] Address test comments --- azurelinuxagent/ga/exthandlers.py | 13 ++-- .../ext_policy_with_dependencies.yml | 4 +- tests_e2e/tests/ext_policy/ext_policy.py | 68 +++++++++---------- .../ext_policy_with_dependencies.py | 31 ++++++--- .../extensions_disabled.py | 11 ++- ..._ext_policy-verify_operation_disallowed.py | 64 +++++++++++++++++ 6 files changed, 140 insertions(+), 51 deletions(-) create mode 100755 tests_e2e/tests/scripts/agent_ext_policy-verify_operation_disallowed.py diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 941ecc1d0..7e9b74633 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -531,7 +531,6 @@ def handle_ext_handlers(self, goal_state_id): agent_conf_file_path = get_osutil().agent_conf_file_path msg = "Extension '{0}' will not be processed since extension processing is disabled. To enable extension " \ "processing, set Extensions.Enabled=y in '{1}'".format(ext_full_name, agent_conf_file_path) - # logger.info(msg) self.__handle_ext_disallowed_error(handler_i, error_code, report_op=WALAEventOperation.ExtensionProcessing, message=msg, extension=extension) continue @@ -746,19 +745,23 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess message=message) def __handle_ext_disallowed_error(self, ext_handler_i, error_code, report_op, message, extension): - # Report error for disallowed extensions (extensions blocked by policy or extensions disabled via config). - # If extension status exists, CRP ignores handler status and reports extension status. In the case of policy errors, - # we write a .status file to force CRP to fail fast - the agent will otherwise report a transitioning status. + # Handle and report error for disallowed extensions (extensions blocked by policy or disabled via config). + # Note: CRP may pick up stale statuses when not polling for sequence number. If extension status exists, CRP + # prioritizes it over handler status and polls for seq no. To work around the issue, we report extension status + # for extensions with settings: # - For extensions without settings or uninstall errors: report at the handler level. # - For extensions with settings (install/enable errors): report at both handler and extension levels. # # TODO: __handle_and_report_ext_handler_errors() does not create a status file for single-config extensions, this - # function was created as a temporary workaround. Consider merging the two functions function after assessing the impact. + # function is a temporary workaround. Consider merging the two functions function after assessing the impact. # Keep a list of disallowed extensions so that report_ext_handler_status() can report status for them. self.__disallowed_ext_handlers.append(ext_handler_i.ext_handler) # Set handler status for all extensions (with and without settings). + # Install errors should always be reported at the handler level. While install errors for any extension should + # ideally be reported ONLY at the handler level, we also report at the ext status level for extensions with settings + # as a workaround for the stale status issue. ext_handler_i.set_handler_status(status=ExtHandlerStatusValue.not_ready, message=message, code=error_code) # For extensions with settings (install/enable errors), also update extension-level status. diff --git a/tests_e2e/test_suites/ext_policy_with_dependencies.yml b/tests_e2e/test_suites/ext_policy_with_dependencies.yml index a44453b52..51ba959be 100644 --- a/tests_e2e/test_suites/ext_policy_with_dependencies.yml +++ b/tests_e2e/test_suites/ext_policy_with_dependencies.yml @@ -1,6 +1,6 @@ # -# The test suite verifies that disallowed extensions are not processed, but the agent should still report status. -# +# The test suite verifies that disallowed extensions and any extensions dependent on disallowed extensions are not processed, +# but the agent should still report status. name: "ExtPolicyWithDependencies" tests: - "ext_policy/ext_policy_with_dependencies.py" diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index f78f6706b..b2ef7bc8d 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -17,6 +17,7 @@ # limitations under the License. # import json +import re import uuid import os from typing import List, Dict, Any @@ -62,7 +63,7 @@ def _create_policy_file(self, policy): def _operation_should_succeed(self, operation, extension_case): log.info("") - log.info(f"Attempting to {operation} {extension_case.extension.__str__()}, expected to succeed ") + log.info(f"Attempting to {operation} {extension_case.extension.__str__()}, expected to succeed") # Attempt operation. If enabling, assert that the extension is present in instance view. # If deleting, assert that the extension is not present in instance view. try: @@ -91,10 +92,10 @@ def _operation_should_fail(self, operation, extension_case): log.info("") if operation == "enable": try: - log.info(f"Attempting to enable {extension_case.extension}, should fail fast.") + log.info(f"Attempting to enable {extension_case.extension}, should fail fast due to policy.") timeout = (6 * 60) # Fail fast. # VirtualMachineRunCommandClient (and VirtualMachineRunCommand) does not take force_update_tag as a parameter. - if type(extension_case.extension) == VirtualMachineRunCommandClient: + if isinstance(extension_case.extension, VirtualMachineRunCommandClient): extension_case.extension.enable(settings=extension_case.settings, timeout=timeout) else: extension_case.extension.enable(settings=extension_case.settings, force_update=True, @@ -103,27 +104,32 @@ def _operation_should_fail(self, operation, extension_case): f"The agent should have reported an error trying to {operation} {extension_case.extension} " f"because the extension is disallowed by policy.") except Exception as error: - expected_msg = "Extension will not be processed: failed to run extension" - assert_that(expected_msg in str(error)) \ + # We exclude the extension name from regex because CRP sometimes installs test extensions with different + # names (ex: Microsoft.Azure.Extensions.Edp.RunCommandHandlerLinuxTest instead of Microsoft.CPlat.Core.RunCommandHandlerLinux) + pattern = r".*Extension will not be processed: failed to run extension .* because it is not specified as an allowed extension.*" + assert_that(re.search(pattern, str(error))) \ .described_as( - f"Error message is expected to contain '{expected_msg}', but actual error message was '{error}'").is_true() - log.info(f"{extension_case.extension} {operation} failed as expected") + f"Error message is expected to contain '{pattern}', but actual error message was '{error}'").is_not_none() + log.info(f"{extension_case.extension} {operation} failed as expected due to policy") elif operation == "delete": - # Delete is a best effort operation and should not fail, so CRP will timeout instead of reporting the - # appropriate error. We swallow the timeout error, and instead, assert that the extension is still in the - # instance view and that the expected error is in the agent log to confirm that deletion failed. - log.info(f"Attempting to delete {extension_case.extension}, should reach timeout.") + # For delete operations, CRP polls until the agent stops reporting status for the extension, or until timeout is + # reached, because delete is a best-effort operation and is not expected to fail. However, when delete is called + # on a disallowed extension, the agent reports failure status, so CRP will continue to poll until timeout. + # For efficiency, we asynchronously check the instance view and agent log to confirm that deletion failed, + # and do not wait for a response from CRP. + # + # Note: CRP will only allow another 'delete' call during this waiting period, 'enable' will fail. Make sure + log.info(f"Attempting to delete {extension_case.extension}, should fail due to policy.") delete_start_time = self._ssh_client.run_command("date '+%Y-%m-%d %T'").rstrip() try: - # TODO: consider checking the agent's log asynchronously to confirm that the uninstall failed instead of - # waiting for the full CRP timeout. - extension_case.extension.delete() - fail(f"CRP should have reported a timeout error when attempting to delete {extension_case.extension} " + timeout = (3 * 60) # Allow agent some time to process goal state, but do not wait for full timeout. + extension_case.extension.delete(timeout=timeout) + fail(f"CRP should not have successfully completed the delete operation for {extension_case.extension} " f"because the extension is disallowed by policy and agent should have reported a policy failure.") except TimeoutError: - log.info("Reported a timeout error when attempting to delete extension, as expected. Checking instance view " - "and agent log to confirm that delete operation failed.") + log.info("Delete operation did not complete, as expected. Checking instance view " + "and agent log to confirm that delete operation failed due to policy.") # Confirm that extension is still present in instance view instance_view_extensions = self._context.vm.get_instance_view().extensions if instance_view_extensions is not None and not any( @@ -131,20 +137,16 @@ def _operation_should_fail(self, operation, extension_case): fail(f"Delete operation is disallowed by policy and should have failed, but extension " f"{extension_case.extension} is no longer present in the instance view.") - # Confirm that expected error message is in the agent log - expected_msg = "Extension will not be processed: failed to uninstall extension" + # Confirm that agent log contains error message that uninstall was blocked due to policy + # The script will check for a log message such as "Extension will not be processed: failed to uninstall + # extension 'Microsoft.Azure.Extensions.CustomScript' because it is not specified as an allowed extension" self._ssh_client.run_command( - f"agent_ext_workflow-check_data_in_agent_log.py --data '{expected_msg}' --after-timestamp '{delete_start_time}'", - use_sudo=True) + f"agent_ext_policy-verify_operation_disallowed.py --extension-name '{extension_case.extension._identifier}' " + f"--after-timestamp '{delete_start_time}' --operation 'uninstall'", use_sudo = True) def run(self): - # The full CRP timeout period for extension operation failure is 90 minutes. For efficiency, we reduce the - # timeout limit to 15 minutes here. We expect "delete" operations on disallowed VMs to reach timeout instead of - # failing fast, because delete is a best effort operation by-design and should not fail. log.info("*** Begin test setup") - log.info("Set CRP timeout to 15 minutes") - self._context.vm.update({"extensionsTimeBudget": "PT15M"}) # Prepare no-config, single-config, and multi-config extension to test. Extensions with settings and extensions # without settings have different status reporting logic, so we should test all cases. @@ -215,8 +217,10 @@ def run(self): self._operation_should_succeed("enable", custom_script) self._operation_should_fail("enable", run_command) self._operation_should_fail("enable", run_command_2) - # Only call enable on AMA if supported. The agent will try to re-enable AMA as a part of the next goal state, when - # policy is changed to allow all. This will cause errors on an unsupported distro. + # We only enable AMA on supported distros. + # If we were to enable AMA on an unsupported distro, the operation would initially be blocked by policy as + # expected. However, after changing the policy to allow all with the next goal state, the agent would attempt to + # re-enable AMA on an unsupported distro, causing errors. if VmExtensionIds.AzureMonitorLinuxAgent.supports_distro((self._ssh_client.run_command("get_distro.py").rstrip())): self._operation_should_fail("enable", azure_monitor) @@ -302,21 +306,17 @@ def run(self): self._operation_should_succeed("delete", custom_script) self._operation_should_succeed("enable", custom_script) - # Cleanup after test: delete leftover extensions and disable policy enforcement in conf file. + # Cleanup after test: disable policy enforcement in conf file. log.info("") log.info("*** Begin test cleanup") log.info("Disabling policy via conf file on the test VM [%s]", self._context.vm.name) self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True) - # TODO: Consider deleting only extensions used by this test instead of all extensions. - self._context.vm.delete_all_extensions() log.info("*** Test cleanup complete.") - - def get_ignore_error_rules(self) -> List[Dict[str, Any]]: ignore_rules = [ - # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension will not be processed: failed to run extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because it is not specified in the allowlist. To enable, add extension to the allowed list in the policy file ('/etc/waagent_policy.json')., duration=0 + # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension will not be processed: failed to run extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because it is not specified as an allowed extension. To enable, add the extension to the list of allowed extensions in the policy file ('/etc/waagent_policy.json')., duration=0 # We intentionally block extensions with policy and expect this failure message { 'message': r"Extension will not be processed: failed to .* extension .* because it is not specified as an allowed extension" diff --git a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py index 368e85fe8..c4f798848 100644 --- a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py +++ b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py @@ -152,8 +152,11 @@ def run(self): for ext in extensions: provisioned_after = ext['properties'].get('provisionAfterExtensions') depends_on = provisioned_after if provisioned_after else [] - dependency_list = "-" if not depends_on else ' and '.join(depends_on) - log.info("{0} depends on {1}".format(ext['name'], dependency_list)) + if depends_on: + dependency_list = ' and '.join(depends_on) + log.info("{0} depends on {1}".format(ext['name'], dependency_list)) + else: + log.info("{0} does not depend on any extension".format(ext['name'])) # Copy policy file to each VM instance log.info("Updating policy file with new policy: {0}".format(policy)) @@ -189,13 +192,21 @@ def run(self): for phrase in expected_errors: if phrase not in error_message: fail("Extension template deployment failed as expected, but with an unexpected error. Error expected to contain message '{0}'. Actual error: {1}".format(phrase, e)) - log.info("Extensions failed as expected. Expected errors: '{0}'. Actual errors: '{1}'.".format(expected_errors, e)) + + log.info("Extensions failed as expected.") + log.info("") + log.info("Expected errors:") + for expected_error in expected_errors: + log.info(" - {0}".format(expected_error)) + log.info("") log.info("") + log.info("Actual errors:") + log.info(str(e)) # Clean up failed extensions to leave VMSS in a good state for the next test. CRP will attempt to uninstall # leftover extensions in the next test, but uninstall will be disallowed and reach timeout unexpectedly. # CRP also won't allow deletion of an extension that is dependent on another failed extension, so we first - # update policy to allow all, re-enable all extensions, and then delete them. + # update policy to allow all, re-enable all extensions, and then delete them in dependency order. log.info("Starting cleanup for test case...") allow_all_policy = \ { @@ -219,13 +230,15 @@ def run(self): rg_client.deploy_template(template=ext_template) except Exception as err: # Known issue - CRP returns a stale status for no-config extensions, because it does not wait for a new - # sequence number. AzureMonitorLinuxAgent is the only no-config extension we test. For this extension - # only, swallow the CRP error and check agent log instead to confirm that extensions were enabled - # successfully. - if VmExtensionIds.AzureMonitorLinuxAgent in deletion_order: + # sequence number. Only for cases testing no-config extension dependencies, swallow the CRP error and + # check agent log instead to confirm that extensions were enabled successfully. + test_cases_to_work_around = [ + _should_fail_single_config_depends_on_disallowed_no_config + ] + if case in test_cases_to_work_around: log.info("CRP returned error when re-enabling extensions after allowing. Checking agent log to see if enable succeeded. " "Error: {0}".format(err)) - time.sleep(60) # Give extensions some time to finish processing. + time.sleep(2 * 60) # Give extensions some time to finish processing. extension_list = ' '.join([str(e) for e in deletion_order]) command = (f"agent_ext_policy-verify_operation_success.py --after-timestamp '{enable_start_time}' " f"--operation 'enable' --extension-list {extension_list}") diff --git a/tests_e2e/tests/extensions_disabled/extensions_disabled.py b/tests_e2e/tests/extensions_disabled/extensions_disabled.py index 1111f7e6a..5a166e695 100755 --- a/tests_e2e/tests/extensions_disabled/extensions_disabled.py +++ b/tests_e2e/tests/extensions_disabled/extensions_disabled.py @@ -27,7 +27,7 @@ import uuid from assertpy import assert_that, fail -from typing import Any +from typing import List, Dict, Any from azure.mgmt.compute.models import VirtualMachineInstanceView @@ -137,6 +137,15 @@ def run(self): fail(f"Unexpected error while processing {t.extension.__str__()} after re-enabling extension " f"processing: {error}") + def get_ignore_error_rules(self) -> List[Dict[str, Any]]: + ignore_rules = [ + # 2025-01-15T20:45:13.359784Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Extensions.CustomScript, op=ExtensionProcessing, message=Extension 'Microsoft.Azure.Extensions.CustomScript' will not be processed since extension processing is disabled. To enable extension processing, set Extensions.Enabled=y in '/etc/waagent.conf', duration=0 + # We expect this error message when we disable extension processing via conf + { + 'message': r"Extension .* will not be processed since extension processing is disabled" + } + ] + return ignore_rules if __name__ == "__main__": ExtensionsDisabled.run_from_command_line() diff --git a/tests_e2e/tests/scripts/agent_ext_policy-verify_operation_disallowed.py b/tests_e2e/tests/scripts/agent_ext_policy-verify_operation_disallowed.py new file mode 100755 index 000000000..32e84e8b4 --- /dev/null +++ b/tests_e2e/tests/scripts/agent_ext_policy-verify_operation_disallowed.py @@ -0,0 +1,64 @@ +#!/usr/bin/env pypy3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Checks that the input data is found in the agent log +# +import argparse +import sys +import re + +from datetime import datetime +from pathlib import Path +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.agent_log import AgentLog + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--extension-name', dest='extension_name', required=True) + parser.add_argument('--operation', dest='operation', required=True, choices=['run', 'uninstall']) + parser.add_argument("--after-timestamp", dest='after_timestamp', required=False) + + args, _ = parser.parse_known_args() + + log.info("Verifying that agent log shows {0} failure due to policy".format(args.operation)) + pattern = (r".*Extension will not be processed: failed to {0} extension '{1}' because it is not specified as an allowed extension.*" + .format(args.operation, re.escape(args.extension_name))) + agent_log = AgentLog(Path('/var/log/waagent.log')) + + if args.after_timestamp is None: + after_datetime = datetime.min + else: + after_datetime = datetime.strptime(args.after_timestamp, '%Y-%m-%d %H:%M:%S') + + try: + for record in agent_log.read(): + if record.timestamp > after_datetime: + if re.search(pattern, record.message): + log.info("Found expected error in agent log: {0}".format(record.message)) + sys.exit(0) + + except Exception as e: + log.info("Error thrown when searching for test data in agent log: {0}".format(str(e))) + + log.info("Did not find expected error in agent log. Expected to find pattern: {0}".format(pattern)) + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file From 5abbfc6445fdc02da0f77843ca834166dee69e35 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Fri, 17 Jan 2025 15:13:45 -0500 Subject: [PATCH 30/32] Small formatting fix --- tests_e2e/tests/ext_policy/ext_policy.py | 3 ++- tests_e2e/tests/extensions_disabled/extensions_disabled.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index b2ef7bc8d..97b78df62 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -119,7 +119,8 @@ def _operation_should_fail(self, operation, extension_case): # For efficiency, we asynchronously check the instance view and agent log to confirm that deletion failed, # and do not wait for a response from CRP. # - # Note: CRP will only allow another 'delete' call during this waiting period, 'enable' will fail. Make sure + # Note: CRP will not allow an 'enable' request until deletion succeeds or times out. The next call must be + # a delete operation allowed by policy. log.info(f"Attempting to delete {extension_case.extension}, should fail due to policy.") delete_start_time = self._ssh_client.run_command("date '+%Y-%m-%d %T'").rstrip() try: diff --git a/tests_e2e/tests/extensions_disabled/extensions_disabled.py b/tests_e2e/tests/extensions_disabled/extensions_disabled.py index 5a166e695..a365c6dd6 100755 --- a/tests_e2e/tests/extensions_disabled/extensions_disabled.py +++ b/tests_e2e/tests/extensions_disabled/extensions_disabled.py @@ -142,7 +142,7 @@ def get_ignore_error_rules(self) -> List[Dict[str, Any]]: # 2025-01-15T20:45:13.359784Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Extensions.CustomScript, op=ExtensionProcessing, message=Extension 'Microsoft.Azure.Extensions.CustomScript' will not be processed since extension processing is disabled. To enable extension processing, set Extensions.Enabled=y in '/etc/waagent.conf', duration=0 # We expect this error message when we disable extension processing via conf { - 'message': r"Extension .* will not be processed since extension processing is disabled" + 'message': r"Extension .* will not be processed since extension processing is disabled" } ] return ignore_rules From 9fa38ea575653fd2a3efd621fb2bfcb8d0fb51de Mon Sep 17 00:00:00 2001 From: mgunnala Date: Tue, 21 Jan 2025 15:14:24 -0500 Subject: [PATCH 31/32] Fix comments --- azurelinuxagent/ga/exthandlers.py | 46 +++++++++++++-------- tests_e2e/orchestrator/scripts/collect-logs | 2 + 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 7e9b74633..618cd78fe 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -95,13 +95,13 @@ _EXT_DISALLOWED_ERROR_MAP = \ { ExtensionRequestedState.Enabled: ('run', ExtensionErrorCodes.PluginEnableProcessingFailed), - # Note: currently, when uninstall is requested for an extension, CRP polls until the agent does not - # report status for that extension, or until timeout is reached. In the case of a policy error, the - # agent reports failed status on behalf of the extension, which will cause CRP to poll for the full - # timeout, instead of failing fast. - # # TODO: CRP does not currently have a terminal error code for uninstall. Once this code is added, use - # it instead of PluginDisableProcessingFailed below. + # it instead of PluginDisableProcessingFailed below. + # + # Note: currently, when uninstall is requested for an extension, CRP polls until the agent does not + # report status for that extension, or until timeout is reached. In the case of a policy error, the + # agent reports failed status on behalf of the extension, which will cause CRP to poll for the full + # timeout, instead of failing fast. ExtensionRequestedState.Uninstall: ('uninstall', ExtensionErrorCodes.PluginDisableProcessingFailed), # "Disable" is an internal operation, users are unaware of it. We surface the term "uninstall" instead. ExtensionRequestedState.Disabled: ('uninstall', ExtensionErrorCodes.PluginDisableProcessingFailed), @@ -745,15 +745,28 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess message=message) def __handle_ext_disallowed_error(self, ext_handler_i, error_code, report_op, message, extension): - # Handle and report error for disallowed extensions (extensions blocked by policy or disabled via config). - # Note: CRP may pick up stale statuses when not polling for sequence number. If extension status exists, CRP - # prioritizes it over handler status and polls for seq no. To work around the issue, we report extension status - # for extensions with settings: - # - For extensions without settings or uninstall errors: report at the handler level. - # - For extensions with settings (install/enable errors): report at both handler and extension levels. # - # TODO: __handle_and_report_ext_handler_errors() does not create a status file for single-config extensions, this - # function is a temporary workaround. Consider merging the two functions function after assessing the impact. + # Handle and report errors for disallowed extensions (e.g. extensions blocked by policy or disabled via config). + # + # TODO: __handle_and_report_ext_handler_errors() is also used to report extension errors, but it does not create + # a status file for single-config extensions (see below as to why this is important). This function, + # __handle_ext_disallowed_error, implements what we believe is the correct behavior, but at this point we + # use it only for disallowed extensions scenarios. In a future release, consider merging the two functions + # after assessing any impact. + # + # Note: When CRP polls for extension status, it first looks at handler status and then looks for any extension + # status. If extension status is present, CRP uses it instead of the handler status, ensuring that the + # sequence number for the extension settings match the sequence number in the reported status. CRP polls + # asynchronously to the Agent and, on a new goal state, it can check the status blob before the Agent has + # reported status for that goal state, effectively checking the status of the previous goal state. This is + # not an issue when the extension reports status at the extension level, since CRP wil wait for the status + # for the correct sequence number. However, when the extension reports status *only* at the handler level + # (e.g if the extension has no settings, during install errors, if extension is disallowed, etc.) CRP can + # end up picking up a stale status. There is not a good solution for extensions with no settings, and CRP + # can report an error from a previous goal state. For install errors of extensions with settings, though, + # we work around this issue by reporting the error *both* at the handler level and at the extension level + # (although reporting at the handler level *should* be sufficient). By reporting at the extension level, + # CRP will enforce a match on the sequence number for the settings, and skip stale status blobs. # Keep a list of disallowed extensions so that report_ext_handler_status() can report status for them. self.__disallowed_ext_handlers.append(ext_handler_i.ext_handler) @@ -764,10 +777,9 @@ def __handle_ext_disallowed_error(self, ext_handler_i, error_code, report_op, me # as a workaround for the stale status issue. ext_handler_i.set_handler_status(status=ExtHandlerStatusValue.not_ready, message=message, code=error_code) - # For extensions with settings (install/enable errors), also update extension-level status. - # Overwrite any existing status file to reflect policy failures accurately. if extension is not None and ext_handler_i.ext_handler.state == ExtensionRequestedState.Enabled: - # TODO: if extension is reporting heartbeat, it overwrites status. Consider overwriting heartbeat here + # TODO: if extension is reporting heartbeat, it overwrites status. Consider overwriting heartbeat here. + # Overwrite any existing status file to reflect the failure accurately. ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error_code, operation=ext_handler_i.operation, message=message, overwrite=True) diff --git a/tests_e2e/orchestrator/scripts/collect-logs b/tests_e2e/orchestrator/scripts/collect-logs index 85ef29ab4..94cb44edd 100755 --- a/tests_e2e/orchestrator/scripts/collect-logs +++ b/tests_e2e/orchestrator/scripts/collect-logs @@ -13,6 +13,8 @@ echo "Collecting logs to $logs_file_name ..." PYTHON=$(get-agent-python) waagent_conf=$($PYTHON -c 'from azurelinuxagent.common.osutil import get_osutil; print(get_osutil().agent_conf_file_path)') +# TODO: instead of collecting /etc/waagent_policy.json here, consider adding it to goal state history, since it can +# change per goal state. tar --exclude='journal/*' --exclude='omsbundle' --exclude='omsagent' --exclude='mdsd' --exclude='scx*' \ --exclude='*.so' --exclude='*__LinuxDiagnostic__*' --exclude='*.zip' --exclude='*.deb' --exclude='*.rpm' \ --warning=no-file-changed \ From 8cd64d4d35f4370bc46e3468ac1e7c07527e00b3 Mon Sep 17 00:00:00 2001 From: mgunnala Date: Wed, 22 Jan 2025 10:35:14 -0500 Subject: [PATCH 32/32] Address review comments --- azurelinuxagent/ga/exthandlers.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 618cd78fe..b1789605f 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -99,8 +99,8 @@ # it instead of PluginDisableProcessingFailed below. # # Note: currently, when uninstall is requested for an extension, CRP polls until the agent does not - # report status for that extension, or until timeout is reached. In the case of a policy error, the - # agent reports failed status on behalf of the extension, which will cause CRP to poll for the full + # report status for that extension, or until timeout is reached. In the case of an extension disallowed + # error, agent reports failed status on behalf of the extension, which will cause CRP to poll for the full # timeout, instead of failing fast. ExtensionRequestedState.Uninstall: ('uninstall', ExtensionErrorCodes.PluginDisableProcessingFailed), # "Disable" is an internal operation, users are unaware of it. We surface the term "uninstall" instead. @@ -298,7 +298,7 @@ def __init__(self, protocol): self.protocol = protocol self.ext_handlers = None # Maintain a list of extension handler objects that are disallowed (e.g. blocked by policy, extensions disabled, etc.). - # Extension status is always reported for the extensions in this list. List is reset for each goal state. + # Extension status, if it exists, is always reported for the extensions in this list. List is reset for each goal state. self.__disallowed_ext_handlers = [] # The GoalState Aggregate status needs to report the last status of the GoalState. Since we only process # extensions on goal state change, we need to maintain its state. @@ -528,7 +528,7 @@ def handle_ext_handlers(self, goal_state_id): # here with an error message. if not extensions_enabled: ext_full_name = handler_i.get_extension_full_name(extension) - agent_conf_file_path = get_osutil().agent_conf_file_path + agent_conf_file_path = get_osutil().get_agent_conf_file_path() msg = "Extension '{0}' will not be processed since extension processing is disabled. To enable extension " \ "processing, set Extensions.Enabled=y in '{1}'".format(ext_full_name, agent_conf_file_path) self.__handle_ext_disallowed_error(handler_i, error_code, report_op=WALAEventOperation.ExtensionProcessing, @@ -771,12 +771,10 @@ def __handle_ext_disallowed_error(self, ext_handler_i, error_code, report_op, me # Keep a list of disallowed extensions so that report_ext_handler_status() can report status for them. self.__disallowed_ext_handlers.append(ext_handler_i.ext_handler) - # Set handler status for all extensions (with and without settings). - # Install errors should always be reported at the handler level. While install errors for any extension should - # ideally be reported ONLY at the handler level, we also report at the ext status level for extensions with settings - # as a workaround for the stale status issue. ext_handler_i.set_handler_status(status=ExtHandlerStatusValue.not_ready, message=message, code=error_code) + # Only report extension status for install errors of extensions with settings. Disable/uninstall errors are + # reported at the handler status level only. if extension is not None and ext_handler_i.ext_handler.state == ExtensionRequestedState.Enabled: # TODO: if extension is reporting heartbeat, it overwrites status. Consider overwriting heartbeat here. # Overwrite any existing status file to reflect the failure accurately.