Azure · narrieta · Jan 22, 2025 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
@@ -39,6 +39,7 @@
 from azurelinuxagent.common.agent_supported_feature import get_agent_supported_features_list_for_extensions, \
     SupportedFeatureNames, get_supported_feature_by_name, get_agent_supported_features_list_for_crp
 from azurelinuxagent.ga.cgroupconfigurator import CGroupConfigurator
+from azurelinuxagent.ga.policy.policy_engine import ExtensionPolicyEngine
 from azurelinuxagent.common.datacontract import get_properties, set_properties
 from azurelinuxagent.common.errorstate import ErrorState
 from azurelinuxagent.common.event import add_event, elapsed_milliseconds, WALAEventOperation, \
@@ -86,6 +87,26 @@
 # This is the default sequence number we use when there are no settings available for Handlers
 _DEFAULT_SEQ_NO = "0"
 
+# For policy-related errors, this mapping is used to generate user-friendly error messages and determine the appropriate
+# terminal error code based on the blocked operation.
+# Format: {<ExtensionRequestedState>: (<str>, <ExtensionErrorCodes>)}
+# - The first element of the tuple is a user-friendly operation name included in error messages.
+# - The second element of the tuple is the CRP terminal error code for the operation.
+_POLICY_ERROR_MAP = \
+    {
+        ExtensionRequestedState.Enabled: ('run', ExtensionErrorCodes.PluginEnableProcessingFailed),
+        # Note: currently, when uninstall is requested for an extension, CRP polls until the agent does not
+        # report status for that extension, or until timeout is reached. In the case of a policy error, the
+        # agent reports failed status on behalf of the extension, which will cause CRP to poll for the full
+        # timeout, instead of failing fast.
+        #
+        # TODO: CRP does not currently have a terminal error code for uninstall. Once this code is added, use
+        # it instead of PluginDisableProcessingFailed below.
+        ExtensionRequestedState.Uninstall: ('uninstall', ExtensionErrorCodes.PluginDisableProcessingFailed),
+        # "Disable" is an internal operation, users are unaware of it. We surface the term "uninstall" instead.
+        ExtensionRequestedState.Disabled: ('uninstall', ExtensionErrorCodes.PluginDisableProcessingFailed),
+    }
+
 
 class ExtHandlerStatusValue(object):
     """
@@ -276,6 +297,8 @@ class ExtHandlersHandler(object):
     def __init__(self, protocol):
         self.protocol = protocol
         self.ext_handlers = None
+        # Maintain a list of extensions that are disallowed, and always report extension status for disallowed extensions.
+        self.disallowed_ext_handlers = []
         # The GoalState Aggregate status needs to report the last status of the GoalState. Since we only process
         # extensions on goal state change, we need to maintain its state.
         # Setting the status to None here. This would be overridden as soon as the first GoalState is processed
@@ -482,6 +505,14 @@ def handle_ext_handlers(self, goal_state_id):
         depends_on_err_msg = None
         extensions_enabled = conf.get_extensions_enabled()
 
+        # Instantiate policy engine, and use same engine to handle all extension handlers. If an error is thrown during
+        # policy engine initialization, we block all extensions and report the error via handler status for each extension.
+        policy_error = None
+        try:
+            policy_engine = ExtensionPolicyEngine()
+        except Exception as ex:
+            policy_error = ex
+
         for extension, ext_handler in all_extensions:
 
             handler_i = ExtHandlerInstance(ext_handler, self.protocol, extension=extension)
@@ -490,6 +521,7 @@ def handle_ext_handlers(self, goal_state_id):
             # back for the skipped extensions. In order to propagate the status back to CRP, we will report status back
             # here with an error message.
             if not extensions_enabled:
+                self.disallowed_ext_handlers.append(ext_handler)
                 agent_conf_file_path = get_osutil().agent_conf_file_path
                 msg = "Extension will not be processed since extension processing is disabled. To enable extension " \
                       "processing, set Extensions.Enabled=y in '{0}'".format(agent_conf_file_path)
@@ -498,11 +530,22 @@ def handle_ext_handlers(self, goal_state_id):
                 logger.info("{0}: {1}".format(ext_full_name, msg))
                 add_event(op=WALAEventOperation.ExtensionProcessing, message="{0}: {1}".format(ext_full_name, msg))
                 handler_i.set_handler_status(status=ExtHandlerStatusValue.not_ready, message=msg, code=-1)
-                handler_i.create_status_file_if_not_exist(extension,
-                                                          status=ExtensionStatusValue.error,
-                                                          code=-1,
-                                                          operation=handler_i.operation,
-                                                          message=msg)
+                handler_i.create_status_file(extension,
+                                             status=ExtensionStatusValue.error,
+                                             code=-1,
+                                             operation=handler_i.operation,
+                                             message=msg,
+                                             overwrite=False)
+                continue
+
+            # If an error was thrown during policy engine initialization, skip further processing of the extension.
+            # CRP is still waiting for status, so we report error status here.
+            operation, error_code = _POLICY_ERROR_MAP.get(ext_handler.state)
+            if policy_error is not None:
+                msg = "Extension will not be processed: {0}".format(ustr(policy_error))
+                self.__report_policy_error(ext_handler_i=handler_i, error_code=error_code,
+                                           report_op=WALAEventOperation.ExtensionPolicy, message=msg,
+                                           extension=extension)
                 continue
 
             # In case of depends-on errors, we skip processing extensions if there was an error processing dependent extensions.
@@ -516,18 +559,31 @@ def handle_ext_handlers(self, goal_state_id):
                     if handler_i.get_handler_status() is None:
                         handler_i.set_handler_status(message=depends_on_err_msg, code=-1)
 
-                    handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=-1,
-                                                              operation=WALAEventOperation.ExtensionProcessing,
-                                                              message=depends_on_err_msg)
+                    handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=-1,
+                                                 operation=WALAEventOperation.ExtensionProcessing,
+                                                 message=depends_on_err_msg, overwrite=False)
 
                 # For SC extensions, overwrite the HandlerStatus with the relevant message
                 else:
                     handler_i.set_handler_status(message=depends_on_err_msg, code=-1)
 
                 continue
 
-            # Process extensions and get if it was successfully executed or not
-            extension_success = self.handle_ext_handler(handler_i, extension, goal_state_id)
+            # Invoke policy engine to determine if extension is allowed.
+            # - if allowed: process the extension and get if it was successfully executed or not
+            # - if disallowed: do not process the handler and report an error on behalf of the extension, dependent
+            #                  extensions will also be blocked.
+            extension_allowed = policy_engine.should_allow_extension(ext_handler.name)
+            if not extension_allowed:
+                msg = (
+                    "Extension will not be processed: failed to {0} extension '{1}' because it is not specified "
+                    "as an allowed extension. To {0}, add the extension to the list of allowed extensions in the policy file ('{2}')."
+                ).format(operation, ext_handler.name, conf.get_policy_file_path())
+                self.__report_policy_error(handler_i, error_code, report_op=WALAEventOperation.ExtensionPolicy,
+                                           message=msg, extension=extension)
+                extension_success = False
+            else:
+                extension_success = self.handle_ext_handler(handler_i, extension, goal_state_id)
 
             dep_level = self.__get_dependency_level((extension, ext_handler))
             if 0 <= dep_level < max_dep_level:
@@ -642,8 +698,8 @@ def handle_ext_handler(self, ext_handler_i, extension, goal_state_id):
             # This error is only thrown for enable operation on MultiConfig extension.
             # Since these are maintained by the extensions, the expectation here is that they would update their status files appropriately with their errors.
             # The extensions should already have a placeholder status file, but incase they dont, setting one here to fail fast.
-            ext_handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=error.code,
-                                                          operation=ext_handler_i.operation, message=err_msg)
+            ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error.code,
+                                             operation=ext_handler_i.operation, message=err_msg, overwrite=False)
             add_event(name=ext_name, version=ext_handler_i.ext_handler.version, op=ext_handler_i.operation,
                       is_success=False, log_event=True, message=err_msg)
         except ExtensionsGoalStateError as error:
@@ -683,15 +739,42 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess
         # file with failure since the extensions wont be called where they can create their status files.
         # This way we guarantee reporting back to CRP
         if ext_handler_i.should_perform_multi_config_op(extension):
-            ext_handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=error.code,
-                                                          operation=report_op, message=message)
+            ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error.code,
+                                             operation=report_op, message=message, overwrite=False)
 
         if report:
             name = ext_handler_i.get_extension_full_name(extension)
             handler_version = ext_handler_i.ext_handler.version
             add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True,
                       message=message)
 
+    def __report_policy_error(self, ext_handler_i, error_code, report_op, message, extension):
+        # TODO: __handle_and_report_ext_handler_errors() does not create a status file for single-config extensions, this
+        # function was created as a temporary workaround. Consider merging the two functions function after assessing its impact.
+
+        # If extension status exists, CRP ignores handler status and reports extension status. In the case of policy errors,
+        # we write a .status file to force CRP to fail fast - the agent will otherwise report a transitioning status.
+        # - For extensions without settings or uninstall errors: report at the handler level.
+        # - For extensions with settings (install/enable errors): report at both handler and extension levels.
+
+        # Keep a list of disallowed extensions so that report_ext_handler_status() can report status for them.
+        self.disallowed_ext_handlers.append(ext_handler_i.ext_handler)
+
+        # Set handler status for all extensions (with and without settings).
+        ext_handler_i.set_handler_status(message=message, code=error_code)
+
+        # For extensions with settings (install/enable errors), also update extension-level status.
+        # Overwrite any existing status file to reflect policy failures accurately.
+        if extension is not None and ext_handler_i.ext_handler.state == ExtensionRequestedState.Enabled:
+            # TODO: if extension is reporting heartbeat, it overwrites status. Consider overwriting heartbeat here
+            ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error_code,
+                                             operation=report_op, message=message, overwrite=True)
+
+        name = ext_handler_i.get_extension_full_name(extension)
+        handler_version = ext_handler_i.ext_handler.version
+        add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True,
+                  message=message)
+
     def handle_enable(self, ext_handler_i, extension):
         """
              1- Ensure the handler is installed
@@ -985,12 +1068,13 @@ def report_ext_handler_status(self, vm_status, ext_handler, goal_state_changed):
 
         handler_state = ext_handler_i.get_handler_state()
         ext_handler_statuses = []
+        ext_disallowed = ext_handler in self.disallowed_ext_handlers
         # For MultiConfig, we need to report status per extension even for Handler level failures.
         # If we have HandlerStatus for a MultiConfig handler and GS is requesting for it, we would report status per
         # extension even if HandlerState == NotInstalled (Sample scenario: ExtensionsGoalStateError, DecideVersionError, etc)
-        # We also need to report extension status for an uninstalled handler if extensions are disabled because CRP
-        # waits for extension runtime status before failing the extension operation.
-        if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or not conf.get_extensions_enabled():
+        # We also need to report extension status for an uninstalled handler if the extension is disallowed (due to
+        # policy failure, extensions disabled, etc.) because CRP waits for extension runtime status before failing the operation.
+        if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or ext_disallowed:
 
             # Since we require reading the Manifest for reading the heartbeat, this would fail if HandlerManifest not found.
             # Only try to read heartbeat if HandlerState != NotInstalled.
@@ -1342,9 +1426,11 @@ def set_extension_resource_limits(self):
             extension_name=extension_name, cpu_quota=resource_limits.get_extension_slice_cpu_quota())
         CGroupConfigurator.get_instance().set_extension_services_cpu_memory_quota(resource_limits.get_service_list())
 
-    def create_status_file_if_not_exist(self, extension, status, code, operation, message):
+    def create_status_file(self, extension, status, code, operation, message, overwrite):
+        # Create status file for specified extension. If overwrite is true, overwrite any existing status file. If
+        # false, create a status file only if it does not already exist.
         _, status_path = self.get_status_file_path(extension)
-        if status_path is not None and not os.path.exists(status_path):
+        if status_path is not None and (overwrite or not os.path.exists(status_path)):
             now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
             status_contents = [
                 {

@@ -36,12 +36,6 @@
 _MAX_SUPPORTED_POLICY_VERSION = "0.1.0"
 
 
-class PolicyError(AgentError):
-    """
-    Error raised during agent policy enforcement.
-    """
-
-
 class InvalidPolicyError(AgentError):
     """
     Error raised if user-provided policy is invalid.
@@ -50,7 +44,6 @@ def __init__(self, msg, inner=None):
         msg = "Customer-provided policy file ('{0}') is invalid, please correct the following error: {1}".format(conf.get_policy_file_path(), msg)
         super(InvalidPolicyError, self).__init__(msg, inner)
 
-
 class _PolicyEngine(object):
     """
     Implements base policy engine API.
@@ -61,6 +54,7 @@ def __init__(self):
         if not self.policy_enforcement_enabled:
             return
 
+        _PolicyEngine._log_policy_event("Policy enforcement is enabled.")
         self._policy = self._parse_policy(self.__read_policy())
 
     @staticmethod
@@ -98,8 +92,10 @@ def __read_policy():
         with open(conf.get_policy_file_path(), 'r') as f:
             try:
                 contents = f.read()
+                # TODO: Consider copying the policy file contents to the history folder, and only log the policy locally
+                # in the case of policy-related failure.
                 _PolicyEngine._log_policy_event(
-                    "Policy enforcement is enabled. Enforcing policy using policy file found at '{0}'. File contents:\n{1}"
+                    "Enforcing policy using policy file found at '{0}'. File contents:\n{1}"
                     .format(conf.get_policy_file_path(), contents))
                 # json.loads will raise error if file contents are not a valid json (including empty file).
                 custom_policy = json.loads(contents)

@@ -140,4 +140,4 @@ OS.SshDir=/notareal/path
 # - The default is false to protect the state of existing VMs
 OS.EnableFirewall=n
 
-Debug.EnableExtensionPolicy=y
+Debug.EnableExtensionPolicy=n