Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Block extensions disallowed by policy #3259

Merged
merged 43 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
c2cc2c6
Block disallowed extension processing
mgunnala Nov 8, 2024
151081d
Enable policy e2e tests
mgunnala Nov 8, 2024
edec2af
Pylint
mgunnala Nov 8, 2024
a37508f
Fix e2e test failures
mgunnala Nov 11, 2024
b0da554
Address review comments
mgunnala Nov 18, 2024
a4f5cab
Merge branch 'develop' into allowlist_2
mgunnala Nov 18, 2024
699b9ba
Address review comments
mgunnala Nov 20, 2024
86de0c5
Address test review comments
mgunnala Nov 21, 2024
c3e9b89
Remove status file for single-config
mgunnala Nov 22, 2024
65d7034
Add back status file for single-config
mgunnala Nov 22, 2024
95f247a
Run e2e tests on all endorsed
mgunnala Nov 22, 2024
3b18519
Fix UT failures
mgunnala Nov 23, 2024
63da127
Pylint
mgunnala Nov 26, 2024
471cd59
Merge branch 'develop' into allowlist_2
narrieta Nov 26, 2024
8ea989b
Address review comments for agent code
mgunnala Dec 3, 2024
83f6ff0
Tests
mgunnala Dec 3, 2024
b037e41
Revert "Tests"
mgunnala Dec 3, 2024
ba3869c
Address test comments
mgunnala Dec 6, 2024
dfcc158
Address test comments
mgunnala Dec 9, 2024
fe07ffa
Merge branch 'develop' into allowlist_2
mgunnala Dec 9, 2024
a31bdcf
Address test comments
mgunnala Dec 10, 2024
5198cf8
Cleanup existing extensions on test VMs
mgunnala Dec 12, 2024
4a0a4ef
Address comments and disable dependencies e2e tests
mgunnala Dec 16, 2024
daa8017
Merge branch 'develop' into allowlist_2
mgunnala Dec 16, 2024
bacc425
Add fixes for e2e tests
mgunnala Dec 17, 2024
3319916
Add back delete failure test case
mgunnala Dec 17, 2024
8c31798
Address comments round 3
mgunnala Dec 17, 2024
32ef5c1
Address comments
mgunnala Dec 17, 2024
f0895b7
Merge branch 'develop' into allowlist_2
mgunnala Dec 17, 2024
0c9f1c7
Pylint
mgunnala Dec 18, 2024
fc2de23
Merge branch 'develop' into allowlist_2
mgunnala Jan 13, 2025
c3aac0f
Report status for single-config ext
mgunnala Jan 13, 2025
be42640
Small e2e test cleanups
mgunnala Jan 13, 2025
7069a0b
Address agent code comments
mgunnala Jan 15, 2025
8231234
Merge branch 'develop' into allowlist_2
mgunnala Jan 15, 2025
ce5cf20
Address test comments
mgunnala Jan 17, 2025
e222aa7
Merge branch 'develop' into allowlist_2
mgunnala Jan 17, 2025
5abbfc6
Small formatting fix
mgunnala Jan 17, 2025
6abd3f8
Merge branch 'develop' into allowlist_2
narrieta Jan 18, 2025
9fa38ea
Fix comments
mgunnala Jan 21, 2025
7ebbd2b
Merge branch 'develop' into allowlist_2
mgunnala Jan 21, 2025
1e6a82d
Merge branch 'develop' into allowlist_2
mgunnala Jan 21, 2025
8cd64d4
Address review comments
mgunnala Jan 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 105 additions & 19 deletions azurelinuxagent/ga/exthandlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from azurelinuxagent.common.agent_supported_feature import get_agent_supported_features_list_for_extensions, \
SupportedFeatureNames, get_supported_feature_by_name, get_agent_supported_features_list_for_crp
from azurelinuxagent.ga.cgroupconfigurator import CGroupConfigurator
from azurelinuxagent.ga.policy.policy_engine import ExtensionPolicyEngine
from azurelinuxagent.common.datacontract import get_properties, set_properties
from azurelinuxagent.common.errorstate import ErrorState
from azurelinuxagent.common.event import add_event, elapsed_milliseconds, WALAEventOperation, \
Expand Down Expand Up @@ -86,6 +87,26 @@
# This is the default sequence number we use when there are no settings available for Handlers
_DEFAULT_SEQ_NO = "0"

# For policy-related errors, this mapping is used to generate user-friendly error messages and determine the appropriate
# terminal error code based on the blocked operation.
# Format: {<ExtensionRequestedState>: (<str>, <ExtensionErrorCodes>)}
# - The first element of the tuple is a user-friendly operation name included in error messages.
# - The second element of the tuple is the CRP terminal error code for the operation.
_POLICY_ERROR_MAP = \
{
ExtensionRequestedState.Enabled: ('run', ExtensionErrorCodes.PluginEnableProcessingFailed),
# Note: currently, when uninstall is requested for an extension, CRP polls until the agent does not
# report status for that extension, or until timeout is reached. In the case of a policy error, the
# agent reports failed status on behalf of the extension, which will cause CRP to poll for the full
# timeout, instead of failing fast.
#
# TODO: CRP does not currently have a terminal error code for uninstall. Once this code is added, use
# it instead of PluginDisableProcessingFailed below.
ExtensionRequestedState.Uninstall: ('uninstall', ExtensionErrorCodes.PluginDisableProcessingFailed),
# "Disable" is an internal operation, users are unaware of it. We surface the term "uninstall" instead.
ExtensionRequestedState.Disabled: ('uninstall', ExtensionErrorCodes.PluginDisableProcessingFailed),
}


class ExtHandlerStatusValue(object):
"""
Expand Down Expand Up @@ -276,6 +297,8 @@ class ExtHandlersHandler(object):
def __init__(self, protocol):
self.protocol = protocol
self.ext_handlers = None
# Maintain a list of extensions that are disallowed, and always report extension status for disallowed extensions.
mgunnala marked this conversation as resolved.
Show resolved Hide resolved
self.disallowed_ext_handlers = []
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be private

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated

# The GoalState Aggregate status needs to report the last status of the GoalState. Since we only process
# extensions on goal state change, we need to maintain its state.
# Setting the status to None here. This would be overridden as soon as the first GoalState is processed
Expand Down Expand Up @@ -482,6 +505,14 @@ def handle_ext_handlers(self, goal_state_id):
depends_on_err_msg = None
extensions_enabled = conf.get_extensions_enabled()

# Instantiate policy engine, and use same engine to handle all extension handlers. If an error is thrown during
# policy engine initialization, we block all extensions and report the error via handler status for each extension.
policy_error = None
try:
policy_engine = ExtensionPolicyEngine()
except Exception as ex:
policy_error = ex

narrieta marked this conversation as resolved.
Show resolved Hide resolved
for extension, ext_handler in all_extensions:

handler_i = ExtHandlerInstance(ext_handler, self.protocol, extension=extension)
Expand All @@ -490,6 +521,7 @@ def handle_ext_handlers(self, goal_state_id):
# back for the skipped extensions. In order to propagate the status back to CRP, we will report status back
# here with an error message.
if not extensions_enabled:
self.disallowed_ext_handlers.append(ext_handler)
agent_conf_file_path = get_osutil().agent_conf_file_path
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: should probably use get_agent_conf_file_path() instead

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated

msg = "Extension will not be processed since extension processing is disabled. To enable extension " \
"processing, set Extensions.Enabled=y in '{0}'".format(agent_conf_file_path)
Expand All @@ -498,11 +530,22 @@ def handle_ext_handlers(self, goal_state_id):
logger.info("{0}: {1}".format(ext_full_name, msg))
add_event(op=WALAEventOperation.ExtensionProcessing, message="{0}: {1}".format(ext_full_name, msg))
handler_i.set_handler_status(status=ExtHandlerStatusValue.not_ready, message=msg, code=-1)
Copy link
Author

@mgunnala mgunnala Jan 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we want to change this to use __report_policy_error() or __handle_and_report_ext_handler_error() instead?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we want to handle extensions_disabled scenario in this PR, I think we should remove the logic to create handler status and extension status here and call __report_policy_error instead.

Although, if we do that, I think we should rename __report_policy_error to something like '__report_ext_disallowed_error' and update the comments in that function to indicate it can be called for either extension disallowed by policy OR extension processing disabled via config.

We should also rename the policy_error_map to be generic to extensions disabled too

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated this to call __handle_ext_disallowed_error( ), and set a terminal error code from _EXT_DISALLOWED_ERROR_MAP

handler_i.create_status_file_if_not_exist(extension,
status=ExtensionStatusValue.error,
code=-1,
operation=handler_i.operation,
message=msg)
handler_i.create_status_file(extension,
narrieta marked this conversation as resolved.
Show resolved Hide resolved
status=ExtensionStatusValue.error,
code=-1,
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this code be set to something terminal?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we're handling extensions_disabled in this PR, then we should use the policy_error_map to determine error code, but rename the map to be generic to any disallowed ext

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, extensions disabled and extensions not allowed by policy should use the same method to report status, so report_policy_error will need a little refactoring to allow for this

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated

operation=handler_i.operation,
message=msg,
overwrite=False)
continue

# If an error was thrown during policy engine initialization, skip further processing of the extension.
# CRP is still waiting for status, so we report error status here.
operation, error_code = _POLICY_ERROR_MAP.get(ext_handler.state)
if policy_error is not None:
msg = "Extension will not be processed: {0}".format(ustr(policy_error))
self.__report_policy_error(ext_handler_i=handler_i, error_code=error_code,
report_op=WALAEventOperation.ExtensionPolicy, message=msg,
extension=extension)
continue

# In case of depends-on errors, we skip processing extensions if there was an error processing dependent extensions.
Expand All @@ -516,18 +559,31 @@ def handle_ext_handlers(self, goal_state_id):
if handler_i.get_handler_status() is None:
handler_i.set_handler_status(message=depends_on_err_msg, code=-1)

handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=-1,
operation=WALAEventOperation.ExtensionProcessing,
message=depends_on_err_msg)
handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=-1,
operation=WALAEventOperation.ExtensionProcessing,
message=depends_on_err_msg, overwrite=False)

# For SC extensions, overwrite the HandlerStatus with the relevant message
else:
handler_i.set_handler_status(message=depends_on_err_msg, code=-1)

continue

# Process extensions and get if it was successfully executed or not
extension_success = self.handle_ext_handler(handler_i, extension, goal_state_id)
# Invoke policy engine to determine if extension is allowed.
# - if allowed: process the extension and get if it was successfully executed or not
# - if disallowed: do not process the handler and report an error on behalf of the extension, dependent
# extensions will also be blocked.
extension_allowed = policy_engine.should_allow_extension(ext_handler.name)
if not extension_allowed:
msg = (
"Extension will not be processed: failed to {0} extension '{1}' because it is not specified "
"as an allowed extension. To {0}, add the extension to the list of allowed extensions in the policy file ('{2}')."
).format(operation, ext_handler.name, conf.get_policy_file_path())
self.__report_policy_error(handler_i, error_code, report_op=WALAEventOperation.ExtensionPolicy,
message=msg, extension=extension)
extension_success = False
else:
extension_success = self.handle_ext_handler(handler_i, extension, goal_state_id)

dep_level = self.__get_dependency_level((extension, ext_handler))
if 0 <= dep_level < max_dep_level:
Expand Down Expand Up @@ -642,8 +698,8 @@ def handle_ext_handler(self, ext_handler_i, extension, goal_state_id):
# This error is only thrown for enable operation on MultiConfig extension.
# Since these are maintained by the extensions, the expectation here is that they would update their status files appropriately with their errors.
# The extensions should already have a placeholder status file, but incase they dont, setting one here to fail fast.
ext_handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=error.code,
operation=ext_handler_i.operation, message=err_msg)
ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error.code,
operation=ext_handler_i.operation, message=err_msg, overwrite=False)
add_event(name=ext_name, version=ext_handler_i.ext_handler.version, op=ext_handler_i.operation,
is_success=False, log_event=True, message=err_msg)
except ExtensionsGoalStateError as error:
Expand Down Expand Up @@ -683,15 +739,42 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess
# file with failure since the extensions wont be called where they can create their status files.
# This way we guarantee reporting back to CRP
if ext_handler_i.should_perform_multi_config_op(extension):
ext_handler_i.create_status_file_if_not_exist(extension, status=ExtensionStatusValue.error, code=error.code,
operation=report_op, message=message)
ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error.code,
operation=report_op, message=message, overwrite=False)

if report:
name = ext_handler_i.get_extension_full_name(extension)
handler_version = ext_handler_i.ext_handler.version
add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True,
message=message)

def __report_policy_error(self, ext_handler_i, error_code, report_op, message, extension):
# TODO: __handle_and_report_ext_handler_errors() does not create a status file for single-config extensions, this
# function was created as a temporary workaround. Consider merging the two functions function after assessing its impact.

# If extension status exists, CRP ignores handler status and reports extension status. In the case of policy errors,
# we write a .status file to force CRP to fail fast - the agent will otherwise report a transitioning status.
# - For extensions without settings or uninstall errors: report at the handler level.
# - For extensions with settings (install/enable errors): report at both handler and extension levels.

# Keep a list of disallowed extensions so that report_ext_handler_status() can report status for them.
self.disallowed_ext_handlers.append(ext_handler_i.ext_handler)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When does an extension get removed from disallowed_ext_handlers? The ExtHandlersHandler is instantiated only once on agent init

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

appending to the list in a method named "report_policy_error" does not feel right (reporting should not change the state of the object). rename to handle_policy_error?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can change this to "handle_ext_disallowed_error"

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed


# Set handler status for all extensions (with and without settings).
ext_handler_i.set_handler_status(message=message, code=error_code)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add comment pointing out that we are intentionally reporting the error at the handler and status level

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean something like "We report the same error at both the handler status and extension status level." ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, what i was trying to point is that reporting the error both at the handler and status level is not needed (or should not be needed). e.g. install errors are reported at the handler level, while single-config errors are reported at the status level.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this comment still needs addressing. we need to explain why

# Set handler status for all extensions (with and without settings).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added some comments, but might be missing this piece of the explanation - is this what you were thinking of?

# Set handler status for all extensions (with and without settings).
# Install errors should always be reported at the handler level. While install errors for any extension should
# ideally be reported ONLY at the handler level, we also report at the ext status level for extensions with settings
# as a workaround for the stale status issue.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the above comment


# For extensions with settings (install/enable errors), also update extension-level status.
# Overwrite any existing status file to reflect policy failures accurately.
if extension is not None and ext_handler_i.ext_handler.state == ExtensionRequestedState.Enabled:
maddieford marked this conversation as resolved.
Show resolved Hide resolved
# TODO: if extension is reporting heartbeat, it overwrites status. Consider overwriting heartbeat here
ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error_code,
operation=report_op, message=message, overwrite=True)
narrieta marked this conversation as resolved.
Show resolved Hide resolved

name = ext_handler_i.get_extension_full_name(extension)
handler_version = ext_handler_i.ext_handler.version
add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True,
message=message)

def handle_enable(self, ext_handler_i, extension):
"""
1- Ensure the handler is installed
Expand Down Expand Up @@ -985,12 +1068,13 @@ def report_ext_handler_status(self, vm_status, ext_handler, goal_state_changed):

handler_state = ext_handler_i.get_handler_state()
ext_handler_statuses = []
ext_disallowed = ext_handler in self.disallowed_ext_handlers
# For MultiConfig, we need to report status per extension even for Handler level failures.
# If we have HandlerStatus for a MultiConfig handler and GS is requesting for it, we would report status per
# extension even if HandlerState == NotInstalled (Sample scenario: ExtensionsGoalStateError, DecideVersionError, etc)
# We also need to report extension status for an uninstalled handler if extensions are disabled because CRP
# waits for extension runtime status before failing the extension operation.
if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or not conf.get_extensions_enabled():
# We also need to report extension status for an uninstalled handler if the extension is disallowed (due to
# policy failure, extensions disabled, etc.) because CRP waits for extension runtime status before failing the operation.
if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or ext_disallowed:
maddieford marked this conversation as resolved.
Show resolved Hide resolved

# Since we require reading the Manifest for reading the heartbeat, this would fail if HandlerManifest not found.
# Only try to read heartbeat if HandlerState != NotInstalled.
Expand Down Expand Up @@ -1342,9 +1426,11 @@ def set_extension_resource_limits(self):
extension_name=extension_name, cpu_quota=resource_limits.get_extension_slice_cpu_quota())
CGroupConfigurator.get_instance().set_extension_services_cpu_memory_quota(resource_limits.get_service_list())

def create_status_file_if_not_exist(self, extension, status, code, operation, message):
def create_status_file(self, extension, status, code, operation, message, overwrite):
# Create status file for specified extension. If overwrite is true, overwrite any existing status file. If
# false, create a status file only if it does not already exist.
_, status_path = self.get_status_file_path(extension)
if status_path is not None and not os.path.exists(status_path):
if status_path is not None and (overwrite or not os.path.exists(status_path)):
now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
status_contents = [
{
Expand Down
12 changes: 4 additions & 8 deletions azurelinuxagent/ga/policy/policy_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,6 @@
_MAX_SUPPORTED_POLICY_VERSION = "0.1.0"


class PolicyError(AgentError):
"""
Error raised during agent policy enforcement.
"""


class InvalidPolicyError(AgentError):
"""
Error raised if user-provided policy is invalid.
Expand All @@ -50,7 +44,6 @@ def __init__(self, msg, inner=None):
msg = "Customer-provided policy file ('{0}') is invalid, please correct the following error: {1}".format(conf.get_policy_file_path(), msg)
super(InvalidPolicyError, self).__init__(msg, inner)


narrieta marked this conversation as resolved.
Show resolved Hide resolved
class _PolicyEngine(object):
"""
Implements base policy engine API.
Expand All @@ -61,6 +54,7 @@ def __init__(self):
if not self.policy_enforcement_enabled:
return

_PolicyEngine._log_policy_event("Policy enforcement is enabled.")
self._policy = self._parse_policy(self.__read_policy())

@staticmethod
Expand Down Expand Up @@ -98,8 +92,10 @@ def __read_policy():
with open(conf.get_policy_file_path(), 'r') as f:
try:
contents = f.read()
# TODO: Consider copying the policy file contents to the history folder, and only log the policy locally
# in the case of policy-related failure.
_PolicyEngine._log_policy_event(
"Policy enforcement is enabled. Enforcing policy using policy file found at '{0}'. File contents:\n{1}"
"Enforcing policy using policy file found at '{0}'. File contents:\n{1}"
.format(conf.get_policy_file_path(), contents))
# json.loads will raise error if file contents are not a valid json (including empty file).
custom_policy = json.loads(contents)
Expand Down
2 changes: 1 addition & 1 deletion tests/data/test_waagent.conf
Original file line number Diff line number Diff line change
Expand Up @@ -140,4 +140,4 @@ OS.SshDir=/notareal/path
# - The default is false to protect the state of existing VMs
OS.EnableFirewall=n

Debug.EnableExtensionPolicy=y
Debug.EnableExtensionPolicy=n
maddieford marked this conversation as resolved.
Show resolved Hide resolved
Loading
Loading