From 9501707c8ac1d6710ee878e797783199c30acb09 Mon Sep 17 00:00:00 2001 From: maddieford <93676569+maddieford@users.noreply.github.com> Date: Fri, 15 Mar 2024 16:49:41 -0700 Subject: [PATCH 01/12] Cgroups api refactor (#6) * Initial changes for log collector cgroups v2 support * Fix pylint issues * Fix pylint issues * Fix pylint issues * Check that both controllers are mounted in the chosen cgroups version for log collector * Fix regex * Update test_agent unit tests * Fix unit tests * Update format strings * Fix broken cgroupconfigurator unit tests * pyling * Fix cgroups api unit tests * Ignore unused args * Ignore unused args * Add cgroup configurator tests * v2 required check in parent cgroup * unit tests is_controller_enabled * Fix test failure and pylint: * pylint * Update agent checks * Fix controller enable logic and unit tests * Remove changes to collect logs * Fix pylint * Add e2e test for v2 --- azurelinuxagent/agent.py | 28 +- azurelinuxagent/ga/cgroupapi.py | 403 ++++++++++----- azurelinuxagent/ga/cgroupconfigurator.py | 173 +++---- azurelinuxagent/ga/cgroupstelemetry.py | 13 + .../sys_fs_cgroup_unified_cgroup.controllers | 7 - tests/data/cgroups/{ => v1}/proc_pid_cgroup | 0 tests/data/cgroups/{ => v1}/proc_self_cgroup | 0 ...s_fs_cgroup_unified_cgroup.subtree_control | 0 tests/data/cgroups/v1_and_v2/proc_pid_cgroup | 12 + tests/data/cgroups/v1_and_v2/proc_self_cgroup | 12 + .../sys_fs_cgroup_cgroup.subtree_control | 1 + tests/data/cgroups/v2/proc_pid_cgroup | 1 + tests/data/cgroups/v2/proc_self_cgroup | 1 + .../v2/sys_fs_cgroup_cgroup.subtree_control | 1 + ...sys_fs_cgroup_cgroup.subtree_control_empty | 0 tests/ga/test_cgroupapi.py | 462 +++++++++++++++--- tests/ga/test_cgroupconfigurator.py | 195 ++++++-- tests/ga/test_cgroupconfigurator_sudo.py | 4 +- tests/lib/cgroups_tools.py | 14 - tests/lib/mock_cgroup_environment.py | 153 ++++-- tests/test_agent.py | 97 +++- tests_e2e/test_suites/cgroups_v2_disabled.yml | 10 + .../cgroups_v2_disabled.py | 84 ++++ 23 files changed, 1278 insertions(+), 393 deletions(-) delete mode 100644 tests/data/cgroups/sys_fs_cgroup_unified_cgroup.controllers rename tests/data/cgroups/{ => v1}/proc_pid_cgroup (100%) rename tests/data/cgroups/{ => v1}/proc_self_cgroup (100%) create mode 100644 tests/data/cgroups/v1/sys_fs_cgroup_unified_cgroup.subtree_control create mode 100644 tests/data/cgroups/v1_and_v2/proc_pid_cgroup create mode 100644 tests/data/cgroups/v1_and_v2/proc_self_cgroup create mode 100644 tests/data/cgroups/v1_and_v2/sys_fs_cgroup_cgroup.subtree_control create mode 100644 tests/data/cgroups/v2/proc_pid_cgroup create mode 100644 tests/data/cgroups/v2/proc_self_cgroup create mode 100644 tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control create mode 100644 tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control_empty create mode 100644 tests_e2e/test_suites/cgroups_v2_disabled.yml create mode 100644 tests_e2e/tests/cgroups_v2_disabled/cgroups_v2_disabled.py diff --git a/azurelinuxagent/agent.py b/azurelinuxagent/agent.py index ee68bd678..b0ce5a19f 100644 --- a/azurelinuxagent/agent.py +++ b/azurelinuxagent/agent.py @@ -30,7 +30,8 @@ import threading from azurelinuxagent.ga import logcollector, cgroupconfigurator from azurelinuxagent.ga.cgroup import AGENT_LOG_COLLECTOR, CpuCgroup, MemoryCgroup -from azurelinuxagent.ga.cgroupapi import SystemdCgroupsApi +from azurelinuxagent.ga.cgroupapi import get_cgroup_api +from azurelinuxagent.ga.cgroupstelemetry import log_cgroup_warning import azurelinuxagent.common.conf as conf import azurelinuxagent.common.event as event @@ -206,18 +207,29 @@ def collect_logs(self, is_full_mode): # Check the cgroups unit log_collector_monitor = None - cgroups_api = SystemdCgroupsApi() - cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self") + cgroups_api = get_cgroup_api() + cpu_cgroup_path = None + memory_cgroup_path = None if CollectLogsHandler.is_enabled_monitor_cgroups_check(): - cpu_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in cpu_cgroup_path) - memory_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in memory_cgroup_path) + if cgroups_api is None: + log_cgroup_warning("Unable to determine what version of cgroups to use for log collector resource " + "monitoring and enforcement.") + sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) + + cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self") + cpu_slice_matches = False + memory_slice_matches = False + if cpu_cgroup_path is not None: + cpu_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in cpu_cgroup_path) + if memory_cgroup_path is not None: + memory_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in memory_cgroup_path) if not cpu_slice_matches or not memory_slice_matches: - logger.info("The Log Collector process is not in the proper cgroups:") + log_cgroup_warning("The Log Collector process is not in the proper cgroups:", send_event=False) if not cpu_slice_matches: - logger.info("\tunexpected cpu slice") + log_cgroup_warning("\tunexpected cpu slice", send_event=False) if not memory_slice_matches: - logger.info("\tunexpected memory slice") + log_cgroup_warning("\tunexpected memory slice", send_event=False) sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) diff --git a/azurelinuxagent/ga/cgroupapi.py b/azurelinuxagent/ga/cgroupapi.py index 6f4bf4ab3..40f66ed74 100644 --- a/azurelinuxagent/ga/cgroupapi.py +++ b/azurelinuxagent/ga/cgroupapi.py @@ -24,7 +24,7 @@ from azurelinuxagent.common import logger from azurelinuxagent.ga.cgroup import CpuCgroup, MemoryCgroup -from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry +from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry, log_cgroup_info, log_cgroup_warning from azurelinuxagent.common.conf import get_agent_pid_file_path from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes, ExtensionError, \ ExtensionOperationError @@ -37,10 +37,32 @@ from azurelinuxagent.common.version import get_distro CGROUPS_FILE_SYSTEM_ROOT = '/sys/fs/cgroup' -CGROUP_CONTROLLERS = ["cpu", "memory"] EXTENSION_SLICE_PREFIX = "azure-vmextensions" +def get_cgroup_api(): + """ + Determines which version of Cgroups should be used for resource enforcement and monitoring by the Agent are returns + the corresponding Api. If the required controllers are not mounted in v1 or v2, return None. + """ + v1 = SystemdCgroupsApiv1() + v2 = SystemdCgroupsApiv2() + + log_cgroup_info("Controllers mounted in v1: {0}. Controllers mounted in v2: {1}".format(v1.get_mounted_controllers(), v2.get_mounted_controllers())) + + # It is possible for different controllers to be simultaneously mounted under v1 and v2. If any are mounted under + # v1, use v1. + if v1.is_cpu_or_memory_mounted(): + log_cgroup_info("Using cgroups v1 for resource enforcement and monitoring") + return v1 + elif v2.is_cpu_or_memory_mounted(): + log_cgroup_info("Using cgroups v2 for resource enforcement and monitoring") + return v2 + else: + log_cgroup_warning("CPU and Memory controllers are not mounted in cgroups v1 or v2") + return None + + class SystemdRunError(CGroupsException): """ Raised when systemd-run fails @@ -68,7 +90,7 @@ def track_cgroups(extension_cgroups): for cgroup in extension_cgroups: CGroupsTelemetry.track_cgroup(cgroup) except Exception as exception: - logger.warn("Cannot add cgroup '{0}' to tracking list; resource usage will not be tracked. " + logger.warn("[CGW] Cannot add cgroup '{0}' to tracking list; resource usage will not be tracked. " "Error: {1}".format(cgroup.path, ustr(exception))) @staticmethod @@ -94,7 +116,7 @@ def _foreach_legacy_cgroup(operation): for controller in ['cpu', 'memory']: cgroup = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, "WALinuxAgent", "WALinuxAgent") if os.path.exists(cgroup): - logger.info('Found legacy cgroup {0}', cgroup) + log_cgroup_info('Found legacy cgroup {0}'.format(cgroup), send_event=False) legacy_cgroups.append((controller, cgroup)) try: @@ -109,7 +131,7 @@ def _foreach_legacy_cgroup(operation): operation(controller, daemon_pid) finally: for _, cgroup in legacy_cgroups: - logger.info('Removing {0}', cgroup) + log_cgroup_info('Removing {0}'.format(cgroup), send_event=False) shutil.rmtree(cgroup, ignore_errors=True) return len(legacy_cgroups) @@ -120,11 +142,11 @@ def get_daemon_pid(): class SystemdCgroupsApi(CGroupsApi): """ - Cgroups interface via systemd + Cgroups interface via systemd. Contains common api implementations between cgroups v1 and v2. """ def __init__(self): - self._cgroup_mountpoints = None + self._cgroup_mountpoints = {} self._agent_unit_name = None self._systemd_run_commands = [] self._systemd_run_commands_lock = threading.RLock() @@ -136,23 +158,106 @@ def get_systemd_run_commands(self): with self._systemd_run_commands_lock: return self._systemd_run_commands[:] + def is_cpu_or_memory_mounted(self): + """ + Returns True if either cpu or memory controllers are mounted and enabled at the root cgroup. + """ + cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() + return cpu_mount_point is not None or memory_mount_point is not None + + def get_mounted_controllers(self): + """ + Returns a list of the controllers mounted and enabled at the root cgroup. Currently, the only controllers the + agent checks for is cpu and memory. + """ + self.get_cgroup_mount_points() # Updates self._cgroup_mountpoints if empty + return [controller for controller, mount_point in self._cgroup_mountpoints.items() if mount_point is not None] + + def cleanup_legacy_cgroups(self): + """ + Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; + starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If + we find that any of the legacy groups include the PID of the daemon then we need to disable data collection for this + instance (under systemd, moving PIDs across the cgroup file system can produce unpredictable results) + """ + return CGroupsApi._foreach_legacy_cgroup(lambda *_: None) + + @staticmethod + def get_extension_slice_name(extension_name, old_slice=False): + # The old slice makes it difficult for user to override the limits because they need to place drop-in files on every upgrade if extension slice is different for each version. + # old slice includes .- + # new slice without version . + if not old_slice: + extension_name = extension_name.rsplit("-", 1)[0] + # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects. + return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_') + ".slice" + + @staticmethod + def _is_systemd_failure(scope_name, stderr): + stderr.seek(0) + stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace') + unit_not_found = "Unit {0} not found.".format(scope_name) + return unit_not_found in stderr or scope_name not in stderr + def get_cgroup_mount_points(self): """ - Returns a tuple with the mount points for the cpu and memory controllers; the values can be None - if the corresponding controller is not mounted + Cgroup version specific. Returns a tuple with the mount points for the cpu and memory controllers; the values + can be None if the corresponding controller is not mounted or enabled at the root cgroup. Updates + self._cgroup_mountpoints if empty. + """ + return None, None + + def get_unit_cgroup_paths(self, unit_name): + """ + Cgroup version specific. Returns a tuple with the path of the cpu and memory cgroups for the given unit. + The values returned can be None if the controller is not mounted or enabled. + """ + pass # pylint: disable=W0107 + + def get_process_cgroup_paths(self, process_id): + """ + Cgroup version specific. Returns a tuple with the path of the cpu and memory cgroups for the given process. + The 'process_id' can be a numeric PID or the string "self" for the current process. + The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is + not mounted or enabled). + """ + pass # pylint: disable=W0107 + + def get_process_cgroup_relative_paths(self, process_id): # pylint: disable=W0613 + """ + Cgroup version specific. Returns a tuple with the path of the cpu and memory cgroups for the given process + (relative to the mount point of the corresponding controller). + The 'process_id' can be a numeric PID or the string "self" for the current process. + The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is + not mounted). """ + pass # pylint: disable=W0107 + + def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, + error_code=ExtensionErrorCodes.PluginUnknownFailure): + """ + Cgroup version specific. Starts extension command. + """ + pass # pylint: disable=W0107 + + +class SystemdCgroupsApiv1(SystemdCgroupsApi): + """ + Cgroups v1 interface via systemd + """ + def get_cgroup_mount_points(self): # the output of mount is similar to - # $ mount -t cgroup - # cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) - # cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct) - # cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory) + # $ findmnt -t cgroup --noheadings + # /sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd + # /sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory + # /sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct # etc # - if self._cgroup_mountpoints is None: + if not self._cgroup_mountpoints: cpu = None memory = None - for line in shellutil.run_command(['mount', '-t', 'cgroup']).splitlines(): - match = re.search(r'on\s+(?P/\S+(memory|cpuacct))\s', line) + for line in shellutil.run_command(['findmnt', '-t', 'cgroup', '--noheadings']).splitlines(): + match = re.search(r'(?P/\S+(memory|cpuacct))\s', line) if match is not None: path = match.group('path') if 'cpuacct' in path: @@ -163,14 +268,34 @@ def get_cgroup_mount_points(self): return self._cgroup_mountpoints['cpu'], self._cgroup_mountpoints['memory'] - @staticmethod - def get_process_cgroup_relative_paths(process_id): - """ - Returns a tuple with the path of the cpu and memory cgroups for the given process (relative to the mount point of the corresponding - controller). - The 'process_id' can be a numeric PID or the string "self" for the current process. - The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is not mounted). - """ + def get_unit_cgroup_paths(self, unit_name): + # Ex: ControlGroup=/azure.slice/walinuxagent.service + # controlgroup_path[1:] = azure.slice/walinuxagent.service + controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup") + cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() + + cpu_cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) \ + if cpu_mount_point is not None else None + + memory_cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) \ + if memory_mount_point is not None else None + + return cpu_cgroup_path, memory_cgroup_path + + def get_process_cgroup_paths(self, process_id): + cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id) + + cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() + + cpu_cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) \ + if cpu_mount_point is not None and cpu_cgroup_relative_path is not None else None + + memory_cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) \ + if memory_mount_point is not None and memory_cgroup_relative_path is not None else None + + return cpu_cgroup_path, memory_cgroup_path + + def get_process_cgroup_relative_paths(self, process_id): # The contents of the file are similar to # # cat /proc/1218/cgroup # 10:memory:/system.slice/walinuxagent.service @@ -190,79 +315,6 @@ def get_process_cgroup_relative_paths(process_id): return cpu_path, memory_path - def get_process_cgroup_paths(self, process_id): - """ - Returns a tuple with the path of the cpu and memory cgroups for the given process. The 'process_id' can be a numeric PID or the string "self" for the current process. - The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is not mounted). - """ - cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id) - - cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() - - cpu_cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) \ - if cpu_mount_point is not None and cpu_cgroup_relative_path is not None else None - - memory_cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) \ - if memory_mount_point is not None and memory_cgroup_relative_path is not None else None - - return cpu_cgroup_path, memory_cgroup_path - - def get_unit_cgroup_paths(self, unit_name): - """ - Returns a tuple with the path of the cpu and memory cgroups for the given unit. - The values returned can be None if the controller is not mounted. - Ex: ControlGroup=/azure.slice/walinuxagent.service - controlgroup_path[1:] = azure.slice/walinuxagent.service - """ - controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup") - cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() - - cpu_cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) \ - if cpu_mount_point is not None else None - - memory_cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) \ - if memory_mount_point is not None else None - - return cpu_cgroup_path, memory_cgroup_path - - @staticmethod - def get_cgroup2_controllers(): - """ - Returns a tuple with the mount point for the cgroups v2 controllers, and the currently mounted controllers; - either value can be None if cgroups v2 or its controllers are not mounted - """ - # the output of mount is similar to - # $ mount -t cgroup2 - # cgroup2 on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime,nsdelegate) - # - for line in shellutil.run_command(['mount', '-t', 'cgroup2']).splitlines(): - match = re.search(r'on\s+(?P/\S+)\s', line) - if match is not None: - mount_point = match.group('path') - controllers = None - controllers_file = os.path.join(mount_point, 'cgroup.controllers') - if os.path.exists(controllers_file): - controllers = fileutil.read_file(controllers_file) - return mount_point, controllers - return None, None - - @staticmethod - def _is_systemd_failure(scope_name, stderr): - stderr.seek(0) - stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace') - unit_not_found = "Unit {0} not found.".format(scope_name) - return unit_not_found in stderr or scope_name not in stderr - - @staticmethod - def get_extension_slice_name(extension_name, old_slice=False): - # The old slice makes it difficult for user to override the limits because they need to place drop-in files on every upgrade if extension slice is different for each version. - # old slice includes .- - # new slice without version . - if not old_slice: - extension_name = extension_name.rsplit("-", 1)[0] - # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects. - return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_') + ".slice" - def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): scope = "{0}_{1}".format(cmd_name, uuid.uuid4()) @@ -272,7 +324,8 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh # Some distros like ubuntu20 by default cpu and memory accounting enabled. Thus create nested cgroups under the extension slice # So disabling CPU and Memory accounting prevents from creating nested cgroups, so that all the counters will be present in extension Cgroup # since slice unit file configured with accounting enabled. - "systemd-run --property=CPUAccounting=no --property=MemoryAccounting=no --unit={0} --scope --slice={1} {2}".format(scope, extension_slice_name, command), + "systemd-run --property=CPUAccounting=no --property=MemoryAccounting=no --unit={0} --scope --slice={1} {2}".format( + scope, extension_slice_name, command), shell=shell, cwd=cwd, stdout=stdout, @@ -285,7 +338,7 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh scope_name = scope + '.scope' - logger.info("Started extension in unit '{0}'", scope_name) + log_cgroup_info("Started extension in unit '{0}'".format(scope_name), send_event=False) cpu_cgroup = None try: @@ -294,14 +347,14 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self.get_cgroup_mount_points() if cpu_cgroup_mountpoint is None: - logger.info("The CPU controller is not mounted; will not track resource usage") + log_cgroup_info("The CPU controller is not mounted; will not track resource usage", send_event=False) else: cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path) cpu_cgroup = CpuCgroup(extension_name, cpu_cgroup_path) CGroupsTelemetry.track_cgroup(cpu_cgroup) if memory_cgroup_mountpoint is None: - logger.info("The Memory controller is not mounted; will not track resource usage") + log_cgroup_info("The Memory controller is not mounted; will not track resource usage", send_event=False) else: memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path) memory_cgroup = MemoryCgroup(extension_name, memory_cgroup_path) @@ -309,10 +362,12 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh except IOError as e: if e.errno == 2: # 'No such file or directory' - logger.info("The extension command already completed; will not track resource usage") - logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e)) + log_cgroup_info("The extension command already completed; will not track resource usage", + send_event=False) + log_cgroup_info("Failed to start tracking resource usage for the extension: {0}".format(ustr(e)), + send_event=False) except Exception as e: - logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e)) + log_cgroup_info("Failed to start tracking resource usage for the extension: {0}".format(ustr(e)), send_event=False) # Wait for process completion or timeout try: @@ -342,11 +397,139 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh with self._systemd_run_commands_lock: self._systemd_run_commands.remove(process.pid) - def cleanup_legacy_cgroups(self): + +class SystemdCgroupsApiv2(SystemdCgroupsApi): + """ + Cgroups v2 interface via systemd + """ + + def is_controller_enabled(self, controller, cgroup_path): """ - Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; - starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If - we find that any of the legacy groups include the PID of the daemon then we need to disable data collection for this - instance (under systemd, moving PIDs across the cgroup file system can produce unpredictable results) + Returns True if the provided controller is enabled at the provided cgroup. + + There are two ways to determine if a controller is enabled at the provided cgroup: + + 1. For non-leaf cgroups, the cgroup.subtree_control shows space separated list of the controllers which are + enabled to control resource distribution from the cgroup to its children. All non-root "cgroup.subtree_control" + files can only contain controllers which are enabled in the parent's "cgroup.subtree_control" file. + $ cat /sys/fs/cgroup/cgroup.subtree_control + cpuset cpu io memory hugetlb pids rdma misc + + 2. For leaf cgroups, the cgroup.subtree_control file will be empty and the presence of "." + prefixed interface files at the path indicate the controller is enabled. + $ ls /sys/fs/cgroup/azure.slice/walinuxagent.service/ + cgroup.controllers cgroup.max.descendants cgroup.threads cpu.pressure cpu.weight.nice memory.high memory.oom.group memory.swap.current memory.zswap.current pids.peak + cgroup.events cgroup.pressure cgroup.type cpu.stat io.pressure memory.low memory.peak memory.swap.events memory.zswap.max + cgroup.freeze cgroup.procs cpu.idle cpu.uclamp.max memory.current memory.max memory.pressure memory.swap.high pids.current + cgroup.kill cgroup.stat cpu.max cpu.uclamp.min memory.events memory.min memory.reclaim memory.swap.max pids.events + cgroup.max.depth cgroup.subtree_control cpu.max.burst cpu.weight memory.events.local memory.numa_stat memory.stat memory.swap.peak pids.max + + If either check is True, the controller is enabled at the cgroup. Check 1 is necessary because no controller + interface files exist at the root cgroup, even if the controller is enabled. """ - return CGroupsApi._foreach_legacy_cgroup(lambda *_: None) + if cgroup_path is not None and controller is not None: + # Check that the controller is enabled in the cgroup.subtree_control file + enabled_controllers_file = os.path.join(cgroup_path, 'cgroup.subtree_control') + if os.path.exists(enabled_controllers_file): + enabled_controllers = fileutil.read_file(enabled_controllers_file).rstrip().split(" ") + if controller in enabled_controllers: + return True + + # Check that the controller interface files exist in the cgroup + if os.path.exists(cgroup_path): + for item in os.listdir(cgroup_path): + if item.startswith(controller + '.'): + return True + + return False + + def get_cgroup_mount_points(self): + # The output of mount is similar to + # $ findmnt -t cgroup2 --noheadings + # /sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot + # + # Since v2 is a unified hierarchy, this method checks if each controller is enabled at the root cgroup. This + # check is necessary because all non-root "cgroup.subtree_control" files can only contain controllers which are + # enabled in the parent's "cgroup.subtree_control" file. + + if not self._cgroup_mountpoints: + cpu = None + memory = None + for line in shellutil.run_command(['findmnt', '-t', 'cgroup2', '--noheadings']).splitlines(): + match = re.search(r'(?P/\S+)\s+cgroup2', line) + if match is not None: + mount_point = match.group('path') + if self.is_controller_enabled('cpu', mount_point): + cpu = mount_point + if self.is_controller_enabled('memory', mount_point): + memory = mount_point + self._cgroup_mountpoints = {'cpu': cpu, 'memory': memory} + + return self._cgroup_mountpoints['cpu'], self._cgroup_mountpoints['memory'] + + def get_unit_cgroup_paths(self, unit_name): + # Ex: ControlGroup=/azure.slice/walinuxagent.service + # controlgroup_path[1:] = azure.slice/walinuxagent.service + controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup") + cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() + + # Since v2 is a unified hierarchy, we need to check if each controller is enabled for the cgroup. If a + # controller is not enabled, then its controller interface files won't exist at the cgroup path + cpu_cgroup_path = None + if cpu_mount_point is not None: + cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) + if self.is_controller_enabled('cpu', cgroup_path): + cpu_cgroup_path = cgroup_path + + memory_cgroup_path = None + if memory_mount_point is not None: + cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) + if self.is_controller_enabled('memory', cgroup_path): + memory_cgroup_path = cgroup_path + + return cpu_cgroup_path, memory_cgroup_path + + def get_process_cgroup_paths(self, process_id): + cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id) + cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() + + # Since v2 is a unified hierarchy, we need to check if each controller is enabled for the cgroup. If a + # controller is not enabled, then its controller interface files won't exist at the cgroup path + cpu_cgroup_path = None + if cpu_mount_point is not None and cpu_cgroup_relative_path is not None: + cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) + if self.is_controller_enabled('cpu', cgroup_path): + cpu_cgroup_path = cgroup_path + + memory_cgroup_path = None + if memory_mount_point is not None and memory_cgroup_relative_path is not None: + cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) + if self.is_controller_enabled('memory', cgroup_path): + memory_cgroup_path = cgroup_path + + return cpu_cgroup_path, memory_cgroup_path + + def get_process_cgroup_relative_paths(self, process_id): + # The contents of the file are similar to + # # cat /proc/1218/cgroup + # 0::/azure.slice/walinuxagent.service + cpu_path = None + memory_path = None + for line in fileutil.read_file("/proc/{0}/cgroup".format(process_id)).splitlines(): + match = re.match(r'\d+::(?P\S+)', line) + if match is not None: + path = match.group('path').lstrip('/') if match.group('path') != '/' else None + memory_path = path + cpu_path = path + + return cpu_path, memory_path + + def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): # pylint: disable=W0613 + """ + Currently, the agent will not enable cgroups v2 or use SystemdCgroupv2Api() to start extension commands. Raising + an exception here for CGroupConfigurator to catch in case v2 is improperly enabled. + """ + error_msg = "The agent does not currently support running extensions in cgroups v2" + log_cgroup_warning(error_msg) + raise CGroupsException(msg=error_msg) + diff --git a/azurelinuxagent/ga/cgroupconfigurator.py b/azurelinuxagent/ga/cgroupconfigurator.py index 09eb8b55a..7b415d99f 100644 --- a/azurelinuxagent/ga/cgroupconfigurator.py +++ b/azurelinuxagent/ga/cgroupconfigurator.py @@ -24,8 +24,9 @@ from azurelinuxagent.common import conf from azurelinuxagent.common import logger from azurelinuxagent.ga.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryCgroup -from azurelinuxagent.ga.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdRunError, EXTENSION_SLICE_PREFIX -from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry +from azurelinuxagent.ga.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdRunError, EXTENSION_SLICE_PREFIX, \ + get_cgroup_api, SystemdCgroupsApiv2 +from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry, log_cgroup_info, log_cgroup_warning from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException, AgentMemoryExceededException from azurelinuxagent.common.future import ustr from azurelinuxagent.common.osutil import systemd @@ -114,18 +115,6 @@ class DisableCgroups(object): EXTENSIONS = "extensions" -def _log_cgroup_info(format_string, *args): - message = format_string.format(*args) - logger.info("[CGI] " + message) - add_event(op=WALAEventOperation.CGroupsInfo, message=message) - - -def _log_cgroup_warning(format_string, *args): - message = format_string.format(*args) - logger.info("[CGW] " + message) # log as INFO for now, in the future it should be logged as WARNING - add_event(op=WALAEventOperation.CGroupsInfo, message=message, is_success=False, log_event=False) - - class CGroupConfigurator(object): """ This class implements the high-level operations on CGroups (e.g. initialization, creation, etc) @@ -166,23 +155,28 @@ def initialize(self): agent_drop_in_file_memory_accounting, agent_drop_in_file_cpu_quota]) self.__cleanup_all_files(files_to_cleanup) self.__reload_systemd_config() - logger.info("Agent reset the quotas if distro: {0} goes from supported to unsupported list", get_distro()) + log_cgroup_info("Agent reset the quotas if distro: {0} goes from supported to unsupported list".format(get_distro()), send_event=False) except Exception as err: - logger.warn("Unable to delete Agent drop-in files while resetting the quotas: {0}".format(err)) + logger.warn("[CGW] Unable to delete Agent drop-in files while resetting the quotas: {0}".format(err)) # check whether cgroup monitoring is supported on the current distro self._cgroups_supported = CGroupsApi.cgroups_supported() if not self._cgroups_supported: - logger.info("Cgroup monitoring is not supported on {0}", get_distro()) + log_cgroup_info("Cgroup monitoring is not supported on {0}".format(get_distro()), send_event=False) + return + + # Determine which version of the Cgroup API should be used. If the correct version can't be determined, + # do not enable resource monitoring/enforcement. + self._cgroups_api = get_cgroup_api() + if self._cgroups_api is None: return # check that systemd is detected correctly - self._cgroups_api = SystemdCgroupsApi() if not systemd.is_systemd(): - _log_cgroup_warning("systemd was not detected on {0}", get_distro()) + log_cgroup_warning("systemd was not detected on {0}".format(get_distro())) return - _log_cgroup_info("systemd version: {0}", systemd.get_version()) + log_cgroup_info("systemd version: {0}".format(systemd.get_version())) if not self.__check_no_legacy_cgroups(): return @@ -190,34 +184,38 @@ def initialize(self): agent_unit_name = systemd.get_agent_unit_name() agent_slice = systemd.get_unit_property(agent_unit_name, "Slice") if agent_slice not in (AZURE_SLICE, "system.slice"): - _log_cgroup_warning("The agent is within an unexpected slice: {0}", agent_slice) + log_cgroup_warning("The agent is within an unexpected slice: {0}".format(agent_slice)) return self.__setup_azure_slice() - cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers() - self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroups(agent_slice, + cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers_mount_points() + self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroup_paths(agent_slice, cpu_controller_root, memory_controller_root) + if self.cgroup_v2_enabled(): + log_cgroup_info("Agent and extensions resource monitoring is not currently supported on cgroups v2") + return + if self._agent_cpu_cgroup_path is not None or self._agent_memory_cgroup_path is not None: self.enable() if self._agent_cpu_cgroup_path is not None: - _log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path) + log_cgroup_info("Agent CPU cgroup: {0}".format(self._agent_cpu_cgroup_path)) self.__set_cpu_quota(conf.get_agent_cpu_quota()) CGroupsTelemetry.track_cgroup(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path)) if self._agent_memory_cgroup_path is not None: - _log_cgroup_info("Agent Memory cgroup: {0}", self._agent_memory_cgroup_path) + log_cgroup_info("Agent Memory cgroup: {0}".format(self._agent_memory_cgroup_path)) self._agent_memory_cgroup = MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path) CGroupsTelemetry.track_cgroup(self._agent_memory_cgroup) - _log_cgroup_info('Agent cgroups enabled: {0}', self._agent_cgroups_enabled) except Exception as exception: - _log_cgroup_warning("Error initializing cgroups: {0}", ustr(exception)) + log_cgroup_warning("Error initializing cgroups: {0}".format(ustr(exception))) finally: + log_cgroup_info('Agent cgroups enabled: {0}'.format(self._agent_cgroups_enabled)) self._initialized = True def __check_no_legacy_cgroups(self): @@ -227,33 +225,22 @@ def __check_no_legacy_cgroups(self): """ legacy_cgroups = self._cgroups_api.cleanup_legacy_cgroups() if legacy_cgroups > 0: - _log_cgroup_warning("The daemon's PID was added to a legacy cgroup; will not monitor resource usage.") + log_cgroup_warning("The daemon's PID was added to a legacy cgroup; will not monitor resource usage.") return False return True - def __get_cgroup_controllers(self): - # - # check v1 controllers - # + def __get_cgroup_controllers_mount_points(self): cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points() if cpu_controller_root is not None: - logger.info("The CPU cgroup controller is mounted at {0}", cpu_controller_root) + log_cgroup_info("The CPU cgroup controller is mounted at {0}".format(cpu_controller_root), send_event=False) else: - _log_cgroup_warning("The CPU cgroup controller is not mounted") + log_cgroup_warning("The CPU cgroup controller is not mounted") if memory_controller_root is not None: - logger.info("The memory cgroup controller is mounted at {0}", memory_controller_root) + log_cgroup_info("The memory cgroup controller is mounted at {0}".format(memory_controller_root), send_event=False) else: - _log_cgroup_warning("The memory cgroup controller is not mounted") - - # - # check v2 controllers - # - cgroup2_mount_point, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers() - if cgroup2_mount_point is not None: - _log_cgroup_info("cgroups v2 mounted at {0}. Controllers: [{1}]", cgroup2_mount_point, - cgroup2_controllers) + log_cgroup_warning("The memory cgroup controller is not mounted") return cpu_controller_root, memory_controller_root @@ -334,7 +321,7 @@ def __setup_azure_slice(): for path, contents in files_to_create: CGroupConfigurator._Impl.__create_unit_file(path, contents) except Exception as exception: - _log_cgroup_warning("Failed to create unit files for the azure slice: {0}", ustr(exception)) + log_cgroup_warning("Failed to create unit files for the azure slice: {0}".format(ustr(exception))) for unit_file in files_to_create: CGroupConfigurator._Impl.__cleanup_unit_file(unit_file) return @@ -345,10 +332,10 @@ def __setup_azure_slice(): def __reload_systemd_config(): # reload the systemd configuration; the new slices will be used once the agent's service restarts try: - logger.info("Executing systemctl daemon-reload...") + log_cgroup_info("Executing systemctl daemon-reload...", send_event=False) shellutil.run_command(["systemctl", "daemon-reload"]) except Exception as exception: - _log_cgroup_warning("daemon-reload failed (create azure slice): {0}", ustr(exception)) + log_cgroup_warning("daemon-reload failed (create azure slice): {0}".format(ustr(exception))) # W0238: Unused private member `_Impl.__create_unit_file(path, contents)` (unused-private-member) @staticmethod @@ -358,7 +345,7 @@ def __create_unit_file(path, contents): # pylint: disable=unused-private-member fileutil.mkdir(parent, mode=0o755) exists = os.path.exists(path) fileutil.write_file(path, contents) - _log_cgroup_info("{0} {1}", "Updated" if exists else "Created", path) + log_cgroup_info("{0} {1}".format("Updated" if exists else "Created", path)) # W0238: Unused private member `_Impl.__cleanup_unit_file(path)` (unused-private-member) @staticmethod @@ -366,9 +353,9 @@ def __cleanup_unit_file(path): # pylint: disable=unused-private-member if os.path.exists(path): try: os.remove(path) - _log_cgroup_info("Removed {0}", path) + log_cgroup_info("Removed {0}".format(path)) except Exception as exception: - _log_cgroup_warning("Failed to remove {0}: {1}", path, ustr(exception)) + log_cgroup_warning("Failed to remove {0}: {1}".format(path, ustr(exception))) @staticmethod def __cleanup_all_files(files_to_cleanup): @@ -376,9 +363,9 @@ def __cleanup_all_files(files_to_cleanup): if os.path.exists(path): try: os.remove(path) - _log_cgroup_info("Removed {0}", path) + log_cgroup_info("Removed {0}".format(path)) except Exception as exception: - _log_cgroup_warning("Failed to remove {0}: {1}", path, ustr(exception)) + log_cgroup_warning("Failed to remove {0}: {1}".format(path, ustr(exception))) @staticmethod def __create_all_files(files_to_create): @@ -387,7 +374,7 @@ def __create_all_files(files_to_create): for path, contents in files_to_create: CGroupConfigurator._Impl.__create_unit_file(path, contents) except Exception as exception: - _log_cgroup_warning("Failed to create unit files : {0}", ustr(exception)) + log_cgroup_warning("Failed to create unit files : {0}".format(ustr(exception))) for unit_file in files_to_create: CGroupConfigurator._Impl.__cleanup_unit_file(unit_file) return @@ -411,7 +398,7 @@ def is_extension_resource_limits_setup_completed(self, extension_name, cpu_quota return True return False - def __get_agent_cgroups(self, agent_slice, cpu_controller_root, memory_controller_root): + def __get_agent_cgroup_paths(self, agent_slice, cpu_controller_root, memory_controller_root): agent_unit_name = systemd.get_agent_unit_name() expected_relative_path = os.path.join(agent_slice, agent_unit_name) @@ -419,29 +406,25 @@ def __get_agent_cgroups(self, agent_slice, cpu_controller_root, memory_controlle "self") if cpu_cgroup_relative_path is None: - _log_cgroup_warning("The agent's process is not within a CPU cgroup") + log_cgroup_warning("The agent's process is not within a CPU cgroup") else: if cpu_cgroup_relative_path == expected_relative_path: - _log_cgroup_info('CPUAccounting: {0}', systemd.get_unit_property(agent_unit_name, "CPUAccounting")) - _log_cgroup_info('CPUQuota: {0}', systemd.get_unit_property(agent_unit_name, "CPUQuotaPerSecUSec")) + log_cgroup_info('CPUAccounting: {0}'.format(systemd.get_unit_property(agent_unit_name, "CPUAccounting"))) + log_cgroup_info('CPUQuota: {0}'.format(systemd.get_unit_property(agent_unit_name, "CPUQuotaPerSecUSec"))) else: - _log_cgroup_warning( - "The Agent is not in the expected CPU cgroup; will not enable monitoring. Cgroup:[{0}] Expected:[{1}]", - cpu_cgroup_relative_path, - expected_relative_path) + log_cgroup_warning( + "The Agent is not in the expected CPU cgroup; will not enable monitoring. Cgroup:[{0}] Expected:[{1}]".format(cpu_cgroup_relative_path, expected_relative_path)) cpu_cgroup_relative_path = None # Set the path to None to prevent monitoring if memory_cgroup_relative_path is None: - _log_cgroup_warning("The agent's process is not within a memory cgroup") + log_cgroup_warning("The agent's process is not within a memory cgroup") else: if memory_cgroup_relative_path == expected_relative_path: memory_accounting = systemd.get_unit_property(agent_unit_name, "MemoryAccounting") - _log_cgroup_info('MemoryAccounting: {0}', memory_accounting) + log_cgroup_info('MemoryAccounting: {0}'.format(memory_accounting)) else: - _log_cgroup_info( - "The Agent is not in the expected memory cgroup; will not enable monitoring. CGroup:[{0}] Expected:[{1}]", - memory_cgroup_relative_path, - expected_relative_path) + log_cgroup_warning( + "The Agent is not in the expected memory cgroup; will not enable monitoring. CGroup:[{0}] Expected:[{1}]".format(memory_cgroup_relative_path, expected_relative_path)) memory_cgroup_relative_path = None # Set the path to None to prevent monitoring if cpu_controller_root is not None and cpu_cgroup_relative_path is not None: @@ -468,6 +451,9 @@ def agent_enabled(self): def extensions_enabled(self): return self._extensions_cgroups_enabled + def cgroup_v2_enabled(self): + return isinstance(self._cgroups_api, SystemdCgroupsApiv2) + def enable(self): if not self.supported(): raise CGroupsException( @@ -481,7 +467,7 @@ def disable(self, reason, disable_cgroups): self.__reset_agent_cpu_quota() extension_services = self.get_extension_services_list() for extension in extension_services: - logger.info("Resetting extension : {0} and it's services: {1} CPUQuota".format(extension, extension_services[extension])) + log_cgroup_info("Resetting extension : {0} and it's services: {1} CPUQuota".format(extension, extension_services[extension]), send_event=False) self.__reset_extension_cpu_quota(extension_name=extension) self.__reset_extension_services_cpu_quota(extension_services[extension]) self.__reload_systemd_config() @@ -494,9 +480,7 @@ def disable(self, reason, disable_cgroups): self.__reset_agent_cpu_quota() CGroupsTelemetry.stop_tracking(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path)) - message = "[CGW] Disabling resource usage monitoring. Reason: {0}".format(reason) - logger.info(message) # log as INFO for now, in the future it should be logged as WARNING - add_event(op=WALAEventOperation.CGroupsDisabled, message=message, is_success=False, log_event=False) + log_cgroup_warning("Disabling resource usage monitoring. Reason: {0}".format(reason), op=WALAEventOperation.CGroupsDisabled) @staticmethod def __set_cpu_quota(quota): @@ -507,7 +491,7 @@ def __set_cpu_quota(quota): over this setting. """ quota_percentage = "{0}%".format(quota) - _log_cgroup_info("Ensuring the agent's CPUQuota is {0}", quota_percentage) + log_cgroup_info("Ensuring the agent's CPUQuota is {0}".format(quota_percentage)) if CGroupConfigurator._Impl.__try_set_cpu_quota(quota_percentage): CGroupsTelemetry.set_track_throttled_time(True) @@ -519,10 +503,9 @@ def __reset_agent_cpu_quota(): NOTE: This resets the quota on the agent's default dropin file; any local overrides on the VM will take precedence over this setting. """ - logger.info("Resetting agent's CPUQuota") + log_cgroup_info("Resetting agent's CPUQuota", send_event=False) if CGroupConfigurator._Impl.__try_set_cpu_quota(''): # setting an empty value resets to the default (infinity) - _log_cgroup_info('CPUQuota: {0}', - systemd.get_unit_property(systemd.get_agent_unit_name(), "CPUQuotaPerSecUSec")) + log_cgroup_info('CPUQuota: {0}'.format(systemd.get_unit_property(systemd.get_agent_unit_name(), "CPUQuotaPerSecUSec"))) # W0238: Unused private member `_Impl.__try_set_cpu_quota(quota)` (unused-private-member) @staticmethod @@ -536,13 +519,13 @@ def __try_set_cpu_quota(quota): # pylint: disable=unused-private-member return True # no need to update the file; return here to avoid doing a daemon-reload CGroupConfigurator._Impl.__create_unit_file(drop_in_file, contents) except Exception as exception: - _log_cgroup_warning('Failed to set CPUQuota: {0}', ustr(exception)) + log_cgroup_warning('Failed to set CPUQuota: {0}', ustr(exception)) return False try: - logger.info("Executing systemctl daemon-reload...") + log_cgroup_info("Executing systemctl daemon-reload...", send_event=False) shellutil.run_command(["systemctl", "daemon-reload"]) except Exception as exception: - _log_cgroup_warning("daemon-reload failed (set quota): {0}", ustr(exception)) + log_cgroup_warning("daemon-reload failed (set quota): {0}", ustr(exception)) return False return True @@ -626,7 +609,7 @@ def _check_processes_in_agent_cgroup(self): if len(unexpected) >= 5: # collect just a small sample break except Exception as exception: - _log_cgroup_warning("Error checking the processes in the agent's cgroup: {0}".format(ustr(exception))) + log_cgroup_warning("Error checking the processes in the agent's cgroup: {0}".format(ustr(exception))) if len(unexpected) > 0: self._report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected) @@ -761,17 +744,17 @@ def start_tracking_unit_cgroups(self, unit_name): cpu_cgroup_path, memory_cgroup_path = self._cgroups_api.get_unit_cgroup_paths(unit_name) if cpu_cgroup_path is None: - logger.info("The CPU controller is not mounted; will not track resource usage") + log_cgroup_info("The CPU controller is not mounted; will not track resource usage", send_event=False) else: CGroupsTelemetry.track_cgroup(CpuCgroup(unit_name, cpu_cgroup_path)) if memory_cgroup_path is None: - logger.info("The Memory controller is not mounted; will not track resource usage") + log_cgroup_info("The Memory controller is not mounted; will not track resource usage", send_event=False) else: CGroupsTelemetry.track_cgroup(MemoryCgroup(unit_name, memory_cgroup_path)) except Exception as exception: - logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(exception)) + log_cgroup_info("Failed to start tracking resource usage for the extension: {0}".format(ustr(exception)), send_event=False) def stop_tracking_unit_cgroups(self, unit_name): """ @@ -787,7 +770,7 @@ def stop_tracking_unit_cgroups(self, unit_name): CGroupsTelemetry.stop_tracking(MemoryCgroup(unit_name, memory_cgroup_path)) except Exception as exception: - logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception)) + log_cgroup_info("Failed to stop tracking resource usage for the extension service: {0}".format(ustr(exception)), send_event=False) def stop_tracking_extension_cgroups(self, extension_name): """ @@ -809,7 +792,7 @@ def stop_tracking_extension_cgroups(self, extension_name): CGroupsTelemetry.stop_tracking(MemoryCgroup(extension_name, memory_cgroup_path)) except Exception as exception: - logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception)) + log_cgroup_info("Failed to stop tracking resource usage for the extension service: {0}".format(ustr(exception)), send_event=False) def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): @@ -836,6 +819,12 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh extension_name, ustr(exception)) self.disable(reason, DisableCgroups.ALL) # fall-through and re-invoke the extension + except CGroupsException as exception: + reason = 'Failed to start {0} using cgroups, will try invoking the extension directly. Error: {1}'.format( + extension_name, ustr(exception)) + self.disable(reason, DisableCgroups.ALL) + # fall-through and re-invoke the extension + # subprocess-popen-preexec-fn Disabled: code is not multi-threaded process = subprocess.Popen(command, shell=shell, cwd=cwd, env=env, stdout=stdout, stderr=stderr, preexec_fn=os.setsid) # pylint: disable=W1509 @@ -867,14 +856,14 @@ def setup_extension_slice(self, extension_name, cpu_quota): try: cpu_quota = str(cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity) if cpu_quota == "": - _log_cgroup_info("CPUQuota not set for {0}", extension_name) + log_cgroup_info("CPUQuota not set for {0}".format(extension_name)) else: - _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", extension_name, cpu_quota) + log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}".format(extension_name, cpu_quota)) slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name, cpu_quota=cpu_quota) CGroupConfigurator._Impl.__create_unit_file(extension_slice_path, slice_contents) except Exception as exception: - _log_cgroup_warning("Failed to set the extension {0} slice and quotas: {1}", extension_name, + log_cgroup_warning("Failed to set the extension {0} slice and quotas: {1}", extension_name, ustr(exception)) CGroupConfigurator._Impl.__cleanup_unit_file(extension_slice_path) @@ -916,7 +905,7 @@ def set_extension_services_cpu_memory_quota(self, services_list): cpu_quota = service.get('cpuQuotaPercentage', None) if cpu_quota is not None: cpu_quota = str(cpu_quota) + "%" - _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", service_name, cpu_quota) + log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}".format(service_name, cpu_quota)) drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA) cpu_quota_contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(cpu_quota) files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents)) @@ -950,7 +939,7 @@ def __reset_extension_services_cpu_quota(self, services_list): files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents)) self.__create_all_files(files_to_create) except Exception as exception: - _log_cgroup_warning('Failed to reset CPUQuota for {0} : {1}', service_name, ustr(exception)) + log_cgroup_warning('Failed to reset CPUQuota for {0} : {1}', service_name, ustr(exception)) def remove_extension_services_drop_in_files(self, services_list): """ @@ -975,7 +964,7 @@ def remove_extension_services_drop_in_files(self, services_list): files_to_cleanup.append(drop_in_file_cpu_quota) CGroupConfigurator._Impl.__cleanup_all_files(files_to_cleanup) - _log_cgroup_info("Drop in files removed for {0}".format(service_name)) + log_cgroup_info("Drop in files removed for {0}".format(service_name)) def stop_tracking_extension_services_cgroups(self, services_list): """ @@ -1016,10 +1005,10 @@ def get_extension_services_list(): services = resource_limits.get('services') if resource_limits else None extensions_services[extensions_name] = services except (IOError, OSError) as e: - _log_cgroup_warning( + log_cgroup_warning( 'Failed to load manifest file ({0}): {1}'.format(manifest_path, e.strerror)) except ValueError: - _log_cgroup_warning('Malformed manifest file ({0}).'.format(manifest_path)) + log_cgroup_warning('Malformed manifest file ({0}).'.format(manifest_path)) return extensions_services # unique instance for the singleton diff --git a/azurelinuxagent/ga/cgroupstelemetry.py b/azurelinuxagent/ga/cgroupstelemetry.py index 5943b45ad..5a564de63 100644 --- a/azurelinuxagent/ga/cgroupstelemetry.py +++ b/azurelinuxagent/ga/cgroupstelemetry.py @@ -17,10 +17,23 @@ import threading from azurelinuxagent.common import logger +from azurelinuxagent.common.event import WALAEventOperation, add_event from azurelinuxagent.ga.cgroup import CpuCgroup from azurelinuxagent.common.future import ustr +def log_cgroup_info(formatted_string, op=WALAEventOperation.CGroupsInfo, send_event=True): + logger.info("[CGI] " + formatted_string) + if send_event: + add_event(op=op, message=formatted_string) + + +def log_cgroup_warning(formatted_string, op=WALAEventOperation.CGroupsInfo, send_event=True): + logger.info("[CGW] " + formatted_string) # log as INFO for now, in the future it should be logged as WARNING + if send_event: + add_event(op=op, message=formatted_string, is_success=False, log_event=False) + + class CGroupsTelemetry(object): """ """ diff --git a/tests/data/cgroups/sys_fs_cgroup_unified_cgroup.controllers b/tests/data/cgroups/sys_fs_cgroup_unified_cgroup.controllers deleted file mode 100644 index 2a03d239d..000000000 --- a/tests/data/cgroups/sys_fs_cgroup_unified_cgroup.controllers +++ /dev/null @@ -1,7 +0,0 @@ -io -memory -pids -perf_event -rdma -cpu -freezer \ No newline at end of file diff --git a/tests/data/cgroups/proc_pid_cgroup b/tests/data/cgroups/v1/proc_pid_cgroup similarity index 100% rename from tests/data/cgroups/proc_pid_cgroup rename to tests/data/cgroups/v1/proc_pid_cgroup diff --git a/tests/data/cgroups/proc_self_cgroup b/tests/data/cgroups/v1/proc_self_cgroup similarity index 100% rename from tests/data/cgroups/proc_self_cgroup rename to tests/data/cgroups/v1/proc_self_cgroup diff --git a/tests/data/cgroups/v1/sys_fs_cgroup_unified_cgroup.subtree_control b/tests/data/cgroups/v1/sys_fs_cgroup_unified_cgroup.subtree_control new file mode 100644 index 000000000..e69de29bb diff --git a/tests/data/cgroups/v1_and_v2/proc_pid_cgroup b/tests/data/cgroups/v1_and_v2/proc_pid_cgroup new file mode 100644 index 000000000..179c59daa --- /dev/null +++ b/tests/data/cgroups/v1_and_v2/proc_pid_cgroup @@ -0,0 +1,12 @@ +12:devices:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope +11:perf_event:/ +10:rdma:/ +9:blkio:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope +8:net_cls,net_prio:/ +7:freezer:/ +6:hugetlb:/ +4:cpuset:/ +3:cpu,cpuacct:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope +2:pids:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope +1:name=systemd:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope +0::/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope diff --git a/tests/data/cgroups/v1_and_v2/proc_self_cgroup b/tests/data/cgroups/v1_and_v2/proc_self_cgroup new file mode 100644 index 000000000..40e7dd5b1 --- /dev/null +++ b/tests/data/cgroups/v1_and_v2/proc_self_cgroup @@ -0,0 +1,12 @@ +12:blkio:/system.slice/walinuxagent.service +11:cpu,cpuacct:/system.slice/walinuxagent.service +10:devices:/system.slice/walinuxagent.service +9:pids:/system.slice/walinuxagent.service +7:freezer:/ +6:hugetlb:/ +5:perf_event:/ +4:net_cls,net_prio:/ +3:cpuset:/ +2:rdma:/ +1:name=systemd:/system.slice/walinuxagent.service +0::/system.slice/walinuxagent.service diff --git a/tests/data/cgroups/v1_and_v2/sys_fs_cgroup_cgroup.subtree_control b/tests/data/cgroups/v1_and_v2/sys_fs_cgroup_cgroup.subtree_control new file mode 100644 index 000000000..2142c3ad3 --- /dev/null +++ b/tests/data/cgroups/v1_and_v2/sys_fs_cgroup_cgroup.subtree_control @@ -0,0 +1 @@ +memory diff --git a/tests/data/cgroups/v2/proc_pid_cgroup b/tests/data/cgroups/v2/proc_pid_cgroup new file mode 100644 index 000000000..8a1f8d0be --- /dev/null +++ b/tests/data/cgroups/v2/proc_pid_cgroup @@ -0,0 +1 @@ +0::/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope diff --git a/tests/data/cgroups/v2/proc_self_cgroup b/tests/data/cgroups/v2/proc_self_cgroup new file mode 100644 index 000000000..0027b4040 --- /dev/null +++ b/tests/data/cgroups/v2/proc_self_cgroup @@ -0,0 +1 @@ +0::/system.slice/walinuxagent.service diff --git a/tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control b/tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control new file mode 100644 index 000000000..c94e05c42 --- /dev/null +++ b/tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control @@ -0,0 +1 @@ +cpuset cpu io memory pids diff --git a/tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control_empty b/tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control_empty new file mode 100644 index 000000000..e69de29bb diff --git a/tests/ga/test_cgroupapi.py b/tests/ga/test_cgroupapi.py index ad8ef80c2..7064ea51f 100644 --- a/tests/ga/test_cgroupapi.py +++ b/tests/ga/test_cgroupapi.py @@ -22,11 +22,15 @@ import subprocess import tempfile -from azurelinuxagent.ga.cgroupapi import CGroupsApi, SystemdCgroupsApi +from azurelinuxagent.common.exception import CGroupsException +from azurelinuxagent.common.utils.fileutil import read_file +from azurelinuxagent.ga import cgroupapi +from azurelinuxagent.ga.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdCgroupsApiv1, SystemdCgroupsApiv2 from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.osutil import systemd from azurelinuxagent.common.utils import fileutil -from tests.lib.mock_cgroup_environment import mock_cgroup_environment +from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, mock_cgroup_v2_environment, \ + mock_cgroup_v1_and_v2_environment from tests.lib.tools import AgentTestCase, patch, mock_sleep from tests.lib.cgroups_tools import CGroupsTools @@ -47,7 +51,24 @@ def tearDown(self): AgentTestCase.tearDown(self) -class CGroupsApiTestCase(_MockedFileSystemTestCase): +class CGroupsApiTestCase(AgentTestCase): + def test_get_cgroup_api_is_v1_when_v1_controllers_mounted(self): + with mock_cgroup_v1_environment(self.tmp_dir): + self.assertIsInstance(cgroupapi.get_cgroup_api(), SystemdCgroupsApiv1) + + def test_get_cgroup_api_is_v2_when_v2_controllers_mounted(self): + with mock_cgroup_v2_environment(self.tmp_dir): + self.assertIsInstance(cgroupapi.get_cgroup_api(), SystemdCgroupsApiv2) + + def test_get_cgroup_api_is_v1_when_v1_and_v2_controllers_mounted(self): + with mock_cgroup_v1_and_v2_environment(self.tmp_dir): + self.assertIsInstance(cgroupapi.get_cgroup_api(), SystemdCgroupsApiv1) + + def test_get_cgroup_api_is_none_when_no_controllers_mounted(self): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points", return_value=(None,None)): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points", return_value=(None,None)): + self.assertIsNone(cgroupapi.get_cgroup_api()) + def test_cgroups_should_be_supported_only_on_ubuntu16_centos7dot4_redhat7dot4_and_later_versions(self): test_cases = [ (['ubuntu', '16.04', 'xenial'], True), @@ -81,79 +102,144 @@ def test_cgroups_should_be_supported_only_on_ubuntu16_centos7dot4_redhat7dot4_an class SystemdCgroupsApiTestCase(AgentTestCase): def test_get_systemd_version_should_return_a_version_number(self): - with mock_cgroup_environment(self.tmp_dir): - version_info = systemd.get_version() - found = re.search(r"systemd \d+", version_info) is not None - self.assertTrue(found, "Could not determine the systemd version: {0}".format(version_info)) - - def test_get_cpu_and_memory_mount_points_should_return_the_cgroup_mount_points(self): - with mock_cgroup_environment(self.tmp_dir): - cpu, memory = SystemdCgroupsApi().get_cgroup_mount_points() - self.assertEqual(cpu, '/sys/fs/cgroup/cpu,cpuacct', "The mount point for the CPU controller is incorrect") - self.assertEqual(memory, '/sys/fs/cgroup/memory', "The mount point for the memory controller is incorrect") - - def test_get_service_cgroup_paths_should_return_the_cgroup_mount_points(self): - with mock_cgroup_environment(self.tmp_dir): - cpu, memory = SystemdCgroupsApi().get_unit_cgroup_paths("extension.service") - self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service', - "The mount point for the CPU controller is incorrect") - self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/extension.service', - "The mount point for the memory controller is incorrect") - - def test_get_cpu_and_memory_cgroup_relative_paths_for_process_should_return_the_cgroup_relative_paths(self): - with mock_cgroup_environment(self.tmp_dir): - cpu, memory = SystemdCgroupsApi.get_process_cgroup_relative_paths('self') - self.assertEqual(cpu, "system.slice/walinuxagent.service", "The relative path for the CPU cgroup is incorrect") - self.assertEqual(memory, "system.slice/walinuxagent.service", "The relative memory for the CPU cgroup is incorrect") - - def test_get_cgroup2_controllers_should_return_the_v2_cgroup_controllers(self): - with mock_cgroup_environment(self.tmp_dir): - mount_point, controllers = SystemdCgroupsApi.get_cgroup2_controllers() - - self.assertEqual(mount_point, "/sys/fs/cgroup/unified", "Invalid mount point for V2 cgroups") - self.assertIn("cpu", controllers, "The CPU controller is not in the list of V2 controllers") - self.assertIn("memory", controllers, "The memory controller is not in the list of V2 controllers") + # We expect same behavior for v1 and v2 + mock_envs = [mock_cgroup_v1_environment(self.tmp_dir), mock_cgroup_v2_environment(self.tmp_dir)] + for env in mock_envs: + with env: + version_info = systemd.get_version() + found = re.search(r"systemd \d+", version_info) is not None + self.assertTrue(found, "Could not determine the systemd version: {0}".format(version_info)) + + def test_is_cpu_or_memory_mounted_true_if_only_memory_mounted(self): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApi.get_cgroup_mount_points", return_value=(None, '/sys/fs/cgroup/memory')): + self.assertTrue(SystemdCgroupsApi().is_cpu_or_memory_mounted()) + + def test_is_cpu_or_memory_mounted_true_if_only_cpu_mounted(self): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApi.get_cgroup_mount_points", return_value=('/sys/fs/cgroup/cpu,cpuacct', None)): + self.assertTrue(SystemdCgroupsApi().is_cpu_or_memory_mounted()) + + def test_is_cpu_or_memory_mounted_true_if_cpu_and_memory_mounted(self): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApi.get_cgroup_mount_points", return_value=('/sys/fs/cgroup/cpu,cpuacct', '/sys/fs/cgroup/memory')): + self.assertTrue(SystemdCgroupsApi().is_cpu_or_memory_mounted()) + + def test_is_cpu_or_memory_mounted_false_if_cpu_and_memory_not_mounted(self): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApi.get_cgroup_mount_points", return_value=(None, None)): + self.assertFalse(SystemdCgroupsApi().is_cpu_or_memory_mounted()) + + def test_get_mounted_controllers_has_cpu_and_memory_controllers(self): + with mock_cgroup_v1_environment(self.tmp_dir): + mounted_controllers = cgroupapi.get_cgroup_api().get_mounted_controllers() + self.assertTrue("cpu" in mounted_controllers) + self.assertTrue("memory" in mounted_controllers) + + with mock_cgroup_v2_environment(self.tmp_dir): + mounted_controllers = cgroupapi.get_cgroup_api().get_mounted_controllers() + self.assertTrue("cpu" in mounted_controllers) + self.assertTrue("memory" in mounted_controllers) + + with mock_cgroup_v1_and_v2_environment(self.tmp_dir): + mounted_controllers = cgroupapi.get_cgroup_api().get_mounted_controllers() # API will be v1 since this environment as CPU mounted in v1 + self.assertTrue("cpu" in mounted_controllers) + self.assertFalse("memory" in mounted_controllers) # This environment has memory mounted in v2 def test_get_unit_property_should_return_the_value_of_the_given_property(self): - with mock_cgroup_environment(self.tmp_dir): - cpu_accounting = systemd.get_unit_property("walinuxagent.service", "CPUAccounting") - - self.assertEqual(cpu_accounting, "no", "Property {0} of {1} is incorrect".format("CPUAccounting", "walinuxagent.service")) + # We expect same behavior for v1 and v2 + mock_envs = [mock_cgroup_v1_environment(self.tmp_dir), mock_cgroup_v2_environment(self.tmp_dir)] + for env in mock_envs: + with env: + cpu_accounting = systemd.get_unit_property("walinuxagent.service", "CPUAccounting") - def assert_cgroups_created(self, extension_cgroups): - self.assertEqual(len(extension_cgroups), 2, - 'start_extension_command did not return the expected number of cgroups') + self.assertEqual(cpu_accounting, "no", "Property {0} of {1} is incorrect".format("CPUAccounting", "walinuxagent.service")) - cpu_found = memory_found = False - for cgroup in extension_cgroups: - match = re.match( - r'^/sys/fs/cgroup/(cpu|memory)/system.slice/Microsoft.Compute.TestExtension_1\.2\.3\_([a-f0-9-]+)\.scope$', - cgroup.path) +class SystemdCgroupsApiv1TestCase(AgentTestCase): + def test_get_unit_cgroup_paths_should_return_the_cgroup_v1_mount_points(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service', + "The mount point for the CPU controller is incorrect") + self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/extension.service', + "The mount point for the memory controller is incorrect") - self.assertTrue(match is not None, "Unexpected path for cgroup: {0}".format(cgroup.path)) + def test_get_unit_cgroup_path_should_return_None_if_either_cgroup_v1_controller_not_mounted(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points', return_value=('/sys/fs/cgroup/cpu,cpuacct', None)): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The mount point for the memory controller is None so unit cgroup should be None") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points', return_value=(None, '/sys/fs/cgroup/memory')): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIsNone(cpu, "The mount point for the cpu controller is None so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/extension.service', + "The mount point for the memory controller is incorrect") + + def test_get_process_cgroup_paths_should_return_the_cgroup_v1_mount_points(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") - if match.group(1) == 'cpu': - cpu_found = True - if match.group(1) == 'memory': - memory_found = True + def test_get_process_cgroup_path_should_return_None_if_either_cgroup_v1_controller_not_mounted(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points', return_value=('/sys/fs/cgroup/cpu,cpuacct', None)): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The mount point for the memory controller is None so unit cgroup should be None") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points', return_value=(None, '/sys/fs/cgroup/memory')): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIsNone(cpu, "The mount point for the CPU controller is None so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") + + def test_get_process_cgroup_v1_path_should_return_None_if_either_relative_path_is_None(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_relative_paths', return_value=('system.slice/walinuxagent.service', None)): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The relative cgroup path for the memory controller is None so unit cgroup should be None") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_relative_paths', return_value=(None, 'system.slice/walinuxagent.service')): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIsNone(cpu, "The relative cgroup path for the cpu controller is None so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") + + def test_get_cpu_and_memory_mount_points_should_return_the_cgroup_v1_mount_points(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_cgroup_mount_points() + self.assertEqual(cpu, '/sys/fs/cgroup/cpu,cpuacct', "The mount point for the CPU controller is incorrect") + self.assertEqual(memory, '/sys/fs/cgroup/memory', "The mount point for the memory controller is incorrect") - self.assertTrue(cpu_found, 'start_extension_command did not return a cpu cgroup') - self.assertTrue(memory_found, 'start_extension_command did not return a memory cgroup') + def test_get_cpu_and_memory_cgroup_relative_paths_for_process_should_return_the_cgroup_v1_relative_paths(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths('self') + self.assertEqual(cpu, "system.slice/walinuxagent.service", "The relative path for the CPU cgroup is incorrect") + self.assertEqual(memory, "system.slice/walinuxagent.service", "The relative memory for the CPU cgroup is incorrect") @patch('time.sleep', side_effect=lambda _: mock_sleep()) - def test_start_extension_command_should_return_the_command_output(self, _): - original_popen = subprocess.Popen + def test_start_extension_cgroups_v1_command_should_return_the_command_output(self, _): + with mock_cgroup_v1_environment(self.tmp_dir): + original_popen = subprocess.Popen - def mock_popen(command, *args, **kwargs): - if command.startswith('systemd-run --property'): - command = "echo TEST_OUTPUT" - return original_popen(command, *args, **kwargs) + def mock_popen(command, *args, **kwargs): + if isinstance(command, str) and command.startswith('systemd-run --property'): + command = "echo TEST_OUTPUT" + return original_popen(command, *args, **kwargs) - with mock_cgroup_environment(self.tmp_dir): with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as output_file: - with patch("subprocess.Popen", side_effect=mock_popen) as popen_patch: # pylint: disable=unused-variable - command_output = SystemdCgroupsApi().start_extension_command( + with patch("subprocess.Popen", + side_effect=mock_popen) as popen_patch: # pylint: disable=unused-variable + command_output = cgroupapi.get_cgroup_api().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="A_TEST_COMMAND", cmd_name="test", @@ -167,9 +253,9 @@ def mock_popen(command, *args, **kwargs): self.assertIn("[stdout]\nTEST_OUTPUT\n", command_output, "The test output was not captured") @patch('time.sleep', side_effect=lambda _: mock_sleep()) - def test_start_extension_command_should_execute_the_command_in_a_cgroup(self, _): - with mock_cgroup_environment(self.tmp_dir): - SystemdCgroupsApi().start_extension_command( + def test_start_extension_cgroups_v1_command_should_execute_the_command_in_a_cgroup(self, _): + with mock_cgroup_v1_environment(self.tmp_dir): + cgroupapi.get_cgroup_api().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="test command", cmd_name="test", @@ -183,18 +269,20 @@ def test_start_extension_command_should_execute_the_command_in_a_cgroup(self, _) tracked = CGroupsTelemetry._tracked self.assertTrue( - any(cg for cg in tracked.values() if cg.name == 'Microsoft.Compute.TestExtension-1.2.3' and 'cpu' in cg.path), + any(cg for cg in tracked.values() if + cg.name == 'Microsoft.Compute.TestExtension-1.2.3' and 'cpu' in cg.path), "The extension's CPU is not being tracked") self.assertTrue( - any(cg for cg in tracked.values() if cg.name == 'Microsoft.Compute.TestExtension-1.2.3' and 'memory' in cg.path), + any(cg for cg in tracked.values() if + cg.name == 'Microsoft.Compute.TestExtension-1.2.3' and 'memory' in cg.path), "The extension's Memory is not being tracked") @patch('time.sleep', side_effect=lambda _: mock_sleep()) - def test_start_extension_command_should_use_systemd_to_execute_the_command(self, _): - with mock_cgroup_environment(self.tmp_dir): + def test_start_extension_cgroups_v1_command_should_use_systemd_to_execute_the_command(self, _): + with mock_cgroup_v1_environment(self.tmp_dir): with patch("subprocess.Popen", wraps=subprocess.Popen) as popen_patch: - SystemdCgroupsApi().start_extension_command( + cgroupapi.get_cgroup_api().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="the-test-extension-command", cmd_name="test", @@ -205,12 +293,242 @@ def test_start_extension_command_should_use_systemd_to_execute_the_command(self, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if "the-test-extension-command" in args[0]] + extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if + "the-test-extension-command" in args[0]] self.assertEqual(1, len(extension_calls), "The extension should have been invoked exactly once") self.assertIn("systemd-run", extension_calls[0], "The extension should have been invoked using systemd") +class SystemdCgroupsApiv2TestCase(AgentTestCase): + def test_is_controller_enabled_should_return_False_if_cgroup_is_None(self): + with mock_cgroup_v2_environment(self.tmp_dir): + self.assertFalse(cgroupapi.get_cgroup_api().is_controller_enabled('cpu', None)) + + def test_is_controller_enabled_should_return_False_if_controller_is_None(self): + with mock_cgroup_v2_environment(self.tmp_dir): + self.assertFalse(cgroupapi.get_cgroup_api().is_controller_enabled(None, '/sys/fs/cgroup')) + + def test_is_controller_enabled_should_return_False_if_cgroup_path_does_not_exist(self): + with mock_cgroup_v2_environment(self.tmp_dir): + self.assertFalse(cgroupapi.get_cgroup_api().is_controller_enabled('cpu', '/path/that/does/not/exist')) + + def test_is_controller_enabled_should_return_False_if_controller_is_not_in_subtree_control_file_and_controller_interface_files_do_not_exist(self): + with mock_cgroup_v2_environment(self.tmp_dir): + self.assertFalse(cgroupapi.get_cgroup_api().is_controller_enabled('cpu', '/sys/fs/cgroup/azure.slice/walinuxagent.service')) + + def test_is_controller_enabled_should_return_True_if_controller_is_in_subtree_control_file(self): + with mock_cgroup_v2_environment(self.tmp_dir): + # Mock the cgroup.subtree_control to include memory controller + def mock_read_file(path): + if "/sys/fs/cgroup/azure.slice/walinuxagent.service/cgroup.subtree_control" in path: + return 'io memory pids\n' + return read_file(path) + + with patch('azurelinuxagent.common.utils.fileutil.read_file', side_effect=mock_read_file): + self.assertTrue(cgroupapi.get_cgroup_api().is_controller_enabled('memory', '/sys/fs/cgroup/azure.slice/walinuxagent.service')) + + def test_is_controller_enabled_should_return_True_if_controller_interface_file_exists(self): + original_list_dir = os.listdir + + # Mock the walinuxagent.service directory to include memory controller interface files + def mock_os_list_dir(path): + if "/sys/fs/cgroup/azure.slice/walinuxagent.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat'] + return original_list_dir(path) + + with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: + # Mock service directory + mock_env._mock_mkdir('/sys/fs/cgroup/azure.slice/walinuxagent.service') + + with patch('os.listdir', side_effect=mock_os_list_dir): + self.assertTrue(cgroupapi.get_cgroup_api().is_controller_enabled('memory', '/sys/fs/cgroup/azure.slice/walinuxagent.service')) + + def test_get_unit_cgroup_paths_should_return_the_cgroup_v2_mount_points(self): + original_list_dir = os.listdir + + # Mock the extension.service directory to include controller interface files + def mock_os_list_dir(path): + if "/sys/fs/cgroup/system.slice/extension.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat', 'cpu.stat'] + return original_list_dir(path) + + with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: + # Mock service directory + mock_env._mock_mkdir('/sys/fs/cgroup/system.slice/extension.service') + + with patch('os.listdir', side_effect=mock_os_list_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertEqual(cpu, '/sys/fs/cgroup/system.slice/extension.service', + "The mount point for the CPU controller is incorrect") + self.assertEqual(memory, '/sys/fs/cgroup/system.slice/extension.service', + "The mount point for the memory controller is incorrect") + + def test_get_unit_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_mounted(self): + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.is_controller_enabled', return_value=True): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points', return_value=('/sys/fs/cgroup', None)): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIn(cpu, '/sys/fs/cgroup/system.slice/extension.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The mount point for the memory controller is None so unit cgroup should be None") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points', return_value=(None, '/sys/fs/cgroup')): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIsNone(cpu, "The mount point for the cpu controller is None so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/system.slice/extension.service', + "The mount point for the memory controller is incorrect") + + def test_get_unit_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_enabled(self): + original_list_dir = os.listdir + + # Mock the extension.service directory to include only cpu controller interface files + def mock_os_list_dir_cpu(path): + if "/sys/fs/cgroup/system.slice/extension.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'cpu.stat'] + return original_list_dir(path) + + # Mock the extension.service directory to include only cpu controller interface files + def mock_os_list_dir_memory(path): + if "/sys/fs/cgroup/system.slice/extension.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat'] + return original_list_dir(path) + + with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: + # Mock service directory + mock_env._mock_mkdir('/sys/fs/cgroup/system.slice/extension.service') + + with patch('os.listdir', side_effect=mock_os_list_dir_cpu): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIn(cpu, '/sys/fs/cgroup/system.slice/extension.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The memory controller is not enabled so unit cgroup should be None") + + with patch('os.listdir', side_effect=mock_os_list_dir_memory): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIsNone(cpu, "The cpu controller is not enabled so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/system.slice/extension.service', + "The mount point for the memory controller is incorrect") + + def test_get_process_cgroup_paths_should_return_the_cgroup_v2_mount_points(self): + original_list_dir = os.listdir + + # Mock the extension.service directory to include controller interface files + def mock_os_list_dir(path): + if "/sys/fs/cgroup/system.slice/walinuxagent.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat', 'cpu.stat'] + return original_list_dir(path) + + with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: + # Mock service directory + mock_env._mock_mkdir('/sys/fs/cgroup/system.slice/walinuxagent.service') + + with patch('os.listdir', side_effect=mock_os_list_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") + + def test_get_process_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_mounted(self): + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.is_controller_enabled', return_value=True): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points', return_value=('/sys/fs/cgroup', None)): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The mount point for the memory controller is None so unit cgroup should be None") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points', return_value=(None, '/sys/fs/cgroup')): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIsNone(cpu, "The mount point for the CPU controller is None so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") + + def test_get_process_cgroup_v2_path_should_return_None_if_either_relative_path_is_None(self): + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.is_controller_enabled', return_value=True): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_process_cgroup_relative_paths', return_value=('system.slice/walinuxagent.service', None)): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The relative cgroup path for the memory controller is None so unit cgroup should be None") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_process_cgroup_relative_paths', return_value=(None, 'system.slice/walinuxagent.service')): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIsNone(cpu, "The relative cgroup path for the cpu controller is None so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") + + def test_get_process_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_enabled(self): + original_list_dir = os.listdir + + # Mock the walinuxagent.service directory to include memory controller interface files + def mock_os_list_dir_memory(path): + if "/sys/fs/cgroup/system.slice/walinuxagent.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat'] + return original_list_dir(path) + + # Mock the walinuxagent.service directory to include cpu controller interface files + def mock_os_list_dir_cpu(path): + if "/sys/fs/cgroup/system.slice/walinuxagent.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'cpu.stat'] + return original_list_dir(path) + + with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: + # Mock service directory + mock_env._mock_mkdir('/sys/fs/cgroup/system.slice/walinuxagent.service') + + with patch('os.listdir', side_effect=mock_os_list_dir_cpu): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The memory controller is not enabled so unit cgroup should be None") + + with patch('os.listdir', side_effect=mock_os_list_dir_memory): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIsNone(cpu, "The cpu controller is not enabled so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") + + def test_get_cpu_and_memory_mount_points_should_return_the_cgroup_v2_mount_points(self): + with mock_cgroup_v2_environment(self.tmp_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_cgroup_mount_points() + self.assertEqual(cpu, '/sys/fs/cgroup', "The mount point for the CPU controller is incorrect") + self.assertEqual(memory, '/sys/fs/cgroup', "The mount point for the memory controller is incorrect") + + def test_get_cpu_and_memory_cgroup_relative_paths_for_process_should_return_the_cgroup_v2_relative_paths(self): + with mock_cgroup_v2_environment(self.tmp_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths('self') + self.assertEqual(cpu, "system.slice/walinuxagent.service", "The relative path for the CPU cgroup is incorrect") + self.assertEqual(memory, "system.slice/walinuxagent.service", "The relative memory for the CPU cgroup is incorrect") + + @patch('time.sleep', side_effect=lambda _: mock_sleep()) + def test_start_extension_cgroups_v2_command_should_raise_exception(self, _): + with mock_cgroup_v2_environment(self.tmp_dir): + with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as output_file: + cgroups_exception_raised = False + try: + cgroupapi.get_cgroup_api().start_extension_command( + extension_name="Microsoft.Compute.TestExtension-1.2.3", + command="A_TEST_COMMAND", + cmd_name="test", + shell=True, + timeout=300, + cwd=self.tmp_dir, + env={}, + stdout=output_file, + stderr=output_file) + except CGroupsException: + cgroups_exception_raised = True + self.assertTrue(cgroups_exception_raised) + + class SystemdCgroupsApiMockedFileSystemTestCase(_MockedFileSystemTestCase): def test_cleanup_legacy_cgroups_should_remove_legacy_cgroups(self): # Set up a mock /var/run/waagent.pid file diff --git a/tests/ga/test_cgroupconfigurator.py b/tests/ga/test_cgroupconfigurator.py index 82c86c956..b097a2602 100644 --- a/tests/ga/test_cgroupconfigurator.py +++ b/tests/ga/test_cgroupconfigurator.py @@ -35,7 +35,8 @@ from azurelinuxagent.common.future import ustr from azurelinuxagent.common.utils import shellutil, fileutil from tests.lib.mock_environment import MockCommand -from tests.lib.mock_cgroup_environment import mock_cgroup_environment, UnitFilePaths +from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, UnitFilePaths, mock_cgroup_v2_environment, \ + mock_cgroup_v1_and_v2_environment from tests.lib.tools import AgentTestCase, patch, mock_sleep, data_dir, is_python_version_26_or_34, skip_if_predicate_true from tests.lib.miscellaneous_tools import format_processes, wait_for @@ -51,7 +52,7 @@ def _get_cgroup_configurator(self, initialize=True, enable=True, mock_commands=N CGroupConfigurator._instance = None configurator = CGroupConfigurator.get_instance() CGroupsTelemetry.reset() - with mock_cgroup_environment(self.tmp_dir) as mock_environment: + with mock_cgroup_v1_environment(self.tmp_dir) as mock_environment: if mock_commands is not None: for command in mock_commands: mock_environment.add_command(command) @@ -64,10 +65,55 @@ def _get_cgroup_configurator(self, initialize=True, enable=True, mock_commands=N configurator.initialize() yield configurator - def test_initialize_should_enable_cgroups(self): + @contextlib.contextmanager + def _get_cgroup_configurator_v2(self, initialize=True, enable=True, mock_commands=None): + CGroupConfigurator._instance = None + configurator = CGroupConfigurator.get_instance() + CGroupsTelemetry.reset() + with mock_cgroup_v2_environment(self.tmp_dir) as mock_environment: + if mock_commands is not None: + for command in mock_commands: + mock_environment.add_command(command) + configurator.mocks = mock_environment + if initialize: + if not enable: + with patch.object(configurator, "enable"): + configurator.initialize() + else: + configurator.initialize() + yield configurator + + @contextlib.contextmanager + def _get_cgroup_configurator_v1_and_v2(self, initialize=True, enable=True, mock_commands=None): + CGroupConfigurator._instance = None + configurator = CGroupConfigurator.get_instance() + CGroupsTelemetry.reset() + with mock_cgroup_v1_and_v2_environment(self.tmp_dir) as mock_environment: + if mock_commands is not None: + for command in mock_commands: + mock_environment.add_command(command) + configurator.mocks = mock_environment + if initialize: + if not enable: + with patch.object(configurator, "enable"): + configurator.initialize() + else: + configurator.initialize() + yield configurator + + def test_initialize_should_enable_cgroups_v1(self): with self._get_cgroup_configurator() as configurator: self.assertTrue(configurator.enabled(), "cgroups were not enabled") + def test_initialize_should_not_enable_cgroups_v2(self): + with self._get_cgroup_configurator_v2() as configurator: + self.assertFalse(configurator.enabled(), "cgroups were enabled") + + def test_initialize_should_not_enable_when_cgroup_api_is_none(self): + with patch('azurelinuxagent.ga.cgroupconfigurator.get_cgroup_api', return_value=None): + with self._get_cgroup_configurator() as configurator: + self.assertFalse(configurator.enabled(), "cgroups were enabled") + def test_initialize_should_start_tracking_the_agent_cgroups(self): with self._get_cgroup_configurator() as configurator: tracked = CGroupsTelemetry._tracked @@ -79,18 +125,18 @@ def test_initialize_should_start_tracking_the_agent_cgroups(self): "The Agent's Memory is not being tracked. Tracked: {0}".format(tracked)) def test_initialize_should_start_tracking_other_controllers_when_one_is_not_present(self): - command_mocks = [MockCommand(r"^mount -t cgroup$", -'''cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) -cgroup on /sys/fs/cgroup/rdma type cgroup (rw,nosuid,nodev,noexec,relatime,rdma) -cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset) -cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_cls,net_prio) -cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event) -cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb) -cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer) -cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids) -cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices) -cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct) -cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio) + command_mocks = [MockCommand(r"^findmnt -t cgroup --noheadings$", +'''/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd +/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices +/sys/fs/cgroup/rdma cgroup cgroup rw,nosuid,nodev,noexec,relatime,rdma +/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event +/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_cls,net_prio +/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio +/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset +/sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct +/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer +/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb +/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids ''')] with self._get_cgroup_configurator(mock_commands=command_mocks) as configurator: tracked = CGroupsTelemetry._tracked @@ -99,18 +145,28 @@ def test_initialize_should_start_tracking_other_controllers_when_one_is_not_pres self.assertFalse(any(cg for cg in tracked.values() if cg.name == 'walinuxagent.service' and 'memory' in cg.path), "The Agent's memory should not be tracked. Tracked: {0}".format(tracked)) + def test_initialize_should_start_tracking_any_controllers_in_v1_if_others_in_v2(self): + # This mock environment has cpu controller in v1 and memory controller in v2 + with self._get_cgroup_configurator_v1_and_v2() as configurator: + tracked = CGroupsTelemetry._tracked + + self.assertTrue(configurator.enabled(), "Cgroups should be enabled") + self.assertFalse( + any(cg for cg in tracked.values() if cg.name == 'walinuxagent.service' and 'memory' in cg.path), + "The Agent's memory should not be tracked. Tracked: {0}".format(tracked)) + def test_initialize_should_not_enable_cgroups_when_the_cpu_and_memory_controllers_are_not_present(self): - command_mocks = [MockCommand(r"^mount -t cgroup$", -'''cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) -cgroup on /sys/fs/cgroup/rdma type cgroup (rw,nosuid,nodev,noexec,relatime,rdma) -cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset) -cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_cls,net_prio) -cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event) -cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb) -cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer) -cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids) -cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices) -cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio) + command_mocks = [MockCommand(r"^findmnt -t cgroup --noheadings$", +'''/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd +/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices +/sys/fs/cgroup/rdma cgroup cgroup rw,nosuid,nodev,noexec,relatime,rdma +/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event +/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_cls,net_prio +/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio +/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset +/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer +/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb +/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids ''')] with self._get_cgroup_configurator(mock_commands=command_mocks) as configurator: tracked = CGroupsTelemetry._tracked @@ -119,17 +175,17 @@ def test_initialize_should_not_enable_cgroups_when_the_cpu_and_memory_controller self.assertEqual(len(tracked), 0, "No cgroups should be tracked. Tracked: {0}".format(tracked)) def test_initialize_should_not_enable_cgroups_when_the_agent_is_not_in_the_system_slice(self): - command_mocks = [MockCommand(r"^mount -t cgroup$", -'''cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) -cgroup on /sys/fs/cgroup/rdma type cgroup (rw,nosuid,nodev,noexec,relatime,rdma) -cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset) -cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_cls,net_prio) -cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event) -cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb) -cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer) -cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids) -cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices) -cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio) + command_mocks = [MockCommand(r"^findmnt -t cgroup --noheadings$", +'''/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd* +/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices +/sys/fs/cgroup/rdma cgroup cgroup rw,nosuid,nodev,noexec,relatime,rdma +/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event +/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_cls,net_prio +/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio +/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset +/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer +/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb +/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids ''')] with self._get_cgroup_configurator(mock_commands=command_mocks) as configurator: @@ -286,6 +342,17 @@ def test_enable_should_not_track_throttled_time_when_setting_the_cpu_quota_fails self.assertFalse(CGroupsTelemetry.get_track_throttled_time(), "Throttle time should not be tracked") + def test_enable_should_not_track_throttled_time_when_cgroups_v2_enabled(self): + with self._get_cgroup_configurator_v2(initialize=False) as configurator: + if CGroupsTelemetry.get_track_throttled_time(): + raise Exception("Test setup should not start tracking Throttle Time") + + configurator.mocks.add_file(UnitFilePaths.cpu_quota, Exception("A TEST EXCEPTION")) + + configurator.initialize() + + self.assertFalse(CGroupsTelemetry.get_track_throttled_time(), "Throttle time should not be tracked when using cgroups v2") + def test_disable_should_reset_cpu_quota(self): with self._get_cgroup_configurator() as configurator: if len(CGroupsTelemetry._tracked) == 0: @@ -376,7 +443,7 @@ def test_start_extension_command_should_not_use_systemd_when_cgroups_are_not_ena self.assertEqual(command_calls[0], "date", "The command line should not have been modified") @patch('time.sleep', side_effect=lambda _: mock_sleep()) - def test_start_extension_command_should_use_systemd_run_when_cgroups_are_enabled(self, _): + def test_start_extension_command_should_use_systemd_run_when_cgroups_v1_are_enabled(self, _): with self._get_cgroup_configurator() as configurator: with patch("azurelinuxagent.ga.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) as popen_patch: configurator.start_extension_command( @@ -444,6 +511,54 @@ def mock_popen(command_arg, *args, **kwargs): self.assertIn("A TEST EXCEPTION", str(context_manager.exception)) + @patch('time.sleep', side_effect=lambda _: mock_sleep()) + def test_start_extension_command_should_disable_cgroups_and_invoke_the_command_directly_if_v2_is_used(self, _): + with self._get_cgroup_configurator_v2() as configurator: + configurator.enable() # NOTE: Cgroups should not currently be enabled if v2 is detected. Adding this test to guarantee extensions are run correctly if cgroups v2 api is incorrectly called. + + with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as output_file: + with patch("azurelinuxagent.ga.cgroupstelemetry.add_event") as mock_add_event: + with patch("subprocess.Popen", wraps=subprocess.Popen) as popen_patch: + CGroupsTelemetry.reset() + + command = "echo TEST_OUTPUT" + + command_output = configurator.start_extension_command( + extension_name="Microsoft.Compute.TestExtension-1.2.3", + command=command, + cmd_name="test", + timeout=300, + shell=True, + cwd=self.tmp_dir, + env={}, + stdout=output_file, + stderr=output_file) + + self.assertFalse(configurator.enabled(), "Cgroups should have been disabled") + + disabled_events = [kwargs for _, kwargs in mock_add_event.call_args_list if + kwargs['op'] == WALAEventOperation.CGroupsDisabled] + + self.assertTrue(len(disabled_events) == 1, + "Exactly one CGroupsDisabled telemetry event should have been issued. Found: {0}".format( + disabled_events)) + self.assertIn("Failed to start Microsoft.Compute.TestExtension-1.2.3 using cgroups", + disabled_events[0]['message'], + "The cgroups failure was not included in the telemetry message") + self.assertEqual(False, disabled_events[0]['is_success'], + "The telemetry event should indicate a failure") + + extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if command in args[0]] + + self.assertEqual(1, len(extension_calls), + "The extension should have been invoked exactly twice") + self.assertEqual(command, extension_calls[0], + "The second call to the extension should not have used systemd") + + self.assertEqual(len(CGroupsTelemetry._tracked), 0, "No cgroups should have been created") + + self.assertIn("TEST_OUTPUT\n", command_output, "The test output was not captured") + @patch('time.sleep', side_effect=lambda _: mock_sleep()) def test_start_extension_command_should_disable_cgroups_and_invoke_the_command_directly_if_systemd_fails(self, _): with self._get_cgroup_configurator() as configurator: @@ -451,7 +566,7 @@ def test_start_extension_command_should_disable_cgroups_and_invoke_the_command_d configurator.mocks.add_command(MockCommand("systemd-run", return_value=1, stdout='', stderr='Failed to start transient scope unit: syntax error')) with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as output_file: - with patch("azurelinuxagent.ga.cgroupconfigurator.add_event") as mock_add_event: + with patch("azurelinuxagent.ga.cgroupstelemetry.add_event") as mock_add_event: with patch("subprocess.Popen", wraps=subprocess.Popen) as popen_patch: CGroupsTelemetry.reset() @@ -881,7 +996,7 @@ def test_check_cgroups_should_disable_cgroups_when_a_check_fails(self): patchers.append(p) p.start() - with patch("azurelinuxagent.ga.cgroupconfigurator.add_event") as add_event: + with patch("azurelinuxagent.ga.cgroupstelemetry.add_event") as add_event: configurator.enable() tracked_metrics = [ diff --git a/tests/ga/test_cgroupconfigurator_sudo.py b/tests/ga/test_cgroupconfigurator_sudo.py index 30db19408..6ff314496 100644 --- a/tests/ga/test_cgroupconfigurator_sudo.py +++ b/tests/ga/test_cgroupconfigurator_sudo.py @@ -25,7 +25,7 @@ from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.exception import ExtensionError, ExtensionErrorCodes from azurelinuxagent.common.future import ustr -from tests.lib.mock_cgroup_environment import mock_cgroup_environment +from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment from tests.lib.tools import AgentTestCase, patch, mock_sleep, i_am_root, is_python_version_26_or_34, skip_if_predicate_true @@ -40,7 +40,7 @@ def _get_cgroup_configurator(self, initialize=True, enable=True, mock_commands=N CGroupConfigurator._instance = None configurator = CGroupConfigurator.get_instance() CGroupsTelemetry.reset() - with mock_cgroup_environment(self.tmp_dir) as mock_environment: + with mock_cgroup_v1_environment(self.tmp_dir) as mock_environment: if mock_commands is not None: for command in mock_commands: mock_environment.add_command(command) diff --git a/tests/lib/cgroups_tools.py b/tests/lib/cgroups_tools.py index 45b817447..cb29ee9bf 100644 --- a/tests/lib/cgroups_tools.py +++ b/tests/lib/cgroups_tools.py @@ -33,17 +33,3 @@ def create_legacy_agent_cgroup(cgroups_file_system_root, controller, daemon_pid) fileutil.append_file(os.path.join(legacy_cgroup, "cgroup.procs"), daemon_pid + "\n") return legacy_cgroup - @staticmethod - def create_agent_cgroup(cgroups_file_system_root, controller, extension_handler_pid): - """ - Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; - starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. - - This method creates a mock cgroup using the newer path and adds the given PID to it. - """ - new_cgroup = os.path.join(cgroups_file_system_root, controller, "walinuxagent.service") - if not os.path.exists(new_cgroup): - os.makedirs(new_cgroup) - fileutil.append_file(os.path.join(new_cgroup, "cgroup.procs"), extension_handler_pid + "\n") - return new_cgroup - diff --git a/tests/lib/mock_cgroup_environment.py b/tests/lib/mock_cgroup_environment.py index 3b51dce8f..4b3e1534e 100644 --- a/tests/lib/mock_cgroup_environment.py +++ b/tests/lib/mock_cgroup_environment.py @@ -20,29 +20,11 @@ from tests.lib.tools import patch, data_dir from tests.lib.mock_environment import MockEnvironment, MockCommand -_MOCKED_COMMANDS = [ +# Mocked commands which are common between v1 and v2 +_MOCKED_COMMANDS_COMMON = [ MockCommand(r"^systemctl --version$", '''systemd 237 +PAM +AUDIT +SELINUX +IMA +APPARMOR +SMACK +SYSVINIT +UTMP +LIBCRYPTSETUP +GCRYPT +GNUTLS +ACL +XZ +LZ4 +SECCOMP +BLKID +ELFUTILS +KMOD -IDN2 +IDN -PCRE2 default-hierarchy=hybrid -'''), - - MockCommand(r"^mount -t cgroup$", -'''cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) -cgroup on /sys/fs/cgroup/rdma type cgroup (rw,nosuid,nodev,noexec,relatime,rdma) -cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset) -cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_cls,net_prio) -cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event) -cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb) -cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer) -cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory) -cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids) -cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices) -cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct) -cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio) -'''), - - MockCommand(r"^mount -t cgroup2$", -'''cgroup on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime) '''), MockCommand(r"^systemctl show walinuxagent\.service --property Slice", @@ -77,10 +59,80 @@ ] -_MOCKED_FILES = [ - ("/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'proc_self_cgroup')), - (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'proc_pid_cgroup')), - ("/sys/fs/cgroup/unified/cgroup.controllers", os.path.join(data_dir, 'cgroups', 'sys_fs_cgroup_unified_cgroup.controllers')) +_MOCKED_COMMANDS_V1 = [ + MockCommand(r"^findmnt -t cgroup --noheadings$", +'''/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd +/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices +/sys/fs/cgroup/rdma cgroup cgroup rw,nosuid,nodev,noexec,relatime,rdma +/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event +/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_cls,net_prio +/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio +/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset +/sys/fs/cgroup/misc cgroup cgroup rw,nosuid,nodev,noexec,relatime,misc +/sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct +/sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory +/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer +/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb +/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids +'''), + + MockCommand(r"^findmnt -t cgroup2 --noheadings$", +'''/sys/fs/cgroup/unified cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate +'''), + +] + +_MOCKED_COMMANDS_V2 = [ + MockCommand(r"^findmnt -t cgroup2 --noheadings$", +'''/sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot +'''), + + MockCommand(r"^findmnt -t cgroup --noheadings$", ''), + +] + +# Mocked commands when memory controller is in v2, but all other controllers are in v1 +_MOCKED_COMMANDS_V1_AND_V2 = [ + MockCommand(r"^findmnt -t cgroup --noheadings$", +'''/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd +/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices +/sys/fs/cgroup/rdma cgroup cgroup rw,nosuid,nodev,noexec,relatime,rdma +/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event +/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_cls,net_prio +/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio +/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset +/sys/fs/cgroup/misc cgroup cgroup rw,nosuid,nodev,noexec,relatime,misc +/sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct +/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer +/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb +/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids +'''), + + MockCommand(r"^findmnt -t cgroup2 --noheadings$", +'''/sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot +'''), + +] + +_MOCKED_FILES_V1 = [ + ("/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_self_cgroup')), + (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_pid_cgroup')), + ("/sys/fs/cgroup/unified/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v1', 'sys_fs_cgroup_cgroup.subtree_control')) +] + +_MOCKED_FILES_V2 = [ + ("/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'v2', 'proc_self_cgroup')), + (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v2', 'proc_pid_cgroup')), + ("/sys/fs/cgroup/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v2', 'sys_fs_cgroup_cgroup.subtree_control')), + ("/sys/fs/cgroup/azure.slice/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v2', 'sys_fs_cgroup_cgroup.subtree_control')), + ("/sys/fs/cgroup/azure.slice/walinuxagent.service/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v2', 'sys_fs_cgroup_cgroup.subtree_control_empty')) +] + +# Mocked files when memory controller is in v2, but all other controllers are in v1 +_MOCKED_FILES_V1_AND_V2 = [ + ("/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'v1_and_v2', 'proc_self_cgroup')), + (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v1_and_v2', 'proc_pid_cgroup')), + ("/sys/fs/cgroup/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v1_and_v2', 'sys_fs_cgroup_cgroup.subtree_control')) ] _MOCKED_PATHS = [ @@ -88,6 +140,12 @@ r"^(/etc/systemd/system)" ] +_MOCKED_PATHS_V2 = [ + r"^(/sys/fs/cgroup/azure.slice/walinuxagent.service)", + r"^(/sys/fs/cgroup/system.slice/walinuxagent.service)", + r"^(/sys/fs/cgroup/system.slice/extension.service)" +] + class UnitFilePaths: walinuxagent = "/lib/systemd/system/walinuxagent.service" @@ -106,11 +164,48 @@ class UnitFilePaths: @contextlib.contextmanager -def mock_cgroup_environment(tmp_dir): +def mock_cgroup_v1_environment(tmp_dir): + """ + Creates a mock environment for cgroups v1 hierarchy used by the tests related to cgroups (currently it only + provides support for systemd platforms). + The command output used in __MOCKED_COMMANDS comes from an Ubuntu 20 system. + """ + data_files = [ + (os.path.join(data_dir, 'init', 'walinuxagent.service'), UnitFilePaths.walinuxagent), + (os.path.join(data_dir, 'init', 'azure.slice'), UnitFilePaths.azure), + (os.path.join(data_dir, 'init', 'azure-vmextensions.slice'), UnitFilePaths.vmextensions) + ] + + with patch('azurelinuxagent.ga.cgroupapi.CGroupsApi.cgroups_supported', return_value=True): + with patch('azurelinuxagent.common.osutil.systemd.is_systemd', return_value=True): + with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS_COMMON + _MOCKED_COMMANDS_V1, paths=_MOCKED_PATHS, files=_MOCKED_FILES_V1, data_files=data_files) as mock: + yield mock + +@contextlib.contextmanager +def mock_cgroup_v2_environment(tmp_dir): + """ + Creates a mock environment for cgroups v2 hierarchy used by the tests related to cgroups (currently it only + provides support for systemd platforms). + The command output used in __MOCKED_COMMANDS comes from an Ubuntu 22 system. + """ + data_files = [ + (os.path.join(data_dir, 'init', 'walinuxagent.service'), UnitFilePaths.walinuxagent), + (os.path.join(data_dir, 'init', 'azure.slice'), UnitFilePaths.azure), + (os.path.join(data_dir, 'init', 'azure-vmextensions.slice'), UnitFilePaths.vmextensions) + ] + + with patch('azurelinuxagent.ga.cgroupapi.CGroupsApi.cgroups_supported', return_value=True): + with patch('azurelinuxagent.common.osutil.systemd.is_systemd', return_value=True): + with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS_COMMON + _MOCKED_COMMANDS_V2, paths=_MOCKED_PATHS + _MOCKED_PATHS_V2, files=_MOCKED_FILES_V2, data_files=data_files) as mock: + yield mock + +@contextlib.contextmanager +def mock_cgroup_v1_and_v2_environment(tmp_dir): + """ + Creates a mock environment for machine which has controllers in cgroups v1 and v2 hierarchies used by the tests + related to cgroups (currently it only provides support for systemd platforms). The agent does not currently support + this scenario. """ - Creates a mocks environment used by the tests related to cgroups (currently it only provides support for systemd platforms). - The command output used in __MOCKED_COMMANDS comes from an Ubuntu 18 system. - """ data_files = [ (os.path.join(data_dir, 'init', 'walinuxagent.service'), UnitFilePaths.walinuxagent), (os.path.join(data_dir, 'init', 'azure.slice'), UnitFilePaths.azure), @@ -119,5 +214,5 @@ def mock_cgroup_environment(tmp_dir): with patch('azurelinuxagent.ga.cgroupapi.CGroupsApi.cgroups_supported', return_value=True): with patch('azurelinuxagent.common.osutil.systemd.is_systemd', return_value=True): - with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS, paths=_MOCKED_PATHS, files=_MOCKED_FILES, data_files=data_files) as mock: + with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS_COMMON + _MOCKED_COMMANDS_V1_AND_V2, paths=_MOCKED_PATHS, files=_MOCKED_FILES_V1_AND_V2, data_files=data_files) as mock: yield mock diff --git a/tests/test_agent.py b/tests/test_agent.py index cbf223aa5..906392b61 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -21,10 +21,10 @@ from azurelinuxagent.agent import parse_args, Agent, usage, AgentCommands from azurelinuxagent.common import conf -from azurelinuxagent.ga import logcollector, cgroupconfigurator -from azurelinuxagent.ga.cgroupapi import SystemdCgroupsApi +from azurelinuxagent.ga import logcollector, cgroupconfigurator, cgroupapi from azurelinuxagent.common.utils import fileutil from azurelinuxagent.ga.collect_logs import CollectLogsHandler +from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, mock_cgroup_v1_and_v2_environment from tests.lib.tools import AgentTestCase, data_dir, Mock, patch EXPECTED_CONFIGURATION = \ @@ -240,46 +240,105 @@ def test_calls_collect_logs_with_proper_mode(self, mock_log_collector, *args): self.assertFalse(full_mode) @patch("azurelinuxagent.agent.LogCollector") - def test_calls_collect_logs_on_valid_cgroups(self, mock_log_collector): + def test_calls_collect_logs_on_valid_cgroups_v1(self, mock_log_collector): try: CollectLogsHandler.enable_monitor_cgroups_check() mock_log_collector.run = Mock() + # Mock cgroup paths so process is in the log collector slice def mock_cgroup_paths(*args, **kwargs): if args and args[0] == "self": relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT) - return (cgroupconfigurator.LOGCOLLECTOR_SLICE, relative_path) - return SystemdCgroupsApi.get_process_cgroup_relative_paths(*args, **kwargs) + return (relative_path, relative_path) + return cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) + + with mock_cgroup_v1_environment(self.tmp_dir): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_paths", + side_effect=mock_cgroup_paths): + agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) + agent.collect_logs(is_full_mode=True) + + mock_log_collector.assert_called_once() - with patch("azurelinuxagent.agent.SystemdCgroupsApi.get_process_cgroup_paths", side_effect=mock_cgroup_paths): - agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) - agent.collect_logs(is_full_mode=True) - - mock_log_collector.assert_called_once() finally: CollectLogsHandler.disable_monitor_cgroups_check() @patch("azurelinuxagent.agent.LogCollector") - def test_doesnt_call_collect_logs_on_invalid_cgroups(self, mock_log_collector): + def test_doesnt_call_collect_logs_when_cgroup_api_cannot_be_determined(self, mock_log_collector): try: CollectLogsHandler.enable_monitor_cgroups_check() mock_log_collector.run = Mock() - def mock_cgroup_paths(*args, **kwargs): - if args and args[0] == "self": - return ("NOT_THE_CORRECT_PATH", "NOT_THE_CORRECT_PATH") - return SystemdCgroupsApi.get_process_cgroup_relative_paths(*args, **kwargs) + def raise_on_sys_exit(*args): + raise RuntimeError(args[0] if args else "Exiting") - with patch("azurelinuxagent.agent.SystemdCgroupsApi.get_process_cgroup_paths", side_effect=mock_cgroup_paths): + with patch("azurelinuxagent.agent.get_cgroup_api", return_value=None): agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) - exit_error = RuntimeError("Exiting") - with patch("sys.exit", return_value=exit_error) as mock_exit: + with patch("sys.exit", side_effect=raise_on_sys_exit) as mock_exit: try: agent.collect_logs(is_full_mode=True) except RuntimeError as re: + self.assertEqual(logcollector.INVALID_CGROUPS_ERRCODE, re.args[0]) + mock_exit.assert_called_once_with(logcollector.INVALID_CGROUPS_ERRCODE) + finally: + CollectLogsHandler.disable_monitor_cgroups_check() + + @patch("azurelinuxagent.agent.LogCollector") + def test_doesnt_call_collect_logs_on_invalid_cgroups_v1(self, mock_log_collector): + try: + CollectLogsHandler.enable_monitor_cgroups_check() + mock_log_collector.run = Mock() + + # Mock cgroup paths so process is in incorrect slice + def mock_cgroup_paths(*args, **kwargs): + if args and args[0] == "self": + return ("NOT_THE_CORRECT_PATH", "NOT_THE_CORRECT_PATH") + return cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) + + def raise_on_sys_exit(*args): + raise RuntimeError(args[0] if args else "Exiting") + + with mock_cgroup_v1_environment(self.tmp_dir): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_paths", side_effect=mock_cgroup_paths): + agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) + + with patch("sys.exit", side_effect=raise_on_sys_exit) as mock_exit: + try: + agent.collect_logs(is_full_mode=True) + except RuntimeError as re: + self.assertEqual(logcollector.INVALID_CGROUPS_ERRCODE, re.args[0]) + mock_exit.assert_called_once_with(logcollector.INVALID_CGROUPS_ERRCODE) + finally: + CollectLogsHandler.disable_monitor_cgroups_check() + + @patch("azurelinuxagent.agent.LogCollector") + def test_doesnt_call_collect_logs_when_controllers_mounted_in_different_hierarchies(self, mock_log_collector): + try: + CollectLogsHandler.enable_monitor_cgroups_check() + mock_log_collector.run = Mock() + + # Mock cgroup paths so process is in the log collector slice and cpu is not mounted + def mock_cgroup_paths(*args, **kwargs): + if args and args[0] == "self": + relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT) + return (None, relative_path) + return cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) + + def raise_on_sys_exit(*args): + raise RuntimeError(args[0] if args else "Exiting") + + with mock_cgroup_v1_and_v2_environment(self.tmp_dir): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_paths", + side_effect=mock_cgroup_paths): + agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) + + with patch("sys.exit", side_effect=raise_on_sys_exit) as mock_exit: + try: + agent.collect_logs(is_full_mode=True) + except RuntimeError as re: + self.assertEqual(logcollector.INVALID_CGROUPS_ERRCODE, re.args[0]) mock_exit.assert_called_once_with(logcollector.INVALID_CGROUPS_ERRCODE) - self.assertEqual(exit_error, re) finally: CollectLogsHandler.disable_monitor_cgroups_check() diff --git a/tests_e2e/test_suites/cgroups_v2_disabled.yml b/tests_e2e/test_suites/cgroups_v2_disabled.yml new file mode 100644 index 000000000..5a075a2a2 --- /dev/null +++ b/tests_e2e/test_suites/cgroups_v2_disabled.yml @@ -0,0 +1,10 @@ +# +# The test suite verifies that the agent does not enable resource enforcement and monitoring on machines which are +# using cgroups v2. This suite will be removed once cgroups v2 is supported. +# +name: "Cgroupsv2Disabled" +tests: + - "cgroups_v2_disabled/cgroups_v2_disabled.py" +images: + - "ubuntu_2204" + - "ubuntu_2404" \ No newline at end of file diff --git a/tests_e2e/tests/cgroups_v2_disabled/cgroups_v2_disabled.py b/tests_e2e/tests/cgroups_v2_disabled/cgroups_v2_disabled.py new file mode 100644 index 000000000..9f6e117e6 --- /dev/null +++ b/tests_e2e/tests/cgroups_v2_disabled/cgroups_v2_disabled.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import time + +from assertpy import fail + +from tests_e2e.tests.lib.agent_test import AgentVmTest +from tests_e2e.tests.lib.agent_test_context import AgentVmTestContext +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.shell import CommandError +from tests_e2e.tests.lib.ssh_client import SshClient + + +class Cgroupsv2Disabled(AgentVmTest): + """ + The test verifies that the agent does not enable resource enforcement and monitoring on machines which are using + cgroups v2. It also checks that the agent correctly determined the controller mount points. This test will be + removed once cgroups v2 is supported. + """ + + def __init__(self, context: AgentVmTestContext): + super().__init__(context) + self._ssh_client: SshClient = self._context.create_ssh_client() + + def check_agent_log_contains(self, data, assertion): + try: + self._ssh_client.run_command("grep \"{0}\" /var/log/waagent.log".format(data)) + except CommandError: + fail("{0}".format(assertion)) + + def run(self): + # Cgroup configurator is initialized when agent is started, and before the goal state processing period is + # logged. Wait until the agent logs the goal state period before checking for cgroup initialization logs. + log.info("Wait for cgroup configurator to be initialized...") + for _ in range(15): + try: + self._ssh_client.run_command("grep 'Goal State Period:' /var/log/waagent.log") + break + except CommandError: + log.info("The Agent has not initialized cgroups yet, will check again after a short delay") + time.sleep(60) + else: + raise Exception("Timeout while waiting for the Agent to initialize cgroups") + + # Verify that the agent chose v2 for resource enforcement and monitoring + log.info("") + log.info("Checking that the agent chose cgroups v2 api for resource enforcement and monitoring...") + self.check_agent_log_contains('Using cgroups v2 for resource enforcement and monitoring', 'The agent should choose v2 for api resource enforcement and monitoring') + + # Verify that the agent determined the correct mount point for each controller + log.info("") + log.info("Checking that the agent determined the correct mount point for each controller...") + self.check_agent_log_contains('The CPU cgroup controller is mounted at /sys/fs/cgroup', + 'The agent should identify the cpu controller to be mounted at /sys/fs/cgroup') + self.check_agent_log_contains('The memory cgroup controller is mounted at /sys/fs/cgroup', + 'The agent should identify the memory controller to be mounted at /sys/fs/cgroup') + + # Verify that the agent does not support cgroups v2 + log.info("") + log.info("Checking that the agent does not use cgroups v2 for resource enforcement and monitoring...") + self.check_agent_log_contains('Agent and extensions resource monitoring is not currently supported on cgroups v2', + 'The agent should not attempt to use cgroups v2 for resource enforcement and monitoring') + self.check_agent_log_contains('Agent cgroups enabled: False', + 'The agent should not enable cgroups when system is using v2') + + +if __name__ == "__main__": + Cgroupsv2Disabled.run_from_command_line() From bd9caff61ad99492cac4eb7c7f8e2c797a5e4708 Mon Sep 17 00:00:00 2001 From: Maddie Ford Date: Mon, 18 Mar 2024 12:50:21 -0700 Subject: [PATCH 02/12] Fix log warnings --- azurelinuxagent/ga/cgroupconfigurator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/azurelinuxagent/ga/cgroupconfigurator.py b/azurelinuxagent/ga/cgroupconfigurator.py index 7b415d99f..34229c533 100644 --- a/azurelinuxagent/ga/cgroupconfigurator.py +++ b/azurelinuxagent/ga/cgroupconfigurator.py @@ -519,13 +519,13 @@ def __try_set_cpu_quota(quota): # pylint: disable=unused-private-member return True # no need to update the file; return here to avoid doing a daemon-reload CGroupConfigurator._Impl.__create_unit_file(drop_in_file, contents) except Exception as exception: - log_cgroup_warning('Failed to set CPUQuota: {0}', ustr(exception)) + log_cgroup_warning('Failed to set CPUQuota: {0}'.format(ustr(exception))) return False try: log_cgroup_info("Executing systemctl daemon-reload...", send_event=False) shellutil.run_command(["systemctl", "daemon-reload"]) except Exception as exception: - log_cgroup_warning("daemon-reload failed (set quota): {0}", ustr(exception)) + log_cgroup_warning("daemon-reload failed (set quota): {0}".format(ustr(exception))) return False return True @@ -863,8 +863,8 @@ def setup_extension_slice(self, extension_name, cpu_quota): cpu_quota=cpu_quota) CGroupConfigurator._Impl.__create_unit_file(extension_slice_path, slice_contents) except Exception as exception: - log_cgroup_warning("Failed to set the extension {0} slice and quotas: {1}", extension_name, - ustr(exception)) + log_cgroup_warning("Failed to set the extension {0} slice and quotas: {1}".format(extension_name, + ustr(exception))) CGroupConfigurator._Impl.__cleanup_unit_file(extension_slice_path) def remove_extension_slice(self, extension_name): @@ -939,7 +939,7 @@ def __reset_extension_services_cpu_quota(self, services_list): files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents)) self.__create_all_files(files_to_create) except Exception as exception: - log_cgroup_warning('Failed to reset CPUQuota for {0} : {1}', service_name, ustr(exception)) + log_cgroup_warning('Failed to reset CPUQuota for {0} : {1}'.format(service_name, ustr(exception))) def remove_extension_services_drop_in_files(self, services_list): """ From 689aacc5c5c87c71d7bfbd1f4de3825501470fa8 Mon Sep 17 00:00:00 2001 From: Maddie Ford Date: Mon, 18 Mar 2024 12:58:28 -0700 Subject: [PATCH 03/12] Add cgroups v2 disabled scenario to daily runbook --- tests_e2e/orchestrator/runbook.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml index 722ceba61..97d44a73d 100644 --- a/tests_e2e/orchestrator/runbook.yml +++ b/tests_e2e/orchestrator/runbook.yml @@ -53,6 +53,7 @@ variable: - no_outbound_connections - publish_hostname - recover_network_interface + - cgroups_v2_disabled # # Parameters used to create test VMs From 0802d25318eff288d83fda8ab6bb72cc4c676c2e Mon Sep 17 00:00:00 2001 From: maddieford <93676569+maddieford@users.noreply.github.com> Date: Tue, 26 Mar 2024 10:22:21 -0700 Subject: [PATCH 04/12] Address PR comments (#7) * get_cgroup_api should raise exception when controllers not mounted * Combine cgroups_supported() check * Combine SystemdCgroupsApi and CGroupApi classes * fix pylint and tests with sudo * Rename SystemdCgroupsApi to SystemdCgroupApi * Cgroup should be singular when referring to the APi * Unimpleneted methods should raise NotImplementederror * Check for cpu,cpuacct * v2 start extension command should not be implemented * log_cgorup_info and log_cgroup_warning should be in cgroupapi * Systemd check should come before api * Explicitly check for empty dict * Only check if controllers are enabled at root for v2 * Remove unnecessary mocked paths in mock cgroup env * V2 does not have concept of mounting controllers * Fix super call for python 2 * get_cgroup_api should be function * Move logging functions up * Use stat -f to get cgroup mode * Mock hybrid path * Fix unit tests: * Debug tests * Debug tests * Debug unit tests * Fix unit tests * Fix pylint * Fix e2e test for v2 * Fix e2e test * Fix e2e test * Fix e2e test * Combine common implementations --- azurelinuxagent/agent.py | 15 +- azurelinuxagent/ga/cgroupapi.py | 402 ++++++++---------- azurelinuxagent/ga/cgroupconfigurator.py | 82 ++-- azurelinuxagent/ga/cgroupstelemetry.py | 13 - .../hybrid/sys_fs_cgroup_cgroup.controllers | 0 tests/data/cgroups/v1_and_v2/proc_pid_cgroup | 12 - tests/data/cgroups/v1_and_v2/proc_self_cgroup | 12 - .../sys_fs_cgroup_cgroup.subtree_control | 1 - tests/ga/test_cgroupapi.py | 384 +++++------------ tests/ga/test_cgroupconfigurator.py | 111 ++--- tests/ga/test_cgroupconfigurator_sudo.py | 2 +- tests/ga/test_update.py | 2 +- tests/lib/mock_cgroup_environment.py | 51 ++- tests/test_agent.py | 28 +- tests_e2e/orchestrator/runbook.yml | 2 +- ...v2_disabled.yml => cgroup_v2_disabled.yml} | 4 +- .../cgroup_v2_disabled.py} | 30 +- 17 files changed, 423 insertions(+), 728 deletions(-) create mode 100644 tests/data/cgroups/hybrid/sys_fs_cgroup_cgroup.controllers delete mode 100644 tests/data/cgroups/v1_and_v2/proc_pid_cgroup delete mode 100644 tests/data/cgroups/v1_and_v2/proc_self_cgroup delete mode 100644 tests/data/cgroups/v1_and_v2/sys_fs_cgroup_cgroup.subtree_control rename tests_e2e/test_suites/{cgroups_v2_disabled.yml => cgroup_v2_disabled.yml} (76%) rename tests_e2e/tests/{cgroups_v2_disabled/cgroups_v2_disabled.py => cgroup_v2_disabled/cgroup_v2_disabled.py} (75%) diff --git a/azurelinuxagent/agent.py b/azurelinuxagent/agent.py index b0ce5a19f..d794432b9 100644 --- a/azurelinuxagent/agent.py +++ b/azurelinuxagent/agent.py @@ -28,10 +28,11 @@ import subprocess import sys import threading + +from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.ga import logcollector, cgroupconfigurator from azurelinuxagent.ga.cgroup import AGENT_LOG_COLLECTOR, CpuCgroup, MemoryCgroup -from azurelinuxagent.ga.cgroupapi import get_cgroup_api -from azurelinuxagent.ga.cgroupstelemetry import log_cgroup_warning +from azurelinuxagent.ga.cgroupapi import get_cgroup_api, log_cgroup_warning import azurelinuxagent.common.conf as conf import azurelinuxagent.common.event as event @@ -207,16 +208,16 @@ def collect_logs(self, is_full_mode): # Check the cgroups unit log_collector_monitor = None - cgroups_api = get_cgroup_api() cpu_cgroup_path = None memory_cgroup_path = None if CollectLogsHandler.is_enabled_monitor_cgroups_check(): - if cgroups_api is None: - log_cgroup_warning("Unable to determine what version of cgroups to use for log collector resource " - "monitoring and enforcement.") + try: + cgroup_api = get_cgroup_api() + except CGroupsException as e: + log_cgroup_warning("Unable to determine which cgroup version to use: {0}".format(ustr(e)), send_event=True) sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) - cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self") + cpu_cgroup_path, memory_cgroup_path = cgroup_api.get_process_cgroup_paths("self") cpu_slice_matches = False memory_slice_matches = False if cpu_cgroup_path is not None: diff --git a/azurelinuxagent/ga/cgroupapi.py b/azurelinuxagent/ga/cgroupapi.py index 40f66ed74..34caefa67 100644 --- a/azurelinuxagent/ga/cgroupapi.py +++ b/azurelinuxagent/ga/cgroupapi.py @@ -23,8 +23,9 @@ import uuid from azurelinuxagent.common import logger +from azurelinuxagent.common.event import WALAEventOperation, add_event from azurelinuxagent.ga.cgroup import CpuCgroup, MemoryCgroup -from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry, log_cgroup_info, log_cgroup_warning +from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.conf import get_agent_pid_file_path from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes, ExtensionError, \ ExtensionOperationError @@ -36,43 +37,26 @@ from azurelinuxagent.common.utils.flexible_version import FlexibleVersion from azurelinuxagent.common.version import get_distro -CGROUPS_FILE_SYSTEM_ROOT = '/sys/fs/cgroup' +CGROUP_FILE_SYSTEM_ROOT = '/sys/fs/cgroup' EXTENSION_SLICE_PREFIX = "azure-vmextensions" -def get_cgroup_api(): - """ - Determines which version of Cgroups should be used for resource enforcement and monitoring by the Agent are returns - the corresponding Api. If the required controllers are not mounted in v1 or v2, return None. - """ - v1 = SystemdCgroupsApiv1() - v2 = SystemdCgroupsApiv2() +def log_cgroup_info(formatted_string, op=WALAEventOperation.CGroupsInfo, send_event=True): + logger.info("[CGI] " + formatted_string) + if send_event: + add_event(op=op, message=formatted_string) - log_cgroup_info("Controllers mounted in v1: {0}. Controllers mounted in v2: {1}".format(v1.get_mounted_controllers(), v2.get_mounted_controllers())) - # It is possible for different controllers to be simultaneously mounted under v1 and v2. If any are mounted under - # v1, use v1. - if v1.is_cpu_or_memory_mounted(): - log_cgroup_info("Using cgroups v1 for resource enforcement and monitoring") - return v1 - elif v2.is_cpu_or_memory_mounted(): - log_cgroup_info("Using cgroups v2 for resource enforcement and monitoring") - return v2 - else: - log_cgroup_warning("CPU and Memory controllers are not mounted in cgroups v1 or v2") - return None +def log_cgroup_warning(formatted_string, op=WALAEventOperation.CGroupsInfo, send_event=True): + logger.info("[CGW] " + formatted_string) # log as INFO for now, in the future it should be logged as WARNING + if send_event: + add_event(op=op, message=formatted_string, is_success=False, log_event=False) -class SystemdRunError(CGroupsException): +class CGroupUtil(object): """ - Raised when systemd-run fails + Cgroup utility methods which are independent of systemd cgroup api. """ - - def __init__(self, msg=None): - super(SystemdRunError, self).__init__(msg) - - -class CGroupsApi(object): @staticmethod def cgroups_supported(): distro_info = get_distro() @@ -85,18 +69,18 @@ def cgroups_supported(): (distro_name.lower() in ('centos', 'redhat') and 8 <= distro_version.major < 9) @staticmethod - def track_cgroups(extension_cgroups): - try: - for cgroup in extension_cgroups: - CGroupsTelemetry.track_cgroup(cgroup) - except Exception as exception: - logger.warn("[CGW] Cannot add cgroup '{0}' to tracking list; resource usage will not be tracked. " - "Error: {1}".format(cgroup.path, ustr(exception))) + def get_extension_slice_name(extension_name, old_slice=False): + # The old slice makes it difficult for user to override the limits because they need to place drop-in files on every upgrade if extension slice is different for each version. + # old slice includes .- + # new slice without version . + if not old_slice: + extension_name = extension_name.rsplit("-", 1)[0] + # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects. + return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_') + ".slice" @staticmethod - def get_processes_in_cgroup(cgroup_path): - with open(os.path.join(cgroup_path, "cgroup.procs"), "r") as cgroup_procs: - return [int(pid) for pid in cgroup_procs.read().split()] + def get_daemon_pid(): + return int(fileutil.read_file(get_agent_pid_file_path()).strip()) @staticmethod def _foreach_legacy_cgroup(operation): @@ -114,7 +98,7 @@ def _foreach_legacy_cgroup(operation): """ legacy_cgroups = [] for controller in ['cpu', 'memory']: - cgroup = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, "WALinuxAgent", "WALinuxAgent") + cgroup = os.path.join(CGROUP_FILE_SYSTEM_ROOT, controller, "WALinuxAgent", "WALinuxAgent") if os.path.exists(cgroup): log_cgroup_info('Found legacy cgroup {0}'.format(cgroup), send_event=False) legacy_cgroups.append((controller, cgroup)) @@ -125,7 +109,7 @@ def _foreach_legacy_cgroup(operation): if os.path.exists(procs_file): procs_file_contents = fileutil.read_file(procs_file).strip() - daemon_pid = CGroupsApi.get_daemon_pid() + daemon_pid = CGroupUtil.get_daemon_pid() if ustr(daemon_pid) in procs_file_contents: operation(controller, daemon_pid) @@ -136,17 +120,67 @@ def _foreach_legacy_cgroup(operation): return len(legacy_cgroups) @staticmethod - def get_daemon_pid(): - return int(fileutil.read_file(get_agent_pid_file_path()).strip()) + def cleanup_legacy_cgroups(): + """ + Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; + starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If + we find that any of the legacy groups include the PID of the daemon then we need to disable data collection for this + instance (under systemd, moving PIDs across the cgroup file system can produce unpredictable results) + """ + return CGroupUtil._foreach_legacy_cgroup(lambda *_: None) -class SystemdCgroupsApi(CGroupsApi): +class SystemdRunError(CGroupsException): """ - Cgroups interface via systemd. Contains common api implementations between cgroups v1 and v2. + Raised when systemd-run fails + """ + + def __init__(self, msg=None): + super(SystemdRunError, self).__init__(msg) + + +def get_cgroup_api(): """ + Determines which version of Cgroup should be used for resource enforcement and monitoring by the Agent and returns + the corresponding Api. + Uses 'stat -f --format=%T /sys/fs/cgroups' to get the cgroup hierarchy in use. + If the result is 'cgroup2fs', cgroup v2 is being used. + If the result is 'tmpfs', cgroup v1 or a hybrid mode is being used. + If the result of 'stat -f --format=%T /sys/fs/cgroup/unified' is 'cgroup2fs', then hybrid mode is being used. + + Raises exception if an unknown mode is detected. Also raises exception if hybrid mode is detected and there are + controllers available to be enabled in the unified hierarchy (the agent does not support cgroups if there are + controllers simultaneously attached to v1 and v2 hierarchies). + """ + root_hierarchy_mode = shellutil.run_command(["stat", "-f", "--format=%T", CGROUP_FILE_SYSTEM_ROOT]).rstrip() + + if root_hierarchy_mode == "cgroup2fs": + log_cgroup_info("Using cgroup v2 for resource enforcement and monitoring") + return SystemdCgroupApiv2() + + elif root_hierarchy_mode == "tmpfs": + # Check if a hybrid mode is being used + unified_hierarchy_path = os.path.join(CGROUP_FILE_SYSTEM_ROOT, "unified") + if os.path.exists(unified_hierarchy_path) and shellutil.run_command(["stat", "-f", "--format=%T", unified_hierarchy_path]).rstrip() == "cgroup2fs": + # Hybrid mode is being used. Check if any controllers are available to be enabled in the unified hierarchy. + available_unified_controllers_file = os.path.join(unified_hierarchy_path, "cgroup.controllers") + if os.path.exists(available_unified_controllers_file): + available_unified_controllers = fileutil.read_file(available_unified_controllers_file).rstrip() + if available_unified_controllers != "": + raise CGroupsException("Detected hybrid cgroup mode, but there are controllers available to be enabled in unified hierarchy: {0}".format(available_unified_controllers)) + + log_cgroup_info("Using cgroup v1 for resource enforcement and monitoring") + return SystemdCgroupApiv1() + + raise CGroupsException("Detected unknown cgroup mode: {0}".format(root_hierarchy_mode)) + + +class _SystemdCgroupApi(object): + """ + Cgroup interface via systemd. Contains common api implementations between cgroup v1 and v2. + """ def __init__(self): - self._cgroup_mountpoints = {} self._agent_unit_name = None self._systemd_run_commands = [] self._systemd_run_commands_lock = threading.RLock() @@ -158,70 +192,48 @@ def get_systemd_run_commands(self): with self._systemd_run_commands_lock: return self._systemd_run_commands[:] - def is_cpu_or_memory_mounted(self): + def get_controller_root_paths(self): """ - Returns True if either cpu or memory controllers are mounted and enabled at the root cgroup. + Cgroup version specific. Returns a tuple with the root paths for the cpu and memory controllers; the values can + be None if the corresponding controller is not mounted or enabled at the root cgroup. """ - cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() - return cpu_mount_point is not None or memory_mount_point is not None + raise NotImplementedError() - def get_mounted_controllers(self): - """ - Returns a list of the controllers mounted and enabled at the root cgroup. Currently, the only controllers the - agent checks for is cpu and memory. - """ - self.get_cgroup_mount_points() # Updates self._cgroup_mountpoints if empty - return [controller for controller, mount_point in self._cgroup_mountpoints.items() if mount_point is not None] - - def cleanup_legacy_cgroups(self): + def get_unit_cgroup_paths(self, unit_name): """ - Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; - starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If - we find that any of the legacy groups include the PID of the daemon then we need to disable data collection for this - instance (under systemd, moving PIDs across the cgroup file system can produce unpredictable results) + Returns a tuple with the path of the cpu and memory cgroups for the given unit. + The values returned can be None if the controller is not mounted or enabled. """ - return CGroupsApi._foreach_legacy_cgroup(lambda *_: None) - - @staticmethod - def get_extension_slice_name(extension_name, old_slice=False): - # The old slice makes it difficult for user to override the limits because they need to place drop-in files on every upgrade if extension slice is different for each version. - # old slice includes .- - # new slice without version . - if not old_slice: - extension_name = extension_name.rsplit("-", 1)[0] - # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects. - return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_') + ".slice" + # Ex: ControlGroup=/azure.slice/walinuxagent.service + # controlgroup_path[1:] = azure.slice/walinuxagent.service + controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup") + cpu_mount_point, memory_mount_point = self.get_controller_root_paths() - @staticmethod - def _is_systemd_failure(scope_name, stderr): - stderr.seek(0) - stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace') - unit_not_found = "Unit {0} not found.".format(scope_name) - return unit_not_found in stderr or scope_name not in stderr + cpu_cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) \ + if cpu_mount_point is not None else None - def get_cgroup_mount_points(self): - """ - Cgroup version specific. Returns a tuple with the mount points for the cpu and memory controllers; the values - can be None if the corresponding controller is not mounted or enabled at the root cgroup. Updates - self._cgroup_mountpoints if empty. - """ - return None, None + memory_cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) \ + if memory_mount_point is not None else None - def get_unit_cgroup_paths(self, unit_name): - """ - Cgroup version specific. Returns a tuple with the path of the cpu and memory cgroups for the given unit. - The values returned can be None if the controller is not mounted or enabled. - """ - pass # pylint: disable=W0107 + return cpu_cgroup_path, memory_cgroup_path def get_process_cgroup_paths(self, process_id): """ - Cgroup version specific. Returns a tuple with the path of the cpu and memory cgroups for the given process. + Returns a tuple with the path of the cpu and memory cgroups for the given process. The 'process_id' can be a numeric PID or the string "self" for the current process. - The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is - not mounted or enabled). + The values returned can be None if the controller is not mounted or enabled. """ - pass # pylint: disable=W0107 + cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id) + + cpu_mount_point, memory_mount_point = self.get_controller_root_paths() + + cpu_cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) \ + if cpu_mount_point is not None and cpu_cgroup_relative_path is not None else None + + memory_cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) \ + if memory_mount_point is not None and memory_cgroup_relative_path is not None else None + + return cpu_cgroup_path, memory_cgroup_path def get_process_cgroup_relative_paths(self, process_id): # pylint: disable=W0613 """ @@ -231,36 +243,55 @@ def get_process_cgroup_relative_paths(self, process_id): # pylint: disable=W061 The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is not mounted). """ - pass # pylint: disable=W0107 + raise NotImplementedError() def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): """ Cgroup version specific. Starts extension command. """ - pass # pylint: disable=W0107 + raise NotImplementedError() + + @staticmethod + def _is_systemd_failure(scope_name, stderr): + stderr.seek(0) + stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace') + unit_not_found = "Unit {0} not found.".format(scope_name) + return unit_not_found in stderr or scope_name not in stderr + + @staticmethod + def get_processes_in_cgroup(cgroup_path): + with open(os.path.join(cgroup_path, "cgroup.procs"), "r") as cgroup_procs: + return [int(pid) for pid in cgroup_procs.read().split()] -class SystemdCgroupsApiv1(SystemdCgroupsApi): +class SystemdCgroupApiv1(_SystemdCgroupApi): """ - Cgroups v1 interface via systemd + Cgroup v1 interface via systemd """ - def get_cgroup_mount_points(self): - # the output of mount is similar to + def __init__(self): + super(SystemdCgroupApiv1, self).__init__() + self._cgroup_mountpoints = {} + + def get_controller_root_paths(self): + # In v1, each controller is mounted at a different path. Use findmnt to get each path and return cpu and memory + # mount points as a tuple. + # + # the output of findmnt is similar to # $ findmnt -t cgroup --noheadings # /sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd # /sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory # /sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct # etc # - if not self._cgroup_mountpoints: + if len(self._cgroup_mountpoints) == 0: cpu = None memory = None for line in shellutil.run_command(['findmnt', '-t', 'cgroup', '--noheadings']).splitlines(): match = re.search(r'(?P/\S+(memory|cpuacct))\s', line) if match is not None: path = match.group('path') - if 'cpuacct' in path: + if 'cpu,cpuacct' in path: cpu = path else: memory = path @@ -268,33 +299,6 @@ def get_cgroup_mount_points(self): return self._cgroup_mountpoints['cpu'], self._cgroup_mountpoints['memory'] - def get_unit_cgroup_paths(self, unit_name): - # Ex: ControlGroup=/azure.slice/walinuxagent.service - # controlgroup_path[1:] = azure.slice/walinuxagent.service - controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup") - cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() - - cpu_cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) \ - if cpu_mount_point is not None else None - - memory_cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) \ - if memory_mount_point is not None else None - - return cpu_cgroup_path, memory_cgroup_path - - def get_process_cgroup_paths(self, process_id): - cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id) - - cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() - - cpu_cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) \ - if cpu_mount_point is not None and cpu_cgroup_relative_path is not None else None - - memory_cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) \ - if memory_mount_point is not None and memory_cgroup_relative_path is not None else None - - return cpu_cgroup_path, memory_cgroup_path - def get_process_cgroup_relative_paths(self, process_id): # The contents of the file are similar to # # cat /proc/1218/cgroup @@ -318,7 +322,7 @@ def get_process_cgroup_relative_paths(self, process_id): def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): scope = "{0}_{1}".format(cmd_name, uuid.uuid4()) - extension_slice_name = self.get_extension_slice_name(extension_name) + extension_slice_name = CGroupUtil.get_extension_slice_name(extension_name) with self._systemd_run_commands_lock: process = subprocess.Popen( # pylint: disable=W1509 # Some distros like ubuntu20 by default cpu and memory accounting enabled. Thus create nested cgroups under the extension slice @@ -344,7 +348,7 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh try: cgroup_relative_path = os.path.join('azure.slice/azure-vmextensions.slice', extension_slice_name) - cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self.get_cgroup_mount_points() + cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self.get_controller_root_paths() if cpu_cgroup_mountpoint is None: log_cgroup_info("The CPU controller is not mounted; will not track resource usage", send_event=False) @@ -398,116 +402,61 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh self._systemd_run_commands.remove(process.pid) -class SystemdCgroupsApiv2(SystemdCgroupsApi): +class SystemdCgroupApiv2(_SystemdCgroupApi): """ - Cgroups v2 interface via systemd + Cgroup v2 interface via systemd """ + def __init__(self): + super(SystemdCgroupApiv2, self).__init__() + self._root_cgroup_path = None + self._controllers_enabled_at_root = [] - def is_controller_enabled(self, controller, cgroup_path): + def is_controller_enabled_at_root(self, controller): """ - Returns True if the provided controller is enabled at the provided cgroup. - - There are two ways to determine if a controller is enabled at the provided cgroup: + Returns True if the provided controller is enabled at the root cgroup. The cgroup.subtree_control file at the + root shows a space separated list of the controllers which are enabled to control resource distribution from + the root cgroup to its children. If a controller is listed here, then that controller is available to enable in + children cgroups. - 1. For non-leaf cgroups, the cgroup.subtree_control shows space separated list of the controllers which are - enabled to control resource distribution from the cgroup to its children. All non-root "cgroup.subtree_control" - files can only contain controllers which are enabled in the parent's "cgroup.subtree_control" file. $ cat /sys/fs/cgroup/cgroup.subtree_control cpuset cpu io memory hugetlb pids rdma misc - - 2. For leaf cgroups, the cgroup.subtree_control file will be empty and the presence of "." - prefixed interface files at the path indicate the controller is enabled. - $ ls /sys/fs/cgroup/azure.slice/walinuxagent.service/ - cgroup.controllers cgroup.max.descendants cgroup.threads cpu.pressure cpu.weight.nice memory.high memory.oom.group memory.swap.current memory.zswap.current pids.peak - cgroup.events cgroup.pressure cgroup.type cpu.stat io.pressure memory.low memory.peak memory.swap.events memory.zswap.max - cgroup.freeze cgroup.procs cpu.idle cpu.uclamp.max memory.current memory.max memory.pressure memory.swap.high pids.current - cgroup.kill cgroup.stat cpu.max cpu.uclamp.min memory.events memory.min memory.reclaim memory.swap.max pids.events - cgroup.max.depth cgroup.subtree_control cpu.max.burst cpu.weight memory.events.local memory.numa_stat memory.stat memory.swap.peak pids.max - - If either check is True, the controller is enabled at the cgroup. Check 1 is necessary because no controller - interface files exist at the root cgroup, even if the controller is enabled. """ - if cgroup_path is not None and controller is not None: - # Check that the controller is enabled in the cgroup.subtree_control file - enabled_controllers_file = os.path.join(cgroup_path, 'cgroup.subtree_control') + if self._root_cgroup_path is not None: + enabled_controllers_file = os.path.join(self._root_cgroup_path, 'cgroup.subtree_control') if os.path.exists(enabled_controllers_file): enabled_controllers = fileutil.read_file(enabled_controllers_file).rstrip().split(" ") if controller in enabled_controllers: return True - # Check that the controller interface files exist in the cgroup - if os.path.exists(cgroup_path): - for item in os.listdir(cgroup_path): - if item.startswith(controller + '.'): - return True - return False - def get_cgroup_mount_points(self): - # The output of mount is similar to + def get_controller_root_paths(self): + # In v2, there is a unified mount point shared by all controllers. Use findmnt to get the unified mount point, + # and check if cpu and memory are enabled at the root. Return a tuple representing the root cgroups for cpu and + # memory. Either should be None if the corresponding controller is not enabled at the root. + # + # The output of findmnt is similar to # $ findmnt -t cgroup2 --noheadings # /sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot # - # Since v2 is a unified hierarchy, this method checks if each controller is enabled at the root cgroup. This - # check is necessary because all non-root "cgroup.subtree_control" files can only contain controllers which are - # enabled in the parent's "cgroup.subtree_control" file. + # This check is necessary because all non-root "cgroup.subtree_control" files can only contain controllers + # which are enabled in the parent's "cgroup.subtree_control" file. - if not self._cgroup_mountpoints: - cpu = None - memory = None + if self._root_cgroup_path is None: for line in shellutil.run_command(['findmnt', '-t', 'cgroup2', '--noheadings']).splitlines(): match = re.search(r'(?P/\S+)\s+cgroup2', line) if match is not None: - mount_point = match.group('path') - if self.is_controller_enabled('cpu', mount_point): - cpu = mount_point - if self.is_controller_enabled('memory', mount_point): - memory = mount_point - self._cgroup_mountpoints = {'cpu': cpu, 'memory': memory} - - return self._cgroup_mountpoints['cpu'], self._cgroup_mountpoints['memory'] - - def get_unit_cgroup_paths(self, unit_name): - # Ex: ControlGroup=/azure.slice/walinuxagent.service - # controlgroup_path[1:] = azure.slice/walinuxagent.service - controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup") - cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() - - # Since v2 is a unified hierarchy, we need to check if each controller is enabled for the cgroup. If a - # controller is not enabled, then its controller interface files won't exist at the cgroup path - cpu_cgroup_path = None - if cpu_mount_point is not None: - cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) - if self.is_controller_enabled('cpu', cgroup_path): - cpu_cgroup_path = cgroup_path - - memory_cgroup_path = None - if memory_mount_point is not None: - cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) - if self.is_controller_enabled('memory', cgroup_path): - memory_cgroup_path = cgroup_path - - return cpu_cgroup_path, memory_cgroup_path - - def get_process_cgroup_paths(self, process_id): - cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id) - cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() - - # Since v2 is a unified hierarchy, we need to check if each controller is enabled for the cgroup. If a - # controller is not enabled, then its controller interface files won't exist at the cgroup path - cpu_cgroup_path = None - if cpu_mount_point is not None and cpu_cgroup_relative_path is not None: - cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) - if self.is_controller_enabled('cpu', cgroup_path): - cpu_cgroup_path = cgroup_path - - memory_cgroup_path = None - if memory_mount_point is not None and memory_cgroup_relative_path is not None: - cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) - if self.is_controller_enabled('memory', cgroup_path): - memory_cgroup_path = cgroup_path - - return cpu_cgroup_path, memory_cgroup_path + root_cgroup_path = match.group('path') + if root_cgroup_path is not None: + self._root_cgroup_path = root_cgroup_path + if self.is_controller_enabled_at_root('cpu'): + self._controllers_enabled_at_root.append('cpu') + if self.is_controller_enabled_at_root('memory'): + self._controllers_enabled_at_root.append('memory') + + root_cpu_path = self._root_cgroup_path if 'cpu' in self._controllers_enabled_at_root else None + root_memory_path = self._root_cgroup_path if 'memory' in self._controllers_enabled_at_root else None + return root_cpu_path, root_memory_path def get_process_cgroup_relative_paths(self, process_id): # The contents of the file are similar to @@ -525,11 +474,4 @@ def get_process_cgroup_relative_paths(self, process_id): return cpu_path, memory_path def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): # pylint: disable=W0613 - """ - Currently, the agent will not enable cgroups v2 or use SystemdCgroupv2Api() to start extension commands. Raising - an exception here for CGroupConfigurator to catch in case v2 is improperly enabled. - """ - error_msg = "The agent does not currently support running extensions in cgroups v2" - log_cgroup_warning(error_msg) - raise CGroupsException(msg=error_msg) - + raise NotImplementedError() diff --git a/azurelinuxagent/ga/cgroupconfigurator.py b/azurelinuxagent/ga/cgroupconfigurator.py index 34229c533..4540b546b 100644 --- a/azurelinuxagent/ga/cgroupconfigurator.py +++ b/azurelinuxagent/ga/cgroupconfigurator.py @@ -24,9 +24,8 @@ from azurelinuxagent.common import conf from azurelinuxagent.common import logger from azurelinuxagent.ga.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryCgroup -from azurelinuxagent.ga.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdRunError, EXTENSION_SLICE_PREFIX, \ - get_cgroup_api, SystemdCgroupsApiv2 -from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry, log_cgroup_info, log_cgroup_warning +from azurelinuxagent.ga.cgroupapi import SystemdRunError, EXTENSION_SLICE_PREFIX, CGroupUtil, SystemdCgroupApiv2, log_cgroup_info, log_cgroup_warning, get_cgroup_api +from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException, AgentMemoryExceededException from azurelinuxagent.common.future import ustr from azurelinuxagent.common.osutil import systemd @@ -139,8 +138,10 @@ def initialize(self): try: if self._initialized: return - # This check is to reset the quotas if agent goes from cgroup supported to unsupported distros later in time. - if not CGroupsApi.cgroups_supported(): + # check whether cgroup monitoring is supported on the current distro + self._cgroups_supported = CGroupUtil.cgroups_supported() + if not self._cgroups_supported: + # This check is to reset the quotas if agent goes from cgroup supported to unsupported distros later in time. agent_drop_in_path = systemd.get_agent_drop_in_path() try: if os.path.exists(agent_drop_in_path) and os.path.isdir(agent_drop_in_path): @@ -155,20 +156,11 @@ def initialize(self): agent_drop_in_file_memory_accounting, agent_drop_in_file_cpu_quota]) self.__cleanup_all_files(files_to_cleanup) self.__reload_systemd_config() - log_cgroup_info("Agent reset the quotas if distro: {0} goes from supported to unsupported list".format(get_distro()), send_event=False) + log_cgroup_info("Agent reset the quotas if distro: {0} goes from supported to unsupported list".format(get_distro()), send_event=True) except Exception as err: - logger.warn("[CGW] Unable to delete Agent drop-in files while resetting the quotas: {0}".format(err)) + logger.warn("Unable to delete Agent drop-in files while resetting the quotas: {0}".format(err)) - # check whether cgroup monitoring is supported on the current distro - self._cgroups_supported = CGroupsApi.cgroups_supported() - if not self._cgroups_supported: - log_cgroup_info("Cgroup monitoring is not supported on {0}".format(get_distro()), send_event=False) - return - - # Determine which version of the Cgroup API should be used. If the correct version can't be determined, - # do not enable resource monitoring/enforcement. - self._cgroups_api = get_cgroup_api() - if self._cgroups_api is None: + log_cgroup_info("Cgroup monitoring is not supported on {0}".format(get_distro()), send_event=True) return # check that systemd is detected correctly @@ -178,6 +170,14 @@ def initialize(self): log_cgroup_info("systemd version: {0}".format(systemd.get_version())) + # Determine which version of the Cgroup API should be used. If the correct version can't be determined, + # do not enable resource monitoring/enforcement. + try: + self._cgroups_api = get_cgroup_api() + except CGroupsException as e: + log_cgroup_warning("Unable to determine which cgroup version to use: {0}".format(ustr(e)), send_event=True) + return + if not self.__check_no_legacy_cgroups(): return @@ -189,13 +189,13 @@ def initialize(self): self.__setup_azure_slice() - cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers_mount_points() + cpu_controller_root, memory_controller_root = self.__get_cgroup_controller_roots() self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroup_paths(agent_slice, cpu_controller_root, memory_controller_root) if self.cgroup_v2_enabled(): - log_cgroup_info("Agent and extensions resource monitoring is not currently supported on cgroups v2") + log_cgroup_info("Agent and extensions resource monitoring is not currently supported on cgroup v2") return if self._agent_cpu_cgroup_path is not None or self._agent_memory_cgroup_path is not None: @@ -223,24 +223,24 @@ def __check_no_legacy_cgroups(self): Older versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent. When running under systemd this could produce invalid resource usage data. Cgroups should not be enabled under this condition. """ - legacy_cgroups = self._cgroups_api.cleanup_legacy_cgroups() + legacy_cgroups = CGroupUtil.cleanup_legacy_cgroups() if legacy_cgroups > 0: log_cgroup_warning("The daemon's PID was added to a legacy cgroup; will not monitor resource usage.") return False return True - def __get_cgroup_controllers_mount_points(self): - cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points() + def __get_cgroup_controller_roots(self): + cpu_controller_root, memory_controller_root = self._cgroups_api.get_controller_root_paths() if cpu_controller_root is not None: - log_cgroup_info("The CPU cgroup controller is mounted at {0}".format(cpu_controller_root), send_event=False) + log_cgroup_info("The CPU cgroup controller root path is {0}".format(cpu_controller_root), send_event=False) else: - log_cgroup_warning("The CPU cgroup controller is not mounted") + log_cgroup_warning("The CPU cgroup controller is not mounted or enabled") if memory_controller_root is not None: - log_cgroup_info("The memory cgroup controller is mounted at {0}".format(memory_controller_root), send_event=False) + log_cgroup_info("The memory cgroup controller root path is {0}".format(memory_controller_root), send_event=False) else: - log_cgroup_warning("The memory cgroup controller is not mounted") + log_cgroup_warning("The memory cgroup controller is not mounted or enabled") return cpu_controller_root, memory_controller_root @@ -381,13 +381,13 @@ def __create_all_files(files_to_create): def is_extension_resource_limits_setup_completed(self, extension_name, cpu_quota=None): unit_file_install_path = systemd.get_unit_file_install_path() - old_extension_slice_path = os.path.join(unit_file_install_path, SystemdCgroupsApi.get_extension_slice_name(extension_name, old_slice=True)) + old_extension_slice_path = os.path.join(unit_file_install_path, CGroupUtil.get_extension_slice_name(extension_name, old_slice=True)) # clean up the old slice from the disk if os.path.exists(old_extension_slice_path): CGroupConfigurator._Impl.__cleanup_unit_file(old_extension_slice_path) extension_slice_path = os.path.join(unit_file_install_path, - SystemdCgroupsApi.get_extension_slice_name(extension_name)) + CGroupUtil.get_extension_slice_name(extension_name)) cpu_quota = str( cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity) slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name, @@ -452,7 +452,7 @@ def extensions_enabled(self): return self._extensions_cgroups_enabled def cgroup_v2_enabled(self): - return isinstance(self._cgroups_api, SystemdCgroupsApiv2) + return isinstance(self._cgroups_api, SystemdCgroupApiv2) def enable(self): if not self.supported(): @@ -581,7 +581,7 @@ def _check_processes_in_agent_cgroup(self): agent_commands.update(shellutil.get_running_commands()) systemd_run_commands = set() systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands()) - agent_cgroup = CGroupsApi.get_processes_in_cgroup(self._agent_cpu_cgroup_path) + agent_cgroup = self._cgroups_api.get_processes_in_cgroup(self._agent_cpu_cgroup_path) # get the running commands again in case new commands started or completed while we were fetching the processes in the cgroup; agent_commands.update(shellutil.get_running_commands()) systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands()) @@ -744,12 +744,12 @@ def start_tracking_unit_cgroups(self, unit_name): cpu_cgroup_path, memory_cgroup_path = self._cgroups_api.get_unit_cgroup_paths(unit_name) if cpu_cgroup_path is None: - log_cgroup_info("The CPU controller is not mounted; will not track resource usage", send_event=False) + log_cgroup_info("The CPU controller is not mounted or enabled; will not track resource usage", send_event=False) else: CGroupsTelemetry.track_cgroup(CpuCgroup(unit_name, cpu_cgroup_path)) if memory_cgroup_path is None: - log_cgroup_info("The Memory controller is not mounted; will not track resource usage", send_event=False) + log_cgroup_info("The Memory controller is not mounted or enabled; will not track resource usage", send_event=False) else: CGroupsTelemetry.track_cgroup(MemoryCgroup(unit_name, memory_cgroup_path)) @@ -777,13 +777,13 @@ def stop_tracking_extension_cgroups(self, extension_name): TODO: remove extension Memory cgroups from tracked list """ try: - extension_slice_name = SystemdCgroupsApi.get_extension_slice_name(extension_name) + extension_slice_name = CGroupUtil.get_extension_slice_name(extension_name) cgroup_relative_path = os.path.join(_AZURE_VMEXTENSIONS_SLICE, extension_slice_name) - cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self._cgroups_api.get_cgroup_mount_points() - cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path) - memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path) + cpu_root_path, memory_root_path = self._cgroups_api.get_controller_root_paths() + cpu_cgroup_path = os.path.join(cpu_root_path, cgroup_relative_path) + memory_cgroup_path = os.path.join(memory_root_path, cgroup_relative_path) if cpu_cgroup_path is not None: CGroupsTelemetry.stop_tracking(CpuCgroup(extension_name, cpu_cgroup_path)) @@ -819,12 +819,6 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh extension_name, ustr(exception)) self.disable(reason, DisableCgroups.ALL) # fall-through and re-invoke the extension - except CGroupsException as exception: - reason = 'Failed to start {0} using cgroups, will try invoking the extension directly. Error: {1}'.format( - extension_name, ustr(exception)) - self.disable(reason, DisableCgroups.ALL) - # fall-through and re-invoke the extension - # subprocess-popen-preexec-fn Disabled: code is not multi-threaded process = subprocess.Popen(command, shell=shell, cwd=cwd, env=env, stdout=stdout, stderr=stderr, preexec_fn=os.setsid) # pylint: disable=W1509 @@ -852,7 +846,7 @@ def setup_extension_slice(self, extension_name, cpu_quota): if self.enabled(): unit_file_install_path = systemd.get_unit_file_install_path() extension_slice_path = os.path.join(unit_file_install_path, - SystemdCgroupsApi.get_extension_slice_name(extension_name)) + CGroupUtil.get_extension_slice_name(extension_name)) try: cpu_quota = str(cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity) if cpu_quota == "": @@ -874,7 +868,7 @@ def remove_extension_slice(self, extension_name): """ if self.enabled(): unit_file_install_path = systemd.get_unit_file_install_path() - extension_slice_name = SystemdCgroupsApi.get_extension_slice_name(extension_name) + extension_slice_name = CGroupUtil.get_extension_slice_name(extension_name) extension_slice_path = os.path.join(unit_file_install_path, extension_slice_name) if os.path.exists(extension_slice_path): self.stop_tracking_extension_cgroups(extension_name) diff --git a/azurelinuxagent/ga/cgroupstelemetry.py b/azurelinuxagent/ga/cgroupstelemetry.py index 5a564de63..5943b45ad 100644 --- a/azurelinuxagent/ga/cgroupstelemetry.py +++ b/azurelinuxagent/ga/cgroupstelemetry.py @@ -17,23 +17,10 @@ import threading from azurelinuxagent.common import logger -from azurelinuxagent.common.event import WALAEventOperation, add_event from azurelinuxagent.ga.cgroup import CpuCgroup from azurelinuxagent.common.future import ustr -def log_cgroup_info(formatted_string, op=WALAEventOperation.CGroupsInfo, send_event=True): - logger.info("[CGI] " + formatted_string) - if send_event: - add_event(op=op, message=formatted_string) - - -def log_cgroup_warning(formatted_string, op=WALAEventOperation.CGroupsInfo, send_event=True): - logger.info("[CGW] " + formatted_string) # log as INFO for now, in the future it should be logged as WARNING - if send_event: - add_event(op=op, message=formatted_string, is_success=False, log_event=False) - - class CGroupsTelemetry(object): """ """ diff --git a/tests/data/cgroups/hybrid/sys_fs_cgroup_cgroup.controllers b/tests/data/cgroups/hybrid/sys_fs_cgroup_cgroup.controllers new file mode 100644 index 000000000..e69de29bb diff --git a/tests/data/cgroups/v1_and_v2/proc_pid_cgroup b/tests/data/cgroups/v1_and_v2/proc_pid_cgroup deleted file mode 100644 index 179c59daa..000000000 --- a/tests/data/cgroups/v1_and_v2/proc_pid_cgroup +++ /dev/null @@ -1,12 +0,0 @@ -12:devices:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope -11:perf_event:/ -10:rdma:/ -9:blkio:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope -8:net_cls,net_prio:/ -7:freezer:/ -6:hugetlb:/ -4:cpuset:/ -3:cpu,cpuacct:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope -2:pids:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope -1:name=systemd:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope -0::/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope diff --git a/tests/data/cgroups/v1_and_v2/proc_self_cgroup b/tests/data/cgroups/v1_and_v2/proc_self_cgroup deleted file mode 100644 index 40e7dd5b1..000000000 --- a/tests/data/cgroups/v1_and_v2/proc_self_cgroup +++ /dev/null @@ -1,12 +0,0 @@ -12:blkio:/system.slice/walinuxagent.service -11:cpu,cpuacct:/system.slice/walinuxagent.service -10:devices:/system.slice/walinuxagent.service -9:pids:/system.slice/walinuxagent.service -7:freezer:/ -6:hugetlb:/ -5:perf_event:/ -4:net_cls,net_prio:/ -3:cpuset:/ -2:rdma:/ -1:name=systemd:/system.slice/walinuxagent.service -0::/system.slice/walinuxagent.service diff --git a/tests/data/cgroups/v1_and_v2/sys_fs_cgroup_cgroup.subtree_control b/tests/data/cgroups/v1_and_v2/sys_fs_cgroup_cgroup.subtree_control deleted file mode 100644 index 2142c3ad3..000000000 --- a/tests/data/cgroups/v1_and_v2/sys_fs_cgroup_cgroup.subtree_control +++ /dev/null @@ -1 +0,0 @@ -memory diff --git a/tests/ga/test_cgroupapi.py b/tests/ga/test_cgroupapi.py index 7064ea51f..0259e93d5 100644 --- a/tests/ga/test_cgroupapi.py +++ b/tests/ga/test_cgroupapi.py @@ -24,16 +24,16 @@ from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.common.utils.fileutil import read_file -from azurelinuxagent.ga import cgroupapi -from azurelinuxagent.ga.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdCgroupsApiv1, SystemdCgroupsApiv2 +from azurelinuxagent.ga.cgroupapi import SystemdCgroupApiv1, SystemdCgroupApiv2, CGroupUtil, get_cgroup_api from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.osutil import systemd from azurelinuxagent.common.utils import fileutil from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, mock_cgroup_v2_environment, \ - mock_cgroup_v1_and_v2_environment + mock_cgroup_hybrid_environment from tests.lib.tools import AgentTestCase, patch, mock_sleep from tests.lib.cgroups_tools import CGroupsTools + class _MockedFileSystemTestCase(AgentTestCase): def setUp(self): AgentTestCase.setUp(self) @@ -43,7 +43,7 @@ def setUp(self): os.mkdir(os.path.join(self.cgroups_file_system_root, "cpu")) os.mkdir(os.path.join(self.cgroups_file_system_root, "memory")) - self.mock_cgroups_file_system_root = patch("azurelinuxagent.ga.cgroupapi.CGROUPS_FILE_SYSTEM_ROOT", self.cgroups_file_system_root) + self.mock_cgroups_file_system_root = patch("azurelinuxagent.ga.cgroupapi.CGROUP_FILE_SYSTEM_ROOT", self.cgroups_file_system_root) self.mock_cgroups_file_system_root.start() def tearDown(self): @@ -51,24 +51,7 @@ def tearDown(self): AgentTestCase.tearDown(self) -class CGroupsApiTestCase(AgentTestCase): - def test_get_cgroup_api_is_v1_when_v1_controllers_mounted(self): - with mock_cgroup_v1_environment(self.tmp_dir): - self.assertIsInstance(cgroupapi.get_cgroup_api(), SystemdCgroupsApiv1) - - def test_get_cgroup_api_is_v2_when_v2_controllers_mounted(self): - with mock_cgroup_v2_environment(self.tmp_dir): - self.assertIsInstance(cgroupapi.get_cgroup_api(), SystemdCgroupsApiv2) - - def test_get_cgroup_api_is_v1_when_v1_and_v2_controllers_mounted(self): - with mock_cgroup_v1_and_v2_environment(self.tmp_dir): - self.assertIsInstance(cgroupapi.get_cgroup_api(), SystemdCgroupsApiv1) - - def test_get_cgroup_api_is_none_when_no_controllers_mounted(self): - with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points", return_value=(None,None)): - with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points", return_value=(None,None)): - self.assertIsNone(cgroupapi.get_cgroup_api()) - +class CGroupUtilTestCase(AgentTestCase): def test_cgroups_should_be_supported_only_on_ubuntu16_centos7dot4_redhat7dot4_and_later_versions(self): test_cases = [ (['ubuntu', '16.04', 'xenial'], True), @@ -97,10 +80,38 @@ def test_cgroups_should_be_supported_only_on_ubuntu16_centos7dot4_redhat7dot4_an for (distro, supported) in test_cases: with patch("azurelinuxagent.ga.cgroupapi.get_distro", return_value=distro): - self.assertEqual(CGroupsApi.cgroups_supported(), supported, "cgroups_supported() failed on {0}".format(distro)) + self.assertEqual(CGroupUtil.cgroups_supported(), supported, "cgroups_supported() failed on {0}".format(distro)) class SystemdCgroupsApiTestCase(AgentTestCase): + def test_get_cgroup_api_is_v1_when_v1_in_use(self): + with mock_cgroup_v1_environment(self.tmp_dir): + self.assertIsInstance(get_cgroup_api(), SystemdCgroupApiv1) + + def test_get_cgroup_api_is_v2_when_v2_in_use(self): + with mock_cgroup_v2_environment(self.tmp_dir): + self.assertIsInstance(get_cgroup_api(), SystemdCgroupApiv2) + + def test_get_cgroup_api_is_v1_when_hybrid_in_use(self): + with mock_cgroup_hybrid_environment(self.tmp_dir): + with patch("os.path.exists", return_value=True): + self.assertIsInstance(get_cgroup_api(), SystemdCgroupApiv1) + + def test_get_cgroup_api_raises_exception_when_hybrid_in_use_and_controllers_available_in_unified_hierarchy(self): + with mock_cgroup_hybrid_environment(self.tmp_dir): + with patch("os.path.exists", return_value=True): + with patch('azurelinuxagent.common.utils.fileutil.read_file', return_value="cpu memory"): + with self.assertRaises(CGroupsException) as context: + get_cgroup_api() + self.assertTrue("Detected hybrid cgroup mode, but there are controllers available to be enabled in unified hierarchy: cpu memory" in str(context.exception)) + + def test_get_cgroup_api_raises_exception_when_cgroup_mode_cannot_be_determined(self): + unknown_cgroup_type = "unknown_cgroup_type" + with patch('azurelinuxagent.common.utils.shellutil.run_command', return_value=unknown_cgroup_type): + with self.assertRaises(CGroupsException) as context: + get_cgroup_api() + self.assertTrue("Detected unknown cgroup mode: {0}".format(unknown_cgroup_type) in str(context.exception)) + def test_get_systemd_version_should_return_a_version_number(self): # We expect same behavior for v1 and v2 mock_envs = [mock_cgroup_v1_environment(self.tmp_dir), mock_cgroup_v2_environment(self.tmp_dir)] @@ -110,38 +121,6 @@ def test_get_systemd_version_should_return_a_version_number(self): found = re.search(r"systemd \d+", version_info) is not None self.assertTrue(found, "Could not determine the systemd version: {0}".format(version_info)) - def test_is_cpu_or_memory_mounted_true_if_only_memory_mounted(self): - with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApi.get_cgroup_mount_points", return_value=(None, '/sys/fs/cgroup/memory')): - self.assertTrue(SystemdCgroupsApi().is_cpu_or_memory_mounted()) - - def test_is_cpu_or_memory_mounted_true_if_only_cpu_mounted(self): - with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApi.get_cgroup_mount_points", return_value=('/sys/fs/cgroup/cpu,cpuacct', None)): - self.assertTrue(SystemdCgroupsApi().is_cpu_or_memory_mounted()) - - def test_is_cpu_or_memory_mounted_true_if_cpu_and_memory_mounted(self): - with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApi.get_cgroup_mount_points", return_value=('/sys/fs/cgroup/cpu,cpuacct', '/sys/fs/cgroup/memory')): - self.assertTrue(SystemdCgroupsApi().is_cpu_or_memory_mounted()) - - def test_is_cpu_or_memory_mounted_false_if_cpu_and_memory_not_mounted(self): - with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApi.get_cgroup_mount_points", return_value=(None, None)): - self.assertFalse(SystemdCgroupsApi().is_cpu_or_memory_mounted()) - - def test_get_mounted_controllers_has_cpu_and_memory_controllers(self): - with mock_cgroup_v1_environment(self.tmp_dir): - mounted_controllers = cgroupapi.get_cgroup_api().get_mounted_controllers() - self.assertTrue("cpu" in mounted_controllers) - self.assertTrue("memory" in mounted_controllers) - - with mock_cgroup_v2_environment(self.tmp_dir): - mounted_controllers = cgroupapi.get_cgroup_api().get_mounted_controllers() - self.assertTrue("cpu" in mounted_controllers) - self.assertTrue("memory" in mounted_controllers) - - with mock_cgroup_v1_and_v2_environment(self.tmp_dir): - mounted_controllers = cgroupapi.get_cgroup_api().get_mounted_controllers() # API will be v1 since this environment as CPU mounted in v1 - self.assertTrue("cpu" in mounted_controllers) - self.assertFalse("memory" in mounted_controllers) # This environment has memory mounted in v2 - def test_get_unit_property_should_return_the_value_of_the_given_property(self): # We expect same behavior for v1 and v2 mock_envs = [mock_cgroup_v1_environment(self.tmp_dir), mock_cgroup_v2_environment(self.tmp_dir)] @@ -155,7 +134,7 @@ def test_get_unit_property_should_return_the_value_of_the_given_property(self): class SystemdCgroupsApiv1TestCase(AgentTestCase): def test_get_unit_cgroup_paths_should_return_the_cgroup_v1_mount_points(self): with mock_cgroup_v1_environment(self.tmp_dir): - cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + cpu, memory = get_cgroup_api().get_unit_cgroup_paths("extension.service") self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service', "The mount point for the CPU controller is incorrect") self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/extension.service', @@ -163,22 +142,22 @@ def test_get_unit_cgroup_paths_should_return_the_cgroup_v1_mount_points(self): def test_get_unit_cgroup_path_should_return_None_if_either_cgroup_v1_controller_not_mounted(self): with mock_cgroup_v1_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points', return_value=('/sys/fs/cgroup/cpu,cpuacct', None)): - cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_controller_root_paths', return_value=('/sys/fs/cgroup/cpu,cpuacct', None)): + cpu, memory = get_cgroup_api().get_unit_cgroup_paths("extension.service") self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service', "The mount point for the CPU controller is incorrect") self.assertIsNone(memory, "The mount point for the memory controller is None so unit cgroup should be None") - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points', return_value=(None, '/sys/fs/cgroup/memory')): - cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_controller_root_paths', return_value=(None, '/sys/fs/cgroup/memory')): + cpu, memory = get_cgroup_api().get_unit_cgroup_paths("extension.service") self.assertIsNone(cpu, "The mount point for the cpu controller is None so unit cgroup should be None") self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/extension.service', "The mount point for the memory controller is incorrect") def test_get_process_cgroup_paths_should_return_the_cgroup_v1_mount_points(self): with mock_cgroup_v1_environment(self.tmp_dir): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service', "The mount point for the CPU controller is incorrect") self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/walinuxagent.service', @@ -186,43 +165,43 @@ def test_get_process_cgroup_paths_should_return_the_cgroup_v1_mount_points(self) def test_get_process_cgroup_path_should_return_None_if_either_cgroup_v1_controller_not_mounted(self): with mock_cgroup_v1_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points', return_value=('/sys/fs/cgroup/cpu,cpuacct', None)): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_controller_root_paths', return_value=('/sys/fs/cgroup/cpu,cpuacct', None)): + cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service', "The mount point for the CPU controller is incorrect") self.assertIsNone(memory, "The mount point for the memory controller is None so unit cgroup should be None") - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points', return_value=(None, '/sys/fs/cgroup/memory')): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_controller_root_paths', return_value=(None, '/sys/fs/cgroup/memory')): + cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") self.assertIsNone(cpu, "The mount point for the CPU controller is None so unit cgroup should be None") self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/walinuxagent.service', "The mount point for the memory controller is incorrect") def test_get_process_cgroup_v1_path_should_return_None_if_either_relative_path_is_None(self): with mock_cgroup_v1_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_relative_paths', return_value=('system.slice/walinuxagent.service', None)): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup_relative_paths', return_value=('system.slice/walinuxagent.service', None)): + cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service', "The mount point for the CPU controller is incorrect") self.assertIsNone(memory, "The relative cgroup path for the memory controller is None so unit cgroup should be None") - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_relative_paths', return_value=(None, 'system.slice/walinuxagent.service')): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup_relative_paths', return_value=(None, 'system.slice/walinuxagent.service')): + cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") self.assertIsNone(cpu, "The relative cgroup path for the cpu controller is None so unit cgroup should be None") self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/walinuxagent.service', "The mount point for the memory controller is incorrect") - def test_get_cpu_and_memory_mount_points_should_return_the_cgroup_v1_mount_points(self): + def test_get_controller_root_paths_should_return_the_cgroup_v1_controller_mount_points(self): with mock_cgroup_v1_environment(self.tmp_dir): - cpu, memory = cgroupapi.get_cgroup_api().get_cgroup_mount_points() - self.assertEqual(cpu, '/sys/fs/cgroup/cpu,cpuacct', "The mount point for the CPU controller is incorrect") - self.assertEqual(memory, '/sys/fs/cgroup/memory', "The mount point for the memory controller is incorrect") + cpu, memory = get_cgroup_api().get_controller_root_paths() + self.assertEqual(cpu, '/sys/fs/cgroup/cpu,cpuacct', "The root cgroup for the CPU controller is incorrect") + self.assertEqual(memory, '/sys/fs/cgroup/memory', "The root cgroup for the memory controller is incorrect") def test_get_cpu_and_memory_cgroup_relative_paths_for_process_should_return_the_cgroup_v1_relative_paths(self): with mock_cgroup_v1_environment(self.tmp_dir): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths('self') + cpu, memory = get_cgroup_api().get_process_cgroup_relative_paths('self') self.assertEqual(cpu, "system.slice/walinuxagent.service", "The relative path for the CPU cgroup is incorrect") self.assertEqual(memory, "system.slice/walinuxagent.service", "The relative memory for the CPU cgroup is incorrect") @@ -239,7 +218,7 @@ def mock_popen(command, *args, **kwargs): with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as output_file: with patch("subprocess.Popen", side_effect=mock_popen) as popen_patch: # pylint: disable=unused-variable - command_output = cgroupapi.get_cgroup_api().start_extension_command( + command_output = get_cgroup_api().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="A_TEST_COMMAND", cmd_name="test", @@ -255,7 +234,7 @@ def mock_popen(command, *args, **kwargs): @patch('time.sleep', side_effect=lambda _: mock_sleep()) def test_start_extension_cgroups_v1_command_should_execute_the_command_in_a_cgroup(self, _): with mock_cgroup_v1_environment(self.tmp_dir): - cgroupapi.get_cgroup_api().start_extension_command( + get_cgroup_api().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="test command", cmd_name="test", @@ -282,7 +261,7 @@ def test_start_extension_cgroups_v1_command_should_execute_the_command_in_a_cgro def test_start_extension_cgroups_v1_command_should_use_systemd_to_execute_the_command(self, _): with mock_cgroup_v1_environment(self.tmp_dir): with patch("subprocess.Popen", wraps=subprocess.Popen) as popen_patch: - cgroupapi.get_cgroup_api().start_extension_command( + get_cgroup_api().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="the-test-extension-command", cmd_name="test", @@ -301,233 +280,90 @@ def test_start_extension_cgroups_v1_command_should_use_systemd_to_execute_the_co class SystemdCgroupsApiv2TestCase(AgentTestCase): - def test_is_controller_enabled_should_return_False_if_cgroup_is_None(self): + def test_is_controller_enabled_at_root_should_return_False_if_controller_is_not_in_subtree_control_file(self): with mock_cgroup_v2_environment(self.tmp_dir): - self.assertFalse(cgroupapi.get_cgroup_api().is_controller_enabled('cpu', None)) - - def test_is_controller_enabled_should_return_False_if_controller_is_None(self): - with mock_cgroup_v2_environment(self.tmp_dir): - self.assertFalse(cgroupapi.get_cgroup_api().is_controller_enabled(None, '/sys/fs/cgroup')) - - def test_is_controller_enabled_should_return_False_if_cgroup_path_does_not_exist(self): - with mock_cgroup_v2_environment(self.tmp_dir): - self.assertFalse(cgroupapi.get_cgroup_api().is_controller_enabled('cpu', '/path/that/does/not/exist')) - - def test_is_controller_enabled_should_return_False_if_controller_is_not_in_subtree_control_file_and_controller_interface_files_do_not_exist(self): - with mock_cgroup_v2_environment(self.tmp_dir): - self.assertFalse(cgroupapi.get_cgroup_api().is_controller_enabled('cpu', '/sys/fs/cgroup/azure.slice/walinuxagent.service')) - - def test_is_controller_enabled_should_return_True_if_controller_is_in_subtree_control_file(self): - with mock_cgroup_v2_environment(self.tmp_dir): - # Mock the cgroup.subtree_control to include memory controller + # Mock the cgroup.subtree_control to not have cpu controller def mock_read_file(path): - if "/sys/fs/cgroup/azure.slice/walinuxagent.service/cgroup.subtree_control" in path: + if "/sys/fs/cgroup/cgroup.subtree_control" in path: return 'io memory pids\n' return read_file(path) with patch('azurelinuxagent.common.utils.fileutil.read_file', side_effect=mock_read_file): - self.assertTrue(cgroupapi.get_cgroup_api().is_controller_enabled('memory', '/sys/fs/cgroup/azure.slice/walinuxagent.service')) - - def test_is_controller_enabled_should_return_True_if_controller_interface_file_exists(self): - original_list_dir = os.listdir - - # Mock the walinuxagent.service directory to include memory controller interface files - def mock_os_list_dir(path): - if "/sys/fs/cgroup/azure.slice/walinuxagent.service" in path: - return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat'] - return original_list_dir(path) - - with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: - # Mock service directory - mock_env._mock_mkdir('/sys/fs/cgroup/azure.slice/walinuxagent.service') - - with patch('os.listdir', side_effect=mock_os_list_dir): - self.assertTrue(cgroupapi.get_cgroup_api().is_controller_enabled('memory', '/sys/fs/cgroup/azure.slice/walinuxagent.service')) - - def test_get_unit_cgroup_paths_should_return_the_cgroup_v2_mount_points(self): - original_list_dir = os.listdir - - # Mock the extension.service directory to include controller interface files - def mock_os_list_dir(path): - if "/sys/fs/cgroup/system.slice/extension.service" in path: - return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat', 'cpu.stat'] - return original_list_dir(path) - - with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: - # Mock service directory - mock_env._mock_mkdir('/sys/fs/cgroup/system.slice/extension.service') + self.assertFalse(get_cgroup_api().is_controller_enabled_at_root('cpu')) - with patch('os.listdir', side_effect=mock_os_list_dir): - cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") - self.assertEqual(cpu, '/sys/fs/cgroup/system.slice/extension.service', - "The mount point for the CPU controller is incorrect") - self.assertEqual(memory, '/sys/fs/cgroup/system.slice/extension.service', - "The mount point for the memory controller is incorrect") + def test_is_controller_enabled_at_root_should_return_True_if_controller_is_in_subtree_control_file(self): + with mock_cgroup_v2_environment(self.tmp_dir): + cgroup_api = get_cgroup_api() + cgroup_api.get_controller_root_paths() + self.assertTrue(cgroup_api.is_controller_enabled_at_root('memory')) + self.assertTrue(cgroup_api.is_controller_enabled_at_root('cpu')) - def test_get_unit_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_mounted(self): + def test_get_unit_cgroup_paths_should_return_the_cgroup_v2_cgroup_paths(self): with mock_cgroup_v2_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.is_controller_enabled', return_value=True): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points', return_value=('/sys/fs/cgroup', None)): - cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") - self.assertIn(cpu, '/sys/fs/cgroup/system.slice/extension.service', - "The mount point for the CPU controller is incorrect") - self.assertIsNone(memory, - "The mount point for the memory controller is None so unit cgroup should be None") - - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points', return_value=(None, '/sys/fs/cgroup')): - cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") - self.assertIsNone(cpu, "The mount point for the cpu controller is None so unit cgroup should be None") - self.assertIn(memory, '/sys/fs/cgroup/system.slice/extension.service', - "The mount point for the memory controller is incorrect") + cpu, memory = get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertEqual(cpu, '/sys/fs/cgroup/system.slice/extension.service', + "The cgroup path for the CPU controller is incorrect") + self.assertEqual(memory, '/sys/fs/cgroup/system.slice/extension.service', + "The cgroup path for the memory controller is incorrect") def test_get_unit_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_enabled(self): - original_list_dir = os.listdir - - # Mock the extension.service directory to include only cpu controller interface files - def mock_os_list_dir_cpu(path): - if "/sys/fs/cgroup/system.slice/extension.service" in path: - return ['cgroup.controllers', 'cgroup.subtree_control', 'cpu.stat'] - return original_list_dir(path) - - # Mock the extension.service directory to include only cpu controller interface files - def mock_os_list_dir_memory(path): - if "/sys/fs/cgroup/system.slice/extension.service" in path: - return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat'] - return original_list_dir(path) - - with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: - # Mock service directory - mock_env._mock_mkdir('/sys/fs/cgroup/system.slice/extension.service') - - with patch('os.listdir', side_effect=mock_os_list_dir_cpu): - cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_controller_root_paths', return_value=('/sys/fs/cgroup', None)): + cpu, memory = get_cgroup_api().get_unit_cgroup_paths("extension.service") self.assertIn(cpu, '/sys/fs/cgroup/system.slice/extension.service', - "The mount point for the CPU controller is incorrect") + "The cgroup path for the CPU controller is incorrect") self.assertIsNone(memory, - "The memory controller is not enabled so unit cgroup should be None") + "The cgroup path for the memory controller is None so unit cgroup should be None") - with patch('os.listdir', side_effect=mock_os_list_dir_memory): - cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") - self.assertIsNone(cpu, "The cpu controller is not enabled so unit cgroup should be None") + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_controller_root_paths', return_value=(None, '/sys/fs/cgroup')): + cpu, memory = get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIsNone(cpu, "The cgroup path for the cpu controller is None so unit cgroup should be None") self.assertIn(memory, '/sys/fs/cgroup/system.slice/extension.service', - "The mount point for the memory controller is incorrect") - - def test_get_process_cgroup_paths_should_return_the_cgroup_v2_mount_points(self): - original_list_dir = os.listdir - - # Mock the extension.service directory to include controller interface files - def mock_os_list_dir(path): - if "/sys/fs/cgroup/system.slice/walinuxagent.service" in path: - return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat', 'cpu.stat'] - return original_list_dir(path) - - with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: - # Mock service directory - mock_env._mock_mkdir('/sys/fs/cgroup/system.slice/walinuxagent.service') - - with patch('os.listdir', side_effect=mock_os_list_dir): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") - self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', - "The mount point for the CPU controller is incorrect") - self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', - "The mount point for the memory controller is incorrect") + "The cgroup path for the memory controller is incorrect") - def test_get_process_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_mounted(self): - with mock_cgroup_v2_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.is_controller_enabled', return_value=True): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points', return_value=('/sys/fs/cgroup', None)): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") - self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', - "The mount point for the CPU controller is incorrect") - self.assertIsNone(memory, - "The mount point for the memory controller is None so unit cgroup should be None") - - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points', return_value=(None, '/sys/fs/cgroup')): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") - self.assertIsNone(cpu, "The mount point for the CPU controller is None so unit cgroup should be None") - self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', - "The mount point for the memory controller is incorrect") - - def test_get_process_cgroup_v2_path_should_return_None_if_either_relative_path_is_None(self): + def test_get_process_cgroup_paths_should_return_the_cgroup_v2_cgroup_paths(self): with mock_cgroup_v2_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.is_controller_enabled', return_value=True): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_process_cgroup_relative_paths', return_value=('system.slice/walinuxagent.service', None)): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") - self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', - "The mount point for the CPU controller is incorrect") - self.assertIsNone(memory, - "The relative cgroup path for the memory controller is None so unit cgroup should be None") - - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_process_cgroup_relative_paths', return_value=(None, 'system.slice/walinuxagent.service')): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") - self.assertIsNone(cpu, "The relative cgroup path for the cpu controller is None so unit cgroup should be None") - self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', - "The mount point for the memory controller is incorrect") + cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The cgroup path for the CPU controller is incorrect") + self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The cgroup path for the memory controller is incorrect") def test_get_process_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_enabled(self): - original_list_dir = os.listdir - - # Mock the walinuxagent.service directory to include memory controller interface files - def mock_os_list_dir_memory(path): - if "/sys/fs/cgroup/system.slice/walinuxagent.service" in path: - return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat'] - return original_list_dir(path) - - # Mock the walinuxagent.service directory to include cpu controller interface files - def mock_os_list_dir_cpu(path): - if "/sys/fs/cgroup/system.slice/walinuxagent.service" in path: - return ['cgroup.controllers', 'cgroup.subtree_control', 'cpu.stat'] - return original_list_dir(path) - - with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: - # Mock service directory - mock_env._mock_mkdir('/sys/fs/cgroup/system.slice/walinuxagent.service') - - with patch('os.listdir', side_effect=mock_os_list_dir_cpu): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_controller_root_paths', return_value=('/sys/fs/cgroup', None)): + cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', - "The mount point for the CPU controller is incorrect") + "The cgroup path for the CPU controller is incorrect") self.assertIsNone(memory, - "The memory controller is not enabled so unit cgroup should be None") + "The cgroup path for the memory controller is None so unit cgroup should be None") - with patch('os.listdir', side_effect=mock_os_list_dir_memory): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") - self.assertIsNone(cpu, "The cpu controller is not enabled so unit cgroup should be None") + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_controller_root_paths', return_value=(None, '/sys/fs/cgroup')): + cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") + self.assertIsNone(cpu, "The cgroup path for the CPU controller is None so unit cgroup should be None") self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', - "The mount point for the memory controller is incorrect") + "The cgroup path for the memory controller is incorrect") - def test_get_cpu_and_memory_mount_points_should_return_the_cgroup_v2_mount_points(self): + def test_get_process_cgroup_v2_path_should_return_None_if_relative_path_is_None(self): with mock_cgroup_v2_environment(self.tmp_dir): - cpu, memory = cgroupapi.get_cgroup_api().get_cgroup_mount_points() - self.assertEqual(cpu, '/sys/fs/cgroup', "The mount point for the CPU controller is incorrect") - self.assertEqual(memory, '/sys/fs/cgroup', "The mount point for the memory controller is incorrect") + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_process_cgroup_relative_paths', return_value=(None, None)): + cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") + self.assertIsNone(cpu, "The relative cgroup path for the cpu controller is None so unit cgroup should be None") + self.assertIsNone(memory, + "The relative cgroup path for the memory controller is None so unit cgroup should be None") + + def test_get_controller_root_paths_should_return_the_cgroup_v2_root_cgroup_path(self): + with mock_cgroup_v2_environment(self.tmp_dir): + cpu, memory = get_cgroup_api().get_controller_root_paths() + self.assertEqual(cpu, '/sys/fs/cgroup', "The root cgroup for the CPU controller is incorrect") + self.assertEqual(memory, '/sys/fs/cgroup', "The root cgroup for the memory controller is incorrect") def test_get_cpu_and_memory_cgroup_relative_paths_for_process_should_return_the_cgroup_v2_relative_paths(self): with mock_cgroup_v2_environment(self.tmp_dir): - cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths('self') + cpu, memory = get_cgroup_api().get_process_cgroup_relative_paths('self') self.assertEqual(cpu, "system.slice/walinuxagent.service", "The relative path for the CPU cgroup is incorrect") self.assertEqual(memory, "system.slice/walinuxagent.service", "The relative memory for the CPU cgroup is incorrect") - @patch('time.sleep', side_effect=lambda _: mock_sleep()) - def test_start_extension_cgroups_v2_command_should_raise_exception(self, _): - with mock_cgroup_v2_environment(self.tmp_dir): - with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as output_file: - cgroups_exception_raised = False - try: - cgroupapi.get_cgroup_api().start_extension_command( - extension_name="Microsoft.Compute.TestExtension-1.2.3", - command="A_TEST_COMMAND", - cmd_name="test", - shell=True, - timeout=300, - cwd=self.tmp_dir, - env={}, - stdout=output_file, - stderr=output_file) - except CGroupsException: - cgroups_exception_raised = True - self.assertTrue(cgroups_exception_raised) - class SystemdCgroupsApiMockedFileSystemTestCase(_MockedFileSystemTestCase): def test_cleanup_legacy_cgroups_should_remove_legacy_cgroups(self): @@ -540,7 +376,7 @@ def test_cleanup_legacy_cgroups_should_remove_legacy_cgroups(self): legacy_memory_cgroup = CGroupsTools.create_legacy_agent_cgroup(self.cgroups_file_system_root, "memory", '') with patch("azurelinuxagent.ga.cgroupapi.get_agent_pid_file_path", return_value=daemon_pid_file): - legacy_cgroups = SystemdCgroupsApi().cleanup_legacy_cgroups() + legacy_cgroups = CGroupUtil.cleanup_legacy_cgroups() self.assertEqual(legacy_cgroups, 2, "cleanup_legacy_cgroups() did not find all the expected cgroups") self.assertFalse(os.path.exists(legacy_cpu_cgroup), "cleanup_legacy_cgroups() did not remove the CPU legacy cgroup") diff --git a/tests/ga/test_cgroupconfigurator.py b/tests/ga/test_cgroupconfigurator.py index b097a2602..b29c4db82 100644 --- a/tests/ga/test_cgroupconfigurator.py +++ b/tests/ga/test_cgroupconfigurator.py @@ -35,8 +35,7 @@ from azurelinuxagent.common.future import ustr from azurelinuxagent.common.utils import shellutil, fileutil from tests.lib.mock_environment import MockCommand -from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, UnitFilePaths, mock_cgroup_v2_environment, \ - mock_cgroup_v1_and_v2_environment +from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, UnitFilePaths, mock_cgroup_v2_environment from tests.lib.tools import AgentTestCase, patch, mock_sleep, data_dir, is_python_version_26_or_34, skip_if_predicate_true from tests.lib.miscellaneous_tools import format_processes, wait_for @@ -83,24 +82,6 @@ def _get_cgroup_configurator_v2(self, initialize=True, enable=True, mock_command configurator.initialize() yield configurator - @contextlib.contextmanager - def _get_cgroup_configurator_v1_and_v2(self, initialize=True, enable=True, mock_commands=None): - CGroupConfigurator._instance = None - configurator = CGroupConfigurator.get_instance() - CGroupsTelemetry.reset() - with mock_cgroup_v1_and_v2_environment(self.tmp_dir) as mock_environment: - if mock_commands is not None: - for command in mock_commands: - mock_environment.add_command(command) - configurator.mocks = mock_environment - if initialize: - if not enable: - with patch.object(configurator, "enable"): - configurator.initialize() - else: - configurator.initialize() - yield configurator - def test_initialize_should_enable_cgroups_v1(self): with self._get_cgroup_configurator() as configurator: self.assertTrue(configurator.enabled(), "cgroups were not enabled") @@ -109,8 +90,12 @@ def test_initialize_should_not_enable_cgroups_v2(self): with self._get_cgroup_configurator_v2() as configurator: self.assertFalse(configurator.enabled(), "cgroups were enabled") - def test_initialize_should_not_enable_when_cgroup_api_is_none(self): - with patch('azurelinuxagent.ga.cgroupconfigurator.get_cgroup_api', return_value=None): + def test_initialize_should_not_enable_when_cgroup_api_cannot_be_determined(self): + # Mock cgroup api to raise CGroupsException + def mock_get_cgroup_api(): + raise CGroupsException("") + + with patch('azurelinuxagent.ga.cgroupconfigurator.get_cgroup_api', side_effect=mock_get_cgroup_api): with self._get_cgroup_configurator() as configurator: self.assertFalse(configurator.enabled(), "cgroups were enabled") @@ -145,16 +130,6 @@ def test_initialize_should_start_tracking_other_controllers_when_one_is_not_pres self.assertFalse(any(cg for cg in tracked.values() if cg.name == 'walinuxagent.service' and 'memory' in cg.path), "The Agent's memory should not be tracked. Tracked: {0}".format(tracked)) - def test_initialize_should_start_tracking_any_controllers_in_v1_if_others_in_v2(self): - # This mock environment has cpu controller in v1 and memory controller in v2 - with self._get_cgroup_configurator_v1_and_v2() as configurator: - tracked = CGroupsTelemetry._tracked - - self.assertTrue(configurator.enabled(), "Cgroups should be enabled") - self.assertFalse( - any(cg for cg in tracked.values() if cg.name == 'walinuxagent.service' and 'memory' in cg.path), - "The Agent's memory should not be tracked. Tracked: {0}".format(tracked)) - def test_initialize_should_not_enable_cgroups_when_the_cpu_and_memory_controllers_are_not_present(self): command_mocks = [MockCommand(r"^findmnt -t cgroup --noheadings$", '''/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd @@ -512,52 +487,32 @@ def mock_popen(command_arg, *args, **kwargs): self.assertIn("A TEST EXCEPTION", str(context_manager.exception)) @patch('time.sleep', side_effect=lambda _: mock_sleep()) - def test_start_extension_command_should_disable_cgroups_and_invoke_the_command_directly_if_v2_is_used(self, _): + def test_start_extension_command_should_not_use_systemd_when_using_cgroup_v2(self, _): with self._get_cgroup_configurator_v2() as configurator: - configurator.enable() # NOTE: Cgroups should not currently be enabled if v2 is detected. Adding this test to guarantee extensions are run correctly if cgroups v2 api is incorrectly called. - - with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as output_file: - with patch("azurelinuxagent.ga.cgroupstelemetry.add_event") as mock_add_event: - with patch("subprocess.Popen", wraps=subprocess.Popen) as popen_patch: - CGroupsTelemetry.reset() - - command = "echo TEST_OUTPUT" - - command_output = configurator.start_extension_command( - extension_name="Microsoft.Compute.TestExtension-1.2.3", - command=command, - cmd_name="test", - timeout=300, - shell=True, - cwd=self.tmp_dir, - env={}, - stdout=output_file, - stderr=output_file) - - self.assertFalse(configurator.enabled(), "Cgroups should have been disabled") - - disabled_events = [kwargs for _, kwargs in mock_add_event.call_args_list if - kwargs['op'] == WALAEventOperation.CGroupsDisabled] - - self.assertTrue(len(disabled_events) == 1, - "Exactly one CGroupsDisabled telemetry event should have been issued. Found: {0}".format( - disabled_events)) - self.assertIn("Failed to start Microsoft.Compute.TestExtension-1.2.3 using cgroups", - disabled_events[0]['message'], - "The cgroups failure was not included in the telemetry message") - self.assertEqual(False, disabled_events[0]['is_success'], - "The telemetry event should indicate a failure") + self.assertFalse(configurator.enabled()) - extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if command in args[0]] - - self.assertEqual(1, len(extension_calls), - "The extension should have been invoked exactly twice") - self.assertEqual(command, extension_calls[0], - "The second call to the extension should not have used systemd") - - self.assertEqual(len(CGroupsTelemetry._tracked), 0, "No cgroups should have been created") + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.start_extension_command") as v2_extension_start_command: + with patch("azurelinuxagent.ga.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) as patcher: + configurator.start_extension_command( + extension_name="Microsoft.Compute.TestExtension-1.2.3", + command="date", + cmd_name="test", + timeout=300, + shell=False, + cwd=self.tmp_dir, + env={}, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) - self.assertIn("TEST_OUTPUT\n", command_output, "The test output was not captured") + command_calls = [args[0] for args, _ in patcher.call_args_list if + len(args) > 0 and "date" in args[0]] + self.assertFalse(v2_extension_start_command.called) + self.assertEqual(len(command_calls), 1, + "The test command should have been called exactly once [{0}]".format( + command_calls)) + self.assertNotIn("systemd-run", command_calls[0], + "The command should not have been invoked using systemd") + self.assertEqual(command_calls[0], "date", "The command line should not have been modified") @patch('time.sleep', side_effect=lambda _: mock_sleep()) def test_start_extension_command_should_disable_cgroups_and_invoke_the_command_directly_if_systemd_fails(self, _): @@ -566,7 +521,7 @@ def test_start_extension_command_should_disable_cgroups_and_invoke_the_command_d configurator.mocks.add_command(MockCommand("systemd-run", return_value=1, stdout='', stderr='Failed to start transient scope unit: syntax error')) with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as output_file: - with patch("azurelinuxagent.ga.cgroupstelemetry.add_event") as mock_add_event: + with patch("azurelinuxagent.ga.cgroupapi.add_event") as mock_add_event: with patch("subprocess.Popen", wraps=subprocess.Popen) as popen_patch: CGroupsTelemetry.reset() @@ -952,7 +907,7 @@ def get_completed_process(): agent_processes = [os.getppid(), os.getpid()] + agent_command_processes + [start_extension.systemd_run_pid] other_processes = [1, get_completed_process()] + extension_processes - with patch("azurelinuxagent.ga.cgroupconfigurator.CGroupsApi.get_processes_in_cgroup", return_value=agent_processes + other_processes): + with patch("azurelinuxagent.ga.cgroupapi._SystemdCgroupApi.get_processes_in_cgroup", return_value=agent_processes + other_processes): with self.assertRaises(CGroupsException) as context_manager: configurator._check_processes_in_agent_cgroup() @@ -996,7 +951,7 @@ def test_check_cgroups_should_disable_cgroups_when_a_check_fails(self): patchers.append(p) p.start() - with patch("azurelinuxagent.ga.cgroupstelemetry.add_event") as add_event: + with patch("azurelinuxagent.ga.cgroupapi.add_event") as add_event: configurator.enable() tracked_metrics = [ diff --git a/tests/ga/test_cgroupconfigurator_sudo.py b/tests/ga/test_cgroupconfigurator_sudo.py index 6ff314496..14b544f5b 100644 --- a/tests/ga/test_cgroupconfigurator_sudo.py +++ b/tests/ga/test_cgroupconfigurator_sudo.py @@ -139,7 +139,7 @@ def test_start_extension_command_should_not_use_fallback_option_if_extension_tim with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr: with patch("azurelinuxagent.ga.extensionprocessutil.wait_for_process_completion_or_timeout", return_value=[True, None, 0]): - with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApi._is_systemd_failure", + with patch("azurelinuxagent.ga.cgroupapi._SystemdCgroupApi._is_systemd_failure", return_value=False): with self.assertRaises(ExtensionError) as context_manager: configurator.start_extension_command( diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index c257cefed..58d58505b 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -1955,7 +1955,7 @@ def iterator(*_, **__): with patch('azurelinuxagent.ga.remoteaccess.get_remote_access_handler'): with patch('azurelinuxagent.ga.agent_update_handler.get_agent_update_handler'): with patch('azurelinuxagent.ga.update.initialize_event_logger_vminfo_common_parameters'): - with patch('azurelinuxagent.ga.cgroupapi.CGroupsApi.cgroups_supported', return_value=False): # skip all cgroup stuff + with patch('azurelinuxagent.ga.cgroupapi.CGroupUtil.cgroups_supported', return_value=False): # skip all cgroup stuff with patch('azurelinuxagent.ga.update.is_log_collection_allowed', return_value=True): with patch('time.sleep'): with patch('sys.exit'): diff --git a/tests/lib/mock_cgroup_environment.py b/tests/lib/mock_cgroup_environment.py index 4b3e1534e..4c3d19c5a 100644 --- a/tests/lib/mock_cgroup_environment.py +++ b/tests/lib/mock_cgroup_environment.py @@ -76,9 +76,9 @@ /sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids '''), - MockCommand(r"^findmnt -t cgroup2 --noheadings$", -'''/sys/fs/cgroup/unified cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate -'''), + MockCommand(r"^findmnt -t cgroup2 --noheadings$", ''), + + MockCommand(r"^stat -f --format=%T /sys/fs/cgroup$", 'tmpfs'), ] @@ -89,10 +89,12 @@ MockCommand(r"^findmnt -t cgroup --noheadings$", ''), + MockCommand(r"^stat -f --format=%T /sys/fs/cgroup$", 'cgroup2fs'), + ] # Mocked commands when memory controller is in v2, but all other controllers are in v1 -_MOCKED_COMMANDS_V1_AND_V2 = [ +_MOCKED_COMMANDS_HYBRID = [ MockCommand(r"^findmnt -t cgroup --noheadings$", '''/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd /sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices @@ -103,21 +105,25 @@ /sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset /sys/fs/cgroup/misc cgroup cgroup rw,nosuid,nodev,noexec,relatime,misc /sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct +/sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory /sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer /sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb /sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids '''), MockCommand(r"^findmnt -t cgroup2 --noheadings$", -'''/sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot +'''/sys/fs/cgroup/unified cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate '''), + MockCommand(r"^stat -f --format=%T /sys/fs/cgroup$", 'tmpfs'), + + MockCommand(r"^stat -f --format=%T /sys/fs/cgroup/unified$", 'cgroup2fs'), + ] _MOCKED_FILES_V1 = [ ("/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_self_cgroup')), - (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_pid_cgroup')), - ("/sys/fs/cgroup/unified/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v1', 'sys_fs_cgroup_cgroup.subtree_control')) + (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_pid_cgroup')) ] _MOCKED_FILES_V2 = [ @@ -129,10 +135,10 @@ ] # Mocked files when memory controller is in v2, but all other controllers are in v1 -_MOCKED_FILES_V1_AND_V2 = [ - ("/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'v1_and_v2', 'proc_self_cgroup')), - (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v1_and_v2', 'proc_pid_cgroup')), - ("/sys/fs/cgroup/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v1_and_v2', 'sys_fs_cgroup_cgroup.subtree_control')) +_MOCKED_FILES_HYBRID = [ + ("/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_self_cgroup')), + (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_pid_cgroup')), + ("/sys/fs/cgroup/unified/cgroup.controllers", os.path.join(data_dir, 'cgroups', 'hybrid', 'sys_fs_cgroup_cgroup.controllers')) ] _MOCKED_PATHS = [ @@ -140,12 +146,6 @@ r"^(/etc/systemd/system)" ] -_MOCKED_PATHS_V2 = [ - r"^(/sys/fs/cgroup/azure.slice/walinuxagent.service)", - r"^(/sys/fs/cgroup/system.slice/walinuxagent.service)", - r"^(/sys/fs/cgroup/system.slice/extension.service)" -] - class UnitFilePaths: walinuxagent = "/lib/systemd/system/walinuxagent.service" @@ -176,7 +176,7 @@ def mock_cgroup_v1_environment(tmp_dir): (os.path.join(data_dir, 'init', 'azure-vmextensions.slice'), UnitFilePaths.vmextensions) ] - with patch('azurelinuxagent.ga.cgroupapi.CGroupsApi.cgroups_supported', return_value=True): + with patch('azurelinuxagent.ga.cgroupapi.CGroupUtil.cgroups_supported', return_value=True): with patch('azurelinuxagent.common.osutil.systemd.is_systemd', return_value=True): with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS_COMMON + _MOCKED_COMMANDS_V1, paths=_MOCKED_PATHS, files=_MOCKED_FILES_V1, data_files=data_files) as mock: yield mock @@ -194,17 +194,16 @@ def mock_cgroup_v2_environment(tmp_dir): (os.path.join(data_dir, 'init', 'azure-vmextensions.slice'), UnitFilePaths.vmextensions) ] - with patch('azurelinuxagent.ga.cgroupapi.CGroupsApi.cgroups_supported', return_value=True): + with patch('azurelinuxagent.ga.cgroupapi.CGroupUtil.cgroups_supported', return_value=True): with patch('azurelinuxagent.common.osutil.systemd.is_systemd', return_value=True): - with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS_COMMON + _MOCKED_COMMANDS_V2, paths=_MOCKED_PATHS + _MOCKED_PATHS_V2, files=_MOCKED_FILES_V2, data_files=data_files) as mock: + with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS_COMMON + _MOCKED_COMMANDS_V2, paths=_MOCKED_PATHS, files=_MOCKED_FILES_V2, data_files=data_files) as mock: yield mock @contextlib.contextmanager -def mock_cgroup_v1_and_v2_environment(tmp_dir): +def mock_cgroup_hybrid_environment(tmp_dir): """ - Creates a mock environment for machine which has controllers in cgroups v1 and v2 hierarchies used by the tests - related to cgroups (currently it only provides support for systemd platforms). The agent does not currently support - this scenario. + Creates a mock environment for machine which uses cgroup hybrid mode used by the tests related to cgroups (currently + it only provides support for systemd platforms). """ data_files = [ (os.path.join(data_dir, 'init', 'walinuxagent.service'), UnitFilePaths.walinuxagent), @@ -212,7 +211,7 @@ def mock_cgroup_v1_and_v2_environment(tmp_dir): (os.path.join(data_dir, 'init', 'azure-vmextensions.slice'), UnitFilePaths.vmextensions) ] - with patch('azurelinuxagent.ga.cgroupapi.CGroupsApi.cgroups_supported', return_value=True): + with patch('azurelinuxagent.ga.cgroupapi.CGroupUtil.cgroups_supported', return_value=True): with patch('azurelinuxagent.common.osutil.systemd.is_systemd', return_value=True): - with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS_COMMON + _MOCKED_COMMANDS_V1_AND_V2, paths=_MOCKED_PATHS, files=_MOCKED_FILES_V1_AND_V2, data_files=data_files) as mock: + with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS_COMMON + _MOCKED_COMMANDS_HYBRID, paths=_MOCKED_PATHS, files=_MOCKED_FILES_HYBRID, data_files=data_files) as mock: yield mock diff --git a/tests/test_agent.py b/tests/test_agent.py index 906392b61..16d98aeca 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -21,10 +21,12 @@ from azurelinuxagent.agent import parse_args, Agent, usage, AgentCommands from azurelinuxagent.common import conf -from azurelinuxagent.ga import logcollector, cgroupconfigurator, cgroupapi +from azurelinuxagent.common.exception import CGroupsException +from azurelinuxagent.ga import logcollector, cgroupconfigurator from azurelinuxagent.common.utils import fileutil +from azurelinuxagent.ga.cgroupapi import get_cgroup_api from azurelinuxagent.ga.collect_logs import CollectLogsHandler -from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, mock_cgroup_v1_and_v2_environment +from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment from tests.lib.tools import AgentTestCase, data_dir, Mock, patch EXPECTED_CONFIGURATION = \ @@ -250,10 +252,10 @@ def mock_cgroup_paths(*args, **kwargs): if args and args[0] == "self": relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT) return (relative_path, relative_path) - return cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) + return get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) with mock_cgroup_v1_environment(self.tmp_dir): - with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_paths", + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup_paths", side_effect=mock_cgroup_paths): agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) agent.collect_logs(is_full_mode=True) @@ -269,10 +271,14 @@ def test_doesnt_call_collect_logs_when_cgroup_api_cannot_be_determined(self, moc CollectLogsHandler.enable_monitor_cgroups_check() mock_log_collector.run = Mock() + # Mock cgroup api to raise CGroupsException + def mock_get_cgroup_api(): + raise CGroupsException("") + def raise_on_sys_exit(*args): raise RuntimeError(args[0] if args else "Exiting") - with patch("azurelinuxagent.agent.get_cgroup_api", return_value=None): + with patch("azurelinuxagent.agent.get_cgroup_api", side_effect=mock_get_cgroup_api): agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) with patch("sys.exit", side_effect=raise_on_sys_exit) as mock_exit: @@ -294,13 +300,13 @@ def test_doesnt_call_collect_logs_on_invalid_cgroups_v1(self, mock_log_collector def mock_cgroup_paths(*args, **kwargs): if args and args[0] == "self": return ("NOT_THE_CORRECT_PATH", "NOT_THE_CORRECT_PATH") - return cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) + return get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) def raise_on_sys_exit(*args): raise RuntimeError(args[0] if args else "Exiting") with mock_cgroup_v1_environment(self.tmp_dir): - with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_paths", side_effect=mock_cgroup_paths): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup_paths", side_effect=mock_cgroup_paths): agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) with patch("sys.exit", side_effect=raise_on_sys_exit) as mock_exit: @@ -313,7 +319,7 @@ def raise_on_sys_exit(*args): CollectLogsHandler.disable_monitor_cgroups_check() @patch("azurelinuxagent.agent.LogCollector") - def test_doesnt_call_collect_logs_when_controllers_mounted_in_different_hierarchies(self, mock_log_collector): + def test_doesnt_call_collect_logs_if_either_controller_not_mounted(self, mock_log_collector): try: CollectLogsHandler.enable_monitor_cgroups_check() mock_log_collector.run = Mock() @@ -323,13 +329,13 @@ def mock_cgroup_paths(*args, **kwargs): if args and args[0] == "self": relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT) return (None, relative_path) - return cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) + return get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) def raise_on_sys_exit(*args): raise RuntimeError(args[0] if args else "Exiting") - with mock_cgroup_v1_and_v2_environment(self.tmp_dir): - with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_paths", + with mock_cgroup_v1_environment(self.tmp_dir): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup_paths", side_effect=mock_cgroup_paths): agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml index 97d44a73d..782a901b6 100644 --- a/tests_e2e/orchestrator/runbook.yml +++ b/tests_e2e/orchestrator/runbook.yml @@ -53,7 +53,7 @@ variable: - no_outbound_connections - publish_hostname - recover_network_interface - - cgroups_v2_disabled + - cgroup_v2_disabled # # Parameters used to create test VMs diff --git a/tests_e2e/test_suites/cgroups_v2_disabled.yml b/tests_e2e/test_suites/cgroup_v2_disabled.yml similarity index 76% rename from tests_e2e/test_suites/cgroups_v2_disabled.yml rename to tests_e2e/test_suites/cgroup_v2_disabled.yml index 5a075a2a2..bad067e6d 100644 --- a/tests_e2e/test_suites/cgroups_v2_disabled.yml +++ b/tests_e2e/test_suites/cgroup_v2_disabled.yml @@ -2,9 +2,9 @@ # The test suite verifies that the agent does not enable resource enforcement and monitoring on machines which are # using cgroups v2. This suite will be removed once cgroups v2 is supported. # -name: "Cgroupsv2Disabled" +name: "Cgroupv2Disabled" tests: - - "cgroups_v2_disabled/cgroups_v2_disabled.py" + - "cgroup_v2_disabled/cgroup_v2_disabled.py" images: - "ubuntu_2204" - "ubuntu_2404" \ No newline at end of file diff --git a/tests_e2e/tests/cgroups_v2_disabled/cgroups_v2_disabled.py b/tests_e2e/tests/cgroup_v2_disabled/cgroup_v2_disabled.py similarity index 75% rename from tests_e2e/tests/cgroups_v2_disabled/cgroups_v2_disabled.py rename to tests_e2e/tests/cgroup_v2_disabled/cgroup_v2_disabled.py index 9f6e117e6..8f88bf880 100644 --- a/tests_e2e/tests/cgroups_v2_disabled/cgroups_v2_disabled.py +++ b/tests_e2e/tests/cgroup_v2_disabled/cgroup_v2_disabled.py @@ -27,11 +27,11 @@ from tests_e2e.tests.lib.ssh_client import SshClient -class Cgroupsv2Disabled(AgentVmTest): +class Cgroupv2Disabled(AgentVmTest): """ The test verifies that the agent does not enable resource enforcement and monitoring on machines which are using - cgroups v2. It also checks that the agent correctly determined the controller mount points. This test will be - removed once cgroups v2 is supported. + cgroup v2. It also checks that the agent correctly determined the controller mount points. This test will be + removed once cgroup v2 is supported. """ def __init__(self, context: AgentVmTestContext): @@ -60,25 +60,25 @@ def run(self): # Verify that the agent chose v2 for resource enforcement and monitoring log.info("") - log.info("Checking that the agent chose cgroups v2 api for resource enforcement and monitoring...") - self.check_agent_log_contains('Using cgroups v2 for resource enforcement and monitoring', 'The agent should choose v2 for api resource enforcement and monitoring') + log.info("Checking that the agent chose cgroup v2 api for resource enforcement and monitoring...") + self.check_agent_log_contains('Using cgroup v2 for resource enforcement and monitoring', 'The agent should choose v2 for api resource enforcement and monitoring') # Verify that the agent determined the correct mount point for each controller log.info("") - log.info("Checking that the agent determined the correct mount point for each controller...") - self.check_agent_log_contains('The CPU cgroup controller is mounted at /sys/fs/cgroup', - 'The agent should identify the cpu controller to be mounted at /sys/fs/cgroup') - self.check_agent_log_contains('The memory cgroup controller is mounted at /sys/fs/cgroup', - 'The agent should identify the memory controller to be mounted at /sys/fs/cgroup') + log.info("Checking that the agent determined the correct root paths for each controller...") + self.check_agent_log_contains('The CPU cgroup controller root path is /sys/fs/cgroup', + 'The agent should identify the cpu controller to be at /sys/fs/cgroup') + self.check_agent_log_contains('The memory cgroup controller root path is /sys/fs/cgroup', + 'The agent should identify the memory controller to be at /sys/fs/cgroup') - # Verify that the agent does not support cgroups v2 + # Verify that the agent does not support cgroup v2 log.info("") - log.info("Checking that the agent does not use cgroups v2 for resource enforcement and monitoring...") - self.check_agent_log_contains('Agent and extensions resource monitoring is not currently supported on cgroups v2', - 'The agent should not attempt to use cgroups v2 for resource enforcement and monitoring') + log.info("Checking that the agent does not use cgroup v2 for resource enforcement and monitoring...") + self.check_agent_log_contains('Agent and extensions resource monitoring is not currently supported on cgroup v2', + 'The agent should not attempt to use cgroup v2 for resource enforcement and monitoring') self.check_agent_log_contains('Agent cgroups enabled: False', 'The agent should not enable cgroups when system is using v2') if __name__ == "__main__": - Cgroupsv2Disabled.run_from_command_line() + Cgroupv2Disabled.run_from_command_line() From cabfc70849d0ef8bd6f8cacfaae85fc4f44b0fd5 Mon Sep 17 00:00:00 2001 From: Maddie Ford Date: Tue, 26 Mar 2024 13:50:27 -0700 Subject: [PATCH 05/12] Improve comments --- azurelinuxagent/ga/cgroupapi.py | 37 ++++++++----------- ...s_fs_cgroup_unified_cgroup.subtree_control | 0 tests/ga/test_cgroupapi.py | 1 + tests/ga/test_cgroupconfigurator.py | 2 +- tests/lib/mock_cgroup_environment.py | 14 +++---- tests_e2e/test_suites/cgroup_v2_disabled.yml | 2 +- 6 files changed, 25 insertions(+), 31 deletions(-) delete mode 100644 tests/data/cgroups/v1/sys_fs_cgroup_unified_cgroup.subtree_control diff --git a/azurelinuxagent/ga/cgroupapi.py b/azurelinuxagent/ga/cgroupapi.py index 34caefa67..512f03496 100644 --- a/azurelinuxagent/ga/cgroupapi.py +++ b/azurelinuxagent/ga/cgroupapi.py @@ -207,13 +207,13 @@ def get_unit_cgroup_paths(self, unit_name): # Ex: ControlGroup=/azure.slice/walinuxagent.service # controlgroup_path[1:] = azure.slice/walinuxagent.service controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup") - cpu_mount_point, memory_mount_point = self.get_controller_root_paths() + cpu_root_path, memory_root_path = self.get_controller_root_paths() - cpu_cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) \ - if cpu_mount_point is not None else None + cpu_cgroup_path = os.path.join(cpu_root_path, controlgroup_path[1:]) \ + if cpu_root_path is not None else None - memory_cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) \ - if memory_mount_point is not None else None + memory_cgroup_path = os.path.join(memory_root_path, controlgroup_path[1:]) \ + if memory_root_path is not None else None return cpu_cgroup_path, memory_cgroup_path @@ -225,23 +225,22 @@ def get_process_cgroup_paths(self, process_id): """ cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id) - cpu_mount_point, memory_mount_point = self.get_controller_root_paths() + cpu_root_path, memory_root_path = self.get_controller_root_paths() - cpu_cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) \ - if cpu_mount_point is not None and cpu_cgroup_relative_path is not None else None + cpu_cgroup_path = os.path.join(cpu_root_path, cpu_cgroup_relative_path) \ + if cpu_root_path is not None and cpu_cgroup_relative_path is not None else None - memory_cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) \ - if memory_mount_point is not None and memory_cgroup_relative_path is not None else None + memory_cgroup_path = os.path.join(memory_root_path, memory_cgroup_relative_path) \ + if memory_root_path is not None and memory_cgroup_relative_path is not None else None return cpu_cgroup_path, memory_cgroup_path def get_process_cgroup_relative_paths(self, process_id): # pylint: disable=W0613 """ Cgroup version specific. Returns a tuple with the path of the cpu and memory cgroups for the given process - (relative to the mount point of the corresponding controller). + (relative to the root path of the corresponding controller). The 'process_id' can be a numeric PID or the string "self" for the current process. - The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is - not mounted). + The values returned can be None if the controller is not mounted or enabled. """ raise NotImplementedError() @@ -328,8 +327,7 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh # Some distros like ubuntu20 by default cpu and memory accounting enabled. Thus create nested cgroups under the extension slice # So disabling CPU and Memory accounting prevents from creating nested cgroups, so that all the counters will be present in extension Cgroup # since slice unit file configured with accounting enabled. - "systemd-run --property=CPUAccounting=no --property=MemoryAccounting=no --unit={0} --scope --slice={1} {2}".format( - scope, extension_slice_name, command), + "systemd-run --property=CPUAccounting=no --property=MemoryAccounting=no --unit={0} --scope --slice={1} {2}".format(scope, extension_slice_name, command), shell=shell, cwd=cwd, stdout=stdout, @@ -366,10 +364,8 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh except IOError as e: if e.errno == 2: # 'No such file or directory' - log_cgroup_info("The extension command already completed; will not track resource usage", - send_event=False) - log_cgroup_info("Failed to start tracking resource usage for the extension: {0}".format(ustr(e)), - send_event=False) + log_cgroup_info("The extension command already completed; will not track resource usage", send_event=False) + log_cgroup_info("Failed to start tracking resource usage for the extension: {0}".format(ustr(e)), send_event=False) except Exception as e: log_cgroup_info("Failed to start tracking resource usage for the extension: {0}".format(ustr(e)), send_event=False) @@ -472,6 +468,3 @@ def get_process_cgroup_relative_paths(self, process_id): cpu_path = path return cpu_path, memory_path - - def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): # pylint: disable=W0613 - raise NotImplementedError() diff --git a/tests/data/cgroups/v1/sys_fs_cgroup_unified_cgroup.subtree_control b/tests/data/cgroups/v1/sys_fs_cgroup_unified_cgroup.subtree_control deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/ga/test_cgroupapi.py b/tests/ga/test_cgroupapi.py index 0259e93d5..76ad73dd7 100644 --- a/tests/ga/test_cgroupapi.py +++ b/tests/ga/test_cgroupapi.py @@ -99,6 +99,7 @@ def test_get_cgroup_api_is_v1_when_hybrid_in_use(self): def test_get_cgroup_api_raises_exception_when_hybrid_in_use_and_controllers_available_in_unified_hierarchy(self): with mock_cgroup_hybrid_environment(self.tmp_dir): + # Mock /sys/fs/cgroup/unified/cgroup.controllers file to have available controllers with patch("os.path.exists", return_value=True): with patch('azurelinuxagent.common.utils.fileutil.read_file', return_value="cpu memory"): with self.assertRaises(CGroupsException) as context: diff --git a/tests/ga/test_cgroupconfigurator.py b/tests/ga/test_cgroupconfigurator.py index b29c4db82..0ddf829e1 100644 --- a/tests/ga/test_cgroupconfigurator.py +++ b/tests/ga/test_cgroupconfigurator.py @@ -487,7 +487,7 @@ def mock_popen(command_arg, *args, **kwargs): self.assertIn("A TEST EXCEPTION", str(context_manager.exception)) @patch('time.sleep', side_effect=lambda _: mock_sleep()) - def test_start_extension_command_should_not_use_systemd_when_using_cgroup_v2(self, _): + def test_start_extension_command_should_not_use_systemd_when_cgroup_v2_enabled(self, _): with self._get_cgroup_configurator_v2() as configurator: self.assertFalse(configurator.enabled()) diff --git a/tests/lib/mock_cgroup_environment.py b/tests/lib/mock_cgroup_environment.py index 4c3d19c5a..d9f79cb6a 100644 --- a/tests/lib/mock_cgroup_environment.py +++ b/tests/lib/mock_cgroup_environment.py @@ -20,7 +20,7 @@ from tests.lib.tools import patch, data_dir from tests.lib.mock_environment import MockEnvironment, MockCommand -# Mocked commands which are common between v1 and v2 +# Mocked commands which are common between v1, v2, and hybrid cgroup environments _MOCKED_COMMANDS_COMMON = [ MockCommand(r"^systemctl --version$", '''systemd 237 @@ -93,7 +93,6 @@ ] -# Mocked commands when memory controller is in v2, but all other controllers are in v1 _MOCKED_COMMANDS_HYBRID = [ MockCommand(r"^findmnt -t cgroup --noheadings$", '''/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd @@ -134,7 +133,6 @@ ("/sys/fs/cgroup/azure.slice/walinuxagent.service/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v2', 'sys_fs_cgroup_cgroup.subtree_control_empty')) ] -# Mocked files when memory controller is in v2, but all other controllers are in v1 _MOCKED_FILES_HYBRID = [ ("/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_self_cgroup')), (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_pid_cgroup')), @@ -166,7 +164,7 @@ class UnitFilePaths: @contextlib.contextmanager def mock_cgroup_v1_environment(tmp_dir): """ - Creates a mock environment for cgroups v1 hierarchy used by the tests related to cgroups (currently it only + Creates a mock environment for cgroup v1 hierarchy used by the tests related to cgroups (currently it only provides support for systemd platforms). The command output used in __MOCKED_COMMANDS comes from an Ubuntu 20 system. """ @@ -181,10 +179,11 @@ def mock_cgroup_v1_environment(tmp_dir): with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS_COMMON + _MOCKED_COMMANDS_V1, paths=_MOCKED_PATHS, files=_MOCKED_FILES_V1, data_files=data_files) as mock: yield mock + @contextlib.contextmanager def mock_cgroup_v2_environment(tmp_dir): """ - Creates a mock environment for cgroups v2 hierarchy used by the tests related to cgroups (currently it only + Creates a mock environment for cgroup v2 hierarchy used by the tests related to cgroups (currently it only provides support for systemd platforms). The command output used in __MOCKED_COMMANDS comes from an Ubuntu 22 system. """ @@ -199,11 +198,12 @@ def mock_cgroup_v2_environment(tmp_dir): with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS_COMMON + _MOCKED_COMMANDS_V2, paths=_MOCKED_PATHS, files=_MOCKED_FILES_V2, data_files=data_files) as mock: yield mock + @contextlib.contextmanager def mock_cgroup_hybrid_environment(tmp_dir): """ - Creates a mock environment for machine which uses cgroup hybrid mode used by the tests related to cgroups (currently - it only provides support for systemd platforms). + Creates a mock environment for cgroup hybrid hierarchy used by the tests related to cgroups (currently it only + provides support for systemd platforms). """ data_files = [ (os.path.join(data_dir, 'init', 'walinuxagent.service'), UnitFilePaths.walinuxagent), diff --git a/tests_e2e/test_suites/cgroup_v2_disabled.yml b/tests_e2e/test_suites/cgroup_v2_disabled.yml index bad067e6d..cf25ecdcf 100644 --- a/tests_e2e/test_suites/cgroup_v2_disabled.yml +++ b/tests_e2e/test_suites/cgroup_v2_disabled.yml @@ -1,6 +1,6 @@ # # The test suite verifies that the agent does not enable resource enforcement and monitoring on machines which are -# using cgroups v2. This suite will be removed once cgroups v2 is supported. +# using cgroup v2. This suite will be removed once cgroup v2 is supported. # name: "Cgroupv2Disabled" tests: From 00b9c3e4b461867926764c5051f47c2c4056f0f4 Mon Sep 17 00:00:00 2001 From: Maddie Ford Date: Tue, 26 Mar 2024 13:54:30 -0700 Subject: [PATCH 06/12] Pylint --- azurelinuxagent/ga/cgroupapi.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/azurelinuxagent/ga/cgroupapi.py b/azurelinuxagent/ga/cgroupapi.py index 512f03496..c2ff299cd 100644 --- a/azurelinuxagent/ga/cgroupapi.py +++ b/azurelinuxagent/ga/cgroupapi.py @@ -468,3 +468,7 @@ def get_process_cgroup_relative_paths(self, process_id): cpu_path = path return cpu_path, memory_path + + def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, + error_code=ExtensionErrorCodes.PluginUnknownFailure): + raise NotImplementedError() From 530ed56ecbb56ef1ee4c6f294b7f2e11d8c3f672 Mon Sep 17 00:00:00 2001 From: maddieford <93676569+maddieford@users.noreply.github.com> Date: Tue, 2 Apr 2024 12:06:07 -0700 Subject: [PATCH 07/12] Address PR comments (#8) * Run unit tests * Clean up drop in files if cgroups are disabled * Init values for cgroup apis * Rever test change --- azurelinuxagent/ga/cgroupapi.py | 145 ++++++++++-------- azurelinuxagent/ga/cgroupconfigurator.py | 51 +++--- tests/ga/test_cgroupapi.py | 74 +++++++-- .../cgroup_v2_disabled/cgroup_v2_disabled.py | 8 - 4 files changed, 166 insertions(+), 112 deletions(-) diff --git a/azurelinuxagent/ga/cgroupapi.py b/azurelinuxagent/ga/cgroupapi.py index c2ff299cd..cae93c88b 100644 --- a/azurelinuxagent/ga/cgroupapi.py +++ b/azurelinuxagent/ga/cgroupapi.py @@ -144,15 +144,21 @@ def get_cgroup_api(): Determines which version of Cgroup should be used for resource enforcement and monitoring by the Agent and returns the corresponding Api. - Uses 'stat -f --format=%T /sys/fs/cgroups' to get the cgroup hierarchy in use. + Uses 'stat -f --format=%T /sys/fs/cgroup' to get the cgroup hierarchy in use. If the result is 'cgroup2fs', cgroup v2 is being used. If the result is 'tmpfs', cgroup v1 or a hybrid mode is being used. If the result of 'stat -f --format=%T /sys/fs/cgroup/unified' is 'cgroup2fs', then hybrid mode is being used. - Raises exception if an unknown mode is detected. Also raises exception if hybrid mode is detected and there are - controllers available to be enabled in the unified hierarchy (the agent does not support cgroups if there are - controllers simultaneously attached to v1 and v2 hierarchies). + Raises exception if cgroup filesystem mountpoint is not '/sys/fs/cgroup', or an unknown mode is detected. Also + raises exception if hybrid mode is detected and there are controllers available to be enabled in the unified + hierarchy (the agent does not support cgroups if there are controllers simultaneously attached to v1 and v2 + hierarchies). """ + if not os.path.exists(CGROUP_FILE_SYSTEM_ROOT): + v1_mount_point = shellutil.run_command(['findmnt', '-t', 'cgroup', '--noheadings']) + v2_mount_point = shellutil.run_command(['findmnt', '-t', 'cgroup2', '--noheadings']) + raise CGroupsException("Expected cgroup filesystem to be mounted at '/sys/fs/cgroup', but it is not.\n v1 mount point: {0}\n v2 mount point: {1}".format(v1_mount_point, v2_mount_point)) + root_hierarchy_mode = shellutil.run_command(["stat", "-f", "--format=%T", CGROUP_FILE_SYSTEM_ROOT]).rstrip() if root_hierarchy_mode == "cgroup2fs": @@ -270,33 +276,35 @@ class SystemdCgroupApiv1(_SystemdCgroupApi): """ def __init__(self): super(SystemdCgroupApiv1, self).__init__() - self._cgroup_mountpoints = {} + self._cgroup_mountpoints = self._get_controller_mountpoints() + + def _get_controller_mountpoints(self): + """ + In v1, each controller is mounted at a different path. Use findmnt to get each path. + + the output of findmnt is similar to + $ findmnt -t cgroup --noheadings + /sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd + /sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory + /sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct + etc + + Returns a dictionary of the controller-path mappings. + """ + mount_points = {} + for line in shellutil.run_command(['findmnt', '-t', 'cgroup', '--noheadings']).splitlines(): + match = re.search(r'(?P\S+\/(?P\S+))\s+cgroup', line) + if match is not None: + path = match.group('path') + controller = match.group('controller') + if controller is not None and path is not None: + mount_points[controller] = path + return mount_points def get_controller_root_paths(self): - # In v1, each controller is mounted at a different path. Use findmnt to get each path and return cpu and memory - # mount points as a tuple. - # - # the output of findmnt is similar to - # $ findmnt -t cgroup --noheadings - # /sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd - # /sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory - # /sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct - # etc - # - if len(self._cgroup_mountpoints) == 0: - cpu = None - memory = None - for line in shellutil.run_command(['findmnt', '-t', 'cgroup', '--noheadings']).splitlines(): - match = re.search(r'(?P/\S+(memory|cpuacct))\s', line) - if match is not None: - path = match.group('path') - if 'cpu,cpuacct' in path: - cpu = path - else: - memory = path - self._cgroup_mountpoints = {'cpu': cpu, 'memory': memory} - - return self._cgroup_mountpoints['cpu'], self._cgroup_mountpoints['memory'] + # Return a tuple representing the mountpoints for cpu and memory. Either should be None if the corresponding + # controller is not mounted. + return self._cgroup_mountpoints.get('cpu,cpuacct'), self._cgroup_mountpoints.get('memory') def get_process_cgroup_relative_paths(self, process_id): # The contents of the file are similar to @@ -404,54 +412,59 @@ class SystemdCgroupApiv2(_SystemdCgroupApi): """ def __init__(self): super(SystemdCgroupApiv2, self).__init__() - self._root_cgroup_path = None - self._controllers_enabled_at_root = [] + self._root_cgroup_path = self._get_root_cgroup_path() + self._controllers_enabled_at_root = self._get_controllers_enabled_at_root() + + @staticmethod + def _get_root_cgroup_path(): + """ + In v2, there is a unified mount point shared by all controllers. Use findmnt to get the unified mount point. + + The output of findmnt is similar to + $ findmnt -t cgroup2 --noheadings + /sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot + + Returns None if the root cgroup cannot be determined from the output above. + """ + # + for line in shellutil.run_command(['findmnt', '-t', 'cgroup2', '--noheadings']).splitlines(): + match = re.search(r'(?P/\S+)\s+cgroup2', line) + if match is not None: + root_cgroup_path = match.group('path') + if root_cgroup_path is not None: + return root_cgroup_path + return None - def is_controller_enabled_at_root(self, controller): + def _get_controllers_enabled_at_root(self): """ - Returns True if the provided controller is enabled at the root cgroup. The cgroup.subtree_control file at the - root shows a space separated list of the controllers which are enabled to control resource distribution from - the root cgroup to its children. If a controller is listed here, then that controller is available to enable in - children cgroups. + Returns a list of the controllers enabled at the root cgroup. The cgroup.subtree_control file at the root shows + a space separated list of the controllers which are enabled to control resource distribution from the root + cgroup to its children. If a controller is listed here, then that controller is available to enable in children + cgroups. $ cat /sys/fs/cgroup/cgroup.subtree_control cpuset cpu io memory hugetlb pids rdma misc """ + controllers_enabled_at_root = [] if self._root_cgroup_path is not None: enabled_controllers_file = os.path.join(self._root_cgroup_path, 'cgroup.subtree_control') if os.path.exists(enabled_controllers_file): - enabled_controllers = fileutil.read_file(enabled_controllers_file).rstrip().split(" ") - if controller in enabled_controllers: - return True - - return False + controllers_enabled_at_root = fileutil.read_file(enabled_controllers_file).rstrip().split(" ") + return controllers_enabled_at_root def get_controller_root_paths(self): - # In v2, there is a unified mount point shared by all controllers. Use findmnt to get the unified mount point, - # and check if cpu and memory are enabled at the root. Return a tuple representing the root cgroups for cpu and - # memory. Either should be None if the corresponding controller is not enabled at the root. - # - # The output of findmnt is similar to - # $ findmnt -t cgroup2 --noheadings - # /sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot - # - # This check is necessary because all non-root "cgroup.subtree_control" files can only contain controllers - # which are enabled in the parent's "cgroup.subtree_control" file. - - if self._root_cgroup_path is None: - for line in shellutil.run_command(['findmnt', '-t', 'cgroup2', '--noheadings']).splitlines(): - match = re.search(r'(?P/\S+)\s+cgroup2', line) - if match is not None: - root_cgroup_path = match.group('path') - if root_cgroup_path is not None: - self._root_cgroup_path = root_cgroup_path - if self.is_controller_enabled_at_root('cpu'): - self._controllers_enabled_at_root.append('cpu') - if self.is_controller_enabled_at_root('memory'): - self._controllers_enabled_at_root.append('memory') - - root_cpu_path = self._root_cgroup_path if 'cpu' in self._controllers_enabled_at_root else None - root_memory_path = self._root_cgroup_path if 'memory' in self._controllers_enabled_at_root else None + # Return a tuple representing the root cgroups for cpu and memory. Either should be None if the corresponding + # controller is not enabled at the root. This check is necessary because all non-root "cgroup.subtree_control" + # files can only contain controllers which are enabled in the parent's "cgroup.subtree_control" file. + + root_cpu_path = None + root_memory_path = None + if self._root_cgroup_path is not None: + if 'cpu' in self._controllers_enabled_at_root: + root_cpu_path = self._root_cgroup_path + if 'memory' in self._controllers_enabled_at_root: + root_memory_path = self._root_cgroup_path + return root_cpu_path, root_memory_path def get_process_cgroup_relative_paths(self, process_id): diff --git a/azurelinuxagent/ga/cgroupconfigurator.py b/azurelinuxagent/ga/cgroupconfigurator.py index 4540b546b..90f778342 100644 --- a/azurelinuxagent/ga/cgroupconfigurator.py +++ b/azurelinuxagent/ga/cgroupconfigurator.py @@ -141,25 +141,6 @@ def initialize(self): # check whether cgroup monitoring is supported on the current distro self._cgroups_supported = CGroupUtil.cgroups_supported() if not self._cgroups_supported: - # This check is to reset the quotas if agent goes from cgroup supported to unsupported distros later in time. - agent_drop_in_path = systemd.get_agent_drop_in_path() - try: - if os.path.exists(agent_drop_in_path) and os.path.isdir(agent_drop_in_path): - files_to_cleanup = [] - agent_drop_in_file_slice = os.path.join(agent_drop_in_path, _AGENT_DROP_IN_FILE_SLICE) - agent_drop_in_file_cpu_accounting = os.path.join(agent_drop_in_path, - _DROP_IN_FILE_CPU_ACCOUNTING) - agent_drop_in_file_memory_accounting = os.path.join(agent_drop_in_path, - _DROP_IN_FILE_MEMORY_ACCOUNTING) - agent_drop_in_file_cpu_quota = os.path.join(agent_drop_in_path, _DROP_IN_FILE_CPU_QUOTA) - files_to_cleanup.extend([agent_drop_in_file_slice, agent_drop_in_file_cpu_accounting, - agent_drop_in_file_memory_accounting, agent_drop_in_file_cpu_quota]) - self.__cleanup_all_files(files_to_cleanup) - self.__reload_systemd_config() - log_cgroup_info("Agent reset the quotas if distro: {0} goes from supported to unsupported list".format(get_distro()), send_event=True) - except Exception as err: - logger.warn("Unable to delete Agent drop-in files while resetting the quotas: {0}".format(err)) - log_cgroup_info("Cgroup monitoring is not supported on {0}".format(get_distro()), send_event=True) return @@ -170,7 +151,7 @@ def initialize(self): log_cgroup_info("systemd version: {0}".format(systemd.get_version())) - # Determine which version of the Cgroup API should be used. If the correct version can't be determined, + # Determine which version of the Cgroup Api should be used. If the correct version can't be determined, # do not enable resource monitoring/enforcement. try: self._cgroups_api = get_cgroup_api() @@ -189,15 +170,15 @@ def initialize(self): self.__setup_azure_slice() + if self.cgroup_v2_enabled(): + log_cgroup_info("Agent and extensions resource monitoring is not currently supported on cgroup v2") + return + cpu_controller_root, memory_controller_root = self.__get_cgroup_controller_roots() self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroup_paths(agent_slice, cpu_controller_root, memory_controller_root) - if self.cgroup_v2_enabled(): - log_cgroup_info("Agent and extensions resource monitoring is not currently supported on cgroup v2") - return - if self._agent_cpu_cgroup_path is not None or self._agent_memory_cgroup_path is not None: self.enable() @@ -211,11 +192,13 @@ def initialize(self): self._agent_memory_cgroup = MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path) CGroupsTelemetry.track_cgroup(self._agent_memory_cgroup) - except Exception as exception: log_cgroup_warning("Error initializing cgroups: {0}".format(ustr(exception))) finally: log_cgroup_info('Agent cgroups enabled: {0}'.format(self._agent_cgroups_enabled)) + if not self._agent_cgroups_enabled: + log_cgroup_info("Agent will reset the quotas in case cgroups went from enabled to disabled") + self._reset_agent_cgroup_setup() self._initialized = True def __check_no_legacy_cgroups(self): @@ -328,6 +311,24 @@ def __setup_azure_slice(): CGroupConfigurator._Impl.__reload_systemd_config() + def _reset_agent_cgroup_setup(self): + try: + agent_drop_in_path = systemd.get_agent_drop_in_path() + if os.path.exists(agent_drop_in_path) and os.path.isdir(agent_drop_in_path): + files_to_cleanup = [] + agent_drop_in_file_slice = os.path.join(agent_drop_in_path, _AGENT_DROP_IN_FILE_SLICE) + agent_drop_in_file_cpu_accounting = os.path.join(agent_drop_in_path, + _DROP_IN_FILE_CPU_ACCOUNTING) + agent_drop_in_file_memory_accounting = os.path.join(agent_drop_in_path, + _DROP_IN_FILE_MEMORY_ACCOUNTING) + agent_drop_in_file_cpu_quota = os.path.join(agent_drop_in_path, _DROP_IN_FILE_CPU_QUOTA) + files_to_cleanup.extend([agent_drop_in_file_slice, agent_drop_in_file_cpu_accounting, + agent_drop_in_file_memory_accounting, agent_drop_in_file_cpu_quota]) + self.__cleanup_all_files(files_to_cleanup) + self.__reload_systemd_config() + except Exception as err: + logger.warn("Unable to delete Agent drop-in files while resetting the quotas: {0}".format(err)) + @staticmethod def __reload_systemd_config(): # reload the systemd configuration; the new slices will be used once the agent's service restarts diff --git a/tests/ga/test_cgroupapi.py b/tests/ga/test_cgroupapi.py index 76ad73dd7..b65861cb8 100644 --- a/tests/ga/test_cgroupapi.py +++ b/tests/ga/test_cgroupapi.py @@ -23,7 +23,6 @@ import tempfile from azurelinuxagent.common.exception import CGroupsException -from azurelinuxagent.common.utils.fileutil import read_file from azurelinuxagent.ga.cgroupapi import SystemdCgroupApiv1, SystemdCgroupApiv2, CGroupUtil, get_cgroup_api from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.osutil import systemd @@ -200,6 +199,38 @@ def test_get_controller_root_paths_should_return_the_cgroup_v1_controller_mount_ self.assertEqual(cpu, '/sys/fs/cgroup/cpu,cpuacct', "The root cgroup for the CPU controller is incorrect") self.assertEqual(memory, '/sys/fs/cgroup/memory', "The root cgroup for the memory controller is incorrect") + def test_get_controller_root_paths_should_return_None_if_either_controller_not_mounted(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'memory': '/sys/fs/cgroup/memory', 'io': '/sys/fs/cgroup/io'}): + cpu, memory = get_cgroup_api().get_controller_root_paths() + self.assertIsNone(cpu, "The CPU controller is mot mounted, so the cpu controller path should be None") + self.assertEqual(memory, '/sys/fs/cgroup/memory', "The root cgroup for the memory controller is incorrect") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct', 'io': '/sys/fs/cgroup/io'}): + cpu, memory = get_cgroup_api().get_controller_root_paths() + self.assertIsNone(memory, "The memory controller is mot mounted, so the memory controller path should be None") + self.assertEqual(cpu, '/sys/fs/cgroup/cpu,cpuacct', "The root cgroup for the cpu controller is incorrect") + + def test_get_controller_mountpoints_should_return_all_controller_mount_points(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cgroup_api = get_cgroup_api() + # Expected value comes from findmnt output in the mocked environment + self.assertEqual(cgroup_api._get_controller_mountpoints(), { + 'systemd': '/sys/fs/cgroup/systemd', + 'devices': '/sys/fs/cgroup/devices', + 'rdma': '/sys/fs/cgroup/rdma', + 'perf_event': '/sys/fs/cgroup/perf_event', + 'net_cls,net_prio': '/sys/fs/cgroup/net_cls,net_prio', + 'blkio': '/sys/fs/cgroup/blkio', + 'cpuset': '/sys/fs/cgroup/cpuset', + 'misc': '/sys/fs/cgroup/misc', + 'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct', + 'memory': '/sys/fs/cgroup/memory', + 'freezer': '/sys/fs/cgroup/freezer', + 'hugetlb': '/sys/fs/cgroup/hugetlb', + 'pids': '/sys/fs/cgroup/pids', + }, "The controller mountpoints are not correct") + def test_get_cpu_and_memory_cgroup_relative_paths_for_process_should_return_the_cgroup_v1_relative_paths(self): with mock_cgroup_v1_environment(self.tmp_dir): cpu, memory = get_cgroup_api().get_process_cgroup_relative_paths('self') @@ -281,23 +312,21 @@ def test_start_extension_cgroups_v1_command_should_use_systemd_to_execute_the_co class SystemdCgroupsApiv2TestCase(AgentTestCase): - def test_is_controller_enabled_at_root_should_return_False_if_controller_is_not_in_subtree_control_file(self): + def test_get_controllers_enabled_at_root_should_return_list_of_enabled_controllers(self): with mock_cgroup_v2_environment(self.tmp_dir): - # Mock the cgroup.subtree_control to not have cpu controller - def mock_read_file(path): - if "/sys/fs/cgroup/cgroup.subtree_control" in path: - return 'io memory pids\n' - return read_file(path) + cgroup_api = get_cgroup_api() + self.assertEqual(cgroup_api._get_controllers_enabled_at_root(), ['cpuset', 'cpu', 'io', 'memory', 'pids']) - with patch('azurelinuxagent.common.utils.fileutil.read_file', side_effect=mock_read_file): - self.assertFalse(get_cgroup_api().is_controller_enabled_at_root('cpu')) + def test_get_controllers_enabled_at_root_should_return_empty_list_if_root_cgroup_path_is_None(self): + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_root_cgroup_path', return_value=None): + cgroup_api = get_cgroup_api() + self.assertEqual(cgroup_api._get_controllers_enabled_at_root(), []) - def test_is_controller_enabled_at_root_should_return_True_if_controller_is_in_subtree_control_file(self): + def test_get_root_cgroup_path_should_return_v2_cgroup_root(self): with mock_cgroup_v2_environment(self.tmp_dir): cgroup_api = get_cgroup_api() - cgroup_api.get_controller_root_paths() - self.assertTrue(cgroup_api.is_controller_enabled_at_root('memory')) - self.assertTrue(cgroup_api.is_controller_enabled_at_root('cpu')) + self.assertEqual(cgroup_api._get_root_cgroup_path(), '/sys/fs/cgroup') def test_get_unit_cgroup_paths_should_return_the_cgroup_v2_cgroup_paths(self): with mock_cgroup_v2_environment(self.tmp_dir): @@ -359,6 +388,25 @@ def test_get_controller_root_paths_should_return_the_cgroup_v2_root_cgroup_path( self.assertEqual(cpu, '/sys/fs/cgroup', "The root cgroup for the CPU controller is incorrect") self.assertEqual(memory, '/sys/fs/cgroup', "The root cgroup for the memory controller is incorrect") + def test_get_controller_root_paths_should_return_None_if_root_cgroup_path_is_None(self): + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_root_cgroup_path', return_value=None): + cpu, memory = get_cgroup_api().get_controller_root_paths() + self.assertIsNone(cpu, "The root cgroup path is None, so the CPU controller path should be None") + self.assertIsNone(memory, "The root cgroup path is None, so the memory controller path should be None") + + def test_get_controller_root_paths_should_return_None_if_either_controller_not_enabled(self): + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_controllers_enabled_at_root', return_value=['io', 'memory']): + cpu, memory = get_cgroup_api().get_controller_root_paths() + self.assertIsNone(cpu, "The CPU controller is not enabled, so the CPU controller path should be None") + self.assertEqual(memory, '/sys/fs/cgroup', "The root cgroup for the memory controller is incorrect") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_controllers_enabled_at_root', return_value=['cpu', 'io']): + cpu, memory = get_cgroup_api().get_controller_root_paths() + self.assertEqual(cpu, '/sys/fs/cgroup', "The root cgroup for the CPU controller is incorrect") + self.assertIsNone(memory, "The memory controller is not enabled, so the memory controller path should be None") + def test_get_cpu_and_memory_cgroup_relative_paths_for_process_should_return_the_cgroup_v2_relative_paths(self): with mock_cgroup_v2_environment(self.tmp_dir): cpu, memory = get_cgroup_api().get_process_cgroup_relative_paths('self') diff --git a/tests_e2e/tests/cgroup_v2_disabled/cgroup_v2_disabled.py b/tests_e2e/tests/cgroup_v2_disabled/cgroup_v2_disabled.py index 8f88bf880..7ab0ca0ff 100644 --- a/tests_e2e/tests/cgroup_v2_disabled/cgroup_v2_disabled.py +++ b/tests_e2e/tests/cgroup_v2_disabled/cgroup_v2_disabled.py @@ -63,14 +63,6 @@ def run(self): log.info("Checking that the agent chose cgroup v2 api for resource enforcement and monitoring...") self.check_agent_log_contains('Using cgroup v2 for resource enforcement and monitoring', 'The agent should choose v2 for api resource enforcement and monitoring') - # Verify that the agent determined the correct mount point for each controller - log.info("") - log.info("Checking that the agent determined the correct root paths for each controller...") - self.check_agent_log_contains('The CPU cgroup controller root path is /sys/fs/cgroup', - 'The agent should identify the cpu controller to be at /sys/fs/cgroup') - self.check_agent_log_contains('The memory cgroup controller root path is /sys/fs/cgroup', - 'The agent should identify the memory controller to be at /sys/fs/cgroup') - # Verify that the agent does not support cgroup v2 log.info("") log.info("Checking that the agent does not use cgroup v2 for resource enforcement and monitoring...") From 3cc51db7060911fcea59827ee08305c2d45fdaa8 Mon Sep 17 00:00:00 2001 From: maddieford <93676569+maddieford@users.noreply.github.com> Date: Thu, 4 Apr 2024 10:31:14 -0700 Subject: [PATCH 08/12] get_cgroup_api should check if mountpoints are correct (#9) --- azurelinuxagent/ga/cgroupapi.py | 45 ++++++++++++++-- azurelinuxagent/ga/cgroupconfigurator.py | 22 ++++++-- tests/ga/test_cgroupapi.py | 69 +++++++++++++++++++++--- 3 files changed, 119 insertions(+), 17 deletions(-) diff --git a/azurelinuxagent/ga/cgroupapi.py b/azurelinuxagent/ga/cgroupapi.py index cae93c88b..df4c629f1 100644 --- a/azurelinuxagent/ga/cgroupapi.py +++ b/azurelinuxagent/ga/cgroupapi.py @@ -14,7 +14,7 @@ # limitations under the License. # # Requires Python 2.6+ and Openssl 1.0+ - +import json import os import re import shutil @@ -139,6 +139,15 @@ def __init__(self, msg=None): super(SystemdRunError, self).__init__(msg) +class InvalidCgroupMountpointException(CGroupsException): + """ + Raised when the cgroup mountpoint is invalid. + """ + + def __init__(self, msg=None): + super(InvalidCgroupMountpointException, self).__init__(msg) + + def get_cgroup_api(): """ Determines which version of Cgroup should be used for resource enforcement and monitoring by the Agent and returns @@ -157,7 +166,7 @@ def get_cgroup_api(): if not os.path.exists(CGROUP_FILE_SYSTEM_ROOT): v1_mount_point = shellutil.run_command(['findmnt', '-t', 'cgroup', '--noheadings']) v2_mount_point = shellutil.run_command(['findmnt', '-t', 'cgroup2', '--noheadings']) - raise CGroupsException("Expected cgroup filesystem to be mounted at '/sys/fs/cgroup', but it is not.\n v1 mount point: {0}\n v2 mount point: {1}".format(v1_mount_point, v2_mount_point)) + raise InvalidCgroupMountpointException("Expected cgroup filesystem to be mounted at '{0}', but it is not.\n v1 mount point: \n{1}\n v2 mount point: \n{2}".format(CGROUP_FILE_SYSTEM_ROOT, v1_mount_point, v2_mount_point)) root_hierarchy_mode = shellutil.run_command(["stat", "-f", "--format=%T", CGROUP_FILE_SYSTEM_ROOT]).rstrip() @@ -176,8 +185,14 @@ def get_cgroup_api(): if available_unified_controllers != "": raise CGroupsException("Detected hybrid cgroup mode, but there are controllers available to be enabled in unified hierarchy: {0}".format(available_unified_controllers)) + cgroup_api = SystemdCgroupApiv1() + # Previously the agent supported users mounting cgroup v1 controllers in locations other than the systemd + # default ('/sys/fs/cgroup'). The agent no longer supports this scenario. If either the cpu or memory + # controller is mounted in a location other than the systemd default, raise Exception. + if not cgroup_api.are_mountpoints_systemd_created(): + raise InvalidCgroupMountpointException("Expected cgroup controllers to be mounted at '{0}', but at least one is not. v1 mount points: \n{1}".format(CGROUP_FILE_SYSTEM_ROOT, json.dumps(cgroup_api.get_controller_root_paths()))) log_cgroup_info("Using cgroup v1 for resource enforcement and monitoring") - return SystemdCgroupApiv1() + return cgroup_api raise CGroupsException("Detected unknown cgroup mode: {0}".format(root_hierarchy_mode)) @@ -293,6 +308,9 @@ def _get_controller_mountpoints(self): """ mount_points = {} for line in shellutil.run_command(['findmnt', '-t', 'cgroup', '--noheadings']).splitlines(): + # In v2, we match only the systemd default mountpoint ('/sys/fs/cgroup'). In v1, we match any path. This + # is because the agent previously supported users mounting controllers at locations other than the systemd + # default in v1. match = re.search(r'(?P\S+\/(?P\S+))\s+cgroup', line) if match is not None: path = match.group('path') @@ -301,6 +319,23 @@ def _get_controller_mountpoints(self): mount_points[controller] = path return mount_points + def are_mountpoints_systemd_created(self): + """ + Systemd mounts each controller at '/sys/fs/cgroup/'. Returns True if both cpu and memory + mountpoints match this pattern, False otherwise. + + The agent does not support cgroup usage if the default root systemd mountpoint (/sys/fs/cgroup) is not used. + This method is used to check if any users are using non-systemd mountpoints. If they are, the agent drop-in + files will be cleaned up in cgroupconfigurator. + """ + cpu_mountpoint = self._cgroup_mountpoints.get('cpu,cpuacct') + memory_mountpoint = self._cgroup_mountpoints.get('memory') + if cpu_mountpoint is not None and cpu_mountpoint != '/sys/fs/cgroup/cpu,cpuacct': + return False + if memory_mountpoint is not None and memory_mountpoint != '/sys/fs/cgroup/memory': + return False + return True + def get_controller_root_paths(self): # Return a tuple representing the mountpoints for cpu and memory. Either should be None if the corresponding # controller is not mounted. @@ -428,7 +463,9 @@ def _get_root_cgroup_path(): """ # for line in shellutil.run_command(['findmnt', '-t', 'cgroup2', '--noheadings']).splitlines(): - match = re.search(r'(?P/\S+)\s+cgroup2', line) + # Systemd mounts the cgroup filesystem at '/sys/fs/cgroup'. The agent does not support cgroups if the + # filesystem is mounted elsewhere, so search specifically for '/sys/fs/cgroup' in the findmnt output. + match = re.search(r'(?P\/sys\/fs\/cgroup)\s+cgroup2', line) if match is not None: root_cgroup_path = match.group('path') if root_cgroup_path is not None: diff --git a/azurelinuxagent/ga/cgroupconfigurator.py b/azurelinuxagent/ga/cgroupconfigurator.py index 90f778342..fefd2e905 100644 --- a/azurelinuxagent/ga/cgroupconfigurator.py +++ b/azurelinuxagent/ga/cgroupconfigurator.py @@ -24,7 +24,8 @@ from azurelinuxagent.common import conf from azurelinuxagent.common import logger from azurelinuxagent.ga.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryCgroup -from azurelinuxagent.ga.cgroupapi import SystemdRunError, EXTENSION_SLICE_PREFIX, CGroupUtil, SystemdCgroupApiv2, log_cgroup_info, log_cgroup_warning, get_cgroup_api +from azurelinuxagent.ga.cgroupapi import SystemdRunError, EXTENSION_SLICE_PREFIX, CGroupUtil, SystemdCgroupApiv2, \ + log_cgroup_info, log_cgroup_warning, get_cgroup_api, InvalidCgroupMountpointException from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException, AgentMemoryExceededException from azurelinuxagent.common.future import ustr @@ -142,6 +143,11 @@ def initialize(self): self._cgroups_supported = CGroupUtil.cgroups_supported() if not self._cgroups_supported: log_cgroup_info("Cgroup monitoring is not supported on {0}".format(get_distro()), send_event=True) + # If a distro is not supported, attempt to clean up any existing drop in files in case it was + # previously supported. It is necessary to cleanup in this scenario in case the OS hits any bugs on + # the kernel related to cgroups. + log_cgroup_info("Agent will reset the quotas in case distro: {0} went from supported to unsupported".format(get_distro()), send_event=False) + self._cleanup_agent_cgroup_drop_in_files() return # check that systemd is detected correctly @@ -155,6 +161,15 @@ def initialize(self): # do not enable resource monitoring/enforcement. try: self._cgroups_api = get_cgroup_api() + except InvalidCgroupMountpointException as e: + # Systemd mounts the cgroup file system at '/sys/fs/cgroup'. Previously, the agent supported cgroup + # usage if a user mounted the cgroup filesystem elsewhere. The agent no longer supports that + # scenario. Cleanup any existing drop in files in case the agent previously supported cgroups on + # this machine. + log_cgroup_warning("The agent does not support cgroups if the default systemd mountpoint is not being used: {0}".format(ustr(e)), send_event=True) + log_cgroup_info("Agent will reset the quotas in case cgroup usage went from enabled to disabled") + self._cleanup_agent_cgroup_drop_in_files() + return except CGroupsException as e: log_cgroup_warning("Unable to determine which cgroup version to use: {0}".format(ustr(e)), send_event=True) return @@ -196,9 +211,6 @@ def initialize(self): log_cgroup_warning("Error initializing cgroups: {0}".format(ustr(exception))) finally: log_cgroup_info('Agent cgroups enabled: {0}'.format(self._agent_cgroups_enabled)) - if not self._agent_cgroups_enabled: - log_cgroup_info("Agent will reset the quotas in case cgroups went from enabled to disabled") - self._reset_agent_cgroup_setup() self._initialized = True def __check_no_legacy_cgroups(self): @@ -311,7 +323,7 @@ def __setup_azure_slice(): CGroupConfigurator._Impl.__reload_systemd_config() - def _reset_agent_cgroup_setup(self): + def _cleanup_agent_cgroup_drop_in_files(self): try: agent_drop_in_path = systemd.get_agent_drop_in_path() if os.path.exists(agent_drop_in_path) and os.path.isdir(agent_drop_in_path): diff --git a/tests/ga/test_cgroupapi.py b/tests/ga/test_cgroupapi.py index b65861cb8..9ac3337d1 100644 --- a/tests/ga/test_cgroupapi.py +++ b/tests/ga/test_cgroupapi.py @@ -23,12 +23,14 @@ import tempfile from azurelinuxagent.common.exception import CGroupsException -from azurelinuxagent.ga.cgroupapi import SystemdCgroupApiv1, SystemdCgroupApiv2, CGroupUtil, get_cgroup_api +from azurelinuxagent.ga.cgroupapi import SystemdCgroupApiv1, SystemdCgroupApiv2, CGroupUtil, get_cgroup_api, \ + InvalidCgroupMountpointException from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.osutil import systemd from azurelinuxagent.common.utils import fileutil from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, mock_cgroup_v2_environment, \ mock_cgroup_hybrid_environment +from tests.lib.mock_environment import MockCommand from tests.lib.tools import AgentTestCase, patch, mock_sleep from tests.lib.cgroups_tools import CGroupsTools @@ -83,19 +85,18 @@ def test_cgroups_should_be_supported_only_on_ubuntu16_centos7dot4_redhat7dot4_an class SystemdCgroupsApiTestCase(AgentTestCase): - def test_get_cgroup_api_is_v1_when_v1_in_use(self): + def test_get_cgroup_api_raises_exception_when_systemd_mount_point_does_not_exist(self): with mock_cgroup_v1_environment(self.tmp_dir): - self.assertIsInstance(get_cgroup_api(), SystemdCgroupApiv1) + # Mock os.path.exists to return False for the os.path.exists(CGROUP_FILE_SYSTEM_ROOT) check + with patch("os.path.exists", return_value=False): + with self.assertRaises(InvalidCgroupMountpointException) as context: + get_cgroup_api() + self.assertTrue("Expected cgroup filesystem to be mounted at '/sys/fs/cgroup', but it is not" in str(context.exception)) def test_get_cgroup_api_is_v2_when_v2_in_use(self): with mock_cgroup_v2_environment(self.tmp_dir): self.assertIsInstance(get_cgroup_api(), SystemdCgroupApiv2) - def test_get_cgroup_api_is_v1_when_hybrid_in_use(self): - with mock_cgroup_hybrid_environment(self.tmp_dir): - with patch("os.path.exists", return_value=True): - self.assertIsInstance(get_cgroup_api(), SystemdCgroupApiv1) - def test_get_cgroup_api_raises_exception_when_hybrid_in_use_and_controllers_available_in_unified_hierarchy(self): with mock_cgroup_hybrid_environment(self.tmp_dir): # Mock /sys/fs/cgroup/unified/cgroup.controllers file to have available controllers @@ -105,6 +106,24 @@ def test_get_cgroup_api_raises_exception_when_hybrid_in_use_and_controllers_avai get_cgroup_api() self.assertTrue("Detected hybrid cgroup mode, but there are controllers available to be enabled in unified hierarchy: cpu memory" in str(context.exception)) + def test_get_cgroup_api_raises_exception_when_v1_in_use_and_controllers_have_non_sytemd_mountpoints(self): + with mock_cgroup_v1_environment(self.tmp_dir): + # Mock /sys/fs/cgroup/unified/cgroup.controllers file to have available controllers + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.are_mountpoints_systemd_created', return_value=False): + with self.assertRaises(InvalidCgroupMountpointException) as context: + get_cgroup_api() + self.assertTrue("Expected cgroup controllers to be mounted at '/sys/fs/cgroup', but at least one is not." in str(context.exception)) + + def test_get_cgroup_api_is_v1_when_v1_in_use(self): + with mock_cgroup_v1_environment(self.tmp_dir): + self.assertIsInstance(get_cgroup_api(), SystemdCgroupApiv1) + + def test_get_cgroup_api_is_v1_when_hybrid_in_use(self): + with mock_cgroup_hybrid_environment(self.tmp_dir): + # Mock os.path.exists to return True for the os.path.exists('/sys/fs/cgroup/cgroup.controllers') check + with patch("os.path.exists", return_value=True): + self.assertIsInstance(get_cgroup_api(), SystemdCgroupApiv1) + def test_get_cgroup_api_raises_exception_when_cgroup_mode_cannot_be_determined(self): unknown_cgroup_type = "unknown_cgroup_type" with patch('azurelinuxagent.common.utils.shellutil.run_command', return_value=unknown_cgroup_type): @@ -231,6 +250,29 @@ def test_get_controller_mountpoints_should_return_all_controller_mount_points(se 'pids': '/sys/fs/cgroup/pids', }, "The controller mountpoints are not correct") + def test_are_mountpoints_systemd_created_should_return_False_if_cpu_or_memory_are_not_systemd_mountpoints(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/custom/mountpoint/path', 'memory': '/custom/mountpoint/path'}): + self.assertFalse(SystemdCgroupApiv1().are_mountpoints_systemd_created()) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/custom/mountpoint/path'}): + self.assertFalse(SystemdCgroupApiv1().are_mountpoints_systemd_created()) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'memory': '/custom/mountpoint/path'}): + self.assertFalse(SystemdCgroupApiv1().are_mountpoints_systemd_created()) + + def test_are_mountpoints_systemd_created_should_return_True_if_cpu_and_memory_are_systemd_mountpoints(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/sys/fs/cgroup', 'memory': '/sys/fs/cgroup'}): + self.assertFalse(SystemdCgroupApiv1().are_mountpoints_systemd_created()) + + # are_mountpoints_systemd_created should only check controllers which are mounted + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/sys/fs/cgroup'}): + self.assertFalse(SystemdCgroupApiv1().are_mountpoints_systemd_created()) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'memory': '/sys/fs/cgroup'}): + self.assertFalse(SystemdCgroupApiv1().are_mountpoints_systemd_created()) + def test_get_cpu_and_memory_cgroup_relative_paths_for_process_should_return_the_cgroup_v1_relative_paths(self): with mock_cgroup_v1_environment(self.tmp_dir): cpu, memory = get_cgroup_api().get_process_cgroup_relative_paths('self') @@ -328,6 +370,17 @@ def test_get_root_cgroup_path_should_return_v2_cgroup_root(self): cgroup_api = get_cgroup_api() self.assertEqual(cgroup_api._get_root_cgroup_path(), '/sys/fs/cgroup') + def test_get_root_cgroup_path_should_only_match_systemd_mountpoint(self): + with mock_cgroup_v2_environment(self.tmp_dir) as env: + # Mock an environment which has multiple v2 mountpoints + env.add_command(MockCommand(r"^findmnt -t cgroup2 --noheadings$", +'''/custom/mountpoint/path1 cgroup2 cgroup2 rw,relatime +/sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime +/custom/mountpoint/path2 none cgroup2 rw,relatime +''')) + cgroup_api = get_cgroup_api() + self.assertEqual(cgroup_api._get_root_cgroup_path(), '/sys/fs/cgroup') + def test_get_unit_cgroup_paths_should_return_the_cgroup_v2_cgroup_paths(self): with mock_cgroup_v2_environment(self.tmp_dir): cpu, memory = get_cgroup_api().get_unit_cgroup_paths("extension.service") From 7bc072eca77bbe964736b4144f70166b58ba1010 Mon Sep 17 00:00:00 2001 From: Maddie Ford Date: Thu, 4 Apr 2024 10:43:45 -0700 Subject: [PATCH 09/12] Fix conflict after merge --- azurelinuxagent/ga/cgroupconfigurator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azurelinuxagent/ga/cgroupconfigurator.py b/azurelinuxagent/ga/cgroupconfigurator.py index c5bfe3912..1815132ae 100644 --- a/azurelinuxagent/ga/cgroupconfigurator.py +++ b/azurelinuxagent/ga/cgroupconfigurator.py @@ -559,10 +559,10 @@ def _check_fails_if_processes_found_in_agent_cgroup_before_enable(self, agent_sl if agent_slice != AZURE_SLICE: return False try: - _log_cgroup_info("Checking for unexpected processes in the agent's cgroup before enabling cgroups") + log_cgroup_info("Checking for unexpected processes in the agent's cgroup before enabling cgroups") self._check_processes_in_agent_cgroup() except CGroupsException as exception: - _log_cgroup_warning(ustr(exception)) + log_cgroup_warning(ustr(exception)) return True return False From 6c3da64470646acfd363263ca5f50194cf29f90e Mon Sep 17 00:00:00 2001 From: maddieford <93676569+maddieford@users.noreply.github.com> Date: Thu, 4 Apr 2024 11:21:22 -0700 Subject: [PATCH 10/12] Merge issues (#10) * Fix merge issues * Fix unit tests --- tests/ga/test_cgroupconfigurator.py | 2 +- tests_e2e/tests/lib/cgroup_helpers.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/ga/test_cgroupconfigurator.py b/tests/ga/test_cgroupconfigurator.py index fb3df7663..5b4b0976e 100644 --- a/tests/ga/test_cgroupconfigurator.py +++ b/tests/ga/test_cgroupconfigurator.py @@ -980,7 +980,7 @@ def test_check_cgroups_should_disable_cgroups_when_a_check_fails(self): p.stop() @patch('azurelinuxagent.ga.cgroupconfigurator.CGroupConfigurator._Impl._check_processes_in_agent_cgroup', side_effect=CGroupsException("Test")) - @patch('azurelinuxagent.ga.cgroupconfigurator.add_event') + @patch('azurelinuxagent.ga.cgroupapi.add_event') def test_agent_should_not_enable_cgroups_if_unexpected_process_already_in_agent_cgroups(self, add_event, _): command_mocks = [MockCommand(r"^systemctl show walinuxagent\.service --property Slice", '''Slice=azure.slice diff --git a/tests_e2e/tests/lib/cgroup_helpers.py b/tests_e2e/tests/lib/cgroup_helpers.py index ef49284e1..1fe21c329 100644 --- a/tests_e2e/tests/lib/cgroup_helpers.py +++ b/tests_e2e/tests/lib/cgroup_helpers.py @@ -7,7 +7,7 @@ from azurelinuxagent.common.osutil import systemd from azurelinuxagent.common.utils import shellutil from azurelinuxagent.common.version import DISTRO_NAME, DISTRO_VERSION -from azurelinuxagent.ga.cgroupapi import SystemdCgroupsApi +from azurelinuxagent.ga.cgroupapi import get_cgroup_api from tests_e2e.tests.lib.agent_log import AgentLog from tests_e2e.tests.lib.logging import log from tests_e2e.tests.lib.retry import retry_if_false @@ -168,5 +168,5 @@ def get_unit_cgroup_paths(unit_name): """ Returns the cgroup paths for the given unit """ - cgroups_api = SystemdCgroupsApi() + cgroups_api = get_cgroup_api() return cgroups_api.get_unit_cgroup_paths(unit_name) From 6b309a6954c162d9dd4d5a4b9dcddd9eee064df7 Mon Sep 17 00:00:00 2001 From: maddieford <93676569+maddieford@users.noreply.github.com> Date: Thu, 4 Apr 2024 13:32:24 -0700 Subject: [PATCH 11/12] get_cgroup_api raises InvalidCroupMountpointException (#11) * get_cgroup_api can raise InvalidCgroupMountpointException * Add unit test for agent --- azurelinuxagent/agent.py | 5 ++++- tests/test_agent.py | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/azurelinuxagent/agent.py b/azurelinuxagent/agent.py index d794432b9..50735b54e 100644 --- a/azurelinuxagent/agent.py +++ b/azurelinuxagent/agent.py @@ -32,7 +32,7 @@ from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.ga import logcollector, cgroupconfigurator from azurelinuxagent.ga.cgroup import AGENT_LOG_COLLECTOR, CpuCgroup, MemoryCgroup -from azurelinuxagent.ga.cgroupapi import get_cgroup_api, log_cgroup_warning +from azurelinuxagent.ga.cgroupapi import get_cgroup_api, log_cgroup_warning, InvalidCgroupMountpointException import azurelinuxagent.common.conf as conf import azurelinuxagent.common.event as event @@ -213,6 +213,9 @@ def collect_logs(self, is_full_mode): if CollectLogsHandler.is_enabled_monitor_cgroups_check(): try: cgroup_api = get_cgroup_api() + except InvalidCgroupMountpointException as e: + log_cgroup_warning("The agent does not support cgroups if the default systemd mountpoint is not being used: {0}".format(ustr(e)), send_event=True) + sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) except CGroupsException as e: log_cgroup_warning("Unable to determine which cgroup version to use: {0}".format(ustr(e)), send_event=True) sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) diff --git a/tests/test_agent.py b/tests/test_agent.py index 16d98aeca..4b643ca36 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -24,7 +24,7 @@ from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.ga import logcollector, cgroupconfigurator from azurelinuxagent.common.utils import fileutil -from azurelinuxagent.ga.cgroupapi import get_cgroup_api +from azurelinuxagent.ga.cgroupapi import get_cgroup_api, InvalidCgroupMountpointException from azurelinuxagent.ga.collect_logs import CollectLogsHandler from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment from tests.lib.tools import AgentTestCase, data_dir, Mock, patch @@ -318,6 +318,28 @@ def raise_on_sys_exit(*args): finally: CollectLogsHandler.disable_monitor_cgroups_check() + @patch('azurelinuxagent.agent.get_cgroup_api', side_effect=InvalidCgroupMountpointException("Test")) + @patch("azurelinuxagent.agent.LogCollector") + def test_doesnt_call_collect_logs_on_non_systemd_cgroups_v1_mountpoints(self, mock_log_collector, _): + try: + CollectLogsHandler.enable_monitor_cgroups_check() + mock_log_collector.run = Mock() + + def raise_on_sys_exit(*args): + raise RuntimeError(args[0] if args else "Exiting") + + with mock_cgroup_v1_environment(self.tmp_dir): + agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) + + with patch("sys.exit", side_effect=raise_on_sys_exit) as mock_exit: + try: + agent.collect_logs(is_full_mode=True) + except RuntimeError as re: + self.assertEqual(logcollector.INVALID_CGROUPS_ERRCODE, re.args[0]) + mock_exit.assert_called_once_with(logcollector.INVALID_CGROUPS_ERRCODE) + finally: + CollectLogsHandler.disable_monitor_cgroups_check() + @patch("azurelinuxagent.agent.LogCollector") def test_doesnt_call_collect_logs_if_either_controller_not_mounted(self, mock_log_collector): try: From 35ca335000a76dde11726274f6493fb04592365b Mon Sep 17 00:00:00 2001 From: maddieford <93676569+maddieford@users.noreply.github.com> Date: Thu, 4 Apr 2024 16:14:43 -0700 Subject: [PATCH 12/12] Address PR comments (#12) --- azurelinuxagent/ga/cgroupapi.py | 22 +++++++++++----------- azurelinuxagent/ga/cgroupconfigurator.py | 14 +++++++------- tests/ga/test_cgroupapi.py | 6 +++--- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/azurelinuxagent/ga/cgroupapi.py b/azurelinuxagent/ga/cgroupapi.py index df4c629f1..3bce05350 100644 --- a/azurelinuxagent/ga/cgroupapi.py +++ b/azurelinuxagent/ga/cgroupapi.py @@ -194,7 +194,7 @@ def get_cgroup_api(): log_cgroup_info("Using cgroup v1 for resource enforcement and monitoring") return cgroup_api - raise CGroupsException("Detected unknown cgroup mode: {0}".format(root_hierarchy_mode)) + raise CGroupsException("{0} has an unexpected file type: {1}".format(CGROUP_FILE_SYSTEM_ROOT, root_hierarchy_mode)) class _SystemdCgroupApi(object): @@ -256,7 +256,7 @@ def get_process_cgroup_paths(self, process_id): return cpu_cgroup_path, memory_cgroup_path - def get_process_cgroup_relative_paths(self, process_id): # pylint: disable=W0613 + def get_process_cgroup_relative_paths(self, process_id): """ Cgroup version specific. Returns a tuple with the path of the cpu and memory cgroups for the given process (relative to the root path of the corresponding controller). @@ -330,9 +330,9 @@ def are_mountpoints_systemd_created(self): """ cpu_mountpoint = self._cgroup_mountpoints.get('cpu,cpuacct') memory_mountpoint = self._cgroup_mountpoints.get('memory') - if cpu_mountpoint is not None and cpu_mountpoint != '/sys/fs/cgroup/cpu,cpuacct': + if cpu_mountpoint is not None and cpu_mountpoint != os.path.join(CGROUP_FILE_SYSTEM_ROOT, 'cpu,cpuacct'): return False - if memory_mountpoint is not None and memory_mountpoint != '/sys/fs/cgroup/memory': + if memory_mountpoint is not None and memory_mountpoint != os.path.join(CGROUP_FILE_SYSTEM_ROOT, 'memory'): return False return True @@ -448,7 +448,7 @@ class SystemdCgroupApiv2(_SystemdCgroupApi): def __init__(self): super(SystemdCgroupApiv2, self).__init__() self._root_cgroup_path = self._get_root_cgroup_path() - self._controllers_enabled_at_root = self._get_controllers_enabled_at_root() + self._controllers_enabled_at_root = self._get_controllers_enabled_at_root(self._root_cgroup_path) if self._root_cgroup_path is not None else [] @staticmethod def _get_root_cgroup_path(): @@ -472,7 +472,8 @@ def _get_root_cgroup_path(): return root_cgroup_path return None - def _get_controllers_enabled_at_root(self): + @staticmethod + def _get_controllers_enabled_at_root(root_cgroup_path): """ Returns a list of the controllers enabled at the root cgroup. The cgroup.subtree_control file at the root shows a space separated list of the controllers which are enabled to control resource distribution from the root @@ -483,10 +484,9 @@ def _get_controllers_enabled_at_root(self): cpuset cpu io memory hugetlb pids rdma misc """ controllers_enabled_at_root = [] - if self._root_cgroup_path is not None: - enabled_controllers_file = os.path.join(self._root_cgroup_path, 'cgroup.subtree_control') - if os.path.exists(enabled_controllers_file): - controllers_enabled_at_root = fileutil.read_file(enabled_controllers_file).rstrip().split(" ") + enabled_controllers_file = os.path.join(root_cgroup_path, 'cgroup.subtree_control') + if os.path.exists(enabled_controllers_file): + controllers_enabled_at_root = fileutil.read_file(enabled_controllers_file).rstrip().split() return controllers_enabled_at_root def get_controller_root_paths(self): @@ -511,7 +511,7 @@ def get_process_cgroup_relative_paths(self, process_id): cpu_path = None memory_path = None for line in fileutil.read_file("/proc/{0}/cgroup".format(process_id)).splitlines(): - match = re.match(r'\d+::(?P\S+)', line) + match = re.match(r'0::(?P\S+)', line) if match is not None: path = match.group('path').lstrip('/') if match.group('path') != '/' else None memory_path = path diff --git a/azurelinuxagent/ga/cgroupconfigurator.py b/azurelinuxagent/ga/cgroupconfigurator.py index 1815132ae..72d5329f9 100644 --- a/azurelinuxagent/ga/cgroupconfigurator.py +++ b/azurelinuxagent/ga/cgroupconfigurator.py @@ -157,6 +157,9 @@ def initialize(self): log_cgroup_info("systemd version: {0}".format(systemd.get_version())) + if not self.__check_no_legacy_cgroups(): + return + # Determine which version of the Cgroup Api should be used. If the correct version can't be determined, # do not enable resource monitoring/enforcement. try: @@ -174,7 +177,8 @@ def initialize(self): log_cgroup_warning("Unable to determine which cgroup version to use: {0}".format(ustr(e)), send_event=True) return - if not self.__check_no_legacy_cgroups(): + if self.using_cgroup_v2(): + log_cgroup_info("Agent and extensions resource monitoring is not currently supported on cgroup v2") return agent_unit_name = systemd.get_agent_unit_name() @@ -185,10 +189,6 @@ def initialize(self): self.__setup_azure_slice() - if self.cgroup_v2_enabled(): - log_cgroup_info("Agent and extensions resource monitoring is not currently supported on cgroup v2") - return - cpu_controller_root, memory_controller_root = self.__get_cgroup_controller_roots() self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroup_paths(agent_slice, cpu_controller_root, @@ -469,7 +469,7 @@ def agent_enabled(self): def extensions_enabled(self): return self._extensions_cgroups_enabled - def cgroup_v2_enabled(self): + def using_cgroup_v2(self): return isinstance(self._cgroups_api, SystemdCgroupApiv2) def enable(self): @@ -624,7 +624,7 @@ def _check_processes_in_agent_cgroup(self): agent_commands.update(shellutil.get_running_commands()) systemd_run_commands = set() systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands()) - agent_cgroup = self._cgroups_api.get_processes_in_cgroup(self._agent_cpu_cgroup_path) + agent_cgroup = self._cgroups_api.get_processes_in_cgroup(cgroup_path) # get the running commands again in case new commands started or completed while we were fetching the processes in the cgroup; agent_commands.update(shellutil.get_running_commands()) systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands()) diff --git a/tests/ga/test_cgroupapi.py b/tests/ga/test_cgroupapi.py index 9ac3337d1..ec077c90a 100644 --- a/tests/ga/test_cgroupapi.py +++ b/tests/ga/test_cgroupapi.py @@ -129,7 +129,7 @@ def test_get_cgroup_api_raises_exception_when_cgroup_mode_cannot_be_determined(s with patch('azurelinuxagent.common.utils.shellutil.run_command', return_value=unknown_cgroup_type): with self.assertRaises(CGroupsException) as context: get_cgroup_api() - self.assertTrue("Detected unknown cgroup mode: {0}".format(unknown_cgroup_type) in str(context.exception)) + self.assertTrue("/sys/fs/cgroup has an unexpected file type: {0}".format(unknown_cgroup_type) in str(context.exception)) def test_get_systemd_version_should_return_a_version_number(self): # We expect same behavior for v1 and v2 @@ -357,13 +357,13 @@ class SystemdCgroupsApiv2TestCase(AgentTestCase): def test_get_controllers_enabled_at_root_should_return_list_of_enabled_controllers(self): with mock_cgroup_v2_environment(self.tmp_dir): cgroup_api = get_cgroup_api() - self.assertEqual(cgroup_api._get_controllers_enabled_at_root(), ['cpuset', 'cpu', 'io', 'memory', 'pids']) + self.assertEqual(cgroup_api._get_controllers_enabled_at_root('/sys/fs/cgroup'), ['cpuset', 'cpu', 'io', 'memory', 'pids']) def test_get_controllers_enabled_at_root_should_return_empty_list_if_root_cgroup_path_is_None(self): with mock_cgroup_v2_environment(self.tmp_dir): with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_root_cgroup_path', return_value=None): cgroup_api = get_cgroup_api() - self.assertEqual(cgroup_api._get_controllers_enabled_at_root(), []) + self.assertEqual(cgroup_api._controllers_enabled_at_root, []) def test_get_root_cgroup_path_should_return_v2_cgroup_root(self): with mock_cgroup_v2_environment(self.tmp_dir):