From 9501707c8ac1d6710ee878e797783199c30acb09 Mon Sep 17 00:00:00 2001 From: maddieford <93676569+maddieford@users.noreply.github.com> Date: Fri, 15 Mar 2024 16:49:41 -0700 Subject: [PATCH] Cgroups api refactor (#6) * Initial changes for log collector cgroups v2 support * Fix pylint issues * Fix pylint issues * Fix pylint issues * Check that both controllers are mounted in the chosen cgroups version for log collector * Fix regex * Update test_agent unit tests * Fix unit tests * Update format strings * Fix broken cgroupconfigurator unit tests * pyling * Fix cgroups api unit tests * Ignore unused args * Ignore unused args * Add cgroup configurator tests * v2 required check in parent cgroup * unit tests is_controller_enabled * Fix test failure and pylint: * pylint * Update agent checks * Fix controller enable logic and unit tests * Remove changes to collect logs * Fix pylint * Add e2e test for v2 --- azurelinuxagent/agent.py | 28 +- azurelinuxagent/ga/cgroupapi.py | 403 ++++++++++----- azurelinuxagent/ga/cgroupconfigurator.py | 173 +++---- azurelinuxagent/ga/cgroupstelemetry.py | 13 + .../sys_fs_cgroup_unified_cgroup.controllers | 7 - tests/data/cgroups/{ => v1}/proc_pid_cgroup | 0 tests/data/cgroups/{ => v1}/proc_self_cgroup | 0 ...s_fs_cgroup_unified_cgroup.subtree_control | 0 tests/data/cgroups/v1_and_v2/proc_pid_cgroup | 12 + tests/data/cgroups/v1_and_v2/proc_self_cgroup | 12 + .../sys_fs_cgroup_cgroup.subtree_control | 1 + tests/data/cgroups/v2/proc_pid_cgroup | 1 + tests/data/cgroups/v2/proc_self_cgroup | 1 + .../v2/sys_fs_cgroup_cgroup.subtree_control | 1 + ...sys_fs_cgroup_cgroup.subtree_control_empty | 0 tests/ga/test_cgroupapi.py | 462 +++++++++++++++--- tests/ga/test_cgroupconfigurator.py | 195 ++++++-- tests/ga/test_cgroupconfigurator_sudo.py | 4 +- tests/lib/cgroups_tools.py | 14 - tests/lib/mock_cgroup_environment.py | 153 ++++-- tests/test_agent.py | 97 +++- tests_e2e/test_suites/cgroups_v2_disabled.yml | 10 + .../cgroups_v2_disabled.py | 84 ++++ 23 files changed, 1278 insertions(+), 393 deletions(-) delete mode 100644 tests/data/cgroups/sys_fs_cgroup_unified_cgroup.controllers rename tests/data/cgroups/{ => v1}/proc_pid_cgroup (100%) rename tests/data/cgroups/{ => v1}/proc_self_cgroup (100%) create mode 100644 tests/data/cgroups/v1/sys_fs_cgroup_unified_cgroup.subtree_control create mode 100644 tests/data/cgroups/v1_and_v2/proc_pid_cgroup create mode 100644 tests/data/cgroups/v1_and_v2/proc_self_cgroup create mode 100644 tests/data/cgroups/v1_and_v2/sys_fs_cgroup_cgroup.subtree_control create mode 100644 tests/data/cgroups/v2/proc_pid_cgroup create mode 100644 tests/data/cgroups/v2/proc_self_cgroup create mode 100644 tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control create mode 100644 tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control_empty create mode 100644 tests_e2e/test_suites/cgroups_v2_disabled.yml create mode 100644 tests_e2e/tests/cgroups_v2_disabled/cgroups_v2_disabled.py diff --git a/azurelinuxagent/agent.py b/azurelinuxagent/agent.py index ee68bd678..b0ce5a19f 100644 --- a/azurelinuxagent/agent.py +++ b/azurelinuxagent/agent.py @@ -30,7 +30,8 @@ import threading from azurelinuxagent.ga import logcollector, cgroupconfigurator from azurelinuxagent.ga.cgroup import AGENT_LOG_COLLECTOR, CpuCgroup, MemoryCgroup -from azurelinuxagent.ga.cgroupapi import SystemdCgroupsApi +from azurelinuxagent.ga.cgroupapi import get_cgroup_api +from azurelinuxagent.ga.cgroupstelemetry import log_cgroup_warning import azurelinuxagent.common.conf as conf import azurelinuxagent.common.event as event @@ -206,18 +207,29 @@ def collect_logs(self, is_full_mode): # Check the cgroups unit log_collector_monitor = None - cgroups_api = SystemdCgroupsApi() - cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self") + cgroups_api = get_cgroup_api() + cpu_cgroup_path = None + memory_cgroup_path = None if CollectLogsHandler.is_enabled_monitor_cgroups_check(): - cpu_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in cpu_cgroup_path) - memory_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in memory_cgroup_path) + if cgroups_api is None: + log_cgroup_warning("Unable to determine what version of cgroups to use for log collector resource " + "monitoring and enforcement.") + sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) + + cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self") + cpu_slice_matches = False + memory_slice_matches = False + if cpu_cgroup_path is not None: + cpu_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in cpu_cgroup_path) + if memory_cgroup_path is not None: + memory_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in memory_cgroup_path) if not cpu_slice_matches or not memory_slice_matches: - logger.info("The Log Collector process is not in the proper cgroups:") + log_cgroup_warning("The Log Collector process is not in the proper cgroups:", send_event=False) if not cpu_slice_matches: - logger.info("\tunexpected cpu slice") + log_cgroup_warning("\tunexpected cpu slice", send_event=False) if not memory_slice_matches: - logger.info("\tunexpected memory slice") + log_cgroup_warning("\tunexpected memory slice", send_event=False) sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) diff --git a/azurelinuxagent/ga/cgroupapi.py b/azurelinuxagent/ga/cgroupapi.py index 6f4bf4ab3..40f66ed74 100644 --- a/azurelinuxagent/ga/cgroupapi.py +++ b/azurelinuxagent/ga/cgroupapi.py @@ -24,7 +24,7 @@ from azurelinuxagent.common import logger from azurelinuxagent.ga.cgroup import CpuCgroup, MemoryCgroup -from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry +from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry, log_cgroup_info, log_cgroup_warning from azurelinuxagent.common.conf import get_agent_pid_file_path from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes, ExtensionError, \ ExtensionOperationError @@ -37,10 +37,32 @@ from azurelinuxagent.common.version import get_distro CGROUPS_FILE_SYSTEM_ROOT = '/sys/fs/cgroup' -CGROUP_CONTROLLERS = ["cpu", "memory"] EXTENSION_SLICE_PREFIX = "azure-vmextensions" +def get_cgroup_api(): + """ + Determines which version of Cgroups should be used for resource enforcement and monitoring by the Agent are returns + the corresponding Api. If the required controllers are not mounted in v1 or v2, return None. + """ + v1 = SystemdCgroupsApiv1() + v2 = SystemdCgroupsApiv2() + + log_cgroup_info("Controllers mounted in v1: {0}. Controllers mounted in v2: {1}".format(v1.get_mounted_controllers(), v2.get_mounted_controllers())) + + # It is possible for different controllers to be simultaneously mounted under v1 and v2. If any are mounted under + # v1, use v1. + if v1.is_cpu_or_memory_mounted(): + log_cgroup_info("Using cgroups v1 for resource enforcement and monitoring") + return v1 + elif v2.is_cpu_or_memory_mounted(): + log_cgroup_info("Using cgroups v2 for resource enforcement and monitoring") + return v2 + else: + log_cgroup_warning("CPU and Memory controllers are not mounted in cgroups v1 or v2") + return None + + class SystemdRunError(CGroupsException): """ Raised when systemd-run fails @@ -68,7 +90,7 @@ def track_cgroups(extension_cgroups): for cgroup in extension_cgroups: CGroupsTelemetry.track_cgroup(cgroup) except Exception as exception: - logger.warn("Cannot add cgroup '{0}' to tracking list; resource usage will not be tracked. " + logger.warn("[CGW] Cannot add cgroup '{0}' to tracking list; resource usage will not be tracked. " "Error: {1}".format(cgroup.path, ustr(exception))) @staticmethod @@ -94,7 +116,7 @@ def _foreach_legacy_cgroup(operation): for controller in ['cpu', 'memory']: cgroup = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, "WALinuxAgent", "WALinuxAgent") if os.path.exists(cgroup): - logger.info('Found legacy cgroup {0}', cgroup) + log_cgroup_info('Found legacy cgroup {0}'.format(cgroup), send_event=False) legacy_cgroups.append((controller, cgroup)) try: @@ -109,7 +131,7 @@ def _foreach_legacy_cgroup(operation): operation(controller, daemon_pid) finally: for _, cgroup in legacy_cgroups: - logger.info('Removing {0}', cgroup) + log_cgroup_info('Removing {0}'.format(cgroup), send_event=False) shutil.rmtree(cgroup, ignore_errors=True) return len(legacy_cgroups) @@ -120,11 +142,11 @@ def get_daemon_pid(): class SystemdCgroupsApi(CGroupsApi): """ - Cgroups interface via systemd + Cgroups interface via systemd. Contains common api implementations between cgroups v1 and v2. """ def __init__(self): - self._cgroup_mountpoints = None + self._cgroup_mountpoints = {} self._agent_unit_name = None self._systemd_run_commands = [] self._systemd_run_commands_lock = threading.RLock() @@ -136,23 +158,106 @@ def get_systemd_run_commands(self): with self._systemd_run_commands_lock: return self._systemd_run_commands[:] + def is_cpu_or_memory_mounted(self): + """ + Returns True if either cpu or memory controllers are mounted and enabled at the root cgroup. + """ + cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() + return cpu_mount_point is not None or memory_mount_point is not None + + def get_mounted_controllers(self): + """ + Returns a list of the controllers mounted and enabled at the root cgroup. Currently, the only controllers the + agent checks for is cpu and memory. + """ + self.get_cgroup_mount_points() # Updates self._cgroup_mountpoints if empty + return [controller for controller, mount_point in self._cgroup_mountpoints.items() if mount_point is not None] + + def cleanup_legacy_cgroups(self): + """ + Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; + starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If + we find that any of the legacy groups include the PID of the daemon then we need to disable data collection for this + instance (under systemd, moving PIDs across the cgroup file system can produce unpredictable results) + """ + return CGroupsApi._foreach_legacy_cgroup(lambda *_: None) + + @staticmethod + def get_extension_slice_name(extension_name, old_slice=False): + # The old slice makes it difficult for user to override the limits because they need to place drop-in files on every upgrade if extension slice is different for each version. + # old slice includes .- + # new slice without version . + if not old_slice: + extension_name = extension_name.rsplit("-", 1)[0] + # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects. + return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_') + ".slice" + + @staticmethod + def _is_systemd_failure(scope_name, stderr): + stderr.seek(0) + stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace') + unit_not_found = "Unit {0} not found.".format(scope_name) + return unit_not_found in stderr or scope_name not in stderr + def get_cgroup_mount_points(self): """ - Returns a tuple with the mount points for the cpu and memory controllers; the values can be None - if the corresponding controller is not mounted + Cgroup version specific. Returns a tuple with the mount points for the cpu and memory controllers; the values + can be None if the corresponding controller is not mounted or enabled at the root cgroup. Updates + self._cgroup_mountpoints if empty. + """ + return None, None + + def get_unit_cgroup_paths(self, unit_name): + """ + Cgroup version specific. Returns a tuple with the path of the cpu and memory cgroups for the given unit. + The values returned can be None if the controller is not mounted or enabled. + """ + pass # pylint: disable=W0107 + + def get_process_cgroup_paths(self, process_id): + """ + Cgroup version specific. Returns a tuple with the path of the cpu and memory cgroups for the given process. + The 'process_id' can be a numeric PID or the string "self" for the current process. + The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is + not mounted or enabled). + """ + pass # pylint: disable=W0107 + + def get_process_cgroup_relative_paths(self, process_id): # pylint: disable=W0613 + """ + Cgroup version specific. Returns a tuple with the path of the cpu and memory cgroups for the given process + (relative to the mount point of the corresponding controller). + The 'process_id' can be a numeric PID or the string "self" for the current process. + The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is + not mounted). """ + pass # pylint: disable=W0107 + + def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, + error_code=ExtensionErrorCodes.PluginUnknownFailure): + """ + Cgroup version specific. Starts extension command. + """ + pass # pylint: disable=W0107 + + +class SystemdCgroupsApiv1(SystemdCgroupsApi): + """ + Cgroups v1 interface via systemd + """ + def get_cgroup_mount_points(self): # the output of mount is similar to - # $ mount -t cgroup - # cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) - # cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct) - # cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory) + # $ findmnt -t cgroup --noheadings + # /sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd + # /sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory + # /sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct # etc # - if self._cgroup_mountpoints is None: + if not self._cgroup_mountpoints: cpu = None memory = None - for line in shellutil.run_command(['mount', '-t', 'cgroup']).splitlines(): - match = re.search(r'on\s+(?P/\S+(memory|cpuacct))\s', line) + for line in shellutil.run_command(['findmnt', '-t', 'cgroup', '--noheadings']).splitlines(): + match = re.search(r'(?P/\S+(memory|cpuacct))\s', line) if match is not None: path = match.group('path') if 'cpuacct' in path: @@ -163,14 +268,34 @@ def get_cgroup_mount_points(self): return self._cgroup_mountpoints['cpu'], self._cgroup_mountpoints['memory'] - @staticmethod - def get_process_cgroup_relative_paths(process_id): - """ - Returns a tuple with the path of the cpu and memory cgroups for the given process (relative to the mount point of the corresponding - controller). - The 'process_id' can be a numeric PID or the string "self" for the current process. - The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is not mounted). - """ + def get_unit_cgroup_paths(self, unit_name): + # Ex: ControlGroup=/azure.slice/walinuxagent.service + # controlgroup_path[1:] = azure.slice/walinuxagent.service + controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup") + cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() + + cpu_cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) \ + if cpu_mount_point is not None else None + + memory_cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) \ + if memory_mount_point is not None else None + + return cpu_cgroup_path, memory_cgroup_path + + def get_process_cgroup_paths(self, process_id): + cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id) + + cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() + + cpu_cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) \ + if cpu_mount_point is not None and cpu_cgroup_relative_path is not None else None + + memory_cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) \ + if memory_mount_point is not None and memory_cgroup_relative_path is not None else None + + return cpu_cgroup_path, memory_cgroup_path + + def get_process_cgroup_relative_paths(self, process_id): # The contents of the file are similar to # # cat /proc/1218/cgroup # 10:memory:/system.slice/walinuxagent.service @@ -190,79 +315,6 @@ def get_process_cgroup_relative_paths(process_id): return cpu_path, memory_path - def get_process_cgroup_paths(self, process_id): - """ - Returns a tuple with the path of the cpu and memory cgroups for the given process. The 'process_id' can be a numeric PID or the string "self" for the current process. - The values returned can be None if the process is not in a cgroup for that controller (e.g. the controller is not mounted). - """ - cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id) - - cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() - - cpu_cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) \ - if cpu_mount_point is not None and cpu_cgroup_relative_path is not None else None - - memory_cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) \ - if memory_mount_point is not None and memory_cgroup_relative_path is not None else None - - return cpu_cgroup_path, memory_cgroup_path - - def get_unit_cgroup_paths(self, unit_name): - """ - Returns a tuple with the path of the cpu and memory cgroups for the given unit. - The values returned can be None if the controller is not mounted. - Ex: ControlGroup=/azure.slice/walinuxagent.service - controlgroup_path[1:] = azure.slice/walinuxagent.service - """ - controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup") - cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() - - cpu_cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) \ - if cpu_mount_point is not None else None - - memory_cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) \ - if memory_mount_point is not None else None - - return cpu_cgroup_path, memory_cgroup_path - - @staticmethod - def get_cgroup2_controllers(): - """ - Returns a tuple with the mount point for the cgroups v2 controllers, and the currently mounted controllers; - either value can be None if cgroups v2 or its controllers are not mounted - """ - # the output of mount is similar to - # $ mount -t cgroup2 - # cgroup2 on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime,nsdelegate) - # - for line in shellutil.run_command(['mount', '-t', 'cgroup2']).splitlines(): - match = re.search(r'on\s+(?P/\S+)\s', line) - if match is not None: - mount_point = match.group('path') - controllers = None - controllers_file = os.path.join(mount_point, 'cgroup.controllers') - if os.path.exists(controllers_file): - controllers = fileutil.read_file(controllers_file) - return mount_point, controllers - return None, None - - @staticmethod - def _is_systemd_failure(scope_name, stderr): - stderr.seek(0) - stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace') - unit_not_found = "Unit {0} not found.".format(scope_name) - return unit_not_found in stderr or scope_name not in stderr - - @staticmethod - def get_extension_slice_name(extension_name, old_slice=False): - # The old slice makes it difficult for user to override the limits because they need to place drop-in files on every upgrade if extension slice is different for each version. - # old slice includes .- - # new slice without version . - if not old_slice: - extension_name = extension_name.rsplit("-", 1)[0] - # Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects. - return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_') + ".slice" - def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): scope = "{0}_{1}".format(cmd_name, uuid.uuid4()) @@ -272,7 +324,8 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh # Some distros like ubuntu20 by default cpu and memory accounting enabled. Thus create nested cgroups under the extension slice # So disabling CPU and Memory accounting prevents from creating nested cgroups, so that all the counters will be present in extension Cgroup # since slice unit file configured with accounting enabled. - "systemd-run --property=CPUAccounting=no --property=MemoryAccounting=no --unit={0} --scope --slice={1} {2}".format(scope, extension_slice_name, command), + "systemd-run --property=CPUAccounting=no --property=MemoryAccounting=no --unit={0} --scope --slice={1} {2}".format( + scope, extension_slice_name, command), shell=shell, cwd=cwd, stdout=stdout, @@ -285,7 +338,7 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh scope_name = scope + '.scope' - logger.info("Started extension in unit '{0}'", scope_name) + log_cgroup_info("Started extension in unit '{0}'".format(scope_name), send_event=False) cpu_cgroup = None try: @@ -294,14 +347,14 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self.get_cgroup_mount_points() if cpu_cgroup_mountpoint is None: - logger.info("The CPU controller is not mounted; will not track resource usage") + log_cgroup_info("The CPU controller is not mounted; will not track resource usage", send_event=False) else: cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path) cpu_cgroup = CpuCgroup(extension_name, cpu_cgroup_path) CGroupsTelemetry.track_cgroup(cpu_cgroup) if memory_cgroup_mountpoint is None: - logger.info("The Memory controller is not mounted; will not track resource usage") + log_cgroup_info("The Memory controller is not mounted; will not track resource usage", send_event=False) else: memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path) memory_cgroup = MemoryCgroup(extension_name, memory_cgroup_path) @@ -309,10 +362,12 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh except IOError as e: if e.errno == 2: # 'No such file or directory' - logger.info("The extension command already completed; will not track resource usage") - logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e)) + log_cgroup_info("The extension command already completed; will not track resource usage", + send_event=False) + log_cgroup_info("Failed to start tracking resource usage for the extension: {0}".format(ustr(e)), + send_event=False) except Exception as e: - logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(e)) + log_cgroup_info("Failed to start tracking resource usage for the extension: {0}".format(ustr(e)), send_event=False) # Wait for process completion or timeout try: @@ -342,11 +397,139 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh with self._systemd_run_commands_lock: self._systemd_run_commands.remove(process.pid) - def cleanup_legacy_cgroups(self): + +class SystemdCgroupsApiv2(SystemdCgroupsApi): + """ + Cgroups v2 interface via systemd + """ + + def is_controller_enabled(self, controller, cgroup_path): """ - Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; - starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If - we find that any of the legacy groups include the PID of the daemon then we need to disable data collection for this - instance (under systemd, moving PIDs across the cgroup file system can produce unpredictable results) + Returns True if the provided controller is enabled at the provided cgroup. + + There are two ways to determine if a controller is enabled at the provided cgroup: + + 1. For non-leaf cgroups, the cgroup.subtree_control shows space separated list of the controllers which are + enabled to control resource distribution from the cgroup to its children. All non-root "cgroup.subtree_control" + files can only contain controllers which are enabled in the parent's "cgroup.subtree_control" file. + $ cat /sys/fs/cgroup/cgroup.subtree_control + cpuset cpu io memory hugetlb pids rdma misc + + 2. For leaf cgroups, the cgroup.subtree_control file will be empty and the presence of "." + prefixed interface files at the path indicate the controller is enabled. + $ ls /sys/fs/cgroup/azure.slice/walinuxagent.service/ + cgroup.controllers cgroup.max.descendants cgroup.threads cpu.pressure cpu.weight.nice memory.high memory.oom.group memory.swap.current memory.zswap.current pids.peak + cgroup.events cgroup.pressure cgroup.type cpu.stat io.pressure memory.low memory.peak memory.swap.events memory.zswap.max + cgroup.freeze cgroup.procs cpu.idle cpu.uclamp.max memory.current memory.max memory.pressure memory.swap.high pids.current + cgroup.kill cgroup.stat cpu.max cpu.uclamp.min memory.events memory.min memory.reclaim memory.swap.max pids.events + cgroup.max.depth cgroup.subtree_control cpu.max.burst cpu.weight memory.events.local memory.numa_stat memory.stat memory.swap.peak pids.max + + If either check is True, the controller is enabled at the cgroup. Check 1 is necessary because no controller + interface files exist at the root cgroup, even if the controller is enabled. """ - return CGroupsApi._foreach_legacy_cgroup(lambda *_: None) + if cgroup_path is not None and controller is not None: + # Check that the controller is enabled in the cgroup.subtree_control file + enabled_controllers_file = os.path.join(cgroup_path, 'cgroup.subtree_control') + if os.path.exists(enabled_controllers_file): + enabled_controllers = fileutil.read_file(enabled_controllers_file).rstrip().split(" ") + if controller in enabled_controllers: + return True + + # Check that the controller interface files exist in the cgroup + if os.path.exists(cgroup_path): + for item in os.listdir(cgroup_path): + if item.startswith(controller + '.'): + return True + + return False + + def get_cgroup_mount_points(self): + # The output of mount is similar to + # $ findmnt -t cgroup2 --noheadings + # /sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot + # + # Since v2 is a unified hierarchy, this method checks if each controller is enabled at the root cgroup. This + # check is necessary because all non-root "cgroup.subtree_control" files can only contain controllers which are + # enabled in the parent's "cgroup.subtree_control" file. + + if not self._cgroup_mountpoints: + cpu = None + memory = None + for line in shellutil.run_command(['findmnt', '-t', 'cgroup2', '--noheadings']).splitlines(): + match = re.search(r'(?P/\S+)\s+cgroup2', line) + if match is not None: + mount_point = match.group('path') + if self.is_controller_enabled('cpu', mount_point): + cpu = mount_point + if self.is_controller_enabled('memory', mount_point): + memory = mount_point + self._cgroup_mountpoints = {'cpu': cpu, 'memory': memory} + + return self._cgroup_mountpoints['cpu'], self._cgroup_mountpoints['memory'] + + def get_unit_cgroup_paths(self, unit_name): + # Ex: ControlGroup=/azure.slice/walinuxagent.service + # controlgroup_path[1:] = azure.slice/walinuxagent.service + controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup") + cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() + + # Since v2 is a unified hierarchy, we need to check if each controller is enabled for the cgroup. If a + # controller is not enabled, then its controller interface files won't exist at the cgroup path + cpu_cgroup_path = None + if cpu_mount_point is not None: + cgroup_path = os.path.join(cpu_mount_point, controlgroup_path[1:]) + if self.is_controller_enabled('cpu', cgroup_path): + cpu_cgroup_path = cgroup_path + + memory_cgroup_path = None + if memory_mount_point is not None: + cgroup_path = os.path.join(memory_mount_point, controlgroup_path[1:]) + if self.is_controller_enabled('memory', cgroup_path): + memory_cgroup_path = cgroup_path + + return cpu_cgroup_path, memory_cgroup_path + + def get_process_cgroup_paths(self, process_id): + cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id) + cpu_mount_point, memory_mount_point = self.get_cgroup_mount_points() + + # Since v2 is a unified hierarchy, we need to check if each controller is enabled for the cgroup. If a + # controller is not enabled, then its controller interface files won't exist at the cgroup path + cpu_cgroup_path = None + if cpu_mount_point is not None and cpu_cgroup_relative_path is not None: + cgroup_path = os.path.join(cpu_mount_point, cpu_cgroup_relative_path) + if self.is_controller_enabled('cpu', cgroup_path): + cpu_cgroup_path = cgroup_path + + memory_cgroup_path = None + if memory_mount_point is not None and memory_cgroup_relative_path is not None: + cgroup_path = os.path.join(memory_mount_point, memory_cgroup_relative_path) + if self.is_controller_enabled('memory', cgroup_path): + memory_cgroup_path = cgroup_path + + return cpu_cgroup_path, memory_cgroup_path + + def get_process_cgroup_relative_paths(self, process_id): + # The contents of the file are similar to + # # cat /proc/1218/cgroup + # 0::/azure.slice/walinuxagent.service + cpu_path = None + memory_path = None + for line in fileutil.read_file("/proc/{0}/cgroup".format(process_id)).splitlines(): + match = re.match(r'\d+::(?P\S+)', line) + if match is not None: + path = match.group('path').lstrip('/') if match.group('path') != '/' else None + memory_path = path + cpu_path = path + + return cpu_path, memory_path + + def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): # pylint: disable=W0613 + """ + Currently, the agent will not enable cgroups v2 or use SystemdCgroupv2Api() to start extension commands. Raising + an exception here for CGroupConfigurator to catch in case v2 is improperly enabled. + """ + error_msg = "The agent does not currently support running extensions in cgroups v2" + log_cgroup_warning(error_msg) + raise CGroupsException(msg=error_msg) + diff --git a/azurelinuxagent/ga/cgroupconfigurator.py b/azurelinuxagent/ga/cgroupconfigurator.py index 09eb8b55a..7b415d99f 100644 --- a/azurelinuxagent/ga/cgroupconfigurator.py +++ b/azurelinuxagent/ga/cgroupconfigurator.py @@ -24,8 +24,9 @@ from azurelinuxagent.common import conf from azurelinuxagent.common import logger from azurelinuxagent.ga.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryCgroup -from azurelinuxagent.ga.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdRunError, EXTENSION_SLICE_PREFIX -from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry +from azurelinuxagent.ga.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdRunError, EXTENSION_SLICE_PREFIX, \ + get_cgroup_api, SystemdCgroupsApiv2 +from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry, log_cgroup_info, log_cgroup_warning from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException, AgentMemoryExceededException from azurelinuxagent.common.future import ustr from azurelinuxagent.common.osutil import systemd @@ -114,18 +115,6 @@ class DisableCgroups(object): EXTENSIONS = "extensions" -def _log_cgroup_info(format_string, *args): - message = format_string.format(*args) - logger.info("[CGI] " + message) - add_event(op=WALAEventOperation.CGroupsInfo, message=message) - - -def _log_cgroup_warning(format_string, *args): - message = format_string.format(*args) - logger.info("[CGW] " + message) # log as INFO for now, in the future it should be logged as WARNING - add_event(op=WALAEventOperation.CGroupsInfo, message=message, is_success=False, log_event=False) - - class CGroupConfigurator(object): """ This class implements the high-level operations on CGroups (e.g. initialization, creation, etc) @@ -166,23 +155,28 @@ def initialize(self): agent_drop_in_file_memory_accounting, agent_drop_in_file_cpu_quota]) self.__cleanup_all_files(files_to_cleanup) self.__reload_systemd_config() - logger.info("Agent reset the quotas if distro: {0} goes from supported to unsupported list", get_distro()) + log_cgroup_info("Agent reset the quotas if distro: {0} goes from supported to unsupported list".format(get_distro()), send_event=False) except Exception as err: - logger.warn("Unable to delete Agent drop-in files while resetting the quotas: {0}".format(err)) + logger.warn("[CGW] Unable to delete Agent drop-in files while resetting the quotas: {0}".format(err)) # check whether cgroup monitoring is supported on the current distro self._cgroups_supported = CGroupsApi.cgroups_supported() if not self._cgroups_supported: - logger.info("Cgroup monitoring is not supported on {0}", get_distro()) + log_cgroup_info("Cgroup monitoring is not supported on {0}".format(get_distro()), send_event=False) + return + + # Determine which version of the Cgroup API should be used. If the correct version can't be determined, + # do not enable resource monitoring/enforcement. + self._cgroups_api = get_cgroup_api() + if self._cgroups_api is None: return # check that systemd is detected correctly - self._cgroups_api = SystemdCgroupsApi() if not systemd.is_systemd(): - _log_cgroup_warning("systemd was not detected on {0}", get_distro()) + log_cgroup_warning("systemd was not detected on {0}".format(get_distro())) return - _log_cgroup_info("systemd version: {0}", systemd.get_version()) + log_cgroup_info("systemd version: {0}".format(systemd.get_version())) if not self.__check_no_legacy_cgroups(): return @@ -190,34 +184,38 @@ def initialize(self): agent_unit_name = systemd.get_agent_unit_name() agent_slice = systemd.get_unit_property(agent_unit_name, "Slice") if agent_slice not in (AZURE_SLICE, "system.slice"): - _log_cgroup_warning("The agent is within an unexpected slice: {0}", agent_slice) + log_cgroup_warning("The agent is within an unexpected slice: {0}".format(agent_slice)) return self.__setup_azure_slice() - cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers() - self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroups(agent_slice, + cpu_controller_root, memory_controller_root = self.__get_cgroup_controllers_mount_points() + self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroup_paths(agent_slice, cpu_controller_root, memory_controller_root) + if self.cgroup_v2_enabled(): + log_cgroup_info("Agent and extensions resource monitoring is not currently supported on cgroups v2") + return + if self._agent_cpu_cgroup_path is not None or self._agent_memory_cgroup_path is not None: self.enable() if self._agent_cpu_cgroup_path is not None: - _log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path) + log_cgroup_info("Agent CPU cgroup: {0}".format(self._agent_cpu_cgroup_path)) self.__set_cpu_quota(conf.get_agent_cpu_quota()) CGroupsTelemetry.track_cgroup(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path)) if self._agent_memory_cgroup_path is not None: - _log_cgroup_info("Agent Memory cgroup: {0}", self._agent_memory_cgroup_path) + log_cgroup_info("Agent Memory cgroup: {0}".format(self._agent_memory_cgroup_path)) self._agent_memory_cgroup = MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path) CGroupsTelemetry.track_cgroup(self._agent_memory_cgroup) - _log_cgroup_info('Agent cgroups enabled: {0}', self._agent_cgroups_enabled) except Exception as exception: - _log_cgroup_warning("Error initializing cgroups: {0}", ustr(exception)) + log_cgroup_warning("Error initializing cgroups: {0}".format(ustr(exception))) finally: + log_cgroup_info('Agent cgroups enabled: {0}'.format(self._agent_cgroups_enabled)) self._initialized = True def __check_no_legacy_cgroups(self): @@ -227,33 +225,22 @@ def __check_no_legacy_cgroups(self): """ legacy_cgroups = self._cgroups_api.cleanup_legacy_cgroups() if legacy_cgroups > 0: - _log_cgroup_warning("The daemon's PID was added to a legacy cgroup; will not monitor resource usage.") + log_cgroup_warning("The daemon's PID was added to a legacy cgroup; will not monitor resource usage.") return False return True - def __get_cgroup_controllers(self): - # - # check v1 controllers - # + def __get_cgroup_controllers_mount_points(self): cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points() if cpu_controller_root is not None: - logger.info("The CPU cgroup controller is mounted at {0}", cpu_controller_root) + log_cgroup_info("The CPU cgroup controller is mounted at {0}".format(cpu_controller_root), send_event=False) else: - _log_cgroup_warning("The CPU cgroup controller is not mounted") + log_cgroup_warning("The CPU cgroup controller is not mounted") if memory_controller_root is not None: - logger.info("The memory cgroup controller is mounted at {0}", memory_controller_root) + log_cgroup_info("The memory cgroup controller is mounted at {0}".format(memory_controller_root), send_event=False) else: - _log_cgroup_warning("The memory cgroup controller is not mounted") - - # - # check v2 controllers - # - cgroup2_mount_point, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers() - if cgroup2_mount_point is not None: - _log_cgroup_info("cgroups v2 mounted at {0}. Controllers: [{1}]", cgroup2_mount_point, - cgroup2_controllers) + log_cgroup_warning("The memory cgroup controller is not mounted") return cpu_controller_root, memory_controller_root @@ -334,7 +321,7 @@ def __setup_azure_slice(): for path, contents in files_to_create: CGroupConfigurator._Impl.__create_unit_file(path, contents) except Exception as exception: - _log_cgroup_warning("Failed to create unit files for the azure slice: {0}", ustr(exception)) + log_cgroup_warning("Failed to create unit files for the azure slice: {0}".format(ustr(exception))) for unit_file in files_to_create: CGroupConfigurator._Impl.__cleanup_unit_file(unit_file) return @@ -345,10 +332,10 @@ def __setup_azure_slice(): def __reload_systemd_config(): # reload the systemd configuration; the new slices will be used once the agent's service restarts try: - logger.info("Executing systemctl daemon-reload...") + log_cgroup_info("Executing systemctl daemon-reload...", send_event=False) shellutil.run_command(["systemctl", "daemon-reload"]) except Exception as exception: - _log_cgroup_warning("daemon-reload failed (create azure slice): {0}", ustr(exception)) + log_cgroup_warning("daemon-reload failed (create azure slice): {0}".format(ustr(exception))) # W0238: Unused private member `_Impl.__create_unit_file(path, contents)` (unused-private-member) @staticmethod @@ -358,7 +345,7 @@ def __create_unit_file(path, contents): # pylint: disable=unused-private-member fileutil.mkdir(parent, mode=0o755) exists = os.path.exists(path) fileutil.write_file(path, contents) - _log_cgroup_info("{0} {1}", "Updated" if exists else "Created", path) + log_cgroup_info("{0} {1}".format("Updated" if exists else "Created", path)) # W0238: Unused private member `_Impl.__cleanup_unit_file(path)` (unused-private-member) @staticmethod @@ -366,9 +353,9 @@ def __cleanup_unit_file(path): # pylint: disable=unused-private-member if os.path.exists(path): try: os.remove(path) - _log_cgroup_info("Removed {0}", path) + log_cgroup_info("Removed {0}".format(path)) except Exception as exception: - _log_cgroup_warning("Failed to remove {0}: {1}", path, ustr(exception)) + log_cgroup_warning("Failed to remove {0}: {1}".format(path, ustr(exception))) @staticmethod def __cleanup_all_files(files_to_cleanup): @@ -376,9 +363,9 @@ def __cleanup_all_files(files_to_cleanup): if os.path.exists(path): try: os.remove(path) - _log_cgroup_info("Removed {0}", path) + log_cgroup_info("Removed {0}".format(path)) except Exception as exception: - _log_cgroup_warning("Failed to remove {0}: {1}", path, ustr(exception)) + log_cgroup_warning("Failed to remove {0}: {1}".format(path, ustr(exception))) @staticmethod def __create_all_files(files_to_create): @@ -387,7 +374,7 @@ def __create_all_files(files_to_create): for path, contents in files_to_create: CGroupConfigurator._Impl.__create_unit_file(path, contents) except Exception as exception: - _log_cgroup_warning("Failed to create unit files : {0}", ustr(exception)) + log_cgroup_warning("Failed to create unit files : {0}".format(ustr(exception))) for unit_file in files_to_create: CGroupConfigurator._Impl.__cleanup_unit_file(unit_file) return @@ -411,7 +398,7 @@ def is_extension_resource_limits_setup_completed(self, extension_name, cpu_quota return True return False - def __get_agent_cgroups(self, agent_slice, cpu_controller_root, memory_controller_root): + def __get_agent_cgroup_paths(self, agent_slice, cpu_controller_root, memory_controller_root): agent_unit_name = systemd.get_agent_unit_name() expected_relative_path = os.path.join(agent_slice, agent_unit_name) @@ -419,29 +406,25 @@ def __get_agent_cgroups(self, agent_slice, cpu_controller_root, memory_controlle "self") if cpu_cgroup_relative_path is None: - _log_cgroup_warning("The agent's process is not within a CPU cgroup") + log_cgroup_warning("The agent's process is not within a CPU cgroup") else: if cpu_cgroup_relative_path == expected_relative_path: - _log_cgroup_info('CPUAccounting: {0}', systemd.get_unit_property(agent_unit_name, "CPUAccounting")) - _log_cgroup_info('CPUQuota: {0}', systemd.get_unit_property(agent_unit_name, "CPUQuotaPerSecUSec")) + log_cgroup_info('CPUAccounting: {0}'.format(systemd.get_unit_property(agent_unit_name, "CPUAccounting"))) + log_cgroup_info('CPUQuota: {0}'.format(systemd.get_unit_property(agent_unit_name, "CPUQuotaPerSecUSec"))) else: - _log_cgroup_warning( - "The Agent is not in the expected CPU cgroup; will not enable monitoring. Cgroup:[{0}] Expected:[{1}]", - cpu_cgroup_relative_path, - expected_relative_path) + log_cgroup_warning( + "The Agent is not in the expected CPU cgroup; will not enable monitoring. Cgroup:[{0}] Expected:[{1}]".format(cpu_cgroup_relative_path, expected_relative_path)) cpu_cgroup_relative_path = None # Set the path to None to prevent monitoring if memory_cgroup_relative_path is None: - _log_cgroup_warning("The agent's process is not within a memory cgroup") + log_cgroup_warning("The agent's process is not within a memory cgroup") else: if memory_cgroup_relative_path == expected_relative_path: memory_accounting = systemd.get_unit_property(agent_unit_name, "MemoryAccounting") - _log_cgroup_info('MemoryAccounting: {0}', memory_accounting) + log_cgroup_info('MemoryAccounting: {0}'.format(memory_accounting)) else: - _log_cgroup_info( - "The Agent is not in the expected memory cgroup; will not enable monitoring. CGroup:[{0}] Expected:[{1}]", - memory_cgroup_relative_path, - expected_relative_path) + log_cgroup_warning( + "The Agent is not in the expected memory cgroup; will not enable monitoring. CGroup:[{0}] Expected:[{1}]".format(memory_cgroup_relative_path, expected_relative_path)) memory_cgroup_relative_path = None # Set the path to None to prevent monitoring if cpu_controller_root is not None and cpu_cgroup_relative_path is not None: @@ -468,6 +451,9 @@ def agent_enabled(self): def extensions_enabled(self): return self._extensions_cgroups_enabled + def cgroup_v2_enabled(self): + return isinstance(self._cgroups_api, SystemdCgroupsApiv2) + def enable(self): if not self.supported(): raise CGroupsException( @@ -481,7 +467,7 @@ def disable(self, reason, disable_cgroups): self.__reset_agent_cpu_quota() extension_services = self.get_extension_services_list() for extension in extension_services: - logger.info("Resetting extension : {0} and it's services: {1} CPUQuota".format(extension, extension_services[extension])) + log_cgroup_info("Resetting extension : {0} and it's services: {1} CPUQuota".format(extension, extension_services[extension]), send_event=False) self.__reset_extension_cpu_quota(extension_name=extension) self.__reset_extension_services_cpu_quota(extension_services[extension]) self.__reload_systemd_config() @@ -494,9 +480,7 @@ def disable(self, reason, disable_cgroups): self.__reset_agent_cpu_quota() CGroupsTelemetry.stop_tracking(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path)) - message = "[CGW] Disabling resource usage monitoring. Reason: {0}".format(reason) - logger.info(message) # log as INFO for now, in the future it should be logged as WARNING - add_event(op=WALAEventOperation.CGroupsDisabled, message=message, is_success=False, log_event=False) + log_cgroup_warning("Disabling resource usage monitoring. Reason: {0}".format(reason), op=WALAEventOperation.CGroupsDisabled) @staticmethod def __set_cpu_quota(quota): @@ -507,7 +491,7 @@ def __set_cpu_quota(quota): over this setting. """ quota_percentage = "{0}%".format(quota) - _log_cgroup_info("Ensuring the agent's CPUQuota is {0}", quota_percentage) + log_cgroup_info("Ensuring the agent's CPUQuota is {0}".format(quota_percentage)) if CGroupConfigurator._Impl.__try_set_cpu_quota(quota_percentage): CGroupsTelemetry.set_track_throttled_time(True) @@ -519,10 +503,9 @@ def __reset_agent_cpu_quota(): NOTE: This resets the quota on the agent's default dropin file; any local overrides on the VM will take precedence over this setting. """ - logger.info("Resetting agent's CPUQuota") + log_cgroup_info("Resetting agent's CPUQuota", send_event=False) if CGroupConfigurator._Impl.__try_set_cpu_quota(''): # setting an empty value resets to the default (infinity) - _log_cgroup_info('CPUQuota: {0}', - systemd.get_unit_property(systemd.get_agent_unit_name(), "CPUQuotaPerSecUSec")) + log_cgroup_info('CPUQuota: {0}'.format(systemd.get_unit_property(systemd.get_agent_unit_name(), "CPUQuotaPerSecUSec"))) # W0238: Unused private member `_Impl.__try_set_cpu_quota(quota)` (unused-private-member) @staticmethod @@ -536,13 +519,13 @@ def __try_set_cpu_quota(quota): # pylint: disable=unused-private-member return True # no need to update the file; return here to avoid doing a daemon-reload CGroupConfigurator._Impl.__create_unit_file(drop_in_file, contents) except Exception as exception: - _log_cgroup_warning('Failed to set CPUQuota: {0}', ustr(exception)) + log_cgroup_warning('Failed to set CPUQuota: {0}', ustr(exception)) return False try: - logger.info("Executing systemctl daemon-reload...") + log_cgroup_info("Executing systemctl daemon-reload...", send_event=False) shellutil.run_command(["systemctl", "daemon-reload"]) except Exception as exception: - _log_cgroup_warning("daemon-reload failed (set quota): {0}", ustr(exception)) + log_cgroup_warning("daemon-reload failed (set quota): {0}", ustr(exception)) return False return True @@ -626,7 +609,7 @@ def _check_processes_in_agent_cgroup(self): if len(unexpected) >= 5: # collect just a small sample break except Exception as exception: - _log_cgroup_warning("Error checking the processes in the agent's cgroup: {0}".format(ustr(exception))) + log_cgroup_warning("Error checking the processes in the agent's cgroup: {0}".format(ustr(exception))) if len(unexpected) > 0: self._report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected) @@ -761,17 +744,17 @@ def start_tracking_unit_cgroups(self, unit_name): cpu_cgroup_path, memory_cgroup_path = self._cgroups_api.get_unit_cgroup_paths(unit_name) if cpu_cgroup_path is None: - logger.info("The CPU controller is not mounted; will not track resource usage") + log_cgroup_info("The CPU controller is not mounted; will not track resource usage", send_event=False) else: CGroupsTelemetry.track_cgroup(CpuCgroup(unit_name, cpu_cgroup_path)) if memory_cgroup_path is None: - logger.info("The Memory controller is not mounted; will not track resource usage") + log_cgroup_info("The Memory controller is not mounted; will not track resource usage", send_event=False) else: CGroupsTelemetry.track_cgroup(MemoryCgroup(unit_name, memory_cgroup_path)) except Exception as exception: - logger.info("Failed to start tracking resource usage for the extension: {0}", ustr(exception)) + log_cgroup_info("Failed to start tracking resource usage for the extension: {0}".format(ustr(exception)), send_event=False) def stop_tracking_unit_cgroups(self, unit_name): """ @@ -787,7 +770,7 @@ def stop_tracking_unit_cgroups(self, unit_name): CGroupsTelemetry.stop_tracking(MemoryCgroup(unit_name, memory_cgroup_path)) except Exception as exception: - logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception)) + log_cgroup_info("Failed to stop tracking resource usage for the extension service: {0}".format(ustr(exception)), send_event=False) def stop_tracking_extension_cgroups(self, extension_name): """ @@ -809,7 +792,7 @@ def stop_tracking_extension_cgroups(self, extension_name): CGroupsTelemetry.stop_tracking(MemoryCgroup(extension_name, memory_cgroup_path)) except Exception as exception: - logger.info("Failed to stop tracking resource usage for the extension service: {0}", ustr(exception)) + log_cgroup_info("Failed to stop tracking resource usage for the extension service: {0}".format(ustr(exception)), send_event=False) def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): @@ -836,6 +819,12 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh extension_name, ustr(exception)) self.disable(reason, DisableCgroups.ALL) # fall-through and re-invoke the extension + except CGroupsException as exception: + reason = 'Failed to start {0} using cgroups, will try invoking the extension directly. Error: {1}'.format( + extension_name, ustr(exception)) + self.disable(reason, DisableCgroups.ALL) + # fall-through and re-invoke the extension + # subprocess-popen-preexec-fn Disabled: code is not multi-threaded process = subprocess.Popen(command, shell=shell, cwd=cwd, env=env, stdout=stdout, stderr=stderr, preexec_fn=os.setsid) # pylint: disable=W1509 @@ -867,14 +856,14 @@ def setup_extension_slice(self, extension_name, cpu_quota): try: cpu_quota = str(cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity) if cpu_quota == "": - _log_cgroup_info("CPUQuota not set for {0}", extension_name) + log_cgroup_info("CPUQuota not set for {0}".format(extension_name)) else: - _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", extension_name, cpu_quota) + log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}".format(extension_name, cpu_quota)) slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name, cpu_quota=cpu_quota) CGroupConfigurator._Impl.__create_unit_file(extension_slice_path, slice_contents) except Exception as exception: - _log_cgroup_warning("Failed to set the extension {0} slice and quotas: {1}", extension_name, + log_cgroup_warning("Failed to set the extension {0} slice and quotas: {1}", extension_name, ustr(exception)) CGroupConfigurator._Impl.__cleanup_unit_file(extension_slice_path) @@ -916,7 +905,7 @@ def set_extension_services_cpu_memory_quota(self, services_list): cpu_quota = service.get('cpuQuotaPercentage', None) if cpu_quota is not None: cpu_quota = str(cpu_quota) + "%" - _log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", service_name, cpu_quota) + log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}".format(service_name, cpu_quota)) drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA) cpu_quota_contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(cpu_quota) files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents)) @@ -950,7 +939,7 @@ def __reset_extension_services_cpu_quota(self, services_list): files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents)) self.__create_all_files(files_to_create) except Exception as exception: - _log_cgroup_warning('Failed to reset CPUQuota for {0} : {1}', service_name, ustr(exception)) + log_cgroup_warning('Failed to reset CPUQuota for {0} : {1}', service_name, ustr(exception)) def remove_extension_services_drop_in_files(self, services_list): """ @@ -975,7 +964,7 @@ def remove_extension_services_drop_in_files(self, services_list): files_to_cleanup.append(drop_in_file_cpu_quota) CGroupConfigurator._Impl.__cleanup_all_files(files_to_cleanup) - _log_cgroup_info("Drop in files removed for {0}".format(service_name)) + log_cgroup_info("Drop in files removed for {0}".format(service_name)) def stop_tracking_extension_services_cgroups(self, services_list): """ @@ -1016,10 +1005,10 @@ def get_extension_services_list(): services = resource_limits.get('services') if resource_limits else None extensions_services[extensions_name] = services except (IOError, OSError) as e: - _log_cgroup_warning( + log_cgroup_warning( 'Failed to load manifest file ({0}): {1}'.format(manifest_path, e.strerror)) except ValueError: - _log_cgroup_warning('Malformed manifest file ({0}).'.format(manifest_path)) + log_cgroup_warning('Malformed manifest file ({0}).'.format(manifest_path)) return extensions_services # unique instance for the singleton diff --git a/azurelinuxagent/ga/cgroupstelemetry.py b/azurelinuxagent/ga/cgroupstelemetry.py index 5943b45ad..5a564de63 100644 --- a/azurelinuxagent/ga/cgroupstelemetry.py +++ b/azurelinuxagent/ga/cgroupstelemetry.py @@ -17,10 +17,23 @@ import threading from azurelinuxagent.common import logger +from azurelinuxagent.common.event import WALAEventOperation, add_event from azurelinuxagent.ga.cgroup import CpuCgroup from azurelinuxagent.common.future import ustr +def log_cgroup_info(formatted_string, op=WALAEventOperation.CGroupsInfo, send_event=True): + logger.info("[CGI] " + formatted_string) + if send_event: + add_event(op=op, message=formatted_string) + + +def log_cgroup_warning(formatted_string, op=WALAEventOperation.CGroupsInfo, send_event=True): + logger.info("[CGW] " + formatted_string) # log as INFO for now, in the future it should be logged as WARNING + if send_event: + add_event(op=op, message=formatted_string, is_success=False, log_event=False) + + class CGroupsTelemetry(object): """ """ diff --git a/tests/data/cgroups/sys_fs_cgroup_unified_cgroup.controllers b/tests/data/cgroups/sys_fs_cgroup_unified_cgroup.controllers deleted file mode 100644 index 2a03d239d..000000000 --- a/tests/data/cgroups/sys_fs_cgroup_unified_cgroup.controllers +++ /dev/null @@ -1,7 +0,0 @@ -io -memory -pids -perf_event -rdma -cpu -freezer \ No newline at end of file diff --git a/tests/data/cgroups/proc_pid_cgroup b/tests/data/cgroups/v1/proc_pid_cgroup similarity index 100% rename from tests/data/cgroups/proc_pid_cgroup rename to tests/data/cgroups/v1/proc_pid_cgroup diff --git a/tests/data/cgroups/proc_self_cgroup b/tests/data/cgroups/v1/proc_self_cgroup similarity index 100% rename from tests/data/cgroups/proc_self_cgroup rename to tests/data/cgroups/v1/proc_self_cgroup diff --git a/tests/data/cgroups/v1/sys_fs_cgroup_unified_cgroup.subtree_control b/tests/data/cgroups/v1/sys_fs_cgroup_unified_cgroup.subtree_control new file mode 100644 index 000000000..e69de29bb diff --git a/tests/data/cgroups/v1_and_v2/proc_pid_cgroup b/tests/data/cgroups/v1_and_v2/proc_pid_cgroup new file mode 100644 index 000000000..179c59daa --- /dev/null +++ b/tests/data/cgroups/v1_and_v2/proc_pid_cgroup @@ -0,0 +1,12 @@ +12:devices:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope +11:perf_event:/ +10:rdma:/ +9:blkio:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope +8:net_cls,net_prio:/ +7:freezer:/ +6:hugetlb:/ +4:cpuset:/ +3:cpu,cpuacct:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope +2:pids:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope +1:name=systemd:/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope +0::/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope diff --git a/tests/data/cgroups/v1_and_v2/proc_self_cgroup b/tests/data/cgroups/v1_and_v2/proc_self_cgroup new file mode 100644 index 000000000..40e7dd5b1 --- /dev/null +++ b/tests/data/cgroups/v1_and_v2/proc_self_cgroup @@ -0,0 +1,12 @@ +12:blkio:/system.slice/walinuxagent.service +11:cpu,cpuacct:/system.slice/walinuxagent.service +10:devices:/system.slice/walinuxagent.service +9:pids:/system.slice/walinuxagent.service +7:freezer:/ +6:hugetlb:/ +5:perf_event:/ +4:net_cls,net_prio:/ +3:cpuset:/ +2:rdma:/ +1:name=systemd:/system.slice/walinuxagent.service +0::/system.slice/walinuxagent.service diff --git a/tests/data/cgroups/v1_and_v2/sys_fs_cgroup_cgroup.subtree_control b/tests/data/cgroups/v1_and_v2/sys_fs_cgroup_cgroup.subtree_control new file mode 100644 index 000000000..2142c3ad3 --- /dev/null +++ b/tests/data/cgroups/v1_and_v2/sys_fs_cgroup_cgroup.subtree_control @@ -0,0 +1 @@ +memory diff --git a/tests/data/cgroups/v2/proc_pid_cgroup b/tests/data/cgroups/v2/proc_pid_cgroup new file mode 100644 index 000000000..8a1f8d0be --- /dev/null +++ b/tests/data/cgroups/v2/proc_pid_cgroup @@ -0,0 +1 @@ +0::/system.slice/Microsoft.A.Sample.Extension_1.0.1_aeac05dc-8c24-4542-95f2-a0d6be1c5ba7.scope diff --git a/tests/data/cgroups/v2/proc_self_cgroup b/tests/data/cgroups/v2/proc_self_cgroup new file mode 100644 index 000000000..0027b4040 --- /dev/null +++ b/tests/data/cgroups/v2/proc_self_cgroup @@ -0,0 +1 @@ +0::/system.slice/walinuxagent.service diff --git a/tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control b/tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control new file mode 100644 index 000000000..c94e05c42 --- /dev/null +++ b/tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control @@ -0,0 +1 @@ +cpuset cpu io memory pids diff --git a/tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control_empty b/tests/data/cgroups/v2/sys_fs_cgroup_cgroup.subtree_control_empty new file mode 100644 index 000000000..e69de29bb diff --git a/tests/ga/test_cgroupapi.py b/tests/ga/test_cgroupapi.py index ad8ef80c2..7064ea51f 100644 --- a/tests/ga/test_cgroupapi.py +++ b/tests/ga/test_cgroupapi.py @@ -22,11 +22,15 @@ import subprocess import tempfile -from azurelinuxagent.ga.cgroupapi import CGroupsApi, SystemdCgroupsApi +from azurelinuxagent.common.exception import CGroupsException +from azurelinuxagent.common.utils.fileutil import read_file +from azurelinuxagent.ga import cgroupapi +from azurelinuxagent.ga.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdCgroupsApiv1, SystemdCgroupsApiv2 from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.osutil import systemd from azurelinuxagent.common.utils import fileutil -from tests.lib.mock_cgroup_environment import mock_cgroup_environment +from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, mock_cgroup_v2_environment, \ + mock_cgroup_v1_and_v2_environment from tests.lib.tools import AgentTestCase, patch, mock_sleep from tests.lib.cgroups_tools import CGroupsTools @@ -47,7 +51,24 @@ def tearDown(self): AgentTestCase.tearDown(self) -class CGroupsApiTestCase(_MockedFileSystemTestCase): +class CGroupsApiTestCase(AgentTestCase): + def test_get_cgroup_api_is_v1_when_v1_controllers_mounted(self): + with mock_cgroup_v1_environment(self.tmp_dir): + self.assertIsInstance(cgroupapi.get_cgroup_api(), SystemdCgroupsApiv1) + + def test_get_cgroup_api_is_v2_when_v2_controllers_mounted(self): + with mock_cgroup_v2_environment(self.tmp_dir): + self.assertIsInstance(cgroupapi.get_cgroup_api(), SystemdCgroupsApiv2) + + def test_get_cgroup_api_is_v1_when_v1_and_v2_controllers_mounted(self): + with mock_cgroup_v1_and_v2_environment(self.tmp_dir): + self.assertIsInstance(cgroupapi.get_cgroup_api(), SystemdCgroupsApiv1) + + def test_get_cgroup_api_is_none_when_no_controllers_mounted(self): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points", return_value=(None,None)): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points", return_value=(None,None)): + self.assertIsNone(cgroupapi.get_cgroup_api()) + def test_cgroups_should_be_supported_only_on_ubuntu16_centos7dot4_redhat7dot4_and_later_versions(self): test_cases = [ (['ubuntu', '16.04', 'xenial'], True), @@ -81,79 +102,144 @@ def test_cgroups_should_be_supported_only_on_ubuntu16_centos7dot4_redhat7dot4_an class SystemdCgroupsApiTestCase(AgentTestCase): def test_get_systemd_version_should_return_a_version_number(self): - with mock_cgroup_environment(self.tmp_dir): - version_info = systemd.get_version() - found = re.search(r"systemd \d+", version_info) is not None - self.assertTrue(found, "Could not determine the systemd version: {0}".format(version_info)) - - def test_get_cpu_and_memory_mount_points_should_return_the_cgroup_mount_points(self): - with mock_cgroup_environment(self.tmp_dir): - cpu, memory = SystemdCgroupsApi().get_cgroup_mount_points() - self.assertEqual(cpu, '/sys/fs/cgroup/cpu,cpuacct', "The mount point for the CPU controller is incorrect") - self.assertEqual(memory, '/sys/fs/cgroup/memory', "The mount point for the memory controller is incorrect") - - def test_get_service_cgroup_paths_should_return_the_cgroup_mount_points(self): - with mock_cgroup_environment(self.tmp_dir): - cpu, memory = SystemdCgroupsApi().get_unit_cgroup_paths("extension.service") - self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service', - "The mount point for the CPU controller is incorrect") - self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/extension.service', - "The mount point for the memory controller is incorrect") - - def test_get_cpu_and_memory_cgroup_relative_paths_for_process_should_return_the_cgroup_relative_paths(self): - with mock_cgroup_environment(self.tmp_dir): - cpu, memory = SystemdCgroupsApi.get_process_cgroup_relative_paths('self') - self.assertEqual(cpu, "system.slice/walinuxagent.service", "The relative path for the CPU cgroup is incorrect") - self.assertEqual(memory, "system.slice/walinuxagent.service", "The relative memory for the CPU cgroup is incorrect") - - def test_get_cgroup2_controllers_should_return_the_v2_cgroup_controllers(self): - with mock_cgroup_environment(self.tmp_dir): - mount_point, controllers = SystemdCgroupsApi.get_cgroup2_controllers() - - self.assertEqual(mount_point, "/sys/fs/cgroup/unified", "Invalid mount point for V2 cgroups") - self.assertIn("cpu", controllers, "The CPU controller is not in the list of V2 controllers") - self.assertIn("memory", controllers, "The memory controller is not in the list of V2 controllers") + # We expect same behavior for v1 and v2 + mock_envs = [mock_cgroup_v1_environment(self.tmp_dir), mock_cgroup_v2_environment(self.tmp_dir)] + for env in mock_envs: + with env: + version_info = systemd.get_version() + found = re.search(r"systemd \d+", version_info) is not None + self.assertTrue(found, "Could not determine the systemd version: {0}".format(version_info)) + + def test_is_cpu_or_memory_mounted_true_if_only_memory_mounted(self): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApi.get_cgroup_mount_points", return_value=(None, '/sys/fs/cgroup/memory')): + self.assertTrue(SystemdCgroupsApi().is_cpu_or_memory_mounted()) + + def test_is_cpu_or_memory_mounted_true_if_only_cpu_mounted(self): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApi.get_cgroup_mount_points", return_value=('/sys/fs/cgroup/cpu,cpuacct', None)): + self.assertTrue(SystemdCgroupsApi().is_cpu_or_memory_mounted()) + + def test_is_cpu_or_memory_mounted_true_if_cpu_and_memory_mounted(self): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApi.get_cgroup_mount_points", return_value=('/sys/fs/cgroup/cpu,cpuacct', '/sys/fs/cgroup/memory')): + self.assertTrue(SystemdCgroupsApi().is_cpu_or_memory_mounted()) + + def test_is_cpu_or_memory_mounted_false_if_cpu_and_memory_not_mounted(self): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApi.get_cgroup_mount_points", return_value=(None, None)): + self.assertFalse(SystemdCgroupsApi().is_cpu_or_memory_mounted()) + + def test_get_mounted_controllers_has_cpu_and_memory_controllers(self): + with mock_cgroup_v1_environment(self.tmp_dir): + mounted_controllers = cgroupapi.get_cgroup_api().get_mounted_controllers() + self.assertTrue("cpu" in mounted_controllers) + self.assertTrue("memory" in mounted_controllers) + + with mock_cgroup_v2_environment(self.tmp_dir): + mounted_controllers = cgroupapi.get_cgroup_api().get_mounted_controllers() + self.assertTrue("cpu" in mounted_controllers) + self.assertTrue("memory" in mounted_controllers) + + with mock_cgroup_v1_and_v2_environment(self.tmp_dir): + mounted_controllers = cgroupapi.get_cgroup_api().get_mounted_controllers() # API will be v1 since this environment as CPU mounted in v1 + self.assertTrue("cpu" in mounted_controllers) + self.assertFalse("memory" in mounted_controllers) # This environment has memory mounted in v2 def test_get_unit_property_should_return_the_value_of_the_given_property(self): - with mock_cgroup_environment(self.tmp_dir): - cpu_accounting = systemd.get_unit_property("walinuxagent.service", "CPUAccounting") - - self.assertEqual(cpu_accounting, "no", "Property {0} of {1} is incorrect".format("CPUAccounting", "walinuxagent.service")) + # We expect same behavior for v1 and v2 + mock_envs = [mock_cgroup_v1_environment(self.tmp_dir), mock_cgroup_v2_environment(self.tmp_dir)] + for env in mock_envs: + with env: + cpu_accounting = systemd.get_unit_property("walinuxagent.service", "CPUAccounting") - def assert_cgroups_created(self, extension_cgroups): - self.assertEqual(len(extension_cgroups), 2, - 'start_extension_command did not return the expected number of cgroups') + self.assertEqual(cpu_accounting, "no", "Property {0} of {1} is incorrect".format("CPUAccounting", "walinuxagent.service")) - cpu_found = memory_found = False - for cgroup in extension_cgroups: - match = re.match( - r'^/sys/fs/cgroup/(cpu|memory)/system.slice/Microsoft.Compute.TestExtension_1\.2\.3\_([a-f0-9-]+)\.scope$', - cgroup.path) +class SystemdCgroupsApiv1TestCase(AgentTestCase): + def test_get_unit_cgroup_paths_should_return_the_cgroup_v1_mount_points(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service', + "The mount point for the CPU controller is incorrect") + self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/extension.service', + "The mount point for the memory controller is incorrect") - self.assertTrue(match is not None, "Unexpected path for cgroup: {0}".format(cgroup.path)) + def test_get_unit_cgroup_path_should_return_None_if_either_cgroup_v1_controller_not_mounted(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points', return_value=('/sys/fs/cgroup/cpu,cpuacct', None)): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The mount point for the memory controller is None so unit cgroup should be None") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points', return_value=(None, '/sys/fs/cgroup/memory')): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIsNone(cpu, "The mount point for the cpu controller is None so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/extension.service', + "The mount point for the memory controller is incorrect") + + def test_get_process_cgroup_paths_should_return_the_cgroup_v1_mount_points(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") - if match.group(1) == 'cpu': - cpu_found = True - if match.group(1) == 'memory': - memory_found = True + def test_get_process_cgroup_path_should_return_None_if_either_cgroup_v1_controller_not_mounted(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points', return_value=('/sys/fs/cgroup/cpu,cpuacct', None)): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The mount point for the memory controller is None so unit cgroup should be None") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_cgroup_mount_points', return_value=(None, '/sys/fs/cgroup/memory')): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIsNone(cpu, "The mount point for the CPU controller is None so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") + + def test_get_process_cgroup_v1_path_should_return_None_if_either_relative_path_is_None(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_relative_paths', return_value=('system.slice/walinuxagent.service', None)): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The relative cgroup path for the memory controller is None so unit cgroup should be None") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_relative_paths', return_value=(None, 'system.slice/walinuxagent.service')): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIsNone(cpu, "The relative cgroup path for the cpu controller is None so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") + + def test_get_cpu_and_memory_mount_points_should_return_the_cgroup_v1_mount_points(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_cgroup_mount_points() + self.assertEqual(cpu, '/sys/fs/cgroup/cpu,cpuacct', "The mount point for the CPU controller is incorrect") + self.assertEqual(memory, '/sys/fs/cgroup/memory', "The mount point for the memory controller is incorrect") - self.assertTrue(cpu_found, 'start_extension_command did not return a cpu cgroup') - self.assertTrue(memory_found, 'start_extension_command did not return a memory cgroup') + def test_get_cpu_and_memory_cgroup_relative_paths_for_process_should_return_the_cgroup_v1_relative_paths(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths('self') + self.assertEqual(cpu, "system.slice/walinuxagent.service", "The relative path for the CPU cgroup is incorrect") + self.assertEqual(memory, "system.slice/walinuxagent.service", "The relative memory for the CPU cgroup is incorrect") @patch('time.sleep', side_effect=lambda _: mock_sleep()) - def test_start_extension_command_should_return_the_command_output(self, _): - original_popen = subprocess.Popen + def test_start_extension_cgroups_v1_command_should_return_the_command_output(self, _): + with mock_cgroup_v1_environment(self.tmp_dir): + original_popen = subprocess.Popen - def mock_popen(command, *args, **kwargs): - if command.startswith('systemd-run --property'): - command = "echo TEST_OUTPUT" - return original_popen(command, *args, **kwargs) + def mock_popen(command, *args, **kwargs): + if isinstance(command, str) and command.startswith('systemd-run --property'): + command = "echo TEST_OUTPUT" + return original_popen(command, *args, **kwargs) - with mock_cgroup_environment(self.tmp_dir): with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as output_file: - with patch("subprocess.Popen", side_effect=mock_popen) as popen_patch: # pylint: disable=unused-variable - command_output = SystemdCgroupsApi().start_extension_command( + with patch("subprocess.Popen", + side_effect=mock_popen) as popen_patch: # pylint: disable=unused-variable + command_output = cgroupapi.get_cgroup_api().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="A_TEST_COMMAND", cmd_name="test", @@ -167,9 +253,9 @@ def mock_popen(command, *args, **kwargs): self.assertIn("[stdout]\nTEST_OUTPUT\n", command_output, "The test output was not captured") @patch('time.sleep', side_effect=lambda _: mock_sleep()) - def test_start_extension_command_should_execute_the_command_in_a_cgroup(self, _): - with mock_cgroup_environment(self.tmp_dir): - SystemdCgroupsApi().start_extension_command( + def test_start_extension_cgroups_v1_command_should_execute_the_command_in_a_cgroup(self, _): + with mock_cgroup_v1_environment(self.tmp_dir): + cgroupapi.get_cgroup_api().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="test command", cmd_name="test", @@ -183,18 +269,20 @@ def test_start_extension_command_should_execute_the_command_in_a_cgroup(self, _) tracked = CGroupsTelemetry._tracked self.assertTrue( - any(cg for cg in tracked.values() if cg.name == 'Microsoft.Compute.TestExtension-1.2.3' and 'cpu' in cg.path), + any(cg for cg in tracked.values() if + cg.name == 'Microsoft.Compute.TestExtension-1.2.3' and 'cpu' in cg.path), "The extension's CPU is not being tracked") self.assertTrue( - any(cg for cg in tracked.values() if cg.name == 'Microsoft.Compute.TestExtension-1.2.3' and 'memory' in cg.path), + any(cg for cg in tracked.values() if + cg.name == 'Microsoft.Compute.TestExtension-1.2.3' and 'memory' in cg.path), "The extension's Memory is not being tracked") @patch('time.sleep', side_effect=lambda _: mock_sleep()) - def test_start_extension_command_should_use_systemd_to_execute_the_command(self, _): - with mock_cgroup_environment(self.tmp_dir): + def test_start_extension_cgroups_v1_command_should_use_systemd_to_execute_the_command(self, _): + with mock_cgroup_v1_environment(self.tmp_dir): with patch("subprocess.Popen", wraps=subprocess.Popen) as popen_patch: - SystemdCgroupsApi().start_extension_command( + cgroupapi.get_cgroup_api().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", command="the-test-extension-command", cmd_name="test", @@ -205,12 +293,242 @@ def test_start_extension_command_should_use_systemd_to_execute_the_command(self, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if "the-test-extension-command" in args[0]] + extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if + "the-test-extension-command" in args[0]] self.assertEqual(1, len(extension_calls), "The extension should have been invoked exactly once") self.assertIn("systemd-run", extension_calls[0], "The extension should have been invoked using systemd") +class SystemdCgroupsApiv2TestCase(AgentTestCase): + def test_is_controller_enabled_should_return_False_if_cgroup_is_None(self): + with mock_cgroup_v2_environment(self.tmp_dir): + self.assertFalse(cgroupapi.get_cgroup_api().is_controller_enabled('cpu', None)) + + def test_is_controller_enabled_should_return_False_if_controller_is_None(self): + with mock_cgroup_v2_environment(self.tmp_dir): + self.assertFalse(cgroupapi.get_cgroup_api().is_controller_enabled(None, '/sys/fs/cgroup')) + + def test_is_controller_enabled_should_return_False_if_cgroup_path_does_not_exist(self): + with mock_cgroup_v2_environment(self.tmp_dir): + self.assertFalse(cgroupapi.get_cgroup_api().is_controller_enabled('cpu', '/path/that/does/not/exist')) + + def test_is_controller_enabled_should_return_False_if_controller_is_not_in_subtree_control_file_and_controller_interface_files_do_not_exist(self): + with mock_cgroup_v2_environment(self.tmp_dir): + self.assertFalse(cgroupapi.get_cgroup_api().is_controller_enabled('cpu', '/sys/fs/cgroup/azure.slice/walinuxagent.service')) + + def test_is_controller_enabled_should_return_True_if_controller_is_in_subtree_control_file(self): + with mock_cgroup_v2_environment(self.tmp_dir): + # Mock the cgroup.subtree_control to include memory controller + def mock_read_file(path): + if "/sys/fs/cgroup/azure.slice/walinuxagent.service/cgroup.subtree_control" in path: + return 'io memory pids\n' + return read_file(path) + + with patch('azurelinuxagent.common.utils.fileutil.read_file', side_effect=mock_read_file): + self.assertTrue(cgroupapi.get_cgroup_api().is_controller_enabled('memory', '/sys/fs/cgroup/azure.slice/walinuxagent.service')) + + def test_is_controller_enabled_should_return_True_if_controller_interface_file_exists(self): + original_list_dir = os.listdir + + # Mock the walinuxagent.service directory to include memory controller interface files + def mock_os_list_dir(path): + if "/sys/fs/cgroup/azure.slice/walinuxagent.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat'] + return original_list_dir(path) + + with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: + # Mock service directory + mock_env._mock_mkdir('/sys/fs/cgroup/azure.slice/walinuxagent.service') + + with patch('os.listdir', side_effect=mock_os_list_dir): + self.assertTrue(cgroupapi.get_cgroup_api().is_controller_enabled('memory', '/sys/fs/cgroup/azure.slice/walinuxagent.service')) + + def test_get_unit_cgroup_paths_should_return_the_cgroup_v2_mount_points(self): + original_list_dir = os.listdir + + # Mock the extension.service directory to include controller interface files + def mock_os_list_dir(path): + if "/sys/fs/cgroup/system.slice/extension.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat', 'cpu.stat'] + return original_list_dir(path) + + with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: + # Mock service directory + mock_env._mock_mkdir('/sys/fs/cgroup/system.slice/extension.service') + + with patch('os.listdir', side_effect=mock_os_list_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertEqual(cpu, '/sys/fs/cgroup/system.slice/extension.service', + "The mount point for the CPU controller is incorrect") + self.assertEqual(memory, '/sys/fs/cgroup/system.slice/extension.service', + "The mount point for the memory controller is incorrect") + + def test_get_unit_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_mounted(self): + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.is_controller_enabled', return_value=True): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points', return_value=('/sys/fs/cgroup', None)): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIn(cpu, '/sys/fs/cgroup/system.slice/extension.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The mount point for the memory controller is None so unit cgroup should be None") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points', return_value=(None, '/sys/fs/cgroup')): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIsNone(cpu, "The mount point for the cpu controller is None so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/system.slice/extension.service', + "The mount point for the memory controller is incorrect") + + def test_get_unit_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_enabled(self): + original_list_dir = os.listdir + + # Mock the extension.service directory to include only cpu controller interface files + def mock_os_list_dir_cpu(path): + if "/sys/fs/cgroup/system.slice/extension.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'cpu.stat'] + return original_list_dir(path) + + # Mock the extension.service directory to include only cpu controller interface files + def mock_os_list_dir_memory(path): + if "/sys/fs/cgroup/system.slice/extension.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat'] + return original_list_dir(path) + + with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: + # Mock service directory + mock_env._mock_mkdir('/sys/fs/cgroup/system.slice/extension.service') + + with patch('os.listdir', side_effect=mock_os_list_dir_cpu): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIn(cpu, '/sys/fs/cgroup/system.slice/extension.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The memory controller is not enabled so unit cgroup should be None") + + with patch('os.listdir', side_effect=mock_os_list_dir_memory): + cpu, memory = cgroupapi.get_cgroup_api().get_unit_cgroup_paths("extension.service") + self.assertIsNone(cpu, "The cpu controller is not enabled so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/system.slice/extension.service', + "The mount point for the memory controller is incorrect") + + def test_get_process_cgroup_paths_should_return_the_cgroup_v2_mount_points(self): + original_list_dir = os.listdir + + # Mock the extension.service directory to include controller interface files + def mock_os_list_dir(path): + if "/sys/fs/cgroup/system.slice/walinuxagent.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat', 'cpu.stat'] + return original_list_dir(path) + + with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: + # Mock service directory + mock_env._mock_mkdir('/sys/fs/cgroup/system.slice/walinuxagent.service') + + with patch('os.listdir', side_effect=mock_os_list_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") + + def test_get_process_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_mounted(self): + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.is_controller_enabled', return_value=True): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points', return_value=('/sys/fs/cgroup', None)): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The mount point for the memory controller is None so unit cgroup should be None") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_cgroup_mount_points', return_value=(None, '/sys/fs/cgroup')): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIsNone(cpu, "The mount point for the CPU controller is None so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") + + def test_get_process_cgroup_v2_path_should_return_None_if_either_relative_path_is_None(self): + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.is_controller_enabled', return_value=True): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_process_cgroup_relative_paths', return_value=('system.slice/walinuxagent.service', None)): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The relative cgroup path for the memory controller is None so unit cgroup should be None") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv2.get_process_cgroup_relative_paths', return_value=(None, 'system.slice/walinuxagent.service')): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIsNone(cpu, "The relative cgroup path for the cpu controller is None so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") + + def test_get_process_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_enabled(self): + original_list_dir = os.listdir + + # Mock the walinuxagent.service directory to include memory controller interface files + def mock_os_list_dir_memory(path): + if "/sys/fs/cgroup/system.slice/walinuxagent.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'memory.stat'] + return original_list_dir(path) + + # Mock the walinuxagent.service directory to include cpu controller interface files + def mock_os_list_dir_cpu(path): + if "/sys/fs/cgroup/system.slice/walinuxagent.service" in path: + return ['cgroup.controllers', 'cgroup.subtree_control', 'cpu.stat'] + return original_list_dir(path) + + with mock_cgroup_v2_environment(self.tmp_dir) as mock_env: + # Mock service directory + mock_env._mock_mkdir('/sys/fs/cgroup/system.slice/walinuxagent.service') + + with patch('os.listdir', side_effect=mock_os_list_dir_cpu): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the CPU controller is incorrect") + self.assertIsNone(memory, + "The memory controller is not enabled so unit cgroup should be None") + + with patch('os.listdir', side_effect=mock_os_list_dir_memory): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_paths("self") + self.assertIsNone(cpu, "The cpu controller is not enabled so unit cgroup should be None") + self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', + "The mount point for the memory controller is incorrect") + + def test_get_cpu_and_memory_mount_points_should_return_the_cgroup_v2_mount_points(self): + with mock_cgroup_v2_environment(self.tmp_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_cgroup_mount_points() + self.assertEqual(cpu, '/sys/fs/cgroup', "The mount point for the CPU controller is incorrect") + self.assertEqual(memory, '/sys/fs/cgroup', "The mount point for the memory controller is incorrect") + + def test_get_cpu_and_memory_cgroup_relative_paths_for_process_should_return_the_cgroup_v2_relative_paths(self): + with mock_cgroup_v2_environment(self.tmp_dir): + cpu, memory = cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths('self') + self.assertEqual(cpu, "system.slice/walinuxagent.service", "The relative path for the CPU cgroup is incorrect") + self.assertEqual(memory, "system.slice/walinuxagent.service", "The relative memory for the CPU cgroup is incorrect") + + @patch('time.sleep', side_effect=lambda _: mock_sleep()) + def test_start_extension_cgroups_v2_command_should_raise_exception(self, _): + with mock_cgroup_v2_environment(self.tmp_dir): + with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as output_file: + cgroups_exception_raised = False + try: + cgroupapi.get_cgroup_api().start_extension_command( + extension_name="Microsoft.Compute.TestExtension-1.2.3", + command="A_TEST_COMMAND", + cmd_name="test", + shell=True, + timeout=300, + cwd=self.tmp_dir, + env={}, + stdout=output_file, + stderr=output_file) + except CGroupsException: + cgroups_exception_raised = True + self.assertTrue(cgroups_exception_raised) + + class SystemdCgroupsApiMockedFileSystemTestCase(_MockedFileSystemTestCase): def test_cleanup_legacy_cgroups_should_remove_legacy_cgroups(self): # Set up a mock /var/run/waagent.pid file diff --git a/tests/ga/test_cgroupconfigurator.py b/tests/ga/test_cgroupconfigurator.py index 82c86c956..b097a2602 100644 --- a/tests/ga/test_cgroupconfigurator.py +++ b/tests/ga/test_cgroupconfigurator.py @@ -35,7 +35,8 @@ from azurelinuxagent.common.future import ustr from azurelinuxagent.common.utils import shellutil, fileutil from tests.lib.mock_environment import MockCommand -from tests.lib.mock_cgroup_environment import mock_cgroup_environment, UnitFilePaths +from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, UnitFilePaths, mock_cgroup_v2_environment, \ + mock_cgroup_v1_and_v2_environment from tests.lib.tools import AgentTestCase, patch, mock_sleep, data_dir, is_python_version_26_or_34, skip_if_predicate_true from tests.lib.miscellaneous_tools import format_processes, wait_for @@ -51,7 +52,7 @@ def _get_cgroup_configurator(self, initialize=True, enable=True, mock_commands=N CGroupConfigurator._instance = None configurator = CGroupConfigurator.get_instance() CGroupsTelemetry.reset() - with mock_cgroup_environment(self.tmp_dir) as mock_environment: + with mock_cgroup_v1_environment(self.tmp_dir) as mock_environment: if mock_commands is not None: for command in mock_commands: mock_environment.add_command(command) @@ -64,10 +65,55 @@ def _get_cgroup_configurator(self, initialize=True, enable=True, mock_commands=N configurator.initialize() yield configurator - def test_initialize_should_enable_cgroups(self): + @contextlib.contextmanager + def _get_cgroup_configurator_v2(self, initialize=True, enable=True, mock_commands=None): + CGroupConfigurator._instance = None + configurator = CGroupConfigurator.get_instance() + CGroupsTelemetry.reset() + with mock_cgroup_v2_environment(self.tmp_dir) as mock_environment: + if mock_commands is not None: + for command in mock_commands: + mock_environment.add_command(command) + configurator.mocks = mock_environment + if initialize: + if not enable: + with patch.object(configurator, "enable"): + configurator.initialize() + else: + configurator.initialize() + yield configurator + + @contextlib.contextmanager + def _get_cgroup_configurator_v1_and_v2(self, initialize=True, enable=True, mock_commands=None): + CGroupConfigurator._instance = None + configurator = CGroupConfigurator.get_instance() + CGroupsTelemetry.reset() + with mock_cgroup_v1_and_v2_environment(self.tmp_dir) as mock_environment: + if mock_commands is not None: + for command in mock_commands: + mock_environment.add_command(command) + configurator.mocks = mock_environment + if initialize: + if not enable: + with patch.object(configurator, "enable"): + configurator.initialize() + else: + configurator.initialize() + yield configurator + + def test_initialize_should_enable_cgroups_v1(self): with self._get_cgroup_configurator() as configurator: self.assertTrue(configurator.enabled(), "cgroups were not enabled") + def test_initialize_should_not_enable_cgroups_v2(self): + with self._get_cgroup_configurator_v2() as configurator: + self.assertFalse(configurator.enabled(), "cgroups were enabled") + + def test_initialize_should_not_enable_when_cgroup_api_is_none(self): + with patch('azurelinuxagent.ga.cgroupconfigurator.get_cgroup_api', return_value=None): + with self._get_cgroup_configurator() as configurator: + self.assertFalse(configurator.enabled(), "cgroups were enabled") + def test_initialize_should_start_tracking_the_agent_cgroups(self): with self._get_cgroup_configurator() as configurator: tracked = CGroupsTelemetry._tracked @@ -79,18 +125,18 @@ def test_initialize_should_start_tracking_the_agent_cgroups(self): "The Agent's Memory is not being tracked. Tracked: {0}".format(tracked)) def test_initialize_should_start_tracking_other_controllers_when_one_is_not_present(self): - command_mocks = [MockCommand(r"^mount -t cgroup$", -'''cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) -cgroup on /sys/fs/cgroup/rdma type cgroup (rw,nosuid,nodev,noexec,relatime,rdma) -cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset) -cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_cls,net_prio) -cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event) -cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb) -cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer) -cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids) -cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices) -cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct) -cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio) + command_mocks = [MockCommand(r"^findmnt -t cgroup --noheadings$", +'''/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd +/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices +/sys/fs/cgroup/rdma cgroup cgroup rw,nosuid,nodev,noexec,relatime,rdma +/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event +/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_cls,net_prio +/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio +/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset +/sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct +/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer +/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb +/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids ''')] with self._get_cgroup_configurator(mock_commands=command_mocks) as configurator: tracked = CGroupsTelemetry._tracked @@ -99,18 +145,28 @@ def test_initialize_should_start_tracking_other_controllers_when_one_is_not_pres self.assertFalse(any(cg for cg in tracked.values() if cg.name == 'walinuxagent.service' and 'memory' in cg.path), "The Agent's memory should not be tracked. Tracked: {0}".format(tracked)) + def test_initialize_should_start_tracking_any_controllers_in_v1_if_others_in_v2(self): + # This mock environment has cpu controller in v1 and memory controller in v2 + with self._get_cgroup_configurator_v1_and_v2() as configurator: + tracked = CGroupsTelemetry._tracked + + self.assertTrue(configurator.enabled(), "Cgroups should be enabled") + self.assertFalse( + any(cg for cg in tracked.values() if cg.name == 'walinuxagent.service' and 'memory' in cg.path), + "The Agent's memory should not be tracked. Tracked: {0}".format(tracked)) + def test_initialize_should_not_enable_cgroups_when_the_cpu_and_memory_controllers_are_not_present(self): - command_mocks = [MockCommand(r"^mount -t cgroup$", -'''cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) -cgroup on /sys/fs/cgroup/rdma type cgroup (rw,nosuid,nodev,noexec,relatime,rdma) -cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset) -cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_cls,net_prio) -cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event) -cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb) -cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer) -cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids) -cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices) -cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio) + command_mocks = [MockCommand(r"^findmnt -t cgroup --noheadings$", +'''/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd +/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices +/sys/fs/cgroup/rdma cgroup cgroup rw,nosuid,nodev,noexec,relatime,rdma +/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event +/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_cls,net_prio +/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio +/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset +/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer +/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb +/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids ''')] with self._get_cgroup_configurator(mock_commands=command_mocks) as configurator: tracked = CGroupsTelemetry._tracked @@ -119,17 +175,17 @@ def test_initialize_should_not_enable_cgroups_when_the_cpu_and_memory_controller self.assertEqual(len(tracked), 0, "No cgroups should be tracked. Tracked: {0}".format(tracked)) def test_initialize_should_not_enable_cgroups_when_the_agent_is_not_in_the_system_slice(self): - command_mocks = [MockCommand(r"^mount -t cgroup$", -'''cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) -cgroup on /sys/fs/cgroup/rdma type cgroup (rw,nosuid,nodev,noexec,relatime,rdma) -cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset) -cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_cls,net_prio) -cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event) -cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb) -cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer) -cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids) -cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices) -cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio) + command_mocks = [MockCommand(r"^findmnt -t cgroup --noheadings$", +'''/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd* +/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices +/sys/fs/cgroup/rdma cgroup cgroup rw,nosuid,nodev,noexec,relatime,rdma +/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event +/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_cls,net_prio +/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio +/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset +/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer +/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb +/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids ''')] with self._get_cgroup_configurator(mock_commands=command_mocks) as configurator: @@ -286,6 +342,17 @@ def test_enable_should_not_track_throttled_time_when_setting_the_cpu_quota_fails self.assertFalse(CGroupsTelemetry.get_track_throttled_time(), "Throttle time should not be tracked") + def test_enable_should_not_track_throttled_time_when_cgroups_v2_enabled(self): + with self._get_cgroup_configurator_v2(initialize=False) as configurator: + if CGroupsTelemetry.get_track_throttled_time(): + raise Exception("Test setup should not start tracking Throttle Time") + + configurator.mocks.add_file(UnitFilePaths.cpu_quota, Exception("A TEST EXCEPTION")) + + configurator.initialize() + + self.assertFalse(CGroupsTelemetry.get_track_throttled_time(), "Throttle time should not be tracked when using cgroups v2") + def test_disable_should_reset_cpu_quota(self): with self._get_cgroup_configurator() as configurator: if len(CGroupsTelemetry._tracked) == 0: @@ -376,7 +443,7 @@ def test_start_extension_command_should_not_use_systemd_when_cgroups_are_not_ena self.assertEqual(command_calls[0], "date", "The command line should not have been modified") @patch('time.sleep', side_effect=lambda _: mock_sleep()) - def test_start_extension_command_should_use_systemd_run_when_cgroups_are_enabled(self, _): + def test_start_extension_command_should_use_systemd_run_when_cgroups_v1_are_enabled(self, _): with self._get_cgroup_configurator() as configurator: with patch("azurelinuxagent.ga.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) as popen_patch: configurator.start_extension_command( @@ -444,6 +511,54 @@ def mock_popen(command_arg, *args, **kwargs): self.assertIn("A TEST EXCEPTION", str(context_manager.exception)) + @patch('time.sleep', side_effect=lambda _: mock_sleep()) + def test_start_extension_command_should_disable_cgroups_and_invoke_the_command_directly_if_v2_is_used(self, _): + with self._get_cgroup_configurator_v2() as configurator: + configurator.enable() # NOTE: Cgroups should not currently be enabled if v2 is detected. Adding this test to guarantee extensions are run correctly if cgroups v2 api is incorrectly called. + + with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as output_file: + with patch("azurelinuxagent.ga.cgroupstelemetry.add_event") as mock_add_event: + with patch("subprocess.Popen", wraps=subprocess.Popen) as popen_patch: + CGroupsTelemetry.reset() + + command = "echo TEST_OUTPUT" + + command_output = configurator.start_extension_command( + extension_name="Microsoft.Compute.TestExtension-1.2.3", + command=command, + cmd_name="test", + timeout=300, + shell=True, + cwd=self.tmp_dir, + env={}, + stdout=output_file, + stderr=output_file) + + self.assertFalse(configurator.enabled(), "Cgroups should have been disabled") + + disabled_events = [kwargs for _, kwargs in mock_add_event.call_args_list if + kwargs['op'] == WALAEventOperation.CGroupsDisabled] + + self.assertTrue(len(disabled_events) == 1, + "Exactly one CGroupsDisabled telemetry event should have been issued. Found: {0}".format( + disabled_events)) + self.assertIn("Failed to start Microsoft.Compute.TestExtension-1.2.3 using cgroups", + disabled_events[0]['message'], + "The cgroups failure was not included in the telemetry message") + self.assertEqual(False, disabled_events[0]['is_success'], + "The telemetry event should indicate a failure") + + extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if command in args[0]] + + self.assertEqual(1, len(extension_calls), + "The extension should have been invoked exactly twice") + self.assertEqual(command, extension_calls[0], + "The second call to the extension should not have used systemd") + + self.assertEqual(len(CGroupsTelemetry._tracked), 0, "No cgroups should have been created") + + self.assertIn("TEST_OUTPUT\n", command_output, "The test output was not captured") + @patch('time.sleep', side_effect=lambda _: mock_sleep()) def test_start_extension_command_should_disable_cgroups_and_invoke_the_command_directly_if_systemd_fails(self, _): with self._get_cgroup_configurator() as configurator: @@ -451,7 +566,7 @@ def test_start_extension_command_should_disable_cgroups_and_invoke_the_command_d configurator.mocks.add_command(MockCommand("systemd-run", return_value=1, stdout='', stderr='Failed to start transient scope unit: syntax error')) with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as output_file: - with patch("azurelinuxagent.ga.cgroupconfigurator.add_event") as mock_add_event: + with patch("azurelinuxagent.ga.cgroupstelemetry.add_event") as mock_add_event: with patch("subprocess.Popen", wraps=subprocess.Popen) as popen_patch: CGroupsTelemetry.reset() @@ -881,7 +996,7 @@ def test_check_cgroups_should_disable_cgroups_when_a_check_fails(self): patchers.append(p) p.start() - with patch("azurelinuxagent.ga.cgroupconfigurator.add_event") as add_event: + with patch("azurelinuxagent.ga.cgroupstelemetry.add_event") as add_event: configurator.enable() tracked_metrics = [ diff --git a/tests/ga/test_cgroupconfigurator_sudo.py b/tests/ga/test_cgroupconfigurator_sudo.py index 30db19408..6ff314496 100644 --- a/tests/ga/test_cgroupconfigurator_sudo.py +++ b/tests/ga/test_cgroupconfigurator_sudo.py @@ -25,7 +25,7 @@ from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.exception import ExtensionError, ExtensionErrorCodes from azurelinuxagent.common.future import ustr -from tests.lib.mock_cgroup_environment import mock_cgroup_environment +from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment from tests.lib.tools import AgentTestCase, patch, mock_sleep, i_am_root, is_python_version_26_or_34, skip_if_predicate_true @@ -40,7 +40,7 @@ def _get_cgroup_configurator(self, initialize=True, enable=True, mock_commands=N CGroupConfigurator._instance = None configurator = CGroupConfigurator.get_instance() CGroupsTelemetry.reset() - with mock_cgroup_environment(self.tmp_dir) as mock_environment: + with mock_cgroup_v1_environment(self.tmp_dir) as mock_environment: if mock_commands is not None: for command in mock_commands: mock_environment.add_command(command) diff --git a/tests/lib/cgroups_tools.py b/tests/lib/cgroups_tools.py index 45b817447..cb29ee9bf 100644 --- a/tests/lib/cgroups_tools.py +++ b/tests/lib/cgroups_tools.py @@ -33,17 +33,3 @@ def create_legacy_agent_cgroup(cgroups_file_system_root, controller, daemon_pid) fileutil.append_file(os.path.join(legacy_cgroup, "cgroup.procs"), daemon_pid + "\n") return legacy_cgroup - @staticmethod - def create_agent_cgroup(cgroups_file_system_root, controller, extension_handler_pid): - """ - Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; - starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. - - This method creates a mock cgroup using the newer path and adds the given PID to it. - """ - new_cgroup = os.path.join(cgroups_file_system_root, controller, "walinuxagent.service") - if not os.path.exists(new_cgroup): - os.makedirs(new_cgroup) - fileutil.append_file(os.path.join(new_cgroup, "cgroup.procs"), extension_handler_pid + "\n") - return new_cgroup - diff --git a/tests/lib/mock_cgroup_environment.py b/tests/lib/mock_cgroup_environment.py index 3b51dce8f..4b3e1534e 100644 --- a/tests/lib/mock_cgroup_environment.py +++ b/tests/lib/mock_cgroup_environment.py @@ -20,29 +20,11 @@ from tests.lib.tools import patch, data_dir from tests.lib.mock_environment import MockEnvironment, MockCommand -_MOCKED_COMMANDS = [ +# Mocked commands which are common between v1 and v2 +_MOCKED_COMMANDS_COMMON = [ MockCommand(r"^systemctl --version$", '''systemd 237 +PAM +AUDIT +SELINUX +IMA +APPARMOR +SMACK +SYSVINIT +UTMP +LIBCRYPTSETUP +GCRYPT +GNUTLS +ACL +XZ +LZ4 +SECCOMP +BLKID +ELFUTILS +KMOD -IDN2 +IDN -PCRE2 default-hierarchy=hybrid -'''), - - MockCommand(r"^mount -t cgroup$", -'''cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) -cgroup on /sys/fs/cgroup/rdma type cgroup (rw,nosuid,nodev,noexec,relatime,rdma) -cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset) -cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_cls,net_prio) -cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event) -cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb) -cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer) -cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory) -cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids) -cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices) -cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct) -cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio) -'''), - - MockCommand(r"^mount -t cgroup2$", -'''cgroup on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime) '''), MockCommand(r"^systemctl show walinuxagent\.service --property Slice", @@ -77,10 +59,80 @@ ] -_MOCKED_FILES = [ - ("/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'proc_self_cgroup')), - (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'proc_pid_cgroup')), - ("/sys/fs/cgroup/unified/cgroup.controllers", os.path.join(data_dir, 'cgroups', 'sys_fs_cgroup_unified_cgroup.controllers')) +_MOCKED_COMMANDS_V1 = [ + MockCommand(r"^findmnt -t cgroup --noheadings$", +'''/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd +/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices +/sys/fs/cgroup/rdma cgroup cgroup rw,nosuid,nodev,noexec,relatime,rdma +/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event +/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_cls,net_prio +/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio +/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset +/sys/fs/cgroup/misc cgroup cgroup rw,nosuid,nodev,noexec,relatime,misc +/sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct +/sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory +/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer +/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb +/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids +'''), + + MockCommand(r"^findmnt -t cgroup2 --noheadings$", +'''/sys/fs/cgroup/unified cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate +'''), + +] + +_MOCKED_COMMANDS_V2 = [ + MockCommand(r"^findmnt -t cgroup2 --noheadings$", +'''/sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot +'''), + + MockCommand(r"^findmnt -t cgroup --noheadings$", ''), + +] + +# Mocked commands when memory controller is in v2, but all other controllers are in v1 +_MOCKED_COMMANDS_V1_AND_V2 = [ + MockCommand(r"^findmnt -t cgroup --noheadings$", +'''/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd +/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices +/sys/fs/cgroup/rdma cgroup cgroup rw,nosuid,nodev,noexec,relatime,rdma +/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event +/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_cls,net_prio +/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio +/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset +/sys/fs/cgroup/misc cgroup cgroup rw,nosuid,nodev,noexec,relatime,misc +/sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct +/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer +/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb +/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids +'''), + + MockCommand(r"^findmnt -t cgroup2 --noheadings$", +'''/sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot +'''), + +] + +_MOCKED_FILES_V1 = [ + ("/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_self_cgroup')), + (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_pid_cgroup')), + ("/sys/fs/cgroup/unified/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v1', 'sys_fs_cgroup_cgroup.subtree_control')) +] + +_MOCKED_FILES_V2 = [ + ("/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'v2', 'proc_self_cgroup')), + (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v2', 'proc_pid_cgroup')), + ("/sys/fs/cgroup/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v2', 'sys_fs_cgroup_cgroup.subtree_control')), + ("/sys/fs/cgroup/azure.slice/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v2', 'sys_fs_cgroup_cgroup.subtree_control')), + ("/sys/fs/cgroup/azure.slice/walinuxagent.service/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v2', 'sys_fs_cgroup_cgroup.subtree_control_empty')) +] + +# Mocked files when memory controller is in v2, but all other controllers are in v1 +_MOCKED_FILES_V1_AND_V2 = [ + ("/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'v1_and_v2', 'proc_self_cgroup')), + (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v1_and_v2', 'proc_pid_cgroup')), + ("/sys/fs/cgroup/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v1_and_v2', 'sys_fs_cgroup_cgroup.subtree_control')) ] _MOCKED_PATHS = [ @@ -88,6 +140,12 @@ r"^(/etc/systemd/system)" ] +_MOCKED_PATHS_V2 = [ + r"^(/sys/fs/cgroup/azure.slice/walinuxagent.service)", + r"^(/sys/fs/cgroup/system.slice/walinuxagent.service)", + r"^(/sys/fs/cgroup/system.slice/extension.service)" +] + class UnitFilePaths: walinuxagent = "/lib/systemd/system/walinuxagent.service" @@ -106,11 +164,48 @@ class UnitFilePaths: @contextlib.contextmanager -def mock_cgroup_environment(tmp_dir): +def mock_cgroup_v1_environment(tmp_dir): + """ + Creates a mock environment for cgroups v1 hierarchy used by the tests related to cgroups (currently it only + provides support for systemd platforms). + The command output used in __MOCKED_COMMANDS comes from an Ubuntu 20 system. + """ + data_files = [ + (os.path.join(data_dir, 'init', 'walinuxagent.service'), UnitFilePaths.walinuxagent), + (os.path.join(data_dir, 'init', 'azure.slice'), UnitFilePaths.azure), + (os.path.join(data_dir, 'init', 'azure-vmextensions.slice'), UnitFilePaths.vmextensions) + ] + + with patch('azurelinuxagent.ga.cgroupapi.CGroupsApi.cgroups_supported', return_value=True): + with patch('azurelinuxagent.common.osutil.systemd.is_systemd', return_value=True): + with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS_COMMON + _MOCKED_COMMANDS_V1, paths=_MOCKED_PATHS, files=_MOCKED_FILES_V1, data_files=data_files) as mock: + yield mock + +@contextlib.contextmanager +def mock_cgroup_v2_environment(tmp_dir): + """ + Creates a mock environment for cgroups v2 hierarchy used by the tests related to cgroups (currently it only + provides support for systemd platforms). + The command output used in __MOCKED_COMMANDS comes from an Ubuntu 22 system. + """ + data_files = [ + (os.path.join(data_dir, 'init', 'walinuxagent.service'), UnitFilePaths.walinuxagent), + (os.path.join(data_dir, 'init', 'azure.slice'), UnitFilePaths.azure), + (os.path.join(data_dir, 'init', 'azure-vmextensions.slice'), UnitFilePaths.vmextensions) + ] + + with patch('azurelinuxagent.ga.cgroupapi.CGroupsApi.cgroups_supported', return_value=True): + with patch('azurelinuxagent.common.osutil.systemd.is_systemd', return_value=True): + with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS_COMMON + _MOCKED_COMMANDS_V2, paths=_MOCKED_PATHS + _MOCKED_PATHS_V2, files=_MOCKED_FILES_V2, data_files=data_files) as mock: + yield mock + +@contextlib.contextmanager +def mock_cgroup_v1_and_v2_environment(tmp_dir): + """ + Creates a mock environment for machine which has controllers in cgroups v1 and v2 hierarchies used by the tests + related to cgroups (currently it only provides support for systemd platforms). The agent does not currently support + this scenario. """ - Creates a mocks environment used by the tests related to cgroups (currently it only provides support for systemd platforms). - The command output used in __MOCKED_COMMANDS comes from an Ubuntu 18 system. - """ data_files = [ (os.path.join(data_dir, 'init', 'walinuxagent.service'), UnitFilePaths.walinuxagent), (os.path.join(data_dir, 'init', 'azure.slice'), UnitFilePaths.azure), @@ -119,5 +214,5 @@ def mock_cgroup_environment(tmp_dir): with patch('azurelinuxagent.ga.cgroupapi.CGroupsApi.cgroups_supported', return_value=True): with patch('azurelinuxagent.common.osutil.systemd.is_systemd', return_value=True): - with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS, paths=_MOCKED_PATHS, files=_MOCKED_FILES, data_files=data_files) as mock: + with MockEnvironment(tmp_dir, commands=_MOCKED_COMMANDS_COMMON + _MOCKED_COMMANDS_V1_AND_V2, paths=_MOCKED_PATHS, files=_MOCKED_FILES_V1_AND_V2, data_files=data_files) as mock: yield mock diff --git a/tests/test_agent.py b/tests/test_agent.py index cbf223aa5..906392b61 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -21,10 +21,10 @@ from azurelinuxagent.agent import parse_args, Agent, usage, AgentCommands from azurelinuxagent.common import conf -from azurelinuxagent.ga import logcollector, cgroupconfigurator -from azurelinuxagent.ga.cgroupapi import SystemdCgroupsApi +from azurelinuxagent.ga import logcollector, cgroupconfigurator, cgroupapi from azurelinuxagent.common.utils import fileutil from azurelinuxagent.ga.collect_logs import CollectLogsHandler +from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, mock_cgroup_v1_and_v2_environment from tests.lib.tools import AgentTestCase, data_dir, Mock, patch EXPECTED_CONFIGURATION = \ @@ -240,46 +240,105 @@ def test_calls_collect_logs_with_proper_mode(self, mock_log_collector, *args): self.assertFalse(full_mode) @patch("azurelinuxagent.agent.LogCollector") - def test_calls_collect_logs_on_valid_cgroups(self, mock_log_collector): + def test_calls_collect_logs_on_valid_cgroups_v1(self, mock_log_collector): try: CollectLogsHandler.enable_monitor_cgroups_check() mock_log_collector.run = Mock() + # Mock cgroup paths so process is in the log collector slice def mock_cgroup_paths(*args, **kwargs): if args and args[0] == "self": relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT) - return (cgroupconfigurator.LOGCOLLECTOR_SLICE, relative_path) - return SystemdCgroupsApi.get_process_cgroup_relative_paths(*args, **kwargs) + return (relative_path, relative_path) + return cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) + + with mock_cgroup_v1_environment(self.tmp_dir): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_paths", + side_effect=mock_cgroup_paths): + agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) + agent.collect_logs(is_full_mode=True) + + mock_log_collector.assert_called_once() - with patch("azurelinuxagent.agent.SystemdCgroupsApi.get_process_cgroup_paths", side_effect=mock_cgroup_paths): - agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) - agent.collect_logs(is_full_mode=True) - - mock_log_collector.assert_called_once() finally: CollectLogsHandler.disable_monitor_cgroups_check() @patch("azurelinuxagent.agent.LogCollector") - def test_doesnt_call_collect_logs_on_invalid_cgroups(self, mock_log_collector): + def test_doesnt_call_collect_logs_when_cgroup_api_cannot_be_determined(self, mock_log_collector): try: CollectLogsHandler.enable_monitor_cgroups_check() mock_log_collector.run = Mock() - def mock_cgroup_paths(*args, **kwargs): - if args and args[0] == "self": - return ("NOT_THE_CORRECT_PATH", "NOT_THE_CORRECT_PATH") - return SystemdCgroupsApi.get_process_cgroup_relative_paths(*args, **kwargs) + def raise_on_sys_exit(*args): + raise RuntimeError(args[0] if args else "Exiting") - with patch("azurelinuxagent.agent.SystemdCgroupsApi.get_process_cgroup_paths", side_effect=mock_cgroup_paths): + with patch("azurelinuxagent.agent.get_cgroup_api", return_value=None): agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) - exit_error = RuntimeError("Exiting") - with patch("sys.exit", return_value=exit_error) as mock_exit: + with patch("sys.exit", side_effect=raise_on_sys_exit) as mock_exit: try: agent.collect_logs(is_full_mode=True) except RuntimeError as re: + self.assertEqual(logcollector.INVALID_CGROUPS_ERRCODE, re.args[0]) + mock_exit.assert_called_once_with(logcollector.INVALID_CGROUPS_ERRCODE) + finally: + CollectLogsHandler.disable_monitor_cgroups_check() + + @patch("azurelinuxagent.agent.LogCollector") + def test_doesnt_call_collect_logs_on_invalid_cgroups_v1(self, mock_log_collector): + try: + CollectLogsHandler.enable_monitor_cgroups_check() + mock_log_collector.run = Mock() + + # Mock cgroup paths so process is in incorrect slice + def mock_cgroup_paths(*args, **kwargs): + if args and args[0] == "self": + return ("NOT_THE_CORRECT_PATH", "NOT_THE_CORRECT_PATH") + return cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) + + def raise_on_sys_exit(*args): + raise RuntimeError(args[0] if args else "Exiting") + + with mock_cgroup_v1_environment(self.tmp_dir): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_paths", side_effect=mock_cgroup_paths): + agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) + + with patch("sys.exit", side_effect=raise_on_sys_exit) as mock_exit: + try: + agent.collect_logs(is_full_mode=True) + except RuntimeError as re: + self.assertEqual(logcollector.INVALID_CGROUPS_ERRCODE, re.args[0]) + mock_exit.assert_called_once_with(logcollector.INVALID_CGROUPS_ERRCODE) + finally: + CollectLogsHandler.disable_monitor_cgroups_check() + + @patch("azurelinuxagent.agent.LogCollector") + def test_doesnt_call_collect_logs_when_controllers_mounted_in_different_hierarchies(self, mock_log_collector): + try: + CollectLogsHandler.enable_monitor_cgroups_check() + mock_log_collector.run = Mock() + + # Mock cgroup paths so process is in the log collector slice and cpu is not mounted + def mock_cgroup_paths(*args, **kwargs): + if args and args[0] == "self": + relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT) + return (None, relative_path) + return cgroupapi.get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) + + def raise_on_sys_exit(*args): + raise RuntimeError(args[0] if args else "Exiting") + + with mock_cgroup_v1_and_v2_environment(self.tmp_dir): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupsApiv1.get_process_cgroup_paths", + side_effect=mock_cgroup_paths): + agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) + + with patch("sys.exit", side_effect=raise_on_sys_exit) as mock_exit: + try: + agent.collect_logs(is_full_mode=True) + except RuntimeError as re: + self.assertEqual(logcollector.INVALID_CGROUPS_ERRCODE, re.args[0]) mock_exit.assert_called_once_with(logcollector.INVALID_CGROUPS_ERRCODE) - self.assertEqual(exit_error, re) finally: CollectLogsHandler.disable_monitor_cgroups_check() diff --git a/tests_e2e/test_suites/cgroups_v2_disabled.yml b/tests_e2e/test_suites/cgroups_v2_disabled.yml new file mode 100644 index 000000000..5a075a2a2 --- /dev/null +++ b/tests_e2e/test_suites/cgroups_v2_disabled.yml @@ -0,0 +1,10 @@ +# +# The test suite verifies that the agent does not enable resource enforcement and monitoring on machines which are +# using cgroups v2. This suite will be removed once cgroups v2 is supported. +# +name: "Cgroupsv2Disabled" +tests: + - "cgroups_v2_disabled/cgroups_v2_disabled.py" +images: + - "ubuntu_2204" + - "ubuntu_2404" \ No newline at end of file diff --git a/tests_e2e/tests/cgroups_v2_disabled/cgroups_v2_disabled.py b/tests_e2e/tests/cgroups_v2_disabled/cgroups_v2_disabled.py new file mode 100644 index 000000000..9f6e117e6 --- /dev/null +++ b/tests_e2e/tests/cgroups_v2_disabled/cgroups_v2_disabled.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import time + +from assertpy import fail + +from tests_e2e.tests.lib.agent_test import AgentVmTest +from tests_e2e.tests.lib.agent_test_context import AgentVmTestContext +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.shell import CommandError +from tests_e2e.tests.lib.ssh_client import SshClient + + +class Cgroupsv2Disabled(AgentVmTest): + """ + The test verifies that the agent does not enable resource enforcement and monitoring on machines which are using + cgroups v2. It also checks that the agent correctly determined the controller mount points. This test will be + removed once cgroups v2 is supported. + """ + + def __init__(self, context: AgentVmTestContext): + super().__init__(context) + self._ssh_client: SshClient = self._context.create_ssh_client() + + def check_agent_log_contains(self, data, assertion): + try: + self._ssh_client.run_command("grep \"{0}\" /var/log/waagent.log".format(data)) + except CommandError: + fail("{0}".format(assertion)) + + def run(self): + # Cgroup configurator is initialized when agent is started, and before the goal state processing period is + # logged. Wait until the agent logs the goal state period before checking for cgroup initialization logs. + log.info("Wait for cgroup configurator to be initialized...") + for _ in range(15): + try: + self._ssh_client.run_command("grep 'Goal State Period:' /var/log/waagent.log") + break + except CommandError: + log.info("The Agent has not initialized cgroups yet, will check again after a short delay") + time.sleep(60) + else: + raise Exception("Timeout while waiting for the Agent to initialize cgroups") + + # Verify that the agent chose v2 for resource enforcement and monitoring + log.info("") + log.info("Checking that the agent chose cgroups v2 api for resource enforcement and monitoring...") + self.check_agent_log_contains('Using cgroups v2 for resource enforcement and monitoring', 'The agent should choose v2 for api resource enforcement and monitoring') + + # Verify that the agent determined the correct mount point for each controller + log.info("") + log.info("Checking that the agent determined the correct mount point for each controller...") + self.check_agent_log_contains('The CPU cgroup controller is mounted at /sys/fs/cgroup', + 'The agent should identify the cpu controller to be mounted at /sys/fs/cgroup') + self.check_agent_log_contains('The memory cgroup controller is mounted at /sys/fs/cgroup', + 'The agent should identify the memory controller to be mounted at /sys/fs/cgroup') + + # Verify that the agent does not support cgroups v2 + log.info("") + log.info("Checking that the agent does not use cgroups v2 for resource enforcement and monitoring...") + self.check_agent_log_contains('Agent and extensions resource monitoring is not currently supported on cgroups v2', + 'The agent should not attempt to use cgroups v2 for resource enforcement and monitoring') + self.check_agent_log_contains('Agent cgroups enabled: False', + 'The agent should not enable cgroups when system is using v2') + + +if __name__ == "__main__": + Cgroupsv2Disabled.run_from_command_line()