diff --git a/azurelinuxagent/agent.py b/azurelinuxagent/agent.py index babd67e1e..bfb795c6b 100644 --- a/azurelinuxagent/agent.py +++ b/azurelinuxagent/agent.py @@ -23,6 +23,7 @@ from __future__ import print_function +import json import os import re import subprocess @@ -31,7 +32,8 @@ from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.ga import logcollector, cgroupconfigurator -from azurelinuxagent.ga.controllermetrics import AGENT_LOG_COLLECTOR, CpuMetrics +from azurelinuxagent.ga.cgroupcontroller import AGENT_LOG_COLLECTOR +from azurelinuxagent.ga.cpucontroller import _CpuController from azurelinuxagent.ga.cgroupapi import get_cgroup_api, log_cgroup_warning, InvalidCgroupMountpointException import azurelinuxagent.common.conf as conf @@ -208,7 +210,7 @@ def collect_logs(self, is_full_mode): # Check the cgroups unit log_collector_monitor = None - tracked_metrics = [] + tracked_controllers = [] if CollectLogsHandler.is_enabled_monitor_cgroups_check(): try: cgroup_api = get_cgroup_api() @@ -220,10 +222,10 @@ def collect_logs(self, is_full_mode): sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) log_collector_cgroup = cgroup_api.get_process_cgroup(process_id="self", cgroup_name=AGENT_LOG_COLLECTOR) - tracked_metrics = log_collector_cgroup.get_controller_metrics() + tracked_controllers = log_collector_cgroup.get_controllers() - if len(tracked_metrics) != len(log_collector_cgroup.get_supported_controllers()): - log_cgroup_warning("At least one required controller is missing. The following controllers are required for the log collector to run: {0}".format(log_collector_cgroup.get_supported_controllers())) + if len(tracked_controllers) != len(log_collector_cgroup.get_supported_controller_names()): + log_cgroup_warning("At least one required controller is missing. The following controllers are required for the log collector to run: {0}".format(log_collector_cgroup.get_supported_controller_names())) sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) if not log_collector_cgroup.check_in_expected_slice(cgroupconfigurator.LOGCOLLECTOR_SLICE): @@ -235,15 +237,30 @@ def collect_logs(self, is_full_mode): # Running log collector resource monitoring only if agent starts the log collector. # If Log collector start by any other means, then it will not be monitored. if CollectLogsHandler.is_enabled_monitor_cgroups_check(): - for metric in tracked_metrics: - if isinstance(metric, CpuMetrics): - metric.initialize_cpu_usage() + for controller in tracked_controllers: + if isinstance(controller, _CpuController): + controller.initialize_cpu_usage() break - log_collector_monitor = get_log_collector_monitor_handler(tracked_metrics) + log_collector_monitor = get_log_collector_monitor_handler(tracked_controllers) log_collector_monitor.run() - archive = log_collector.collect_logs_and_get_archive() + + archive, total_uncompressed_size = log_collector.collect_logs_and_get_archive() logger.info("Log collection successfully completed. Archive can be found at {0} " "and detailed log output can be found at {1}".format(archive, OUTPUT_RESULTS_FILE_PATH)) + + if log_collector_monitor is not None: + log_collector_monitor.stop() + try: + metrics_summary = log_collector_monitor.get_max_recorded_metrics() + metrics_summary['Total Uncompressed File Size (B)'] = total_uncompressed_size + msg = json.dumps(metrics_summary) + logger.info(msg) + event.add_event(op=event.WALAEventOperation.LogCollection, message=msg, log_event=False) + except Exception as e: + msg = "An error occurred while reporting log collector resource usage summary: {0}".format(ustr(e)) + logger.warn(msg) + event.add_event(op=event.WALAEventOperation.LogCollection, is_success=False, message=msg, log_event=False) + except Exception as e: logger.error("Log collection completed unsuccessfully. Error: {0}".format(ustr(e))) logger.info("Detailed log output can be found at {0}".format(OUTPUT_RESULTS_FILE_PATH)) diff --git a/azurelinuxagent/common/conf.py b/azurelinuxagent/common/conf.py index 85a7bc2f7..b5eec73ce 100644 --- a/azurelinuxagent/common/conf.py +++ b/azurelinuxagent/common/conf.py @@ -146,7 +146,8 @@ def load_conf_from_file(conf_file_path, conf=__conf__): "Debug.CgroupDisableOnQuotaCheckFailure": True, "Debug.EnableAgentMemoryUsageCheck": False, "Debug.EnableFastTrack": True, - "Debug.EnableGAVersioning": True + "Debug.EnableGAVersioning": True, + "Debug.EnableCgroupV2ResourceLimiting": False } @@ -200,7 +201,8 @@ def load_conf_from_file(conf_file_path, conf=__conf__): "Debug.EtpCollectionPeriod": 300, "Debug.AutoUpdateHotfixFrequency": 14400, "Debug.AutoUpdateNormalFrequency": 86400, - "Debug.FirewallRulesLogPeriod": 86400 + "Debug.FirewallRulesLogPeriod": 86400, + "Debug.LogCollectorInitialDelay": 5 * 60 } @@ -680,3 +682,20 @@ def get_firewall_rules_log_period(conf=__conf__): NOTE: This option is experimental and may be removed in later versions of the Agent. """ return conf.get_int("Debug.FirewallRulesLogPeriod", 86400) + + +def get_enable_cgroup_v2_resource_limiting(conf=__conf__): + """ + If True, the agent will enable resource monitoring and enforcement for the log collector on machines using cgroup v2. + NOTE: This option is experimental and may be removed in later versions of the Agent. + """ + return conf.get_switch("Debug.EnableCgroupV2ResourceLimiting", False) + + +def get_log_collector_initial_delay(conf=__conf__): + """ + Determine the initial delay at service start before the first periodic log collection. + + NOTE: This option is experimental and may be removed in later versions of the Agent. + """ + return conf.get_int("Debug.LogCollectorInitialDelay", 5 * 60) diff --git a/azurelinuxagent/ga/cgroupapi.py b/azurelinuxagent/ga/cgroupapi.py index 348352762..72b41ec77 100644 --- a/azurelinuxagent/ga/cgroupapi.py +++ b/azurelinuxagent/ga/cgroupapi.py @@ -24,8 +24,9 @@ from azurelinuxagent.common import logger from azurelinuxagent.common.event import WALAEventOperation, add_event -from azurelinuxagent.ga.controllermetrics import CpuMetrics, MemoryMetrics from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry +from azurelinuxagent.ga.cpucontroller import _CpuController, CpuControllerV1, CpuControllerV2 +from azurelinuxagent.ga.memorycontroller import MemoryControllerV1, MemoryControllerV2 from azurelinuxagent.common.conf import get_agent_pid_file_path from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes, ExtensionError, \ ExtensionOperationError @@ -292,7 +293,7 @@ def _get_controller_mountpoints(): if match is not None: path = match.group('path') controller = match.group('controller') - if controller is not None and path is not None and controller in CgroupV1.get_supported_controllers(): + if controller is not None and path is not None and controller in CgroupV1.get_supported_controller_names(): mount_points[controller] = path return mount_points @@ -335,7 +336,7 @@ def _get_process_relative_controller_paths(process_id): if match is not None: controller = match.group('controller') path = match.group('path').lstrip('/') if match.group('path') != '/' else None - if path is not None and controller in CgroupV1.get_supported_controllers(): + if path is not None and controller in CgroupV1.get_supported_controller_names(): conroller_relative_paths[controller] = path return conroller_relative_paths @@ -371,7 +372,7 @@ def get_process_cgroup(self, process_id, cgroup_name): controller_paths=process_controller_paths) def log_root_paths(self): - for controller in CgroupV1.get_supported_controllers(): + for controller in CgroupV1.get_supported_controller_names(): mount_point = self._cgroup_mountpoints.get(controller) if mount_point is None: log_cgroup_info("The {0} controller is not mounted".format(controller), send_event=False) @@ -402,14 +403,14 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh log_cgroup_info("Started extension in unit '{0}'".format(scope_name), send_event=False) - cpu_metrics = None + cpu_controller = None try: cgroup_relative_path = os.path.join('azure.slice/azure-vmextensions.slice', extension_slice_name) cgroup = self.get_cgroup_from_relative_path(cgroup_relative_path, extension_name) - for metrics in cgroup.get_controller_metrics(): - if isinstance(metrics, CpuMetrics): - cpu_metrics = metrics - CGroupsTelemetry.track_cgroup(metrics) + for controller in cgroup.get_controllers(): + if isinstance(controller, _CpuController): + cpu_controller = controller + CGroupsTelemetry.track_cgroup_controller(controller) except IOError as e: if e.errno == 2: # 'No such file or directory' @@ -421,7 +422,7 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh # Wait for process completion or timeout try: return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, - stderr=stderr, error_code=error_code, cpu_metrics=cpu_metrics) + stderr=stderr, error_code=error_code, cpu_controller=cpu_controller) except ExtensionError as e: # The extension didn't terminate successfully. Determine whether it was due to systemd errors or # extension errors. @@ -498,7 +499,7 @@ def _get_controllers_enabled_at_root(root_cgroup_path): enabled_controllers_file = os.path.join(root_cgroup_path, 'cgroup.subtree_control') if os.path.exists(enabled_controllers_file): controllers_enabled_at_root = fileutil.read_file(enabled_controllers_file).rstrip().split() - return list(set(controllers_enabled_at_root) & set(CgroupV2.get_supported_controllers())) + return list(set(controllers_enabled_at_root) & set(CgroupV2.get_supported_controller_names())) return [] @staticmethod @@ -546,7 +547,7 @@ def get_process_cgroup(self, process_id, cgroup_name): def log_root_paths(self): log_cgroup_info("The root cgroup path is {0}".format(self._root_cgroup_path), send_event=False) - for controller in CgroupV2.get_supported_controllers(): + for controller in CgroupV2.get_supported_controller_names(): if controller in self._controllers_enabled_at_root: log_cgroup_info("The {0} controller is enabled at the root cgroup".format(controller), send_event=False) else: @@ -564,9 +565,9 @@ def __init__(self, cgroup_name): self._cgroup_name = cgroup_name @staticmethod - def get_supported_controllers(): + def get_supported_controller_names(): """ - Cgroup version specific. Returns a list of the controllers which the agent supports. + Cgroup version specific. Returns a list of the controllers which the agent supports as strings. """ raise NotImplementedError() @@ -578,12 +579,12 @@ def check_in_expected_slice(self, expected_slice): """ raise NotImplementedError() - def get_controller_metrics(self, expected_relative_path=None): + def get_controllers(self, expected_relative_path=None): """ - Cgroup version specific. Returns a list of the metrics for the agent supported controllers which are - mounted/enabled for the cgroup. + Cgroup version specific. Returns a list of the agent supported controllers which are mounted/enabled for the cgroup. - :param expected_relative_path: The expected relative path of the cgroup. If provided, only metrics for controllers at this expected path will be returned. + :param expected_relative_path: The expected relative path of the cgroup. If provided, only controllers mounted + at this expected path will be returned. """ raise NotImplementedError() @@ -608,7 +609,7 @@ def __init__(self, cgroup_name, controller_mountpoints, controller_paths): self._controller_paths = controller_paths @staticmethod - def get_supported_controllers(): + def get_supported_controller_names(): return [CgroupV1.CPU_CONTROLLER, CgroupV1.MEMORY_CONTROLLER] def check_in_expected_slice(self, expected_slice): @@ -620,39 +621,39 @@ def check_in_expected_slice(self, expected_slice): return in_expected_slice - def get_controller_metrics(self, expected_relative_path=None): - metrics = [] + def get_controllers(self, expected_relative_path=None): + controllers = [] - for controller in self.get_supported_controllers(): - controller_metrics = None - controller_path = self._controller_paths.get(controller) - controller_mountpoint = self._controller_mountpoints.get(controller) + for supported_controller_name in self.get_supported_controller_names(): + controller = None + controller_path = self._controller_paths.get(supported_controller_name) + controller_mountpoint = self._controller_mountpoints.get(supported_controller_name) if controller_mountpoint is None: - log_cgroup_warning("{0} controller is not mounted; will not track metrics".format(controller), send_event=False) + log_cgroup_warning("{0} controller is not mounted; will not track".format(supported_controller_name), send_event=False) continue if controller_path is None: - log_cgroup_warning("{0} is not mounted for the {1} cgroup; will not track metrics".format(controller, self._cgroup_name), send_event=False) + log_cgroup_warning("{0} is not mounted for the {1} cgroup; will not track".format(supported_controller_name, self._cgroup_name), send_event=False) continue if expected_relative_path is not None: expected_path = os.path.join(controller_mountpoint, expected_relative_path) if controller_path != expected_path: - log_cgroup_warning("The {0} controller is not mounted at the expected path for the {1} cgroup; will not track metrics. Actual cgroup path:[{2}] Expected:[{3}]".format(controller, self._cgroup_name, controller_path, expected_path), send_event=False) + log_cgroup_warning("The {0} controller is not mounted at the expected path for the {1} cgroup; will not track. Actual cgroup path:[{2}] Expected:[{3}]".format(supported_controller_name, self._cgroup_name, controller_path, expected_path), send_event=False) continue - if controller == self.CPU_CONTROLLER: - controller_metrics = CpuMetrics(self._cgroup_name, controller_path) - elif controller == self.MEMORY_CONTROLLER: - controller_metrics = MemoryMetrics(self._cgroup_name, controller_path) + if supported_controller_name == self.CPU_CONTROLLER: + controller = CpuControllerV1(self._cgroup_name, controller_path) + elif supported_controller_name == self.MEMORY_CONTROLLER: + controller = MemoryControllerV1(self._cgroup_name, controller_path) - if controller_metrics is not None: - msg = "{0} metrics for cgroup: {1}".format(controller, controller_metrics) + if controller is not None: + msg = "{0} controller for cgroup: {1}".format(supported_controller_name, controller) log_cgroup_info(msg, send_event=False) - metrics.append(controller_metrics) + controllers.append(controller) - return metrics + return controllers def get_controller_procs_path(self, controller): controller_path = self._controller_paths.get(controller) @@ -687,7 +688,7 @@ def __init__(self, cgroup_name, root_cgroup_path, cgroup_path, enabled_controlle self._enabled_controllers = enabled_controllers @staticmethod - def get_supported_controllers(): + def get_supported_controller_names(): return [CgroupV2.CPU_CONTROLLER, CgroupV2.MEMORY_CONTROLLER] def check_in_expected_slice(self, expected_slice): @@ -697,9 +698,41 @@ def check_in_expected_slice(self, expected_slice): return True - def get_controller_metrics(self, expected_relative_path=None): - # TODO - Implement controller metrics for cgroup v2 - raise NotImplementedError() + def get_controllers(self, expected_relative_path=None): + controllers = [] + + for supported_controller_name in self.get_supported_controller_names(): + controller = None + + if supported_controller_name not in self._enabled_controllers: + log_cgroup_warning("{0} controller is not enabled; will not track".format(supported_controller_name), + send_event=False) + continue + + if self._cgroup_path == "": + log_cgroup_warning("Cgroup path for {0} cannot be determined; will not track".format(self._cgroup_name), + send_event=False) + continue + + if expected_relative_path is not None: + expected_path = os.path.join(self._root_cgroup_path, expected_relative_path) + if self._cgroup_path != expected_path: + log_cgroup_warning( + "The {0} cgroup is not mounted at the expected path; will not track. Actual cgroup path:[{1}] Expected:[{2}]".format( + self._cgroup_name, self._cgroup_path, expected_path), send_event=False) + continue + + if supported_controller_name == self.CPU_CONTROLLER: + controller = CpuControllerV2(self._cgroup_name, self._cgroup_path) + elif supported_controller_name == self.MEMORY_CONTROLLER: + controller = MemoryControllerV2(self._cgroup_name, self._cgroup_path) + + if controller is not None: + msg = "{0} controller for cgroup: {1}".format(supported_controller_name, controller) + log_cgroup_info(msg, send_event=False) + controllers.append(controller) + + return controllers def get_procs_path(self): if self._cgroup_path != "": diff --git a/azurelinuxagent/ga/cgroupconfigurator.py b/azurelinuxagent/ga/cgroupconfigurator.py index f18c23e3c..2a75344b4 100644 --- a/azurelinuxagent/ga/cgroupconfigurator.py +++ b/azurelinuxagent/ga/cgroupconfigurator.py @@ -23,10 +23,12 @@ from azurelinuxagent.common import conf from azurelinuxagent.common import logger -from azurelinuxagent.ga.controllermetrics import CpuMetrics, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryMetrics +from azurelinuxagent.ga.cgroupcontroller import AGENT_NAME_TELEMETRY, MetricsCounter from azurelinuxagent.ga.cgroupapi import SystemdRunError, EXTENSION_SLICE_PREFIX, CGroupUtil, SystemdCgroupApiv2, \ log_cgroup_info, log_cgroup_warning, get_cgroup_api, InvalidCgroupMountpointException from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry +from azurelinuxagent.ga.cpucontroller import _CpuController +from azurelinuxagent.ga.memorycontroller import _MemoryController from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException, AgentMemoryExceededException from azurelinuxagent.common.future import ustr from azurelinuxagent.common.osutil import systemd @@ -76,8 +78,11 @@ CPUQuota={cpu_quota} MemoryAccounting=yes """ -_LOGCOLLECTOR_CPU_QUOTA = "5%" -LOGCOLLECTOR_MEMORY_LIMIT = 30 * 1024 ** 2 # 30Mb +LOGCOLLECTOR_CPU_QUOTA_FOR_V1_AND_V2 = "5%" +LOGCOLLECTOR_MEMORY_THROTTLE_LIMIT_FOR_V2 = "170M" +LOGCOLLECTOR_MAX_THROTTLED_EVENTS_FOR_V2 = 10 +LOGCOLLECTOR_ANON_MEMORY_LIMIT_FOR_V1_AND_V2 = 25 * 1024 ** 2 # 25Mb +LOGCOLLECTOR_CACHE_MEMORY_LIMIT_FOR_V1_AND_V2 = 155 * 1024 ** 2 # 155Mb _AGENT_DROP_IN_FILE_SLICE = "10-Slice.conf" _AGENT_DROP_IN_FILE_SLICE_CONTENTS = """ @@ -176,6 +181,11 @@ def initialize(self): log_cgroup_warning("Unable to determine which cgroup version to use: {0}".format(ustr(e)), send_event=True) return + # Setup the slices before v2 check. Cgroup v2 usage is disabled for agent and extensions, but can be + # enabled for log collector in waagent.conf. The log collector slice should be created in case v2 + # usage is enabled for log collector. + self.__setup_azure_slice() + if self.using_cgroup_v2(): log_cgroup_info("Agent and extensions resource monitoring is not currently supported on cgroup v2") return @@ -186,8 +196,6 @@ def initialize(self): log_cgroup_warning("The agent is within an unexpected slice: {0}".format(agent_slice)) return - self.__setup_azure_slice() - # Log mount points/root paths for cgroup controllers self._cgroups_api.log_root_paths() @@ -199,19 +207,19 @@ def initialize(self): self.disable(reason, DisableCgroups.ALL) return - # Get metrics to track - metrics = self._agent_cgroup.get_controller_metrics(expected_relative_path=os.path.join(agent_slice, systemd.get_agent_unit_name())) - if len(metrics) > 0: + # Get controllers to track + agent_controllers = self._agent_cgroup.get_controllers(expected_relative_path=os.path.join(agent_slice, systemd.get_agent_unit_name())) + if len(agent_controllers) > 0: self.enable() - for metric in metrics: - for prop in metric.get_unit_properties(): + for controller in agent_controllers: + for prop in controller.get_unit_properties(): log_cgroup_info('Agent {0} unit property value: {1}'.format(prop, systemd.get_unit_property(systemd.get_agent_unit_name(), prop))) - if isinstance(metric, CpuMetrics): + if isinstance(controller, _CpuController): self.__set_cpu_quota(conf.get_agent_cpu_quota()) - elif isinstance(metric, MemoryMetrics): - self._agent_memory_metrics = metric - CGroupsTelemetry.track_cgroup(metric) + elif isinstance(controller, _MemoryController): + self._agent_memory_metrics = controller + CGroupsTelemetry.track_cgroup_controller(controller) except Exception as exception: log_cgroup_warning("Error initializing cgroups: {0}".format(ustr(exception))) @@ -279,7 +287,7 @@ def __setup_azure_slice(): files_to_create.append((vmextensions_slice, _VMEXTENSIONS_SLICE_CONTENTS)) # Update log collector slice contents - slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT.format(cpu_quota=_LOGCOLLECTOR_CPU_QUOTA) + slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT.format(cpu_quota=LOGCOLLECTOR_CPU_QUOTA_FOR_V1_AND_V2) files_to_create.append((logcollector_slice, slice_contents)) if fileutil.findre_in_file(agent_unit_file, r"Slice=") is not None: @@ -441,10 +449,10 @@ def disable(self, reason, disable_cgroups): elif disable_cgroups == DisableCgroups.AGENT: # disable agent self._agent_cgroups_enabled = False self.__reset_agent_cpu_quota() - agent_metrics = self._agent_cgroup.get_controller_metrics() - for metric in agent_metrics: - if isinstance(metric, CpuMetrics): - CGroupsTelemetry.stop_tracking(metric) + agent_controllers = self._agent_cgroup.get_controllers() + for controller in agent_controllers: + if isinstance(controller, _CpuController): + CGroupsTelemetry.stop_tracking(controller) break log_cgroup_warning("Disabling resource usage monitoring. Reason: {0}".format(reason), op=WALAEventOperation.CGroupsDisabled) @@ -603,6 +611,22 @@ def _check_processes_in_agent_cgroup(self): self._report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected) raise CGroupsException("The agent's cgroup includes unexpected processes: {0}".format(unexpected)) + def get_logcollector_unit_properties(self): + """ + Returns the systemd unit properties for the log collector process. + + Each property should be explicitly set (even if already included in the log collector slice) for the log + collector process to run in the transient scope directory with the expected accounting and limits. + """ + logcollector_properties = ["--property=CPUAccounting=yes", "--property=MemoryAccounting=yes", "--property=CPUQuota={0}".format(LOGCOLLECTOR_CPU_QUOTA_FOR_V1_AND_V2)] + if not self.using_cgroup_v2(): + return logcollector_properties + # Memory throttling limit is used when running log collector on v2 machines using the 'MemoryHigh' property. + # We do not use a systemd property to enforce memory on V1 because it invokes the OOM killer if the limit + # is exceeded. + logcollector_properties.append("--property=MemoryHigh={0}".format(LOGCOLLECTOR_MEMORY_THROTTLE_LIMIT_FOR_V2)) + return logcollector_properties + @staticmethod def _get_command(pid): try: @@ -727,10 +751,10 @@ def _get_parent(pid): def start_tracking_unit_cgroups(self, unit_name): try: cgroup = self._cgroups_api.get_unit_cgroup(unit_name, unit_name) - metrics = cgroup.get_controller_metrics() + controllers = cgroup.get_controllers() - for metric in metrics: - CGroupsTelemetry.track_cgroup(metric) + for controller in controllers: + CGroupsTelemetry.track_cgroup_controller(controller) except Exception as exception: log_cgroup_info("Failed to start tracking resource usage for the extension: {0}".format(ustr(exception)), send_event=False) @@ -738,10 +762,10 @@ def start_tracking_unit_cgroups(self, unit_name): def stop_tracking_unit_cgroups(self, unit_name): try: cgroup = self._cgroups_api.get_unit_cgroup(unit_name, unit_name) - metrics = cgroup.get_controller_metrics() + controllers = cgroup.get_controllers() - for metric in metrics: - CGroupsTelemetry.stop_tracking(metric) + for controller in controllers: + CGroupsTelemetry.stop_tracking(controller) except Exception as exception: log_cgroup_info("Failed to stop tracking resource usage for the extension service: {0}".format(ustr(exception)), send_event=False) @@ -753,9 +777,9 @@ def stop_tracking_extension_cgroups(self, extension_name): cgroup = self._cgroups_api.get_cgroup_from_relative_path(relative_path=cgroup_relative_path, cgroup_name=extension_name) - metrics = cgroup.get_controller_metrics() - for metric in metrics: - CGroupsTelemetry.stop_tracking(metric) + controllers = cgroup.get_controllers() + for controller in controllers: + CGroupsTelemetry.stop_tracking(controller) except Exception as exception: log_cgroup_info("Failed to stop tracking resource usage for the extension service: {0}".format(ustr(exception)), send_event=False) diff --git a/azurelinuxagent/ga/cgroupcontroller.py b/azurelinuxagent/ga/cgroupcontroller.py new file mode 100644 index 000000000..a530553b2 --- /dev/null +++ b/azurelinuxagent/ga/cgroupcontroller.py @@ -0,0 +1,175 @@ +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.6+ and Openssl 1.0+ + +import errno +import os +from datetime import timedelta + +from azurelinuxagent.common import logger, conf +from azurelinuxagent.common.exception import CGroupsException +from azurelinuxagent.common.future import ustr +from azurelinuxagent.common.utils import fileutil + +_REPORT_EVERY_HOUR = timedelta(hours=1) +_DEFAULT_REPORT_PERIOD = timedelta(seconds=conf.get_cgroup_check_period()) + +AGENT_NAME_TELEMETRY = "walinuxagent.service" # Name used for telemetry; it needs to be consistent even if the name of the service changes +AGENT_LOG_COLLECTOR = "azure-walinuxagent-logcollector" + + +class CounterNotFound(Exception): + pass + + +class MetricValue(object): + """ + Class for defining all the required metric fields to send telemetry. + """ + + def __init__(self, category, counter, instance, value, report_period=_DEFAULT_REPORT_PERIOD): + self._category = category + self._counter = counter + self._instance = instance + self._value = value + self._report_period = report_period + + @property + def category(self): + return self._category + + @property + def counter(self): + return self._counter + + @property + def instance(self): + return self._instance + + @property + def value(self): + return self._value + + @property + def report_period(self): + return self._report_period + + +class MetricsCategory(object): + MEMORY_CATEGORY = "Memory" + CPU_CATEGORY = "CPU" + + +class MetricsCounter(object): + PROCESSOR_PERCENT_TIME = "% Processor Time" + THROTTLED_TIME = "Throttled Time (s)" + TOTAL_MEM_USAGE = "Total Memory Usage (B)" + ANON_MEM_USAGE = "Anon Memory Usage (B)" + CACHE_MEM_USAGE = "Cache Memory Usage (B)" + MAX_MEM_USAGE = "Max Memory Usage (B)" + SWAP_MEM_USAGE = "Swap Memory Usage (B)" + MEM_THROTTLED = "Total Memory Throttled Events" + AVAILABLE_MEM = "Available Memory (MB)" + USED_MEM = "Used Memory (MB)" + + +class _CgroupController(object): + def __init__(self, name, cgroup_path): + """ + Initialize _data collection for the controller + :param: name: Name of the CGroup + :param: cgroup_path: Path of the controller + :return: + """ + self.name = name + self.path = cgroup_path + + def __str__(self): + return "{0} [{1}]".format(self.name, self.path) + + def _get_cgroup_file(self, file_name): + return os.path.join(self.path, file_name) + + def _get_file_contents(self, file_name): + """ + Retrieve the contents of file. + + :param str file_name: Name of file within that metric controller + :return: Entire contents of the file + :rtype: str + """ + parameter_file = self._get_cgroup_file(file_name) + + return fileutil.read_file(parameter_file) + + def _get_parameters(self, parameter_name, first_line_only=False): + """ + Retrieve the values of a parameter from a controller. + Returns a list of values in the file. + + :param first_line_only: return only the first line. + :param str parameter_name: Name of file within that metric controller + :return: The first line of the file, without line terminator + :rtype: [str] + """ + result = [] + try: + values = self._get_file_contents(parameter_name).splitlines() + result = values[0] if first_line_only else values + except IndexError: + parameter_filename = self._get_cgroup_file(parameter_name) + logger.error("File {0} is empty but should not be".format(parameter_filename)) + raise CGroupsException("File {0} is empty but should not be".format(parameter_filename)) + except Exception as e: + if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101 + raise e + parameter_filename = self._get_cgroup_file(parameter_name) + raise CGroupsException("Exception while attempting to read {0}".format(parameter_filename), e) + return result + + def is_active(self): + """ + Returns True if any processes belong to the cgroup. In v1, cgroup.procs returns a list of the thread group IDs + belong to the cgroup. In v2, cgroup.procs returns a list of the process IDs belonging to the cgroup. + """ + try: + procs = self._get_parameters("cgroup.procs") + if procs: + return len(procs) != 0 + except (IOError, OSError) as e: + if e.errno == errno.ENOENT: + # only suppressing file not found exceptions. + pass + else: + logger.periodic_warn(logger.EVERY_HALF_HOUR, + 'Could not get list of procs from "cgroup.procs" file in the cgroup: {0}.' + ' Internal error: {1}'.format(self.path, ustr(e))) + except CGroupsException as e: + logger.periodic_warn(logger.EVERY_HALF_HOUR, + 'Could not get list of procs from "cgroup.procs" file in the cgroup: {0}.' + ' Internal error: {1}'.format(self.path, ustr(e))) + return False + + def get_tracked_metrics(self, **_): + """ + Retrieves the current value of the metrics tracked for this controller/cgroup and returns them as an array. + """ + raise NotImplementedError() + + def get_unit_properties(self): + """ + Returns a list of the unit properties to collect for the controller. + """ + raise NotImplementedError() diff --git a/azurelinuxagent/ga/cgroupstelemetry.py b/azurelinuxagent/ga/cgroupstelemetry.py index e8efad038..412f75f4f 100644 --- a/azurelinuxagent/ga/cgroupstelemetry.py +++ b/azurelinuxagent/ga/cgroupstelemetry.py @@ -17,7 +17,7 @@ import threading from azurelinuxagent.common import logger -from azurelinuxagent.ga.controllermetrics import CpuMetrics +from azurelinuxagent.ga.cpucontroller import _CpuController from azurelinuxagent.common.future import ustr @@ -37,18 +37,18 @@ def get_track_throttled_time(): return CGroupsTelemetry._track_throttled_time @staticmethod - def track_cgroup(cgroup): + def track_cgroup_controller(cgroup_controller): """ - Adds the given item to the dictionary of tracked cgroups + Adds the given item to the dictionary of tracked cgroup controllers """ - if isinstance(cgroup, CpuMetrics): + if isinstance(cgroup_controller, _CpuController): # set the current cpu usage - cgroup.initialize_cpu_usage() + cgroup_controller.initialize_cpu_usage() with CGroupsTelemetry._rlock: - if not CGroupsTelemetry.is_tracked(cgroup.path): - CGroupsTelemetry._tracked[cgroup.path] = cgroup - logger.info("Started tracking cgroup {0}", cgroup) + if not CGroupsTelemetry.is_tracked(cgroup_controller.path): + CGroupsTelemetry._tracked[cgroup_controller.path] = cgroup_controller + logger.info("Started tracking cgroup {0}", cgroup_controller) @staticmethod def is_tracked(path): @@ -75,11 +75,11 @@ def stop_tracking(cgroup): @staticmethod def poll_all_tracked(): metrics = [] - inactive_cgroups = [] + inactive_controllers = [] with CGroupsTelemetry._rlock: - for cgroup in CGroupsTelemetry._tracked.values(): + for controller in CGroupsTelemetry._tracked.values(): try: - metrics.extend(cgroup.get_tracked_metrics(track_throttled_time=CGroupsTelemetry._track_throttled_time)) + metrics.extend(controller.get_tracked_metrics(track_throttled_time=CGroupsTelemetry._track_throttled_time)) except Exception as e: # There can be scenarios when the CGroup has been deleted by the time we are fetching the values # from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log @@ -87,11 +87,11 @@ def poll_all_tracked(): # exceptions which could occur, which is why we do a periodic log for all the other errors. if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: # pylint: disable=E1101 logger.periodic_warn(logger.EVERY_HOUR, '[PERIODIC] Could not collect metrics for cgroup ' - '{0}. Error : {1}'.format(cgroup.name, ustr(e))) - if not cgroup.is_active(): - inactive_cgroups.append(cgroup) - for inactive_cgroup in inactive_cgroups: - CGroupsTelemetry.stop_tracking(inactive_cgroup) + '{0}. Error : {1}'.format(controller.name, ustr(e))) + if not controller.is_active(): + inactive_controllers.append(controller) + for inactive_controller in inactive_controllers: + CGroupsTelemetry.stop_tracking(inactive_controller) return metrics diff --git a/azurelinuxagent/ga/collect_logs.py b/azurelinuxagent/ga/collect_logs.py index d8ea3dba3..488691a5a 100644 --- a/azurelinuxagent/ga/collect_logs.py +++ b/azurelinuxagent/ga/collect_logs.py @@ -25,19 +25,17 @@ import azurelinuxagent.common.conf as conf from azurelinuxagent.common import logger -from azurelinuxagent.ga.controllermetrics import MetricsCounter -from azurelinuxagent.common.event import elapsed_milliseconds, add_event, WALAEventOperation, report_metric +from azurelinuxagent.ga.cgroupcontroller import MetricsCounter +from azurelinuxagent.common.event import elapsed_milliseconds, add_event, WALAEventOperation from azurelinuxagent.common.future import ustr from azurelinuxagent.ga.interfaces import ThreadHandlerInterface from azurelinuxagent.ga.logcollector import COMPRESSED_ARCHIVE_PATH, GRACEFUL_KILL_ERRCODE -from azurelinuxagent.ga.cgroupconfigurator import CGroupConfigurator, LOGCOLLECTOR_MEMORY_LIMIT +from azurelinuxagent.ga.cgroupconfigurator import CGroupConfigurator, LOGCOLLECTOR_ANON_MEMORY_LIMIT_FOR_V1_AND_V2, LOGCOLLECTOR_CACHE_MEMORY_LIMIT_FOR_V1_AND_V2, LOGCOLLECTOR_MAX_THROTTLED_EVENTS_FOR_V2 from azurelinuxagent.common.protocol.util import get_protocol_util from azurelinuxagent.common.utils import shellutil from azurelinuxagent.common.utils.shellutil import CommandError from azurelinuxagent.common.version import PY_VERSION_MAJOR, PY_VERSION_MINOR, AGENT_NAME, CURRENT_VERSION -_INITIAL_LOG_COLLECTION_DELAY = 5 * 60 # Five minutes of delay - def get_collect_logs_handler(): return CollectLogsHandler() @@ -46,18 +44,27 @@ def get_collect_logs_handler(): def is_log_collection_allowed(): # There are three conditions that need to be met in order to allow periodic log collection: # 1) It should be enabled in the configuration. - # 2) The system must be using cgroups to manage services. Needed for resource limiting of the log collection. + # 2) The system must be using cgroups to manage services - needed for resource limiting of the log collection. The + # agent currently fully supports resource limiting for v1, but only supports log collector resource limiting for v2 + # if enabled via configuration. + # This condition is True if either: + # a. cgroup usage in the agent is enabled; OR + # b. the machine is using cgroup v2 and v2 resource limiting is enabled in the configuration. # 3) The python version must be greater than 2.6 in order to support the ZipFile library used when collecting. conf_enabled = conf.get_collect_logs() cgroups_enabled = CGroupConfigurator.get_instance().enabled() + cgroup_v2_resource_limiting_enabled = CGroupConfigurator.get_instance().using_cgroup_v2() and conf.get_enable_cgroup_v2_resource_limiting() supported_python = PY_VERSION_MINOR >= 6 if PY_VERSION_MAJOR == 2 else PY_VERSION_MAJOR == 3 - is_allowed = conf_enabled and cgroups_enabled and supported_python + is_allowed = conf_enabled and (cgroups_enabled or cgroup_v2_resource_limiting_enabled) and supported_python msg = "Checking if log collection is allowed at this time [{0}]. All three conditions must be met: " \ - "configuration enabled [{1}], cgroups enabled [{2}], python supported: [{3}]".format(is_allowed, - conf_enabled, - cgroups_enabled, - supported_python) + "1. configuration enabled [{1}], " \ + "2. cgroups v1 enabled [{2}] OR cgroups v2 is in use and v2 resource limiting configuration enabled [{3}], " \ + "3. python supported: [{4}]".format(is_allowed, + conf_enabled, + cgroups_enabled, + cgroup_v2_resource_limiting_enabled, + supported_python) logger.info(msg) add_event( name=AGENT_NAME, @@ -144,7 +151,7 @@ def init_protocols(self): def daemon(self): # Delay the first collector on start up to give short lived VMs (that might be dead before the second # collection has a chance to run) an opportunity to do produce meaningful logs to collect. - time.sleep(_INITIAL_LOG_COLLECTION_DELAY) + time.sleep(conf.get_log_collector_initial_delay()) try: CollectLogsHandler.enable_monitor_cgroups_check() @@ -171,15 +178,13 @@ def collect_and_send_logs(self): def _collect_logs(self): logger.info("Starting log collection...") - # Invoke the command line tool in the agent to collect logs, with resource limits on CPU. - # Some distros like ubuntu20 by default cpu and memory accounting enabled. Thus create nested cgroups under the logcollector slice - # So disabling CPU and Memory accounting prevents from creating nested cgroups, so that all the counters will be present in logcollector Cgroup - + # Invoke the command line tool in the agent to collect logs. The --scope option starts the process as a systemd + # transient scope unit. The --property option is used to set systemd memory and cpu properties on the scope. systemd_cmd = [ - "systemd-run", "--property=CPUAccounting=no", "--property=MemoryAccounting=no", + "systemd-run", "--unit={0}".format(logcollector.CGROUPS_UNIT), "--slice={0}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE), "--scope" - ] + ] + CGroupConfigurator.get_instance().get_logcollector_unit_properties() # The log tool is invoked from the current agent's egg with the command line option collect_logs_cmd = [sys.executable, "-u", sys.argv[0], "-collect-logs"] @@ -208,8 +213,7 @@ def exec_command(): # pylint has limited (i.e. no) awareness of control flow w.r.t. typing. we disable=no-member # here because we know e must be a CommandError but pylint still considers the case where # e is a different type of exception. - err_msg = ustr("Log Collector exited with code {0}").format( - e.returncode) # pylint: disable=no-member + err_msg = ustr("Log Collector exited with code {0}").format(e.returncode) # pylint: disable=no-member if e.returncode == logcollector.INVALID_CGROUPS_ERRCODE: # pylint: disable=no-member logger.info("Disabling periodic log collection until service restart due to process error.") @@ -262,8 +266,8 @@ def _send_logs(self): log_event=False) -def get_log_collector_monitor_handler(cgroups): - return LogCollectorMonitorHandler(cgroups) +def get_log_collector_monitor_handler(controllers): + return LogCollectorMonitorHandler(controllers) class LogCollectorMonitorHandler(ThreadHandlerInterface): @@ -277,12 +281,13 @@ class LogCollectorMonitorHandler(ThreadHandlerInterface): def get_thread_name(): return LogCollectorMonitorHandler._THREAD_NAME - def __init__(self, cgroups): + def __init__(self, controllers): self.event_thread = None self.should_run = True self.period = 2 # Log collector monitor runs every 2 secs. - self.cgroups = cgroups - self.__log_metrics = conf.get_cgroup_log_metrics() + self.controllers = controllers + self.max_recorded_metrics = {} + self.__should_log_metrics = conf.get_cgroup_log_metrics() def run(self): self.start() @@ -312,7 +317,8 @@ def daemon(self): while not self.stopped(): try: metrics = self._poll_resource_usage() - self._send_telemetry(metrics) + if self.__should_log_metrics: + self._log_metrics(metrics) self._verify_memory_limit(metrics) except Exception as e: logger.error("An error occurred in the log collection monitor thread loop; " @@ -324,30 +330,54 @@ def daemon(self): "An error occurred in the MonitorLogCollectorCgroupsHandler thread; will exit the thread.\n{0}", ustr(e)) + def get_max_recorded_metrics(self): + return self.max_recorded_metrics + def _poll_resource_usage(self): metrics = [] - for cgroup in self.cgroups: - metrics.extend(cgroup.get_tracked_metrics(track_throttled_time=True)) + for controller in self.controllers: + metrics.extend(controller.get_tracked_metrics(track_throttled_time=True)) + + for metric in metrics: + current_max = self.max_recorded_metrics.get(metric.counter) + self.max_recorded_metrics[metric.counter] = metric.value if current_max is None else max(current_max, metric.value) + return metrics - def _send_telemetry(self, metrics): + def _log_metrics(self, metrics): for metric in metrics: - report_metric(metric.category, metric.counter, metric.instance, metric.value, log_event=self.__log_metrics) + logger.info("Metric {0}/{1} [{2}] = {3}".format(metric.category, metric.counter, metric.instance, metric.value)) def _verify_memory_limit(self, metrics): - current_usage = 0 + current_anon_and_swap_usage = 0 + current_cache_usage = 0 + memory_throttled_events = 0 for metric in metrics: - if metric.counter == MetricsCounter.TOTAL_MEM_USAGE: - current_usage += metric.value + if metric.counter == MetricsCounter.ANON_MEM_USAGE: + current_anon_and_swap_usage += metric.value elif metric.counter == MetricsCounter.SWAP_MEM_USAGE: - current_usage += metric.value - - if current_usage > LOGCOLLECTOR_MEMORY_LIMIT: - msg = "Log collector memory limit {0} bytes exceeded. The max reported usage is {1} bytes.".format(LOGCOLLECTOR_MEMORY_LIMIT, current_usage) + current_anon_and_swap_usage += metric.value + elif metric.counter == MetricsCounter.CACHE_MEM_USAGE: + current_cache_usage = metric.value + elif metric.counter == MetricsCounter.MEM_THROTTLED: + memory_throttled_events = metric.value + + mem_limit_exceeded = False + if current_anon_and_swap_usage > LOGCOLLECTOR_ANON_MEMORY_LIMIT_FOR_V1_AND_V2: + mem_limit_exceeded = True + msg = "Log collector anon + swap memory limit {0} bytes exceeded. The reported usage is {1} bytes.".format(LOGCOLLECTOR_ANON_MEMORY_LIMIT_FOR_V1_AND_V2, current_anon_and_swap_usage) logger.info(msg) - add_event( - name=AGENT_NAME, - version=CURRENT_VERSION, - op=WALAEventOperation.LogCollection, - message=msg) + add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.LogCollection, message=msg) + if current_cache_usage > LOGCOLLECTOR_CACHE_MEMORY_LIMIT_FOR_V1_AND_V2: + mem_limit_exceeded = True + msg = "Log collector cache memory limit {0} bytes exceeded. The reported usage is {1} bytes.".format(LOGCOLLECTOR_CACHE_MEMORY_LIMIT_FOR_V1_AND_V2, current_cache_usage) + logger.info(msg) + add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.LogCollection, message=msg) + if memory_throttled_events > LOGCOLLECTOR_MAX_THROTTLED_EVENTS_FOR_V2: + mem_limit_exceeded = True + msg = "Log collector memory throttled events limit {0} exceeded. The reported number of throttled events is {1}.".format(LOGCOLLECTOR_MAX_THROTTLED_EVENTS_FOR_V2, memory_throttled_events) + logger.info(msg) + add_event(name=AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.LogCollection, message=msg) + + if mem_limit_exceeded: os._exit(GRACEFUL_KILL_ERRCODE) diff --git a/azurelinuxagent/ga/controllermetrics.py b/azurelinuxagent/ga/controllermetrics.py deleted file mode 100644 index 3aaeab319..000000000 --- a/azurelinuxagent/ga/controllermetrics.py +++ /dev/null @@ -1,404 +0,0 @@ -# Copyright 2018 Microsoft Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Requires Python 2.6+ and Openssl 1.0+ - -import errno -import os -import re -from datetime import timedelta - -from azurelinuxagent.common import logger, conf -from azurelinuxagent.common.exception import CGroupsException -from azurelinuxagent.common.future import ustr -from azurelinuxagent.common.osutil import get_osutil -from azurelinuxagent.common.utils import fileutil - -_REPORT_EVERY_HOUR = timedelta(hours=1) -_DEFAULT_REPORT_PERIOD = timedelta(seconds=conf.get_cgroup_check_period()) - -AGENT_NAME_TELEMETRY = "walinuxagent.service" # Name used for telemetry; it needs to be consistent even if the name of the service changes -AGENT_LOG_COLLECTOR = "azure-walinuxagent-logcollector" - - -class CounterNotFound(Exception): - pass - - -class MetricValue(object): - - """ - Class for defining all the required metric fields to send telemetry. - """ - - def __init__(self, category, counter, instance, value, report_period=_DEFAULT_REPORT_PERIOD): - self._category = category - self._counter = counter - self._instance = instance - self._value = value - self._report_period = report_period - - @property - def category(self): - return self._category - - @property - def counter(self): - return self._counter - - @property - def instance(self): - return self._instance - - @property - def value(self): - return self._value - - @property - def report_period(self): - return self._report_period - - -class MetricsCategory(object): - MEMORY_CATEGORY = "Memory" - CPU_CATEGORY = "CPU" - - -class MetricsCounter(object): - PROCESSOR_PERCENT_TIME = "% Processor Time" - TOTAL_MEM_USAGE = "Total Memory Usage" - MAX_MEM_USAGE = "Max Memory Usage" - THROTTLED_TIME = "Throttled Time" - SWAP_MEM_USAGE = "Swap Memory Usage" - AVAILABLE_MEM = "Available MBytes" - USED_MEM = "Used MBytes" - - -re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n') - - -class ControllerMetrics(object): - def __init__(self, name, cgroup_path): - """ - Initialize _data collection for the Memory controller - :param: name: Name of the CGroup - :param: cgroup_path: Path of the controller - :return: - """ - self.name = name - self.path = cgroup_path - - def __str__(self): - return "{0} [{1}]".format(self.name, self.path) - - def _get_cgroup_file(self, file_name): - return os.path.join(self.path, file_name) - - def _get_file_contents(self, file_name): - """ - Retrieve the contents to file. - - :param str file_name: Name of file within that metric controller - :return: Entire contents of the file - :rtype: str - """ - parameter_file = self._get_cgroup_file(file_name) - - return fileutil.read_file(parameter_file) - - def _get_parameters(self, parameter_name, first_line_only=False): - """ - Retrieve the values of a parameter from a controller. - Returns a list of values in the file. - - :param first_line_only: return only the first line. - :param str parameter_name: Name of file within that metric controller - :return: The first line of the file, without line terminator - :rtype: [str] - """ - result = [] - try: - values = self._get_file_contents(parameter_name).splitlines() - result = values[0] if first_line_only else values - except IndexError: - parameter_filename = self._get_cgroup_file(parameter_name) - logger.error("File {0} is empty but should not be".format(parameter_filename)) - raise CGroupsException("File {0} is empty but should not be".format(parameter_filename)) - except Exception as e: - if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101 - raise e - parameter_filename = self._get_cgroup_file(parameter_name) - raise CGroupsException("Exception while attempting to read {0}".format(parameter_filename), e) - return result - - def is_active(self): - try: - tasks = self._get_parameters("tasks") - if tasks: - return len(tasks) != 0 - except (IOError, OSError) as e: - if e.errno == errno.ENOENT: - # only suppressing file not found exceptions. - pass - else: - logger.periodic_warn(logger.EVERY_HALF_HOUR, - 'Could not get list of tasks from "tasks" file in the cgroup: {0}.' - ' Internal error: {1}'.format(self.path, ustr(e))) - except CGroupsException as e: - logger.periodic_warn(logger.EVERY_HALF_HOUR, - 'Could not get list of tasks from "tasks" file in the cgroup: {0}.' - ' Internal error: {1}'.format(self.path, ustr(e))) - return False - - def get_tracked_metrics(self, **_): - """ - Retrieves the current value of the metrics tracked for this cgroup and returns them as an array. - - Note: Agent won't track the metrics if the current cpu ticks less than previous value and returns empty array. - """ - raise NotImplementedError() - - def get_unit_properties(self): - """ - Returns a list of the unit properties to collect for the controller. - """ - raise NotImplementedError() - - -class CpuMetrics(ControllerMetrics): - def __init__(self, name, cgroup_path): - super(CpuMetrics, self).__init__(name, cgroup_path) - - self._osutil = get_osutil() - self._previous_cgroup_cpu = None - self._previous_system_cpu = None - self._current_cgroup_cpu = None - self._current_system_cpu = None - self._previous_throttled_time = None - self._current_throttled_time = None - - def _get_cpu_ticks(self, allow_no_such_file_or_directory_error=False): - """ - Returns the number of USER_HZ of CPU time (user and system) consumed by this cgroup. - - If allow_no_such_file_or_directory_error is set to True and cpuacct.stat does not exist the function - returns 0; this is useful when the function can be called before the cgroup has been created. - """ - try: - cpuacct_stat = self._get_file_contents('cpuacct.stat') - except Exception as e: - if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: # pylint: disable=E1101 - raise CGroupsException("Failed to read cpuacct.stat: {0}".format(ustr(e))) - if not allow_no_such_file_or_directory_error: - raise e - cpuacct_stat = None - - cpu_ticks = 0 - - if cpuacct_stat is not None: - # - # Sample file: - # # cat /sys/fs/cgroup/cpuacct/azure.slice/walinuxagent.service/cpuacct.stat - # user 10190 - # system 3160 - # - match = re_user_system_times.match(cpuacct_stat) - if not match: - raise CGroupsException( - "The contents of {0} are invalid: {1}".format(self._get_cgroup_file('cpuacct.stat'), cpuacct_stat)) - cpu_ticks = int(match.groups()[0]) + int(match.groups()[1]) - - return cpu_ticks - - def get_throttled_time(self): - try: - with open(os.path.join(self.path, 'cpu.stat')) as cpu_stat: - # - # Sample file: - # - # # cat /sys/fs/cgroup/cpuacct/azure.slice/walinuxagent.service/cpu.stat - # nr_periods 51660 - # nr_throttled 19461 - # throttled_time 1529590856339 - # - for line in cpu_stat: - match = re.match(r'throttled_time\s+(\d+)', line) - if match is not None: - return int(match.groups()[0]) - raise Exception("Cannot find throttled_time") - except (IOError, OSError) as e: - if e.errno == errno.ENOENT: - return 0 - raise CGroupsException("Failed to read cpu.stat: {0}".format(ustr(e))) - except Exception as e: - raise CGroupsException("Failed to read cpu.stat: {0}".format(ustr(e))) - - def _cpu_usage_initialized(self): - return self._current_cgroup_cpu is not None and self._current_system_cpu is not None - - def initialize_cpu_usage(self): - """ - Sets the initial values of CPU usage. This function must be invoked before calling get_cpu_usage(). - """ - if self._cpu_usage_initialized(): - raise CGroupsException("initialize_cpu_usage() should be invoked only once") - self._current_cgroup_cpu = self._get_cpu_ticks(allow_no_such_file_or_directory_error=True) - self._current_system_cpu = self._osutil.get_total_cpu_ticks_since_boot() - self._current_throttled_time = self.get_throttled_time() - - def get_cpu_usage(self): - """ - Computes the CPU used by the cgroup since the last call to this function. - - The usage is measured as a percentage of utilization of 1 core in the system. For example, - using 1 core all of the time on a 4-core system would be reported as 100%. - - NOTE: initialize_cpu_usage() must be invoked before calling get_cpu_usage() - """ - if not self._cpu_usage_initialized(): - raise CGroupsException("initialize_cpu_usage() must be invoked before the first call to get_cpu_usage()") - - self._previous_cgroup_cpu = self._current_cgroup_cpu - self._previous_system_cpu = self._current_system_cpu - self._current_cgroup_cpu = self._get_cpu_ticks() - self._current_system_cpu = self._osutil.get_total_cpu_ticks_since_boot() - - cgroup_delta = self._current_cgroup_cpu - self._previous_cgroup_cpu - system_delta = max(1, self._current_system_cpu - self._previous_system_cpu) - - return round(100.0 * self._osutil.get_processor_cores() * float(cgroup_delta) / float(system_delta), 3) - - def get_cpu_throttled_time(self, read_previous_throttled_time=True): - """ - Computes the throttled time (in seconds) since the last call to this function. - NOTE: initialize_cpu_usage() must be invoked before calling this function - Compute only current throttled time if read_previous_throttled_time set to False - """ - if not read_previous_throttled_time: - return float(self.get_throttled_time() / 1E9) - - if not self._cpu_usage_initialized(): - raise CGroupsException( - "initialize_cpu_usage() must be invoked before the first call to get_throttled_time()") - - self._previous_throttled_time = self._current_throttled_time - self._current_throttled_time = self.get_throttled_time() - - return float(self._current_throttled_time - self._previous_throttled_time) / 1E9 - - def get_tracked_metrics(self, **kwargs): - tracked = [] - cpu_usage = self.get_cpu_usage() - if cpu_usage >= float(0): - tracked.append( - MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, cpu_usage)) - - if 'track_throttled_time' in kwargs and kwargs['track_throttled_time']: - throttled_time = self.get_cpu_throttled_time() - if cpu_usage >= float(0) and throttled_time >= float(0): - tracked.append( - MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time)) - - return tracked - - def get_unit_properties(self): - return ["CPUAccounting", "CPUQuotaPerSecUSec"] - - -class MemoryMetrics(ControllerMetrics): - def __init__(self, name, cgroup_path): - super(MemoryMetrics, self).__init__(name, cgroup_path) - - self._counter_not_found_error_count = 0 - - def _get_memory_stat_counter(self, counter_name): - try: - with open(os.path.join(self.path, 'memory.stat')) as memory_stat: - # cat /sys/fs/cgroup/memory/azure.slice/memory.stat - # cache 67178496 - # rss 42340352 - # rss_huge 6291456 - # swap 0 - for line in memory_stat: - re_memory_counter = r'{0}\s+(\d+)'.format(counter_name) - match = re.match(re_memory_counter, line) - if match is not None: - return int(match.groups()[0]) - except (IOError, OSError) as e: - if e.errno == errno.ENOENT: - raise - raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e))) - except Exception as e: - raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e))) - - raise CounterNotFound("Cannot find counter: {0}".format(counter_name)) - - def get_memory_usage(self): - """ - Collect RSS+CACHE from memory.stat cgroup. - - :return: Memory usage in bytes - :rtype: int - """ - - cache = self._get_memory_stat_counter("cache") - rss = self._get_memory_stat_counter("rss") - return cache + rss - - def try_swap_memory_usage(self): - """ - Collect SWAP from memory.stat cgroup. - - :return: Memory usage in bytes - :rtype: int - Note: stat file is the only place to get the SWAP since other swap related file memory.memsw.usage_in_bytes is for total Memory+SWAP. - """ - try: - return self._get_memory_stat_counter("swap") - except CounterNotFound as e: - if self._counter_not_found_error_count < 1: - logger.periodic_info(logger.EVERY_HALF_HOUR, - '{0} from "memory.stat" file in the cgroup: {1}---[Note: This log for informational purpose only and can be ignored]'.format(ustr(e), self.path)) - self._counter_not_found_error_count += 1 - return 0 - - def get_max_memory_usage(self): - """ - Collect memory.max_usage_in_bytes from the cgroup. - - :return: Memory usage in bytes - :rtype: int - """ - usage = 0 - try: - usage = int(self._get_parameters('memory.max_usage_in_bytes', first_line_only=True)) - except Exception as e: - if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101 - raise - raise CGroupsException("Exception while attempting to read {0}".format("memory.max_usage_in_bytes"), e) - - return usage - - def get_tracked_metrics(self, **_): - return [ - MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, self.name, - self.get_memory_usage()), - MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, self.name, - self.get_max_memory_usage(), _REPORT_EVERY_HOUR), - MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.SWAP_MEM_USAGE, self.name, - self.try_swap_memory_usage(), _REPORT_EVERY_HOUR) - ] - - def get_unit_properties(self): - return["MemoryAccounting"] diff --git a/azurelinuxagent/ga/cpucontroller.py b/azurelinuxagent/ga/cpucontroller.py new file mode 100644 index 000000000..b4f56dd15 --- /dev/null +++ b/azurelinuxagent/ga/cpucontroller.py @@ -0,0 +1,293 @@ +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.6+ and Openssl 1.0+ + +import errno +import os +import re + +from azurelinuxagent.common.exception import CGroupsException +from azurelinuxagent.common.future import ustr +from azurelinuxagent.common.osutil import get_osutil +from azurelinuxagent.common.utils import fileutil +from azurelinuxagent.ga.cgroupcontroller import _CgroupController, MetricValue, MetricsCategory, MetricsCounter + +re_v1_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n') +re_v2_usage_time = re.compile(r'[\s\S]*usage_usec (\d+)[\s\S]*') + + +class _CpuController(_CgroupController): + def __init__(self, name, cgroup_path): + super(_CpuController, self).__init__(name, cgroup_path) + + self._osutil = get_osutil() + self._previous_cgroup_cpu = None + self._previous_system_cpu = None + self._current_cgroup_cpu = None + self._current_system_cpu = None + self._previous_throttled_time = None + self._current_throttled_time = None + + def _get_cpu_stat_counter(self, counter_name): + """ + Gets the value for the provided counter in cpu.stat + """ + try: + with open(os.path.join(self.path, 'cpu.stat')) as cpu_stat: + # + # Sample file v1: + # # cat cpu.stat + # nr_periods 51660 + # nr_throttled 19461 + # throttled_time 1529590856339 + # + # Sample file v2 + # # cat cpu.stat + # usage_usec 200161503 + # user_usec 199388368 + # system_usec 773134 + # core_sched.force_idle_usec 0 + # nr_periods 40059 + # nr_throttled 40022 + # throttled_usec 3565247992 + # nr_bursts 0 + # burst_usec 0 + # + for line in cpu_stat: + match = re.match(r'{0}\s+(\d+)'.format(counter_name), line) + if match is not None: + return int(match.groups()[0]) + raise Exception("Cannot find {0}".format(counter_name)) + except (IOError, OSError) as e: + if e.errno == errno.ENOENT: + return 0 + raise CGroupsException("Failed to read cpu.stat: {0}".format(ustr(e))) + except Exception as e: + raise CGroupsException("Failed to read cpu.stat: {0}".format(ustr(e))) + + def _cpu_usage_initialized(self): + """ + Returns True if cpu usage has been initialized, False otherwise. + """ + return self._current_cgroup_cpu is not None and self._current_system_cpu is not None + + def initialize_cpu_usage(self): + """ + Sets the initial values of CPU usage. This function must be invoked before calling get_cpu_usage(). + """ + raise NotImplementedError() + + def get_cpu_usage(self): + """ + Computes the CPU used by the cgroup since the last call to this function. + + The usage is measured as a percentage of utilization of 1 core in the system. For example, + using 1 core all of the time on a 4-core system would be reported as 100%. + + NOTE: initialize_cpu_usage() must be invoked before calling get_cpu_usage() + """ + raise NotImplementedError() + + def get_cpu_throttled_time(self, read_previous_throttled_time=True): + """ + Computes the throttled time (in seconds) since the last call to this function. + NOTE: initialize_cpu_usage() must be invoked before calling this function + Compute only current throttled time if read_previous_throttled_time set to False + """ + raise NotImplementedError() + + def get_tracked_metrics(self, **kwargs): + # Note: If the current cpu usage is less than the previous usage (metric is negative), then an empty array will + # be returned and the agent won't track the metrics. + tracked = [] + cpu_usage = self.get_cpu_usage() + if cpu_usage >= float(0): + tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, cpu_usage)) + + if 'track_throttled_time' in kwargs and kwargs['track_throttled_time']: + throttled_time = self.get_cpu_throttled_time() + if cpu_usage >= float(0) and throttled_time >= float(0): + tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time)) + + return tracked + + def get_unit_properties(self): + return ["CPUAccounting", "CPUQuotaPerSecUSec"] + + +class CpuControllerV1(_CpuController): + def initialize_cpu_usage(self): + if self._cpu_usage_initialized(): + raise CGroupsException("initialize_cpu_usage() should be invoked only once") + self._current_cgroup_cpu = self._get_cpu_ticks(allow_no_such_file_or_directory_error=True) + self._current_system_cpu = self._osutil.get_total_cpu_ticks_since_boot() + self._current_throttled_time = self._get_cpu_stat_counter(counter_name='throttled_time') + + def _get_cpu_ticks(self, allow_no_such_file_or_directory_error=False): + """ + Returns the number of USER_HZ of CPU time (user and system) consumed by this cgroup. + + If allow_no_such_file_or_directory_error is set to True and cpuacct.stat does not exist the function + returns 0; this is useful when the function can be called before the cgroup has been created. + """ + try: + cpuacct_stat = self._get_file_contents('cpuacct.stat') + except Exception as e: + if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: # pylint: disable=E1101 + raise CGroupsException("Failed to read cpuacct.stat: {0}".format(ustr(e))) + if not allow_no_such_file_or_directory_error: + raise e + cpuacct_stat = None + + cpu_ticks = 0 + + if cpuacct_stat is not None: + # + # Sample file: + # # cat /sys/fs/cgroup/cpuacct/azure.slice/walinuxagent.service/cpuacct.stat + # user 10190 + # system 3160 + # + match = re_v1_user_system_times.match(cpuacct_stat) + if not match: + raise CGroupsException("The contents of {0} are invalid: {1}".format(self._get_cgroup_file('cpuacct.stat'), cpuacct_stat)) + cpu_ticks = int(match.groups()[0]) + int(match.groups()[1]) + + return cpu_ticks + + def get_cpu_usage(self): + if not self._cpu_usage_initialized(): + raise CGroupsException("initialize_cpu_usage() must be invoked before the first call to get_cpu_usage()") + + self._previous_cgroup_cpu = self._current_cgroup_cpu + self._previous_system_cpu = self._current_system_cpu + self._current_cgroup_cpu = self._get_cpu_ticks() + self._current_system_cpu = self._osutil.get_total_cpu_ticks_since_boot() + + cgroup_delta = self._current_cgroup_cpu - self._previous_cgroup_cpu + system_delta = max(1, self._current_system_cpu - self._previous_system_cpu) + + return round(100.0 * self._osutil.get_processor_cores() * float(cgroup_delta) / float(system_delta), 3) + + def get_cpu_throttled_time(self, read_previous_throttled_time=True): + # Throttled time is reported in nanoseconds in v1 + if not read_previous_throttled_time: + return float(self._get_cpu_stat_counter(counter_name='throttled_time') / 1E9) + + if not self._cpu_usage_initialized(): + raise CGroupsException("initialize_cpu_usage() must be invoked before the first call to get_cpu_throttled_time()") + + self._previous_throttled_time = self._current_throttled_time + self._current_throttled_time = self._get_cpu_stat_counter(counter_name='throttled_time') + + return round(float(self._current_throttled_time - self._previous_throttled_time) / 1E9, 3) + + +class CpuControllerV2(_CpuController): + @staticmethod + def get_system_uptime(): + """ + Get the uptime of the system (including time spent in suspend) in seconds. + /proc/uptime contains two numbers (values in seconds): the uptime of the system (including time spent in + suspend) and the amount of time spent in the idle process: + # cat /proc/uptime + 365380.48 722644.81 + + :return: System uptime in seconds + :rtype: float + """ + uptime_contents = fileutil.read_file('/proc/uptime').split() + return float(uptime_contents[0]) + + def _get_system_usage(self): + try: + return self.get_system_uptime() + except (OSError, IOError) as e: + raise CGroupsException("Couldn't read /proc/uptime: {0}".format(ustr(e))) + except Exception as e: + raise CGroupsException("Couldn't parse /proc/uptime: {0}".format(ustr(e))) + + def initialize_cpu_usage(self): + if self._cpu_usage_initialized(): + raise CGroupsException("initialize_cpu_usage() should be invoked only once") + self._current_cgroup_cpu = self._get_cpu_time(allow_no_such_file_or_directory_error=True) + self._current_system_cpu = self._get_system_usage() + self._current_throttled_time = self._get_cpu_stat_counter(counter_name='throttled_usec') + + def _get_cpu_time(self, allow_no_such_file_or_directory_error=False): + """ + Returns the CPU time (user and system) consumed by this cgroup in seconds. + + If allow_no_such_file_or_directory_error is set to True and cpu.stat does not exist the function + returns 0; this is useful when the function can be called before the cgroup has been created. + """ + try: + cpu_stat = self._get_file_contents('cpu.stat') + except Exception as e: + if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: # pylint: disable=E1101 + raise CGroupsException("Failed to read cpu.stat: {0}".format(ustr(e))) + if not allow_no_such_file_or_directory_error: + raise e + cpu_stat = None + + cpu_time = 0 + + if cpu_stat is not None: + # + # Sample file: + # # cat /sys/fs/cgroup/azure.slice/azure-walinuxagent.slice/azure-walinuxagent-logcollector.slice/collect-logs.scope/cpu.stat + # usage_usec 1990707 + # user_usec 1939858 + # system_usec 50848 + # core_sched.force_idle_usec 0 + # nr_periods 397 + # nr_throttled 397 + # throttled_usec 37994949 + # nr_bursts 0 + # burst_usec 0 + # + match = re_v2_usage_time.match(cpu_stat) + if not match: + raise CGroupsException("The contents of {0} are invalid: {1}".format(self._get_cgroup_file('cpu.stat'), cpu_stat)) + cpu_time = int(match.groups()[0]) / 1E6 + + return cpu_time + + def get_cpu_usage(self): + if not self._cpu_usage_initialized(): + raise CGroupsException("initialize_cpu_usage() must be invoked before the first call to get_cpu_usage()") + + self._previous_cgroup_cpu = self._current_cgroup_cpu + self._previous_system_cpu = self._current_system_cpu + self._current_cgroup_cpu = self._get_cpu_time() + self._current_system_cpu = self._get_system_usage() + + cgroup_delta = self._current_cgroup_cpu - self._previous_cgroup_cpu + system_delta = max(1.0, self._current_system_cpu - self._previous_system_cpu) + + return round(100.0 * float(cgroup_delta) / float(system_delta), 3) + + def get_cpu_throttled_time(self, read_previous_throttled_time=True): + # Throttled time is reported in microseconds in v2 + if not read_previous_throttled_time: + return float(self._get_cpu_stat_counter(counter_name='throttled_usec') / 1E6) + + if not self._cpu_usage_initialized(): + raise CGroupsException("initialize_cpu_usage() must be invoked before the first call to get_cpu_throttled_time()") + + self._previous_throttled_time = self._current_throttled_time + self._current_throttled_time = self._get_cpu_stat_counter(counter_name='throttled_usec') + + return round(float(self._current_throttled_time - self._previous_throttled_time) / 1E6, 3) diff --git a/azurelinuxagent/ga/extensionprocessutil.py b/azurelinuxagent/ga/extensionprocessutil.py index 9061fd3ff..8eb65d459 100644 --- a/azurelinuxagent/ga/extensionprocessutil.py +++ b/azurelinuxagent/ga/extensionprocessutil.py @@ -31,7 +31,7 @@ TELEMETRY_MESSAGE_MAX_LEN = 3200 -def wait_for_process_completion_or_timeout(process, timeout, cpu_metrics): +def wait_for_process_completion_or_timeout(process, timeout, cpu_controller): """ Utility function that waits for the process to complete within the given time frame. This function will terminate the process if when the given time frame elapses. @@ -47,7 +47,7 @@ def wait_for_process_completion_or_timeout(process, timeout, cpu_metrics): throttled_time = 0 if timeout == 0: - throttled_time = get_cpu_throttled_time(cpu_metrics) + throttled_time = get_cpu_throttled_time(cpu_controller) os.killpg(os.getpgid(process.pid), signal.SIGKILL) else: # process completed or forked; sleep 1 sec to give the child process (if any) a chance to start @@ -57,7 +57,7 @@ def wait_for_process_completion_or_timeout(process, timeout, cpu_metrics): return timeout == 0, return_code, throttled_time -def handle_process_completion(process, command, timeout, stdout, stderr, error_code, cpu_metrics=None): +def handle_process_completion(process, command, timeout, stdout, stderr, error_code, cpu_controller=None): """ Utility function that waits for process completion and retrieves its output (stdout and stderr) if it completed before the timeout period. Otherwise, the process will get killed and an ExtensionError will be raised. @@ -68,15 +68,15 @@ def handle_process_completion(process, command, timeout, stdout, stderr, error_c :param stdout: Must be a file since we seek on it when parsing the subprocess output :param stderr: Must be a file since we seek on it when parsing the subprocess outputs :param error_code: The error code to set if we raise an ExtensionError - :param cpu_metrics: References the cpu metrics for the cgroup + :param cpu_controller: References the cpu controller for the cgroup :return: """ # Wait for process completion or timeout - timed_out, return_code, throttled_time = wait_for_process_completion_or_timeout(process, timeout, cpu_metrics) + timed_out, return_code, throttled_time = wait_for_process_completion_or_timeout(process, timeout, cpu_controller) process_output = read_output(stdout, stderr) if timed_out: - if cpu_metrics is not None: # Report CPUThrottledTime when timeout happens + if cpu_controller is not None: # Report CPUThrottledTime when timeout happens raise ExtensionError("Timeout({0});CPUThrottledTime({1}secs): {2}\n{3}".format(timeout, throttled_time, command, process_output), code=ExtensionErrorCodes.PluginHandlerScriptTimedout) @@ -211,14 +211,14 @@ def to_s(captured_stdout, stdout_offset, captured_stderr, stderr_offset): return to_s(stdout, -1*max_len_each, stderr, -1*max_len_each) -def get_cpu_throttled_time(cpu_metrics): +def get_cpu_throttled_time(cpu_controller): """ return the throttled time for the given cgroup. """ throttled_time = 0 - if cpu_metrics is not None: + if cpu_controller is not None: try: - throttled_time = cpu_metrics.get_cpu_throttled_time(read_previous_throttled_time=False) + throttled_time = cpu_controller.get_cpu_throttled_time(read_previous_throttled_time=False) except Exception as e: logger.warn("Failed to get cpu throttled time for the extension: {0}", ustr(e)) diff --git a/azurelinuxagent/ga/logcollector.py b/azurelinuxagent/ga/logcollector.py index eda6106b6..dfd5bfaf1 100644 --- a/azurelinuxagent/ga/logcollector.py +++ b/azurelinuxagent/ga/logcollector.py @@ -314,21 +314,21 @@ def _get_final_list_for_archive(self, priority_file_queue): if os.path.getsize(file_path) <= _FILE_SIZE_LIMIT: final_files_to_collect.append(file_path) + total_uncompressed_size += file_size _LOGGER.info("Adding file %s, size %s b", file_path, file_size) else: truncated_file_path = self._truncate_large_file(file_path) if truncated_file_path: _LOGGER.info("Adding truncated file %s, size %s b", truncated_file_path, file_size) final_files_to_collect.append(truncated_file_path) - - total_uncompressed_size += file_size + total_uncompressed_size += file_size except IOError as e: if e.errno == 2: # [Errno 2] No such file or directory _LOGGER.warning("File %s does not exist, skipping collection for this file", file_path) _LOGGER.info("Uncompressed archive size is %s b", total_uncompressed_size) - return final_files_to_collect + return final_files_to_collect, total_uncompressed_size def _create_list_of_files_to_collect(self): # The final list of files to be collected by zip is created in three steps: @@ -338,8 +338,8 @@ def _create_list_of_files_to_collect(self): # the size limit. parsed_file_paths = self._process_manifest_file() prioritized_file_paths = self._get_priority_files_list(parsed_file_paths) - files_to_collect = self._get_final_list_for_archive(prioritized_file_paths) - return files_to_collect + files_to_collect, total_uncompressed_size = self._get_final_list_for_archive(prioritized_file_paths) + return files_to_collect, total_uncompressed_size def collect_logs_and_get_archive(self): """ @@ -347,6 +347,7 @@ def collect_logs_and_get_archive(self): :return: Returns the path of the collected compressed archive """ files_to_collect = [] + total_uncompressed_size = 0 try: # Clear previous run's output and create base directories if they don't exist already. @@ -356,7 +357,7 @@ def collect_logs_and_get_archive(self): _LOGGER.info("Starting log collection at %s", start_time.strftime("%Y-%m-%dT%H:%M:%SZ")) _LOGGER.info("Using log collection mode %s", "full" if self._is_full_mode else "normal") - files_to_collect = self._create_list_of_files_to_collect() + files_to_collect, total_uncompressed_size = self._create_list_of_files_to_collect() _LOGGER.info("### Creating compressed archive ###") compressed_archive = None @@ -402,7 +403,7 @@ def handle_add_file_to_archive_error(error_count, max_errors, file_to_collect, e if compressed_archive is not None: compressed_archive.close() - return COMPRESSED_ARCHIVE_PATH + return COMPRESSED_ARCHIVE_PATH, total_uncompressed_size except Exception as e: msg = "Failed to collect logs: {0}".format(ustr(e)) _LOGGER.error(msg) diff --git a/azurelinuxagent/ga/memorycontroller.py b/azurelinuxagent/ga/memorycontroller.py new file mode 100644 index 000000000..30e7540cf --- /dev/null +++ b/azurelinuxagent/ga/memorycontroller.py @@ -0,0 +1,220 @@ +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.6+ and Openssl 1.0+ + +import errno +import os +import re + +from azurelinuxagent.common import logger +from azurelinuxagent.common.exception import CGroupsException +from azurelinuxagent.common.future import ustr +from azurelinuxagent.ga.cgroupcontroller import _CgroupController, CounterNotFound, MetricValue, MetricsCategory, \ + MetricsCounter, _REPORT_EVERY_HOUR + + +class _MemoryController(_CgroupController): + def __init__(self, name, cgroup_path): + super(_MemoryController, self).__init__(name, cgroup_path) + self._counter_not_found_error_count = 0 + + def _get_memory_stat_counter(self, counter_name): + """ + Gets the value for the provided counter in memory.stat + """ + try: + with open(os.path.join(self.path, 'memory.stat')) as memory_stat: + # + # Sample file v1: + # # cat memory.stat + # cache 0 + # rss 0 + # rss_huge 0 + # shmem 0 + # mapped_file 0 + # dirty 0 + # writeback 0 + # swap 0 + # ... + # + # Sample file v2 + # # cat memory.stat + # anon 0 + # file 147140608 + # kernel 1421312 + # kernel_stack 0 + # pagetables 0 + # sec_pagetables 0 + # percpu 130752 + # sock 0 + # ... + # + for line in memory_stat: + re_memory_counter = r'{0}\s+(\d+)'.format(counter_name) + match = re.match(re_memory_counter, line) + if match is not None: + return int(match.groups()[0]) + except (IOError, OSError) as e: + if e.errno == errno.ENOENT: + raise + raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e))) + except Exception as e: + raise CGroupsException("Failed to read memory.stat: {0}".format(ustr(e))) + + raise CounterNotFound("Cannot find counter: {0}".format(counter_name)) + + def get_memory_usage(self): + """ + Collects anon and cache usage for the cgroup and returns as a tuple + Returns anon and cache memory usage for the cgroup as a tuple -> (anon, cache) + + :return: Anon and cache memory usage in bytes + :rtype: tuple[int, int] + """ + raise NotImplementedError() + + def try_swap_memory_usage(self): + """ + Collects swap usage for the cgroup + + :return: Memory usage in bytes + :rtype: int + """ + raise NotImplementedError() + + def get_max_memory_usage(self): + """ + Collect max memory usage for the cgroup. + + :return: Memory usage in bytes + :rtype: int + """ + raise NotImplementedError() + + def get_tracked_metrics(self, **_): + # The log collector monitor tracks anon and cache memory separately. + anon_mem_usage, cache_mem_usage = self.get_memory_usage() + total_mem_usage = anon_mem_usage + cache_mem_usage + return [ + MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, self.name, total_mem_usage), + MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.ANON_MEM_USAGE, self.name, anon_mem_usage), + MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.CACHE_MEM_USAGE, self.name, cache_mem_usage), + MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, self.name, + self.get_max_memory_usage(), _REPORT_EVERY_HOUR), + MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.SWAP_MEM_USAGE, self.name, + self.try_swap_memory_usage(), _REPORT_EVERY_HOUR) + ] + + def get_unit_properties(self): + return["MemoryAccounting"] + + +class MemoryControllerV1(_MemoryController): + def get_memory_usage(self): + # In v1, anon memory is reported in the 'rss' counter + return self._get_memory_stat_counter("rss"), self._get_memory_stat_counter("cache") + + def try_swap_memory_usage(self): + # In v1, swap memory should be collected from memory.stat, because memory.memsw.usage_in_bytes reports total Memory+SWAP. + try: + return self._get_memory_stat_counter("swap") + except CounterNotFound as e: + if self._counter_not_found_error_count < 1: + logger.periodic_info(logger.EVERY_HALF_HOUR, + '{0} from "memory.stat" file in the cgroup: {1}---[Note: This log for informational purpose only and can be ignored]'.format(ustr(e), self.path)) + self._counter_not_found_error_count += 1 + return 0 + + def get_max_memory_usage(self): + # In v1, max memory usage is reported in memory.max_usage_in_bytes + usage = 0 + try: + usage = int(self._get_parameters('memory.max_usage_in_bytes', first_line_only=True)) + except Exception as e: + if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101 + raise + raise CGroupsException("Exception while attempting to read {0}".format("memory.max_usage_in_bytes"), e) + + return usage + + +class MemoryControllerV2(_MemoryController): + def get_memory_usage(self): + # In v2, cache memory is reported in the 'file' counter + return self._get_memory_stat_counter("anon"), self._get_memory_stat_counter("file") + + def get_memory_throttled_events(self): + """ + Returns the number of times processes of the cgroup are throttled and routed to perform memory recliam because + the high memory boundary was exceeded. + + :return: Number of memory throttling events for the cgroup + :rtype: int + """ + try: + with open(os.path.join(self.path, 'memory.events')) as memory_events: + # + # Sample file: + # # cat memory.events + # low 0 + # high 0 + # max 0 + # oom 0 + # oom_kill 0 + # oom_group_kill 0 + # + for line in memory_events: + match = re.match(r'high\s+(\d+)', line) + if match is not None: + return int(match.groups()[0]) + except (IOError, OSError) as e: + if e.errno == errno.ENOENT: + raise + raise CGroupsException("Failed to read memory.events: {0}".format(ustr(e))) + except Exception as e: + raise CGroupsException("Failed to read memory.events: {0}".format(ustr(e))) + + raise CounterNotFound("Cannot find memory.events counter: high") + + def try_swap_memory_usage(self): + # In v2, swap memory is reported in memory.swap.current + usage = 0 + try: + usage = int(self._get_parameters('memory.swap.current', first_line_only=True)) + except Exception as e: + if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101 + raise + raise CGroupsException("Exception while attempting to read {0}".format("memory.swap.current"), e) + + return usage + + def get_max_memory_usage(self): + # In v2, max memory usage is reported in memory.peak + usage = 0 + try: + usage = int(self._get_parameters('memory.peak', first_line_only=True)) + except Exception as e: + if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: # pylint: disable=E1101 + raise + raise CGroupsException("Exception while attempting to read {0}".format("memory.peak"), e) + + return usage + + def get_tracked_metrics(self, **_): + metrics = super(MemoryControllerV2, self).get_tracked_metrics() + throttled_value = MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MEM_THROTTLED, self.name, + self.get_memory_throttled_events()) + metrics.append(throttled_value) + return metrics diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index bdf2603fa..c1340ed69 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -22,7 +22,7 @@ import azurelinuxagent.common.conf as conf import azurelinuxagent.common.logger as logger import azurelinuxagent.common.utils.networkutil as networkutil -from azurelinuxagent.ga.controllermetrics import MetricValue, MetricsCategory, MetricsCounter +from azurelinuxagent.ga.cgroupcontroller import MetricValue, MetricsCategory, MetricsCounter from azurelinuxagent.ga.cgroupconfigurator import CGroupConfigurator from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.errorstate import ErrorState diff --git a/tests/common/utils/test_extension_process_util.py b/tests/common/utils/test_extension_process_util.py index 7771de4fd..8058292b9 100644 --- a/tests/common/utils/test_extension_process_util.py +++ b/tests/common/utils/test_extension_process_util.py @@ -19,9 +19,9 @@ import subprocess import tempfile -from azurelinuxagent.ga.controllermetrics import CpuMetrics from azurelinuxagent.common.exception import ExtensionError, ExtensionErrorCodes from azurelinuxagent.common.future import ustr +from azurelinuxagent.ga.cpucontroller import CpuControllerV1 from azurelinuxagent.ga.extensionprocessutil import format_stdout_stderr, read_output, \ wait_for_process_completion_or_timeout, handle_process_completion from tests.lib.tools import AgentTestCase, patch, data_dir @@ -52,7 +52,7 @@ def test_wait_for_process_completion_or_timeout_should_terminate_cleanly(self): stdout=subprocess.PIPE, stderr=subprocess.PIPE) - timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=5, cpu_metrics=None) + timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=5, cpu_controller=None) self.assertEqual(timed_out, False) self.assertEqual(ret, 0) @@ -71,7 +71,7 @@ def test_wait_for_process_completion_or_timeout_should_kill_process_on_timeout(s with patch('azurelinuxagent.ga.extensionprocessutil.os.killpg', wraps=os.killpg) as patch_kill: with patch('time.sleep') as mock_sleep: timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=timeout, - cpu_metrics=None) + cpu_controller=None) # We're mocking sleep to avoid prolonging the test execution time, but we still want to make sure # we're "waiting" the correct amount of time before killing the process @@ -90,7 +90,7 @@ def test_handle_process_completion_should_return_nonzero_when_process_fails(self stdout=subprocess.PIPE, stderr=subprocess.PIPE) - timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=5, cpu_metrics=None) + timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=5, cpu_controller=None) self.assertEqual(timed_out, False) self.assertEqual(ret, 2) @@ -149,9 +149,9 @@ def test_handle_process_completion_should_log_throttled_time_on_timeout(self): with patch('time.sleep') as mock_sleep: with self.assertRaises(ExtensionError) as context_manager: test_file = os.path.join(self.tmp_dir, "cpu.stat") - shutil.copyfile(os.path.join(data_dir, "cgroups", "cpu.stat_t0"), + shutil.copyfile(os.path.join(data_dir, "cgroups", "v1", "cpu.stat_t0"), test_file) # throttled_time = 50 - cgroup = CpuMetrics("test", self.tmp_dir) + cpu_controller = CpuControllerV1("test", self.tmp_dir) process = subprocess.Popen(command, # pylint: disable=subprocess-popen-preexec-fn shell=True, cwd=self.tmp_dir, @@ -161,7 +161,7 @@ def test_handle_process_completion_should_log_throttled_time_on_timeout(self): preexec_fn=os.setsid) handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, - stderr=stderr, error_code=42, cpu_metrics=cgroup) + stderr=stderr, error_code=42, cpu_controller=cpu_controller) # We're mocking sleep to avoid prolonging the test execution time, but we still want to make sure # we're "waiting" the correct amount of time before killing the process and raising an exception diff --git a/tests/data/cgroups/cpu_mount/cpuacct.stat b/tests/data/cgroups/cpu_mount/cpuacct.stat deleted file mode 100644 index dbdaec701..000000000 --- a/tests/data/cgroups/cpu_mount/cpuacct.stat +++ /dev/null @@ -1,2 +0,0 @@ -user 50000 -system 100000 diff --git a/tests/data/cgroups/cpu.stat b/tests/data/cgroups/v1/cpu.stat similarity index 100% rename from tests/data/cgroups/cpu.stat rename to tests/data/cgroups/v1/cpu.stat diff --git a/tests/data/cgroups/cpu.stat_t0 b/tests/data/cgroups/v1/cpu.stat_t0 similarity index 100% rename from tests/data/cgroups/cpu.stat_t0 rename to tests/data/cgroups/v1/cpu.stat_t0 diff --git a/tests/data/cgroups/cpu.stat_t1 b/tests/data/cgroups/v1/cpu.stat_t1 similarity index 100% rename from tests/data/cgroups/cpu.stat_t1 rename to tests/data/cgroups/v1/cpu.stat_t1 diff --git a/tests/data/cgroups/cpuacct.stat b/tests/data/cgroups/v1/cpuacct.stat similarity index 100% rename from tests/data/cgroups/cpuacct.stat rename to tests/data/cgroups/v1/cpuacct.stat diff --git a/tests/data/cgroups/cpuacct.stat_t0 b/tests/data/cgroups/v1/cpuacct.stat_t0 similarity index 100% rename from tests/data/cgroups/cpuacct.stat_t0 rename to tests/data/cgroups/v1/cpuacct.stat_t0 diff --git a/tests/data/cgroups/cpuacct.stat_t1 b/tests/data/cgroups/v1/cpuacct.stat_t1 similarity index 100% rename from tests/data/cgroups/cpuacct.stat_t1 rename to tests/data/cgroups/v1/cpuacct.stat_t1 diff --git a/tests/data/cgroups/cpuacct.stat_t2 b/tests/data/cgroups/v1/cpuacct.stat_t2 similarity index 100% rename from tests/data/cgroups/cpuacct.stat_t2 rename to tests/data/cgroups/v1/cpuacct.stat_t2 diff --git a/tests/data/cgroups/memory_mount/memory.max_usage_in_bytes b/tests/data/cgroups/v1/memory.max_usage_in_bytes similarity index 100% rename from tests/data/cgroups/memory_mount/memory.max_usage_in_bytes rename to tests/data/cgroups/v1/memory.max_usage_in_bytes diff --git a/tests/data/cgroups/memory_mount/memory.stat b/tests/data/cgroups/v1/memory.stat similarity index 100% rename from tests/data/cgroups/memory_mount/memory.stat rename to tests/data/cgroups/v1/memory.stat diff --git a/tests/data/cgroups/missing_memory_counters/memory.stat b/tests/data/cgroups/v1/memory.stat_missing similarity index 100% rename from tests/data/cgroups/missing_memory_counters/memory.stat rename to tests/data/cgroups/v1/memory.stat_missing diff --git a/tests/data/cgroups/proc_stat_t0 b/tests/data/cgroups/v1/proc_stat_t0 similarity index 100% rename from tests/data/cgroups/proc_stat_t0 rename to tests/data/cgroups/v1/proc_stat_t0 diff --git a/tests/data/cgroups/proc_stat_t1 b/tests/data/cgroups/v1/proc_stat_t1 similarity index 100% rename from tests/data/cgroups/proc_stat_t1 rename to tests/data/cgroups/v1/proc_stat_t1 diff --git a/tests/data/cgroups/proc_stat_t2 b/tests/data/cgroups/v1/proc_stat_t2 similarity index 100% rename from tests/data/cgroups/proc_stat_t2 rename to tests/data/cgroups/v1/proc_stat_t2 diff --git a/tests/data/cgroups/v2/cpu.stat b/tests/data/cgroups/v2/cpu.stat new file mode 100644 index 000000000..6fcb7b86f --- /dev/null +++ b/tests/data/cgroups/v2/cpu.stat @@ -0,0 +1,9 @@ +usage_usec 817045397 +user_usec 742283732 +system_usec 74761665 +core_sched.force_idle_usec 0 +nr_periods 165261 +nr_throttled 162912 +throttled_usec 15735198706 +nr_bursts 0 +burst_usec 0 diff --git a/tests/data/cgroups/v2/cpu.stat_t0 b/tests/data/cgroups/v2/cpu.stat_t0 new file mode 100644 index 000000000..6fcb7b86f --- /dev/null +++ b/tests/data/cgroups/v2/cpu.stat_t0 @@ -0,0 +1,9 @@ +usage_usec 817045397 +user_usec 742283732 +system_usec 74761665 +core_sched.force_idle_usec 0 +nr_periods 165261 +nr_throttled 162912 +throttled_usec 15735198706 +nr_bursts 0 +burst_usec 0 diff --git a/tests/data/cgroups/v2/cpu.stat_t1 b/tests/data/cgroups/v2/cpu.stat_t1 new file mode 100644 index 000000000..a2eaecf6e --- /dev/null +++ b/tests/data/cgroups/v2/cpu.stat_t1 @@ -0,0 +1,9 @@ +usage_usec 819624087 +user_usec 744545316 +system_usec 75078770 +core_sched.force_idle_usec 0 +nr_periods 165783 +nr_throttled 163430 +throttled_usec 15796563650 +nr_bursts 0 +burst_usec 0 diff --git a/tests/data/cgroups/v2/cpu.stat_t2 b/tests/data/cgroups/v2/cpu.stat_t2 new file mode 100644 index 000000000..cca6a6e42 --- /dev/null +++ b/tests/data/cgroups/v2/cpu.stat_t2 @@ -0,0 +1,9 @@ +usage_usec 822052295 +user_usec 746640066 +system_usec 75412229 +core_sched.force_idle_usec 0 +nr_periods 166274 +nr_throttled 163917 +throttled_usec 15853013984 +nr_bursts 0 +burst_usec 0 diff --git a/tests/data/cgroups/v2/memory.events b/tests/data/cgroups/v2/memory.events new file mode 100644 index 000000000..ee154297a --- /dev/null +++ b/tests/data/cgroups/v2/memory.events @@ -0,0 +1,6 @@ +low 0 +high 9 +max 0 +oom 0 +oom_kill 0 +oom_group_kill 0 diff --git a/tests/data/cgroups/v2/memory.events_missing b/tests/data/cgroups/v2/memory.events_missing new file mode 100644 index 000000000..5a5d05a34 --- /dev/null +++ b/tests/data/cgroups/v2/memory.events_missing @@ -0,0 +1,5 @@ +low 0 +max 0 +oom 0 +oom_kill 0 +oom_group_kill 0 diff --git a/tests/data/cgroups/v2/memory.peak b/tests/data/cgroups/v2/memory.peak new file mode 100644 index 000000000..25140d458 --- /dev/null +++ b/tests/data/cgroups/v2/memory.peak @@ -0,0 +1 @@ +194494464 diff --git a/tests/data/cgroups/v2/memory.stat b/tests/data/cgroups/v2/memory.stat new file mode 100644 index 000000000..0b0d4c52d --- /dev/null +++ b/tests/data/cgroups/v2/memory.stat @@ -0,0 +1,53 @@ +anon 17589300 +file 134553600 +kernel 25653248 +kernel_stack 0 +pagetables 0 +sec_pagetables 0 +percpu 726400 +sock 0 +vmalloc 0 +shmem 0 +zswap 0 +zswapped 0 +file_mapped 0 +file_dirty 12288 +file_writeback 0 +swapcached 0 +anon_thp 0 +file_thp 0 +shmem_thp 0 +inactive_anon 0 +active_anon 0 +inactive_file 127213568 +active_file 7340032 +unevictable 0 +slab_reclaimable 24061424 +slab_unreclaimable 0 +slab 24061424 +workingset_refault_anon 0 +workingset_refault_file 0 +workingset_activate_anon 0 +workingset_activate_file 0 +workingset_restore_anon 0 +workingset_restore_file 0 +workingset_nodereclaim 128 +pgscan 56624 +pgsteal 56622 +pgscan_kswapd 56624 +pgscan_direct 0 +pgscan_khugepaged 0 +pgsteal_kswapd 56622 +pgsteal_direct 0 +pgsteal_khugepaged 0 +pgfault 3673191 +pgmajfault 1 +pgrefill 124195 +pgactivate 2 +pgdeactivate 0 +pglazyfree 0 +pglazyfreed 0 +zswpin 0 +zswpout 0 +thp_fault_alloc 255 +thp_collapse_alloc 111 diff --git a/tests/data/cgroups/v2/memory.stat_missing b/tests/data/cgroups/v2/memory.stat_missing new file mode 100644 index 000000000..96d43db68 --- /dev/null +++ b/tests/data/cgroups/v2/memory.stat_missing @@ -0,0 +1,51 @@ +kernel 25653248 +kernel_stack 0 +pagetables 0 +sec_pagetables 0 +percpu 726400 +sock 0 +vmalloc 0 +shmem 0 +zswap 0 +zswapped 0 +file_mapped 0 +file_dirty 12288 +file_writeback 0 +swapcached 0 +anon_thp 0 +file_thp 0 +shmem_thp 0 +inactive_anon 0 +active_anon 0 +inactive_file 127213568 +active_file 7340032 +unevictable 0 +slab_reclaimable 24061424 +slab_unreclaimable 0 +slab 24061424 +workingset_refault_anon 0 +workingset_refault_file 0 +workingset_activate_anon 0 +workingset_activate_file 0 +workingset_restore_anon 0 +workingset_restore_file 0 +workingset_nodereclaim 128 +pgscan 56624 +pgsteal 56622 +pgscan_kswapd 56624 +pgscan_direct 0 +pgscan_khugepaged 0 +pgsteal_kswapd 56622 +pgsteal_direct 0 +pgsteal_khugepaged 0 +pgfault 3673191 +pgmajfault 1 +pgrefill 124195 +pgactivate 2 +pgdeactivate 0 +pglazyfree 0 +pglazyfreed 0 +zswpin 0 +zswpout 0 +thp_fault_alloc 255 +thp_collapse_alloc 111 diff --git a/tests/data/cgroups/v2/memory.swap.current b/tests/data/cgroups/v2/memory.swap.current new file mode 100644 index 000000000..b92677edb --- /dev/null +++ b/tests/data/cgroups/v2/memory.swap.current @@ -0,0 +1 @@ +20000 diff --git a/tests/data/cgroups/v2/proc_uptime_t0 b/tests/data/cgroups/v2/proc_uptime_t0 new file mode 100644 index 000000000..d035316d9 --- /dev/null +++ b/tests/data/cgroups/v2/proc_uptime_t0 @@ -0,0 +1 @@ +776968.02 1495073.30 diff --git a/tests/data/cgroups/v2/proc_uptime_t1 b/tests/data/cgroups/v2/proc_uptime_t1 new file mode 100644 index 000000000..f0660cf12 --- /dev/null +++ b/tests/data/cgroups/v2/proc_uptime_t1 @@ -0,0 +1 @@ +777350.57 1495797.44 diff --git a/tests/data/cgroups/v2/proc_uptime_t2 b/tests/data/cgroups/v2/proc_uptime_t2 new file mode 100644 index 000000000..ae3e36aad --- /dev/null +++ b/tests/data/cgroups/v2/proc_uptime_t2 @@ -0,0 +1 @@ +779218.68 1499425.34 diff --git a/tests/ga/test_cgroupapi.py b/tests/ga/test_cgroupapi.py index 717adbb6f..ae091ed9d 100644 --- a/tests/ga/test_cgroupapi.py +++ b/tests/ga/test_cgroupapi.py @@ -28,7 +28,8 @@ from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.osutil import systemd from azurelinuxagent.common.utils import fileutil -from azurelinuxagent.ga.controllermetrics import CpuMetrics, MemoryMetrics +from azurelinuxagent.ga.cpucontroller import CpuControllerV1, CpuControllerV2 +from azurelinuxagent.ga.memorycontroller import MemoryControllerV1, MemoryControllerV2 from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, mock_cgroup_v2_environment, \ mock_cgroup_hybrid_environment from tests.lib.mock_environment import MockCommand @@ -505,7 +506,7 @@ class CgroupsApiv1TestCase(AgentTestCase): def test_get_supported_controllers_returns_v1_controllers(self): with mock_cgroup_v1_environment(self.tmp_dir): cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") - controllers = cgroup.get_supported_controllers() + controllers = cgroup.get_supported_controller_names() self.assertEqual(len(controllers), 2) self.assertIn('cpu,cpuacct', controllers) self.assertIn('memory', controllers) @@ -536,55 +537,55 @@ def test_check_in_expected_slice_returns_False_if_any_paths_not_in_expected_slic cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") self.assertFalse(cgroup.check_in_expected_slice(expected_slice='system.slice')) - def test_get_controller_metrics_returns_all_supported_controllers_v1(self): + def test_get_controllers_returns_all_supported_controllers_v1(self): with mock_cgroup_v1_environment(self.tmp_dir): cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") - metrics = cgroup.get_controller_metrics() - self.assertEqual(len(metrics), 2) - self.assertIsInstance(metrics[0], CpuMetrics) - self.assertEqual(metrics[0].name, "walinuxagent") - self.assertEqual(metrics[0].path, "/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service") - self.assertIsInstance(metrics[1], MemoryMetrics) - self.assertEqual(metrics[1].name, "walinuxagent") - self.assertEqual(metrics[1].path, "/sys/fs/cgroup/memory/system.slice/walinuxagent.service") - - def test_get_controller_metrics_returns_only_mounted_controllers_v1(self): + controllers = cgroup.get_controllers() + self.assertEqual(len(controllers), 2) + self.assertIsInstance(controllers[0], CpuControllerV1) + self.assertEqual(controllers[0].name, "walinuxagent") + self.assertEqual(controllers[0].path, "/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service") + self.assertIsInstance(controllers[1], MemoryControllerV1) + self.assertEqual(controllers[1].name, "walinuxagent") + self.assertEqual(controllers[1].path, "/sys/fs/cgroup/memory/system.slice/walinuxagent.service") + + def test_get_controllers_returns_only_mounted_controllers_v1(self): with mock_cgroup_v1_environment(self.tmp_dir): with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct'}): cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") - metrics = cgroup.get_controller_metrics() - self.assertEqual(len(metrics), 1) - self.assertIsInstance(metrics[0], CpuMetrics) - self.assertEqual(metrics[0].name, "walinuxagent") - self.assertEqual(metrics[0].path, "/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service") + controllers = cgroup.get_controllers() + self.assertEqual(len(controllers), 1) + self.assertIsInstance(controllers[0], CpuControllerV1) + self.assertEqual(controllers[0].name, "walinuxagent") + self.assertEqual(controllers[0].path, "/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service") with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'memory': '/sys/fs/cgroup/memory'}): cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") - metrics = cgroup.get_controller_metrics() - self.assertEqual(len(metrics), 1) - self.assertIsInstance(metrics[0], MemoryMetrics) - self.assertEqual(metrics[0].name, "walinuxagent") - self.assertEqual(metrics[0].path, "/sys/fs/cgroup/memory/system.slice/walinuxagent.service") + controllers = cgroup.get_controllers() + self.assertEqual(len(controllers), 1) + self.assertIsInstance(controllers[0], MemoryControllerV1) + self.assertEqual(controllers[0].name, "walinuxagent") + self.assertEqual(controllers[0].path, "/sys/fs/cgroup/memory/system.slice/walinuxagent.service") with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={}): cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") - metrics = cgroup.get_controller_metrics() - self.assertEqual(len(metrics), 0) + controllers = cgroup.get_controllers() + self.assertEqual(len(controllers), 0) - def test_get_controller_metrics_returns_only_controllers_at_expected_path_v1(self): + def test_get_controllers_returns_only_controllers_at_expected_path_v1(self): with mock_cgroup_v1_environment(self.tmp_dir): with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_process_relative_controller_paths', return_value={'cpu,cpuacct': 'system.slice/walinuxagent.service', 'memory': 'unexpected/path'}): cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") - metrics = cgroup.get_controller_metrics(expected_relative_path="system.slice/walinuxagent.service") - self.assertEqual(len(metrics), 1) - self.assertIsInstance(metrics[0], CpuMetrics) - self.assertEqual(metrics[0].name, "walinuxagent") - self.assertEqual(metrics[0].path, "/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service") + controllers = cgroup.get_controllers(expected_relative_path="system.slice/walinuxagent.service") + self.assertEqual(len(controllers), 1) + self.assertIsInstance(controllers[0], CpuControllerV1) + self.assertEqual(controllers[0].name, "walinuxagent") + self.assertEqual(controllers[0].path, "/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service") with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_process_relative_controller_paths', return_value={'cpu,cpuacct': 'unexpected/path', 'memory': 'unexpected/path'}): cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") - metrics = cgroup.get_controller_metrics(expected_relative_path="system.slice/walinuxagent.service") - self.assertEqual(len(metrics), 0) + controllers = cgroup.get_controllers(expected_relative_path="system.slice/walinuxagent.service") + self.assertEqual(len(controllers), 0) def test_get_procs_path_returns_correct_path_v1(self): with mock_cgroup_v1_environment(self.tmp_dir): @@ -625,7 +626,7 @@ class CgroupsApiv2TestCase(AgentTestCase): def test_get_supported_controllers_returns_v2_controllers(self): with mock_cgroup_v2_environment(self.tmp_dir): cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") - controllers = cgroup.get_supported_controllers() + controllers = cgroup.get_supported_controller_names() self.assertEqual(len(controllers), 2) self.assertIn('cpu', controllers) self.assertIn('memory', controllers) @@ -644,6 +645,57 @@ def test_check_in_expected_slice_returns_False_if_cgroup_path_not_in_expected_sl cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") self.assertFalse(cgroup.check_in_expected_slice(expected_slice='system.slice')) + def test_get_controllers_returns_all_supported_controllers_v2(self): + with mock_cgroup_v2_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + controllers = cgroup.get_controllers() + self.assertEqual(len(controllers), 2) + self.assertIsInstance(controllers[0], CpuControllerV2) + self.assertEqual(controllers[0].name, "walinuxagent") + self.assertEqual(controllers[0].path, "/sys/fs/cgroup/system.slice/walinuxagent.service") + self.assertIsInstance(controllers[1], MemoryControllerV2) + self.assertEqual(controllers[1].name, "walinuxagent") + self.assertEqual(controllers[1].path, "/sys/fs/cgroup/system.slice/walinuxagent.service") + + def test_get_controllers_returns_only_enabled_controllers_v2(self): + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_controllers_enabled_at_root', return_value=["cpu"]): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + controllers = cgroup.get_controllers() + self.assertEqual(len(controllers), 1) + self.assertIsInstance(controllers[0], CpuControllerV2) + self.assertEqual(controllers[0].name, "walinuxagent") + self.assertEqual(controllers[0].path, "/sys/fs/cgroup/system.slice/walinuxagent.service") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_controllers_enabled_at_root', return_value=["memory"]): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + controllers = cgroup.get_controllers() + self.assertEqual(len(controllers), 1) + self.assertIsInstance(controllers[0], MemoryControllerV2) + self.assertEqual(controllers[0].name, "walinuxagent") + self.assertEqual(controllers[0].path, "/sys/fs/cgroup/system.slice/walinuxagent.service") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_controllers_enabled_at_root', return_value=[]): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + controllers = cgroup.get_controllers() + self.assertEqual(len(controllers), 0) + + def test_get_controllers_returns_empty_if_cgroup_path_is_empty_v2(self): + with mock_cgroup_v2_environment(self.tmp_dir): + mock_cgroup_empty_path = CgroupV2(cgroup_name="test", root_cgroup_path="/sys/fs/cgroup", cgroup_path="", enabled_controllers=["cpu", "memory"]) + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_process_cgroup", return_value=mock_cgroup_empty_path): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + controllers = cgroup.get_controllers() + self.assertEqual(len(controllers), 0) + + def test_get_controllers_returns_only_controllers_at_expected_path_v2(self): + with mock_cgroup_v2_environment(self.tmp_dir): + mock_cgroup_unexpected_path = CgroupV2(cgroup_name="test", root_cgroup_path="/sys/fs/cgroup", cgroup_path="/sys/fs/cgroup/unexpected/path", enabled_controllers=["cpu", "memory"]) + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_process_cgroup", return_value=mock_cgroup_unexpected_path): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + controllers = cgroup.get_controllers(expected_relative_path="system.slice/walinuxagent.service") + self.assertEqual(len(controllers), 0) + def test_get_procs_path_returns_empty_if_root_cgroup_empty_v2(self): with mock_cgroup_v2_environment(self.tmp_dir): with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_root_cgroup_path', return_value=""): diff --git a/tests/ga/test_cgroupconfigurator.py b/tests/ga/test_cgroupconfigurator.py index 1d1465a47..9af0d88d7 100644 --- a/tests/ga/test_cgroupconfigurator.py +++ b/tests/ga/test_cgroupconfigurator.py @@ -27,13 +27,14 @@ import threading from azurelinuxagent.common import conf -from azurelinuxagent.ga.controllermetrics import AGENT_NAME_TELEMETRY, MetricsCounter, MetricValue, MetricsCategory, CpuMetrics +from azurelinuxagent.ga.cgroupcontroller import AGENT_NAME_TELEMETRY, MetricsCounter, MetricValue, MetricsCategory from azurelinuxagent.ga.cgroupconfigurator import CGroupConfigurator, DisableCgroups from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.event import WALAEventOperation from azurelinuxagent.common.exception import CGroupsException, AgentMemoryExceededException from azurelinuxagent.common.future import ustr from azurelinuxagent.common.utils import shellutil, fileutil +from azurelinuxagent.ga.cpucontroller import CpuControllerV1 from tests.lib.mock_environment import MockCommand from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, UnitFilePaths, mock_cgroup_v2_environment from tests.lib.tools import AgentTestCase, patch, mock_sleep, data_dir, is_python_version_26_or_34, skip_if_predicate_true @@ -272,7 +273,7 @@ def test_remove_extension_slice_should_remove_unit_files(self): CGroupsTelemetry._tracked['/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/' \ 'azure-vmextensions-Microsoft.CPlat.Extension.slice'] = \ - CpuMetrics('Microsoft.CPlat.Extension', + CpuControllerV1('Microsoft.CPlat.Extension', '/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/azure-vmextensions-Microsoft.CPlat.Extension.slice') configurator.remove_extension_slice(extension_name="Microsoft.CPlat.Extension") @@ -369,10 +370,10 @@ def test_disable_should_reset_cpu_quota_for_all_cgroups(self): configurator.setup_extension_slice(extension_name=extension_name, cpu_quota=5) configurator.set_extension_services_cpu_memory_quota(service_list) CGroupsTelemetry._tracked['/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service'] = \ - CpuMetrics('extension.service', '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service') + CpuControllerV1('extension.service', '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service') CGroupsTelemetry._tracked['/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/' \ 'azure-vmextensions-Microsoft.CPlat.Extension.slice'] = \ - CpuMetrics('Microsoft.CPlat.Extension', + CpuControllerV1('Microsoft.CPlat.Extension', '/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/azure-vmextensions-Microsoft.CPlat.Extension.slice') configurator.disable("UNIT TEST", DisableCgroups.ALL) @@ -717,7 +718,8 @@ def test_it_should_stop_tracking_extension_services_cgroups(self): with self._get_cgroup_configurator() as configurator: with patch("os.path.exists") as mock_path: mock_path.return_value = True - CGroupsTelemetry.track_cgroup(CpuMetrics('extension.service', '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service')) + CGroupsTelemetry.track_cgroup_controller( + CpuControllerV1('extension.service', '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service')) configurator.stop_tracking_extension_services_cgroups(service_list) tracked = CGroupsTelemetry._tracked @@ -776,7 +778,7 @@ def side_effect(path): with patch("os.path.exists") as mock_path: mock_path.side_effect = side_effect CGroupsTelemetry._tracked['/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service'] = \ - CpuMetrics('extension.service', '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service') + CpuControllerV1('extension.service', '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service') configurator.stop_tracking_unit_cgroups("extension.service") tracked = CGroupsTelemetry._tracked @@ -1012,8 +1014,15 @@ def test_check_agent_memory_usage_should_raise_a_cgroups_exception_when_the_limi with self.assertRaises(AgentMemoryExceededException) as context_manager: with self._get_cgroup_configurator() as configurator: - with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_tracked_metrics") as tracked_metrics: + with patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_tracked_metrics") as tracked_metrics: tracked_metrics.return_value = metrics configurator.check_agent_memory_usage() self.assertIn("The agent memory limit {0} bytes exceeded".format(conf.get_agent_memory_quota()), ustr(context_manager.exception), "An incorrect exception was raised") + + def test_get_log_collector_properties_should_return_correct_props(self): + with self._get_cgroup_configurator() as configurator: + self.assertEqual(configurator.get_logcollector_unit_properties(), ["--property=CPUAccounting=yes", "--property=MemoryAccounting=yes", "--property=CPUQuota=5%"]) + + with self._get_cgroup_configurator_v2() as configurator: + self.assertEqual(configurator.get_logcollector_unit_properties(), ["--property=CPUAccounting=yes", "--property=MemoryAccounting=yes", "--property=CPUQuota=5%", "--property=MemoryHigh=170M"]) diff --git a/tests/ga/test_cgroupcontroller.py b/tests/ga/test_cgroupcontroller.py new file mode 100644 index 000000000..a01237e96 --- /dev/null +++ b/tests/ga/test_cgroupcontroller.py @@ -0,0 +1,55 @@ +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.4+ and Openssl 1.0+ +# + +from __future__ import print_function + +import os +import random + +from azurelinuxagent.ga.cgroupcontroller import _CgroupController +from tests.lib.tools import AgentTestCase, patch + + +def consume_cpu_time(): + waste = 0 + for x in range(1, 200000): # pylint: disable=unused-variable + waste += random.random() + return waste + + +class TestCgroupController(AgentTestCase): + def test_is_active(self): + test_metrics = _CgroupController("test_extension", self.tmp_dir) + + with open(os.path.join(self.tmp_dir, "cgroup.procs"), mode="wb") as tasks: + tasks.write(str(1000).encode()) + + self.assertEqual(True, test_metrics.is_active()) + + @patch("azurelinuxagent.common.logger.periodic_warn") + def test_is_active_file_not_present(self, patch_periodic_warn): + test_metrics = _CgroupController("test_extension", self.tmp_dir) + self.assertFalse(test_metrics.is_active()) + + self.assertEqual(0, patch_periodic_warn.call_count) + + @patch("azurelinuxagent.common.logger.periodic_warn") + def test_is_active_incorrect_file(self, patch_periodic_warn): + open(os.path.join(self.tmp_dir, "cgroup.procs"), mode="wb").close() + test_metrics = _CgroupController("test_extension", os.path.join(self.tmp_dir, "cgroup.procs")) + self.assertEqual(False, test_metrics.is_active()) + self.assertEqual(1, patch_periodic_warn.call_count) diff --git a/tests/ga/test_cgroupstelemetry.py b/tests/ga/test_cgroupstelemetry.py index 457b20e47..ab4e33048 100644 --- a/tests/ga/test_cgroupstelemetry.py +++ b/tests/ga/test_cgroupstelemetry.py @@ -19,9 +19,11 @@ import random import time -from azurelinuxagent.ga.controllermetrics import CpuMetrics, MemoryMetrics +from azurelinuxagent.ga.cgroupcontroller import MetricsCounter from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.utils import fileutil +from azurelinuxagent.ga.cpucontroller import CpuControllerV1 +from azurelinuxagent.ga.memorycontroller import MemoryControllerV1 from tests.lib.tools import AgentTestCase, data_dir, patch @@ -80,9 +82,9 @@ def setUpClass(cls): def mock_read_file(filepath, **args): if filepath == "/proc/stat": - filepath = os.path.join(data_dir, "cgroups", "proc_stat_t0") + filepath = os.path.join(data_dir, "cgroups", "v1", "proc_stat_t0") elif filepath.endswith("/cpuacct.stat"): - filepath = os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") + filepath = os.path.join(data_dir, "cgroups", "v1", "cpuacct.stat_t0") return original_read_file(filepath, **args) cls._mock_read_cpu_cgroup_file = patch("azurelinuxagent.common.utils.fileutil.read_file", side_effect=mock_read_file) @@ -103,76 +105,81 @@ def tearDown(self): CGroupsTelemetry.reset() @staticmethod - def _track_new_extension_cgroups(num_extensions): + def _track_new_extension_cgroup_controllers(num_extensions): for i in range(num_extensions): - dummy_cpu_cgroup = CpuMetrics("dummy_extension_{0}".format(i), "dummy_cpu_path_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) + dummy_cpu_controller = CpuControllerV1("dummy_extension_{0}".format(i), "dummy_cpu_path_{0}".format(i)) + CGroupsTelemetry.track_cgroup_controller(dummy_cpu_controller) - dummy_memory_cgroup = MemoryMetrics("dummy_extension_{0}".format(i), "dummy_memory_path_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) + dummy_memory_controller = MemoryControllerV1("dummy_extension_{0}".format(i), "dummy_memory_path_{0}".format(i)) + CGroupsTelemetry.track_cgroup_controller(dummy_memory_controller) - def _assert_cgroups_are_tracked(self, num_extensions): + def _assert_cgroup_controllers_are_tracked(self, num_extensions): for i in range(num_extensions): self.assertTrue(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) self.assertTrue(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) - def _assert_polled_metrics_equal(self, metrics, cpu_metric_value, memory_metric_value, max_memory_metric_value, swap_memory_value): + def _assert_polled_metrics_equal(self, metrics, cpu_metric_value, current_total_memory_metric_value, current_anon_memory_metric_value, current_cache_memory_metric_value, max_memory_metric_value, swap_memory_value): for metric in metrics: self.assertIn(metric.category, ["CPU", "Memory"]) if metric.category == "CPU": self.assertEqual(metric.counter, "% Processor Time") self.assertEqual(metric.value, cpu_metric_value) if metric.category == "Memory": - self.assertIn(metric.counter, ["Total Memory Usage", "Max Memory Usage", "Swap Memory Usage"]) - if metric.counter == "Total Memory Usage": - self.assertEqual(metric.value, memory_metric_value) - elif metric.counter == "Max Memory Usage": + self.assertIn(metric.counter, [MetricsCounter.TOTAL_MEM_USAGE, MetricsCounter.ANON_MEM_USAGE, MetricsCounter.CACHE_MEM_USAGE, MetricsCounter.MAX_MEM_USAGE, MetricsCounter.SWAP_MEM_USAGE]) + if metric.counter == MetricsCounter.TOTAL_MEM_USAGE: + self.assertEqual(metric.value, current_total_memory_metric_value) + elif metric.counter == MetricsCounter.ANON_MEM_USAGE: + self.assertEqual(metric.value, current_anon_memory_metric_value) + elif metric.counter == MetricsCounter.CACHE_MEM_USAGE: + self.assertEqual(metric.value, current_cache_memory_metric_value) + elif metric.counter == MetricsCounter.MAX_MEM_USAGE: self.assertEqual(metric.value, max_memory_metric_value) - elif metric.counter == "Swap Memory Usage": + elif metric.counter == MetricsCounter.SWAP_MEM_USAGE: self.assertEqual(metric.value, swap_memory_value) def test_telemetry_polling_with_active_cgroups(self, *args): # pylint: disable=unused-argument num_extensions = 3 - self._track_new_extension_cgroups(num_extensions) - - with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_max_memory_usage") as patch_get_memory_max_usage: - with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage") as patch_get_memory_usage: - with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage") as patch_get_memory_usage: - with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.try_swap_memory_usage") as patch_try_swap_memory_usage: - with patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage") as patch_get_cpu_usage: - with patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") as patch_is_active: - patch_is_active.return_value = True - - current_cpu = 30 - current_memory = 209715200 - current_max_memory = 471859200 - current_swap_memory = 20971520 - - # 1 CPU metric + 1 Current Memory + 1 Max memory + 1 swap memory - num_of_metrics_per_extn_expected = 4 - patch_get_cpu_usage.return_value = current_cpu - patch_get_memory_usage.return_value = current_memory # example 200 MB - patch_get_memory_max_usage.return_value = current_max_memory # example 450 MB - patch_try_swap_memory_usage.return_value = current_swap_memory # example 20MB - num_polls = 12 - - for data_count in range(1, num_polls + 1): # pylint: disable=unused-variable - metrics = CGroupsTelemetry.poll_all_tracked() - - self.assertEqual(len(metrics), num_extensions * num_of_metrics_per_extn_expected) - self._assert_polled_metrics_equal(metrics, current_cpu, current_memory, current_max_memory, current_swap_memory) - - @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_max_memory_usage", side_effect=raise_ioerror) - @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage", side_effect=raise_ioerror) - @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage", side_effect=raise_ioerror) - @patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active", return_value=False) + self._track_new_extension_cgroup_controllers(num_extensions) + + with patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_max_memory_usage") as patch_get_memory_max_usage: + with patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_memory_usage") as patch_get_memory_usage: + with patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.try_swap_memory_usage") as patch_try_swap_memory_usage: + with patch("azurelinuxagent.ga.cpucontroller.CpuControllerV1.get_cpu_usage") as patch_get_cpu_usage: + with patch("azurelinuxagent.ga.cgroupcontroller._CgroupController.is_active") as patch_is_active: + patch_is_active.return_value = True + + current_cpu = 30 + current_anon_memory = 209715200 + current_cache_memory = 314572800 + current_total_memory = 209715200 + 314572800 + current_max_memory = 471859200 + current_swap_memory = 20971520 + + # 1 CPU metric + 1 total Memory + 1 anon memory + 1 cache memory + 1 Max memory + 1 swap memory + num_of_metrics_per_extn_expected = 6 + patch_get_cpu_usage.return_value = current_cpu + patch_get_memory_usage.return_value = current_anon_memory, current_cache_memory # example 200 MB, 300 MB + patch_get_memory_max_usage.return_value = current_max_memory # example 450 MB + patch_try_swap_memory_usage.return_value = current_swap_memory # example 20MB + num_polls = 18 + + for data_count in range(1, num_polls + 1): # pylint: disable=unused-variable + metrics = CGroupsTelemetry.poll_all_tracked() + + self.assertEqual(len(metrics), num_extensions * num_of_metrics_per_extn_expected) + self._assert_polled_metrics_equal(metrics, current_cpu, current_total_memory, current_anon_memory, current_cache_memory, current_max_memory, current_swap_memory) + + @patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_max_memory_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_memory_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.cpucontroller.CpuControllerV1.get_cpu_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.cgroupcontroller._CgroupController.is_active", return_value=False) def test_telemetry_polling_with_inactive_cgroups(self, *_): num_extensions = 5 no_extensions_expected = 0 # pylint: disable=unused-variable - self._track_new_extension_cgroups(num_extensions) - self._assert_cgroups_are_tracked(num_extensions) + self._track_new_extension_cgroup_controllers(num_extensions) + self._assert_cgroup_controllers_are_tracked(num_extensions) metrics = CGroupsTelemetry.poll_all_tracked() @@ -182,14 +189,14 @@ def test_telemetry_polling_with_inactive_cgroups(self, *_): self.assertEqual(len(metrics), 0) - @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_max_memory_usage") - @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage") - @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage") - @patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") + @patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_max_memory_usage") + @patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_memory_usage") + @patch("azurelinuxagent.ga.cpucontroller.CpuControllerV1.get_cpu_usage") + @patch("azurelinuxagent.ga.cgroupcontroller._CgroupController.is_active") def test_telemetry_polling_with_changing_cgroups_state(self, patch_is_active, patch_get_cpu_usage, # pylint: disable=unused-argument patch_get_mem, patch_get_max_mem, *args): num_extensions = 5 - self._track_new_extension_cgroups(num_extensions) + self._track_new_extension_cgroup_controllers(num_extensions) patch_is_active.return_value = True @@ -197,17 +204,18 @@ def test_telemetry_polling_with_changing_cgroups_state(self, patch_is_active, pa expected_data_count = 1 # pylint: disable=unused-variable current_cpu = 30 - current_memory = 209715200 + current_anon_memory = 104857600 + current_cache_memory = 104857600 current_max_memory = 471859200 patch_get_cpu_usage.return_value = current_cpu - patch_get_mem.return_value = current_memory # example 200 MB + patch_get_mem.return_value = current_anon_memory, current_cache_memory # example 100 MB, 100 MB patch_get_max_mem.return_value = current_max_memory # example 450 MB - self._assert_cgroups_are_tracked(num_extensions) + self._assert_cgroup_controllers_are_tracked(num_extensions) CGroupsTelemetry.poll_all_tracked() - self._assert_cgroups_are_tracked(num_extensions) + self._assert_cgroup_controllers_are_tracked(num_extensions) patch_is_active.return_value = False patch_get_cpu_usage.side_effect = raise_ioerror @@ -225,7 +233,7 @@ def test_telemetry_polling_with_changing_cgroups_state(self, patch_is_active, pa @patch("azurelinuxagent.common.logger.periodic_warn") def test_telemetry_polling_to_not_generate_transient_logs_ioerror_file_not_found(self, patch_periodic_warn): num_extensions = 1 - self._track_new_extension_cgroups(num_extensions) + self._track_new_extension_cgroup_controllers(num_extensions) self.assertEqual(0, patch_periodic_warn.call_count) # Not expecting logs present for io_error with errno=errno.ENOENT @@ -243,7 +251,7 @@ def test_telemetry_polling_to_generate_transient_logs_ioerror_permission_denied( num_extensions = 1 num_controllers = 1 is_active_check_per_controller = 2 - self._track_new_extension_cgroups(num_extensions) + self._track_new_extension_cgroup_controllers(num_extensions) self.assertEqual(0, patch_periodic_warn.call_count) @@ -254,7 +262,7 @@ def test_telemetry_polling_to_generate_transient_logs_ioerror_permission_denied( with patch("azurelinuxagent.common.utils.fileutil.read_file", side_effect=io_error_3): poll_count = 1 expected_count_per_call = num_controllers + is_active_check_per_controller - # get_max_memory_usage memory controller would generate a log statement, and each cgroup would invoke a + # get_cpu_usage cpu controller would generate a log statement, and each cgroup controller would invoke a # is active check raising an exception for data_count in range(poll_count, 10): # pylint: disable=unused-variable @@ -263,23 +271,23 @@ def test_telemetry_polling_to_generate_transient_logs_ioerror_permission_denied( def test_telemetry_polling_to_generate_transient_logs_index_error(self): num_extensions = 1 - self._track_new_extension_cgroups(num_extensions) + self._track_new_extension_cgroup_controllers(num_extensions) # Generating a different kind of error (non-IOError) to check the logging. # Trying to invoke IndexError during the getParameter call with patch("azurelinuxagent.common.utils.fileutil.read_file", return_value=''): with patch("azurelinuxagent.common.logger.periodic_warn") as patch_periodic_warn: - expected_call_count = 1 # 1 periodic warning for memory + expected_call_count = 1 # 1 periodic warning for cpu for data_count in range(1, 10): # pylint: disable=unused-variable CGroupsTelemetry.poll_all_tracked() self.assertEqual(expected_call_count, patch_periodic_warn.call_count) - @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.try_swap_memory_usage") - @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_max_memory_usage") - @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage") - @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage") - @patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") - def test_telemetry_calculations(self, patch_is_active, patch_get_cpu_usage, patch_get_memory_usage, patch_get_memory_max_usage, patch_try_memory_swap_usage, + @patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.try_swap_memory_usage") + @patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_max_memory_usage") + @patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_memory_usage") + @patch("azurelinuxagent.ga.cpucontroller.CpuControllerV1.get_cpu_usage") + @patch("azurelinuxagent.ga.cgroupcontroller._CgroupController.is_active") + def test_telemetry_calculations(self, patch_is_active, patch_get_cpu_usage, patch_get_memory_usage, patch_get_memory_max_usage, patch_try_memory_swap_usage, *args): # pylint: disable=unused-argument num_polls = 10 num_extensions = 1 @@ -287,47 +295,48 @@ def test_telemetry_calculations(self, patch_is_active, patch_get_cpu_usage, pat cpu_percent_values = [random.randint(0, 100) for _ in range(num_polls)] # only verifying calculations and not validity of the values. - memory_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)] + anon_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)] + cache_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)] max_memory_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)] swap_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)] - self._track_new_extension_cgroups(num_extensions) + self._track_new_extension_cgroup_controllers(num_extensions) self.assertEqual(2 * num_extensions, len(CGroupsTelemetry._tracked)) for i in range(num_polls): patch_is_active.return_value = True patch_get_cpu_usage.return_value = cpu_percent_values[i] - patch_get_memory_usage.return_value = memory_usage_values[i] + patch_get_memory_usage.return_value = anon_usage_values[i], cache_usage_values[i] patch_get_memory_max_usage.return_value = max_memory_usage_values[i] patch_try_memory_swap_usage.return_value = swap_usage_values[i] metrics = CGroupsTelemetry.poll_all_tracked() - # 1 CPU metric + 1 Current Memory + 1 Max memory + 1 swap memory - self.assertEqual(len(metrics), 4 * num_extensions) - self._assert_polled_metrics_equal(metrics, cpu_percent_values[i], memory_usage_values[i], max_memory_usage_values[i], swap_usage_values[i]) + # 1 CPU metric + 1 Total Memory + 1 anon memory + 1 cache memory + 1 Max memory + 1 swap memory + self.assertEqual(len(metrics), 6 * num_extensions) + self._assert_polled_metrics_equal(metrics, cpu_percent_values[i], anon_usage_values[i] + cache_usage_values[i], anon_usage_values[i], cache_usage_values[i], max_memory_usage_values[i], swap_usage_values[i]) def test_cgroup_tracking(self, *args): # pylint: disable=unused-argument num_extensions = 5 num_controllers = 2 - self._track_new_extension_cgroups(num_extensions) - self._assert_cgroups_are_tracked(num_extensions) + self._track_new_extension_cgroup_controllers(num_extensions) + self._assert_cgroup_controllers_are_tracked(num_extensions) self.assertEqual(num_extensions * num_controllers, len(CGroupsTelemetry._tracked)) def test_cgroup_is_tracked(self, *args): # pylint: disable=unused-argument num_extensions = 5 - self._track_new_extension_cgroups(num_extensions) - self._assert_cgroups_are_tracked(num_extensions) + self._track_new_extension_cgroup_controllers(num_extensions) + self._assert_cgroup_controllers_are_tracked(num_extensions) self.assertFalse(CGroupsTelemetry.is_tracked("not_present_cpu_dummy_path")) self.assertFalse(CGroupsTelemetry.is_tracked("not_present_memory_dummy_path")) - @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_memory_usage", side_effect=raise_ioerror) def test_process_cgroup_metric_with_no_memory_cgroup_mounted(self, *args): # pylint: disable=unused-argument num_extensions = 5 - self._track_new_extension_cgroups(num_extensions) + self._track_new_extension_cgroup_controllers(num_extensions) - with patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage") as patch_get_cpu_usage: - with patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") as patch_is_active: + with patch("azurelinuxagent.ga.cpucontroller.CpuControllerV1.get_cpu_usage") as patch_get_cpu_usage: + with patch("azurelinuxagent.ga.cgroupcontroller._CgroupController.is_active") as patch_is_active: patch_is_active.return_value = True current_cpu = 30 @@ -339,42 +348,44 @@ def test_process_cgroup_metric_with_no_memory_cgroup_mounted(self, *args): # py metrics = CGroupsTelemetry.poll_all_tracked() self.assertEqual(len(metrics), num_extensions * 1) # Only CPU populated - self._assert_polled_metrics_equal(metrics, current_cpu, 0, 0, 0) + self._assert_polled_metrics_equal(metrics, current_cpu, 0, 0, 0, 0, 0) - @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.cpucontroller.CpuControllerV1.get_cpu_usage", side_effect=raise_ioerror) def test_process_cgroup_metric_with_no_cpu_cgroup_mounted(self, *args): # pylint: disable=unused-argument num_extensions = 5 - self._track_new_extension_cgroups(num_extensions) + self._track_new_extension_cgroup_controllers(num_extensions) - with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_max_memory_usage") as patch_get_memory_max_usage: - with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage") as patch_get_memory_usage: - with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.try_swap_memory_usage") as patch_try_swap_memory_usage: - with patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") as patch_is_active: + with patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_max_memory_usage") as patch_get_memory_max_usage: + with patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_memory_usage") as patch_get_memory_usage: + with patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.try_swap_memory_usage") as patch_try_swap_memory_usage: + with patch("azurelinuxagent.ga.cgroupcontroller._CgroupController.is_active") as patch_is_active: patch_is_active.return_value = True - current_memory = 209715200 + current_total_memory = 209715200 + current_anon_memory = 104857600 + current_cache_memory = 104857600 current_max_memory = 471859200 current_swap_memory = 20971520 - patch_get_memory_usage.return_value = current_memory # example 200 MB + patch_get_memory_usage.return_value = current_anon_memory, current_cache_memory # example 100 MB, 100 MB patch_get_memory_max_usage.return_value = current_max_memory # example 450 MB patch_try_swap_memory_usage.return_value = current_swap_memory # example 20MB num_polls = 10 for data_count in range(1, num_polls + 1): # pylint: disable=unused-variable metrics = CGroupsTelemetry.poll_all_tracked() - # Memory is only populated, CPU is not. Thus 3 metrics for memory. - self.assertEqual(len(metrics), num_extensions * 3) - self._assert_polled_metrics_equal(metrics, 0, current_memory, current_max_memory, current_swap_memory) + # Memory is only populated, CPU is not. Thus 5 metrics for memory. + self.assertEqual(len(metrics), num_extensions * 5) + self._assert_polled_metrics_equal(metrics, 0, current_total_memory, current_anon_memory, current_cache_memory, current_max_memory, current_swap_memory) - @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage", side_effect=raise_ioerror) - @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_max_memory_usage", side_effect=raise_ioerror) - @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_memory_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_max_memory_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.cpucontroller.CpuControllerV1.get_cpu_usage", side_effect=raise_ioerror) def test_extension_telemetry_not_sent_for_empty_perf_metrics(self, *args): # pylint: disable=unused-argument num_extensions = 5 - self._track_new_extension_cgroups(num_extensions) + self._track_new_extension_cgroup_controllers(num_extensions) - with patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") as patch_is_active: + with patch("azurelinuxagent.ga.cgroupcontroller._CgroupController.is_active") as patch_is_active: patch_is_active.return_value = False poll_count = 1 @@ -383,9 +394,9 @@ def test_extension_telemetry_not_sent_for_empty_perf_metrics(self, *args): # py metrics = CGroupsTelemetry.poll_all_tracked() self.assertEqual(0, len(metrics)) - @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage") - @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_throttled_time") - @patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") + @patch("azurelinuxagent.ga.cpucontroller.CpuControllerV1.get_cpu_usage") + @patch("azurelinuxagent.ga.cpucontroller.CpuControllerV1.get_cpu_throttled_time") + @patch("azurelinuxagent.ga.cgroupcontroller._CgroupController.is_active") def test_cgroup_telemetry_should_not_report_cpu_negative_value(self, patch_is_active, path_get_throttled_time, patch_get_cpu_usage): num_polls = 5 @@ -396,8 +407,8 @@ def test_cgroup_telemetry_should_not_report_cpu_negative_value(self, patch_is_ac cpu_percent_values.append(-1) cpu_throttled_values = [random.randint(0, 60 * 60) for _ in range(num_polls)] - dummy_cpu_cgroup = CpuMetrics("dummy_extension_name", "dummy_cpu_path") - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) + dummy_cpu_cgroup = CpuControllerV1("dummy_extension_name", "dummy_cpu_path") + CGroupsTelemetry.track_cgroup_controller(dummy_cpu_cgroup) self.assertEqual(1, len(CGroupsTelemetry._tracked)) for i in range(num_polls): diff --git a/tests/ga/test_collect_logs.py b/tests/ga/test_collect_logs.py index 2b8c4f412..458cd2e69 100644 --- a/tests/ga/test_collect_logs.py +++ b/tests/ga/test_collect_logs.py @@ -18,13 +18,15 @@ import os from azurelinuxagent.common import logger, conf -from azurelinuxagent.ga.controllermetrics import CpuMetrics, MemoryMetrics, MetricValue +from azurelinuxagent.ga.cgroupcontroller import MetricValue, MetricsCounter from azurelinuxagent.ga.cgroupconfigurator import CGroupConfigurator from azurelinuxagent.common.logger import Logger from azurelinuxagent.common.protocol.util import ProtocolUtil from azurelinuxagent.common.utils import fileutil from azurelinuxagent.ga.collect_logs import get_collect_logs_handler, is_log_collection_allowed, \ get_log_collector_monitor_handler +from azurelinuxagent.ga.cpucontroller import CpuControllerV1, CpuControllerV2 +from azurelinuxagent.ga.memorycontroller import MemoryControllerV1, MemoryControllerV2 from tests.lib.mock_wire_protocol import mock_wire_protocol, MockHttpResponse from tests.lib.http_request_predicates import HttpRequestPredicates from tests.lib.wire_protocol_data import DATA_FILE @@ -32,8 +34,13 @@ is_python_version_26, data_dir +class CgroupVersions: + V1 = "v1" + V2 = "v2" + + @contextlib.contextmanager -def _create_collect_logs_handler(iterations=1, cgroups_enabled=True, collect_logs_conf=True): +def _create_collect_logs_handler(iterations=1, cgroup_version=CgroupVersions.V1, cgroups_enabled=True, collect_logs_conf=True, cgroupv2_resource_limiting_conf=False): """ Creates an instance of CollectLogsHandler that * Uses a mock_wire_protocol for network requests, @@ -52,19 +59,33 @@ def _create_collect_logs_handler(iterations=1, cgroups_enabled=True, collect_log with patch("azurelinuxagent.ga.collect_logs.CollectLogsHandler.stopped", side_effect=[False] * iterations + [True]): with patch("time.sleep"): - # Grab the singleton to patch it - cgroups_configurator_singleton = CGroupConfigurator.get_instance() - with patch.object(cgroups_configurator_singleton, "enabled", return_value=cgroups_enabled): - with patch("azurelinuxagent.ga.collect_logs.conf.get_collect_logs", - return_value=collect_logs_conf): - def run_and_wait(): - collect_logs_handler.run() - collect_logs_handler.join() - - collect_logs_handler = get_collect_logs_handler() - collect_logs_handler.get_mock_wire_protocol = lambda: protocol - collect_logs_handler.run_and_wait = run_and_wait - yield collect_logs_handler + with patch("azurelinuxagent.ga.collect_logs.conf.get_collect_logs", return_value=collect_logs_conf): + + # Grab the singleton to patch it + cgroups_configurator_singleton = CGroupConfigurator.get_instance() + + if cgroup_version == CgroupVersions.V1: + with patch.object(cgroups_configurator_singleton, "enabled", return_value=cgroups_enabled): + def run_and_wait(): + collect_logs_handler.run() + collect_logs_handler.join() + + collect_logs_handler = get_collect_logs_handler() + collect_logs_handler.get_mock_wire_protocol = lambda: protocol + collect_logs_handler.run_and_wait = run_and_wait + yield collect_logs_handler + else: + with patch("azurelinuxagent.ga.collect_logs.conf.get_enable_cgroup_v2_resource_limiting", return_value=cgroupv2_resource_limiting_conf): + with patch.object(cgroups_configurator_singleton, "enabled", return_value=False): + with patch("azurelinuxagent.ga.cgroupconfigurator.CGroupConfigurator._Impl.using_cgroup_v2", return_value=True): + def run_and_wait(): + collect_logs_handler.run() + collect_logs_handler.join() + + collect_logs_handler = get_collect_logs_handler() + collect_logs_handler.get_mock_wire_protocol = lambda: protocol + collect_logs_handler.run_and_wait = run_and_wait + yield collect_logs_handler @skip_if_predicate_true(is_python_version_26, "Disabled on Python 2.6") @@ -101,26 +122,124 @@ def _create_dummy_archive(self, size=1024): def test_it_should_only_collect_logs_if_conditions_are_met(self): # In order to collect logs, three conditions have to be met: - # 1) the flag must be set to true in the conf file - # 2) cgroups must be managing services - # 3) python version 2.7+ which is automatically true for these tests since they are disabled on py2.6 + # 1) It should be enabled in the configuration. + # 2) The system must be using cgroups to manage services - needed for resource limiting of the log collection. The + # agent currently fully supports resource limiting for v1, but only supports log collector resource limiting for v2 + # if enabled via configuration. + # This condition is True if either: + # a. cgroup usage in the agent is enabled; OR + # b. the machine is using cgroup v2 and v2 resource limiting is enabled in the configuration. + # 3) The python version must be greater than 2.6 in order to support the ZipFile library used when collecting. + + # Note, cgroups should not be in an 'enabled' state in the configurator if v2 is in use. Resource governance is + # not fully supported on v2 yet. + + # If collect logs is not enabled in the configuration, then log collection should always be disabled + + # Case 1: + # - Cgroups are enabled in the configurator + # - Cgroup v2 is not in use + # - Cgroup v2 resource limiting conf is True + # - collect logs config flag false + with _create_collect_logs_handler(cgroups_enabled=True, cgroup_version=CgroupVersions.V1, cgroupv2_resource_limiting_conf=True, collect_logs_conf=False): + self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled") + + # Case 2: + # - Cgroups are enabled in the configurator + # - Cgroup v2 is not in use + # - Cgroup v2 resource limiting conf is False + # - collect logs config flag false + with _create_collect_logs_handler(cgroups_enabled=True, cgroup_version=CgroupVersions.V1, cgroupv2_resource_limiting_conf=False, collect_logs_conf=False): + self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled") + + # Case 3: + # - Cgroups are disabled in the configurator + # - Cgroup v2 is in use + # - Cgroup v2 resource limiting conf is True + # - collect logs config flag false + with _create_collect_logs_handler(cgroups_enabled=False, cgroup_version=CgroupVersions.V2, cgroupv2_resource_limiting_conf=True, collect_logs_conf=False): + self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled") - # cgroups not enabled, config flag false - with _create_collect_logs_handler(cgroups_enabled=False, collect_logs_conf=False): + # Case 4: + # - Cgroups are disabled in the configurator + # - Cgroup v2 is in use + # - Cgroup v2 resource limiting conf is False + # - collect logs config flag false + with _create_collect_logs_handler(cgroups_enabled=False, cgroup_version=CgroupVersions.V2, cgroupv2_resource_limiting_conf=False, collect_logs_conf=False): self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled") - # cgroups enabled, config flag false - with _create_collect_logs_handler(cgroups_enabled=True, collect_logs_conf=False): + # Case 5: + # - Cgroups are disabled in the configurator + # - Cgroup v2 is not in use + # - Cgroup v2 resource limiting conf is True + # - collect logs config flag false + with _create_collect_logs_handler(cgroups_enabled=False, cgroup_version=CgroupVersions.V1, cgroupv2_resource_limiting_conf=True, collect_logs_conf=False): self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled") - # cgroups not enabled, config flag true - with _create_collect_logs_handler(cgroups_enabled=False, collect_logs_conf=True): + # Case 6: + # - Cgroups are disabled in the configurator + # - Cgroup v2 is not in use + # - Cgroup v2 resource limiting conf is False + # - collect logs config flag false + with _create_collect_logs_handler(cgroups_enabled=False, cgroup_version=CgroupVersions.V1, cgroupv2_resource_limiting_conf=False, collect_logs_conf=False): self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled") - # cgroups enabled, config flag true - with _create_collect_logs_handler(cgroups_enabled=True, collect_logs_conf=True): + # If collect logs is enabled in the configuration and cgroups are enbaled in the configurator, then log collection should always be enabled + + # Case 7: + # - Cgroups are enabled in the configurator + # - Cgroup v2 is not in use + # - Cgroup v2 resource limiting conf is True + # - collect logs config flag true + with _create_collect_logs_handler(cgroups_enabled=True, cgroup_version=CgroupVersions.V1, cgroupv2_resource_limiting_conf=True, collect_logs_conf=True): self.assertEqual(True, is_log_collection_allowed(), "Log collection should have been enabled") + # Case 8: + # - Cgroups are enabled in the configurator + # - Cgroup v2 is not in use + # - Cgroup v2 resource limiting conf is False + # - collect logs config flag true + with _create_collect_logs_handler(cgroups_enabled=True, cgroup_version=CgroupVersions.V1, cgroupv2_resource_limiting_conf=False, collect_logs_conf=True): + self.assertEqual(True, is_log_collection_allowed(), "Log collection should have been enabled") + + # If collect logs is enabled in the configuration and v2 is in use with the v2 resource limiting conf enabled, then log collection should always be enabled + + # Case 9: + # - Cgroups are disabled in the configurator + # - Cgroup v2 is in use + # - Cgroup v2 resource limiting conf is True + # - collect logs config flag true + with _create_collect_logs_handler(cgroups_enabled=False, cgroup_version=CgroupVersions.V2, cgroupv2_resource_limiting_conf=True, collect_logs_conf=True): + self.assertEqual(True, is_log_collection_allowed(), "Log collection should have been enabled") + + # If collect logs is enabled in the configuration and v2 is in use but the v2 resource limiting conf disabled, then log collection should always be disabled + + # Case 10: + # - Cgroups are disabled in the configurator + # - Cgroup v2 is in use + # - Cgroup v2 resource limiting conf is False + # - collect logs config flag true + with _create_collect_logs_handler(cgroups_enabled=False, cgroup_version=CgroupVersions.V2, cgroupv2_resource_limiting_conf=False, collect_logs_conf=True): + self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled") + + # If collect logs is enabled in the configuration but cgroups are disabled in the configurator and v2 is not in use, then log collections should always be disabled + + # Case 11: + # - Cgroups are disabled in the configurator + # - Cgroup v2 is not in use + # - Cgroup v2 resource limiting conf is True + # - collect logs config flag true + with _create_collect_logs_handler(cgroups_enabled=False, cgroup_version=CgroupVersions.V1, cgroupv2_resource_limiting_conf=True, collect_logs_conf=True): + self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled") + + # Case 12: + # - Cgroups are disabled in the configurator + # - Cgroup v2 is not in use + # - Cgroup v2 resource limiting conf is False + # - collect logs config flag true + with _create_collect_logs_handler(cgroups_enabled=False, cgroup_version=CgroupVersions.V1, cgroupv2_resource_limiting_conf=False, collect_logs_conf=True): + self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled") + def test_it_uploads_logs_when_collection_is_successful(self): archive_size = 42 @@ -168,7 +287,7 @@ def http_put_handler(url, _, **__): @contextlib.contextmanager -def _create_log_collector_monitor_handler(iterations=1): +def _create_log_collector_monitor_handler(iterations=1, cgroup_version=CgroupVersions.V1): """ Creates an instance of LogCollectorMonitorHandler that * Runs its main loop only the number of times given in the 'iterations' parameter, and @@ -184,22 +303,40 @@ def _create_log_collector_monitor_handler(iterations=1): original_read_file = fileutil.read_file - def mock_read_file(filepath, **args): + def mock_read_file_v1(filepath, **args): if filepath == "/proc/stat": - filepath = os.path.join(data_dir, "cgroups", "proc_stat_t0") + filepath = os.path.join(data_dir, "cgroups", "v1", "proc_stat_t0") elif filepath.endswith("/cpuacct.stat"): - filepath = os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") + filepath = os.path.join(data_dir, "cgroups", "v1", "cpuacct.stat_t0") + return original_read_file(filepath, **args) + + def mock_read_file_v2(filepath, **args): + if filepath == "/proc/uptime": + filepath = os.path.join(data_dir, "cgroups", "v2", "proc_uptime_t0") + elif filepath.endswith("/cpu.stat"): + filepath = os.path.join(data_dir, "cgroups", "v2", "cpu.stat_t0") return original_read_file(filepath, **args) + mock_read_file = None + cgroups = [] + if cgroup_version == "v1": + mock_read_file = mock_read_file_v1 + cgroups = [ + CpuControllerV1("test", "dummy_cpu_path"), + MemoryControllerV1("test", "dummy_memory_path") + ] + else: + mock_read_file = mock_read_file_v2 + cgroups = [ + CpuControllerV2("test", "dummy_cpu_path"), + MemoryControllerV2("test", "dummy_memory_path") + ] + with patch("azurelinuxagent.common.utils.fileutil.read_file", side_effect=mock_read_file): def run_and_wait(): monitor_log_collector.run() monitor_log_collector.join() - cgroups = [ - CpuMetrics("test", "dummy_cpu_path"), - MemoryMetrics("test", "dummy_memory_path") - ] monitor_log_collector = get_log_collector_monitor_handler(cgroups) monitor_log_collector.run_and_wait = run_and_wait yield monitor_log_collector @@ -207,33 +344,78 @@ def run_and_wait(): class TestLogCollectorMonitorHandler(AgentTestCase): - @patch('azurelinuxagent.common.event.EventLogger.add_metric') - @patch("azurelinuxagent.ga.collect_logs.LogCollectorMonitorHandler._poll_resource_usage") - def test_send_extension_metrics_telemetry(self, patch_poll_resource_usage, patch_add_metric): + def test_get_max_recorded_metrics(self): + with _create_log_collector_monitor_handler(iterations=2) as log_collector_monitor_handler: + nonlocal_vars = { + 'cpu_iteration': 0, + 'mem_iteration': 0, + 'multiplier': 5 + } + + def get_different_cpu_metrics(**kwargs): # pylint: disable=W0613 + metrics = [MetricValue("Process", MetricsCounter.PROCESSOR_PERCENT_TIME, "service", 4.5), MetricValue("Process", MetricsCounter.THROTTLED_TIME, "service", nonlocal_vars['cpu_iteration']*nonlocal_vars['multiplier'] + 10.000)] + nonlocal_vars['cpu_iteration'] += 1 + return metrics + + def get_different_memory_metrics(**kwargs): # pylint: disable=W0613 + metrics = [MetricValue("Memory", MetricsCounter.TOTAL_MEM_USAGE, "service", 20), + MetricValue("Memory", MetricsCounter.ANON_MEM_USAGE, "service", 15), + MetricValue("Memory", MetricsCounter.CACHE_MEM_USAGE, "service", nonlocal_vars['mem_iteration']*nonlocal_vars['multiplier'] + 5), + MetricValue("Memory", MetricsCounter.MAX_MEM_USAGE, "service", 30), + MetricValue("Memory", MetricsCounter.SWAP_MEM_USAGE, "service", 0)] + nonlocal_vars['mem_iteration'] += 1 + return metrics + + with patch("azurelinuxagent.ga.cpucontroller._CpuController.get_tracked_metrics", side_effect=get_different_cpu_metrics): + with patch("azurelinuxagent.ga.memorycontroller._MemoryController.get_tracked_metrics", side_effect=get_different_memory_metrics): + log_collector_monitor_handler.run_and_wait() + max_recorded_metrics = log_collector_monitor_handler.get_max_recorded_metrics() + self.assertEqual(len(max_recorded_metrics), 7) + self.assertEqual(max_recorded_metrics[MetricsCounter.PROCESSOR_PERCENT_TIME], 4.5) + self.assertEqual(max_recorded_metrics[MetricsCounter.THROTTLED_TIME], 15.0) + self.assertEqual(max_recorded_metrics[MetricsCounter.TOTAL_MEM_USAGE], 20) + self.assertEqual(max_recorded_metrics[MetricsCounter.ANON_MEM_USAGE], 15) + self.assertEqual(max_recorded_metrics[MetricsCounter.CACHE_MEM_USAGE], 10) + self.assertEqual(max_recorded_metrics[MetricsCounter.MAX_MEM_USAGE], 30) + self.assertEqual(max_recorded_metrics[MetricsCounter.SWAP_MEM_USAGE], 0) + + def test_verify_log_collector_memory_limit_exceeded(self): with _create_log_collector_monitor_handler() as log_collector_monitor_handler: - patch_poll_resource_usage.return_value = [MetricValue("Process", "% Processor Time", "service", 1), - MetricValue("Process", "Throttled Time", "service", 1), - MetricValue("Memory", "Total Memory Usage", "service", 1), - MetricValue("Memory", "Max Memory Usage", "service", 1), - MetricValue("Memory", "Swap Memory Usage", "service", 1) - ] - log_collector_monitor_handler.run_and_wait() - self.assertEqual(1, patch_poll_resource_usage.call_count) - self.assertEqual(5, patch_add_metric.call_count) # Five metrics being sent. - - @patch("os._exit", side_effect=Exception) - @patch("azurelinuxagent.ga.collect_logs.LogCollectorMonitorHandler._poll_resource_usage") - def test_verify_log_collector_memory_limit_exceeded(self, patch_poll_resource_usage, mock_exit): + cache_exceeded = [MetricValue("Process", MetricsCounter.PROCESSOR_PERCENT_TIME, "service", 4.5), + MetricValue("Process", MetricsCounter.THROTTLED_TIME, "service", 10.281), + MetricValue("Memory", MetricsCounter.TOTAL_MEM_USAGE, "service", 170 * 1024 ** 2), + MetricValue("Memory", MetricsCounter.ANON_MEM_USAGE, "service", 15 * 1024 ** 2), + MetricValue("Memory", MetricsCounter.CACHE_MEM_USAGE, "service", 160 * 1024 ** 2), + MetricValue("Memory", MetricsCounter.MAX_MEM_USAGE, "service", 171 * 1024 ** 2), + MetricValue("Memory", MetricsCounter.SWAP_MEM_USAGE, "service", 0)] + with patch("azurelinuxagent.ga.collect_logs.LogCollectorMonitorHandler._poll_resource_usage", return_value=cache_exceeded): + with patch("os._exit") as mock_exit: + log_collector_monitor_handler.run_and_wait() + self.assertEqual(mock_exit.call_count, 1) + with _create_log_collector_monitor_handler() as log_collector_monitor_handler: - with patch("azurelinuxagent.ga.cgroupconfigurator.LOGCOLLECTOR_MEMORY_LIMIT", 8): - patch_poll_resource_usage.return_value = [MetricValue("Process", "% Processor Time", "service", 1), - MetricValue("Process", "Throttled Time", "service", 1), - MetricValue("Memory", "Total Memory Usage", "service", 9), - MetricValue("Memory", "Max Memory Usage", "service", 7), - MetricValue("Memory", "Swap Memory Usage", "service", 0) - - ] - try: + anon_exceeded = [MetricValue("Process", MetricsCounter.PROCESSOR_PERCENT_TIME, "service", 4.5), + MetricValue("Process", MetricsCounter.THROTTLED_TIME, "service", 10.281), + MetricValue("Memory", MetricsCounter.TOTAL_MEM_USAGE, "service", 170 * 1024 ** 2), + MetricValue("Memory", MetricsCounter.ANON_MEM_USAGE, "service", 30 * 1024 ** 2), + MetricValue("Memory", MetricsCounter.CACHE_MEM_USAGE, "service", 140 * 1024 ** 2), + MetricValue("Memory", MetricsCounter.MAX_MEM_USAGE, "service", 171 * 1024 ** 2), + MetricValue("Memory", MetricsCounter.SWAP_MEM_USAGE, "service", 0)] + with patch("azurelinuxagent.ga.collect_logs.LogCollectorMonitorHandler._poll_resource_usage", return_value=anon_exceeded): + with patch("os._exit") as mock_exit: + log_collector_monitor_handler.run_and_wait() + self.assertEqual(mock_exit.call_count, 1) + + with _create_log_collector_monitor_handler(cgroup_version=CgroupVersions.V2) as log_collector_monitor_handler: + mem_throttled_exceeded = [MetricValue("Process", MetricsCounter.PROCESSOR_PERCENT_TIME, "service", 4.5), + MetricValue("Process", MetricsCounter.THROTTLED_TIME, "service", 10.281), + MetricValue("Memory", MetricsCounter.TOTAL_MEM_USAGE, "service", 170 * 1024 ** 2), + MetricValue("Memory", MetricsCounter.ANON_MEM_USAGE, "service", 15 * 1024 ** 2), + MetricValue("Memory", MetricsCounter.CACHE_MEM_USAGE, "service", 140 * 1024 ** 2), + MetricValue("Memory", MetricsCounter.MAX_MEM_USAGE, "service", 171 * 1024 ** 2), + MetricValue("Memory", MetricsCounter.SWAP_MEM_USAGE, "service", 0), + MetricValue("Memory", MetricsCounter.MEM_THROTTLED, "service", 11)] + with patch("azurelinuxagent.ga.collect_logs.LogCollectorMonitorHandler._poll_resource_usage", return_value=mem_throttled_exceeded): + with patch("os._exit") as mock_exit: log_collector_monitor_handler.run_and_wait() - except Exception: self.assertEqual(mock_exit.call_count, 1) diff --git a/tests/ga/test_controllermetrics.py b/tests/ga/test_controllermetrics.py deleted file mode 100644 index cdd31395f..000000000 --- a/tests/ga/test_controllermetrics.py +++ /dev/null @@ -1,242 +0,0 @@ -# Copyright 2018 Microsoft Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Requires Python 2.4+ and Openssl 1.0+ -# - -from __future__ import print_function - -import errno -import os -import random -import shutil - -from azurelinuxagent.ga.controllermetrics import CpuMetrics, MemoryMetrics, MetricsCounter, CounterNotFound -from azurelinuxagent.common.exception import CGroupsException -from azurelinuxagent.common.osutil import get_osutil -from azurelinuxagent.common.utils import fileutil -from tests.lib.tools import AgentTestCase, patch, data_dir - - -def consume_cpu_time(): - waste = 0 - for x in range(1, 200000): # pylint: disable=unused-variable - waste += random.random() - return waste - - -class TestControllerMetrics(AgentTestCase): - def test_is_active(self): - test_metrics = CpuMetrics("test_extension", self.tmp_dir) - self.assertEqual(False, test_metrics.is_active()) - - with open(os.path.join(self.tmp_dir, "tasks"), mode="wb") as tasks: - tasks.write(str(1000).encode()) - - self.assertEqual(True, test_metrics.is_active()) - - @patch("azurelinuxagent.common.logger.periodic_warn") - def test_is_active_file_not_present(self, patch_periodic_warn): - test_metrics = CpuMetrics("test_extension", self.tmp_dir) - self.assertEqual(False, test_metrics.is_active()) - - test_metrics = MemoryMetrics("test_extension", os.path.join(self.tmp_dir, "this_cgroup_does_not_exist")) - self.assertEqual(False, test_metrics.is_active()) - - self.assertEqual(0, patch_periodic_warn.call_count) - - @patch("azurelinuxagent.common.logger.periodic_warn") - def test_is_active_incorrect_file(self, patch_periodic_warn): - open(os.path.join(self.tmp_dir, "tasks"), mode="wb").close() - test_metrics = CpuMetrics("test_extension", os.path.join(self.tmp_dir, "tasks")) - self.assertEqual(False, test_metrics.is_active()) - self.assertEqual(1, patch_periodic_warn.call_count) - - -class TestCpuMetrics(AgentTestCase): - @classmethod - def setUpClass(cls): - AgentTestCase.setUpClass() - - original_read_file = fileutil.read_file - - # - # Tests that need to mock the contents of /proc/stat or */cpuacct/stat can set this map from - # the file that needs to be mocked to the mock file (each test starts with an empty map). If - # an Exception is given instead of a path, the exception is raised - # - cls.mock_read_file_map = {} - - def mock_read_file(filepath, **args): - if filepath in cls.mock_read_file_map: - mapped_value = cls.mock_read_file_map[filepath] - if isinstance(mapped_value, Exception): - raise mapped_value - filepath = mapped_value - return original_read_file(filepath, **args) - - cls.mock_read_file = patch("azurelinuxagent.common.utils.fileutil.read_file", side_effect=mock_read_file) - cls.mock_read_file.start() - - @classmethod - def tearDownClass(cls): - cls.mock_read_file.stop() - AgentTestCase.tearDownClass() - - def setUp(self): - AgentTestCase.setUp(self) - TestCpuMetrics.mock_read_file_map.clear() - - def test_initialize_cpu_usage_should_set_current_cpu_usage(self): - metrics = CpuMetrics("test", "/sys/fs/cgroup/cpu/system.slice/test") - - TestCpuMetrics.mock_read_file_map = { - "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"), - os.path.join(metrics.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") - } - - metrics.initialize_cpu_usage() - - self.assertEqual(metrics._current_cgroup_cpu, 63763) - self.assertEqual(metrics._current_system_cpu, 5496872) - - def test_get_cpu_usage_should_return_the_cpu_usage_since_its_last_invocation(self): - osutil = get_osutil() - - metrics = CpuMetrics("test", "/sys/fs/cgroup/cpu/system.slice/test") - - TestCpuMetrics.mock_read_file_map = { - "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"), - os.path.join(metrics.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") - } - - metrics.initialize_cpu_usage() - - TestCpuMetrics.mock_read_file_map = { - "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t1"), - os.path.join(metrics.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t1") - } - - cpu_usage = metrics.get_cpu_usage() - - self.assertEqual(cpu_usage, round(100.0 * 0.000307697876885 * osutil.get_processor_cores(), 3)) - - TestCpuMetrics.mock_read_file_map = { - "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t2"), - os.path.join(metrics.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t2") - } - - cpu_usage = metrics.get_cpu_usage() - - self.assertEqual(cpu_usage, round(100.0 * 0.000445181085968 * osutil.get_processor_cores(), 3)) - - def test_initialize_cpu_usage_should_set_the_cgroup_usage_to_0_when_the_cgroup_does_not_exist(self): - metrics = CpuMetrics("test", "/sys/fs/cgroup/cpu/system.slice/test") - - io_error_2 = IOError() - io_error_2.errno = errno.ENOENT # "No such directory" - - TestCpuMetrics.mock_read_file_map = { - "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"), - os.path.join(metrics.path, "cpuacct.stat"): io_error_2 - } - - metrics.initialize_cpu_usage() - - self.assertEqual(metrics._current_cgroup_cpu, 0) - self.assertEqual(metrics._current_system_cpu, 5496872) # check the system usage just for test sanity - - def test_initialize_cpu_usage_should_raise_an_exception_when_called_more_than_once(self): - metrics = CpuMetrics("test", "/sys/fs/cgroup/cpu/system.slice/test") - - TestCpuMetrics.mock_read_file_map = { - "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"), - os.path.join(metrics.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") - } - - metrics.initialize_cpu_usage() - - with self.assertRaises(CGroupsException): - metrics.initialize_cpu_usage() - - def test_get_cpu_usage_should_raise_an_exception_when_initialize_cpu_usage_has_not_been_invoked(self): - metrics = CpuMetrics("test", "/sys/fs/cgroup/cpu/system.slice/test") - - with self.assertRaises(CGroupsException): - cpu_usage = metrics.get_cpu_usage() # pylint: disable=unused-variable - - def test_get_throttled_time_should_return_the_value_since_its_last_invocation(self): - test_file = os.path.join(self.tmp_dir, "cpu.stat") - shutil.copyfile(os.path.join(data_dir, "cgroups", "cpu.stat_t0"), test_file) # throttled_time = 50 - metrics = CpuMetrics("test", self.tmp_dir) - metrics.initialize_cpu_usage() - shutil.copyfile(os.path.join(data_dir, "cgroups", "cpu.stat_t1"), test_file) # throttled_time = 2075541442327 - - throttled_time = metrics.get_cpu_throttled_time() - - self.assertEqual(throttled_time, float(2075541442327 - 50) / 1E9, "The value of throttled_time is incorrect") - - def test_get_tracked_metrics_should_return_the_throttled_time(self): - metrics = CpuMetrics("test", os.path.join(data_dir, "cgroups")) - metrics.initialize_cpu_usage() - - def find_throttled_time(metrics): - return [m for m in metrics if m.counter == MetricsCounter.THROTTLED_TIME] - - found = find_throttled_time(metrics.get_tracked_metrics()) - self.assertTrue(len(found) == 0, "get_tracked_metrics should not fetch the throttled time by default. Found: {0}".format(found)) - - found = find_throttled_time(metrics.get_tracked_metrics(track_throttled_time=True)) - self.assertTrue(len(found) == 1, "get_tracked_metrics should have fetched the throttled time by default. Found: {0}".format(found)) - - -class TestMemoryMetrics(AgentTestCase): - def test_get_metrics(self): - test_mem_metrics = MemoryMetrics("test_extension", os.path.join(data_dir, "cgroups", "memory_mount")) - - memory_usage = test_mem_metrics.get_memory_usage() - self.assertEqual(150000, memory_usage) - - max_memory_usage = test_mem_metrics.get_max_memory_usage() - self.assertEqual(1000000, max_memory_usage) - - swap_memory_usage = test_mem_metrics.try_swap_memory_usage() - self.assertEqual(20000, swap_memory_usage) - - def test_get_metrics_when_files_not_present(self): - test_mem_metrics = MemoryMetrics("test_extension", os.path.join(data_dir, "cgroups")) - - with self.assertRaises(IOError) as e: - test_mem_metrics.get_memory_usage() - - self.assertEqual(e.exception.errno, errno.ENOENT) - - with self.assertRaises(IOError) as e: - test_mem_metrics.get_max_memory_usage() - - self.assertEqual(e.exception.errno, errno.ENOENT) - - with self.assertRaises(IOError) as e: - test_mem_metrics.try_swap_memory_usage() - - self.assertEqual(e.exception.errno, errno.ENOENT) - - def test_get_memory_usage_counters_not_found(self): - test_mem_metrics = MemoryMetrics("test_extension", os.path.join(data_dir, "cgroups", "missing_memory_counters")) - - with self.assertRaises(CounterNotFound): - test_mem_metrics.get_memory_usage() - - swap_memory_usage = test_mem_metrics.try_swap_memory_usage() - self.assertEqual(0, swap_memory_usage) diff --git a/tests/ga/test_cpucontroller.py b/tests/ga/test_cpucontroller.py new file mode 100644 index 000000000..bc5fc4070 --- /dev/null +++ b/tests/ga/test_cpucontroller.py @@ -0,0 +1,313 @@ +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.4+ and Openssl 1.0+ +# + +from __future__ import print_function + +import errno +import os +import random +import shutil + +from azurelinuxagent.ga.cgroupcontroller import MetricsCounter +from azurelinuxagent.ga.cpucontroller import CpuControllerV1, CpuControllerV2 +from azurelinuxagent.common.exception import CGroupsException +from azurelinuxagent.common.osutil import get_osutil +from azurelinuxagent.common.utils import fileutil +from tests.lib.tools import AgentTestCase, patch, data_dir + + +def consume_cpu_time(): + waste = 0 + for x in range(1, 200000): # pylint: disable=unused-variable + waste += random.random() + return waste + + +class TestCpuControllerV1(AgentTestCase): + @classmethod + def setUpClass(cls): + AgentTestCase.setUpClass() + + original_read_file = fileutil.read_file + + # + # Tests that need to mock the contents of /proc/stat or */cpuacct/stat can set this map from + # the file that needs to be mocked to the mock file (each test starts with an empty map). If + # an Exception is given instead of a path, the exception is raised + # + cls.mock_read_file_map = {} + + def mock_read_file(filepath, **args): + if filepath in cls.mock_read_file_map: + mapped_value = cls.mock_read_file_map[filepath] + if isinstance(mapped_value, Exception): + raise mapped_value + filepath = mapped_value + return original_read_file(filepath, **args) + + cls.mock_read_file = patch("azurelinuxagent.common.utils.fileutil.read_file", side_effect=mock_read_file) + cls.mock_read_file.start() + + @classmethod + def tearDownClass(cls): + cls.mock_read_file.stop() + AgentTestCase.tearDownClass() + + def setUp(self): + AgentTestCase.setUp(self) + TestCpuControllerV1.mock_read_file_map.clear() + + def test_initialize_cpu_usage_v1_should_set_current_cpu_usage(self): + controller = CpuControllerV1("test", "/sys/fs/cgroup/cpu/system.slice/test") + + TestCpuControllerV1.mock_read_file_map = { + "/proc/stat": os.path.join(data_dir, "cgroups", "v1", "proc_stat_t0"), + os.path.join(controller.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "v1", "cpuacct.stat_t0") + } + + controller.initialize_cpu_usage() + + self.assertEqual(controller._current_cgroup_cpu, 63763) + self.assertEqual(controller._current_system_cpu, 5496872) + + def test_get_cpu_usage_v1_should_return_the_cpu_usage_since_its_last_invocation(self): + osutil = get_osutil() + + controller = CpuControllerV1("test", "/sys/fs/cgroup/cpu/system.slice/test") + + TestCpuControllerV1.mock_read_file_map = { + "/proc/stat": os.path.join(data_dir, "cgroups", "v1", "proc_stat_t0"), + os.path.join(controller.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "v1", "cpuacct.stat_t0") + } + + controller.initialize_cpu_usage() + + TestCpuControllerV1.mock_read_file_map = { + "/proc/stat": os.path.join(data_dir, "cgroups", "v1", "proc_stat_t1"), + os.path.join(controller.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "v1", "cpuacct.stat_t1") + } + + cpu_usage = controller.get_cpu_usage() + + self.assertEqual(cpu_usage, round(100.0 * 0.000307697876885 * osutil.get_processor_cores(), 3)) + + TestCpuControllerV1.mock_read_file_map = { + "/proc/stat": os.path.join(data_dir, "cgroups", "v1", "proc_stat_t2"), + os.path.join(controller.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "v1", "cpuacct.stat_t2") + } + + cpu_usage = controller.get_cpu_usage() + + self.assertEqual(cpu_usage, round(100.0 * 0.000445181085968 * osutil.get_processor_cores(), 3)) + + def test_initialize_cpu_usage_v1_should_set_the_cgroup_usage_to_0_when_the_cgroup_does_not_exist(self): + controller = CpuControllerV1("test", "/sys/fs/cgroup/cpu/system.slice/test") + + io_error_2 = IOError() + io_error_2.errno = errno.ENOENT # "No such directory" + + TestCpuControllerV1.mock_read_file_map = { + "/proc/stat": os.path.join(data_dir, "cgroups", "v1", "proc_stat_t0"), + os.path.join(controller.path, "cpuacct.stat"): io_error_2 + } + + controller.initialize_cpu_usage() + + self.assertEqual(controller._current_cgroup_cpu, 0) + self.assertEqual(controller._current_system_cpu, 5496872) # check the system usage just for test sanity + + def test_initialize_cpu_usage_v1_should_raise_an_exception_when_called_more_than_once(self): + controller = CpuControllerV1("test", "/sys/fs/cgroup/cpu/system.slice/test") + + TestCpuControllerV1.mock_read_file_map = { + "/proc/stat": os.path.join(data_dir, "cgroups", "v1", "proc_stat_t0"), + os.path.join(controller.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "v1", "cpuacct.stat_t0") + } + + controller.initialize_cpu_usage() + + with self.assertRaises(CGroupsException): + controller.initialize_cpu_usage() + + def test_get_cpu_usage_v1_should_raise_an_exception_when_initialize_cpu_usage_has_not_been_invoked(self): + controller = CpuControllerV1("test", "/sys/fs/cgroup/cpu/system.slice/test") + + with self.assertRaises(CGroupsException): + cpu_usage = controller.get_cpu_usage() # pylint: disable=unused-variable + + def test_get_throttled_time_v1_should_return_the_value_since_its_last_invocation(self): + test_file = os.path.join(self.tmp_dir, "cpu.stat") + shutil.copyfile(os.path.join(data_dir, "cgroups", "v1", "cpu.stat_t0"), test_file) # throttled_time = 50 + controller = CpuControllerV1("test", self.tmp_dir) + controller.initialize_cpu_usage() + shutil.copyfile(os.path.join(data_dir, "cgroups", "v1", "cpu.stat_t1"), test_file) # throttled_time = 2075541442327 + + throttled_time = controller.get_cpu_throttled_time() + + self.assertEqual(throttled_time, round(float(2075541442327 - 50) / 1E9, 3), "The value of throttled_time is incorrect") + + def test_get_tracked_metrics_v1_should_return_the_throttled_time(self): + controller = CpuControllerV1("test", os.path.join(data_dir, "cgroups", "v1")) + controller.initialize_cpu_usage() + + def find_throttled_time(metrics): + return [m for m in metrics if m.counter == MetricsCounter.THROTTLED_TIME] + + found = find_throttled_time(controller.get_tracked_metrics()) + self.assertTrue(len(found) == 0, "get_tracked_metrics should not fetch the throttled time by default. Found: {0}".format(found)) + + found = find_throttled_time(controller.get_tracked_metrics(track_throttled_time=True)) + self.assertTrue(len(found) == 1, "get_tracked_metrics should have fetched the throttled time by default. Found: {0}".format(found)) + + +class TestCpuControllerV2(AgentTestCase): + @classmethod + def setUpClass(cls): + AgentTestCase.setUpClass() + + original_read_file = fileutil.read_file + + # + # Tests that need to mock the contents of /proc/stat or */cpuacct/stat can set this map from + # the file that needs to be mocked to the mock file (each test starts with an empty map). If + # an Exception is given instead of a path, the exception is raised + # + cls.mock_read_file_map = {} + + def mock_read_file(filepath, **args): + if filepath in cls.mock_read_file_map: + mapped_value = cls.mock_read_file_map[filepath] + if isinstance(mapped_value, Exception): + raise mapped_value + filepath = mapped_value + return original_read_file(filepath, **args) + + cls.mock_read_file = patch("azurelinuxagent.common.utils.fileutil.read_file", side_effect=mock_read_file) + cls.mock_read_file.start() + + @classmethod + def tearDownClass(cls): + cls.mock_read_file.stop() + AgentTestCase.tearDownClass() + + def setUp(self): + AgentTestCase.setUp(self) + TestCpuControllerV2.mock_read_file_map.clear() + + def test_initialize_cpu_usage_v2_should_set_current_cpu_usage(self): + controller = CpuControllerV2("test", "/sys/fs/cgroup/cpu/system.slice/test") + + TestCpuControllerV2.mock_read_file_map = { + "/proc/uptime": os.path.join(data_dir, "cgroups", "v2", "proc_uptime_t0"), + os.path.join(controller.path, "cpu.stat"): os.path.join(data_dir, "cgroups", "v2", "cpu.stat_t0") + } + + controller.initialize_cpu_usage() + + self.assertEqual(controller._current_cgroup_cpu, 817045397 / 1E6) + self.assertEqual(controller._current_system_cpu, 776968.02) + + def test_get_cpu_usage_v2_should_return_the_cpu_usage_since_its_last_invocation(self): + controller = CpuControllerV2("test", "/sys/fs/cgroup/cpu/system.slice/test") + + TestCpuControllerV2.mock_read_file_map = { + "/proc/uptime": os.path.join(data_dir, "cgroups", "v2", "proc_uptime_t0"), + os.path.join(controller.path, "cpu.stat"): os.path.join(data_dir, "cgroups", "v2", "cpu.stat_t0") + } + + controller.initialize_cpu_usage() + + TestCpuControllerV2.mock_read_file_map = { + "/proc/uptime": os.path.join(data_dir, "cgroups", "v2", "proc_uptime_t1"), + os.path.join(controller.path, "cpu.stat"): os.path.join(data_dir, "cgroups", "v2", "cpu.stat_t1") + } + + cpu_usage = controller.get_cpu_usage() + + cgroup_usage_delta = (819624087 / 1E6) - (817045397 / 1E6) + system_usage_delta = 777350.57 - 776968.02 + self.assertEqual(cpu_usage, round(100.0 * cgroup_usage_delta/system_usage_delta, 3)) + + TestCpuControllerV2.mock_read_file_map = { + "/proc/uptime": os.path.join(data_dir, "cgroups", "v2", "proc_uptime_t2"), + os.path.join(controller.path, "cpu.stat"): os.path.join(data_dir, "cgroups", "v2", "cpu.stat_t2") + } + + cpu_usage = controller.get_cpu_usage() + + cgroup_usage_delta = (822052295 / 1E6) - (819624087 / 1E6) + system_usage_delta = 779218.68 - 777350.57 + self.assertEqual(cpu_usage, round(100.0 * cgroup_usage_delta/system_usage_delta, 3)) + + def test_initialize_cpu_usage_v2_should_set_the_cgroup_usage_to_0_when_the_cgroup_does_not_exist(self): + controller = CpuControllerV2("test", "/sys/fs/cgroup/cpu/system.slice/test") + + io_error_2 = IOError() + io_error_2.errno = errno.ENOENT # "No such directory" + + TestCpuControllerV2.mock_read_file_map = { + "/proc/uptime": os.path.join(data_dir, "cgroups", "v2", "proc_uptime_t0"), + os.path.join(controller.path, "cpu.stat"): io_error_2 + } + + controller.initialize_cpu_usage() + + self.assertEqual(controller._current_cgroup_cpu, 0) + self.assertEqual(controller._current_system_cpu, 776968.02) # check the system usage just for test sanity + + def test_initialize_cpu_usage_v2_should_raise_an_exception_when_called_more_than_once(self): + controller = CpuControllerV2("test", "/sys/fs/cgroup/cpu/system.slice/test") + + TestCpuControllerV2.mock_read_file_map = { + "/proc/uptime": os.path.join(data_dir, "cgroups", "v2", "proc_uptime_t0"), + os.path.join(controller.path, "cpu.stat"): os.path.join(data_dir, "cgroups", "v2", "cpu.stat_t0") + } + + controller.initialize_cpu_usage() + + with self.assertRaises(CGroupsException): + controller.initialize_cpu_usage() + + def test_get_cpu_usage_v2_should_raise_an_exception_when_initialize_cpu_usage_has_not_been_invoked(self): + controller = CpuControllerV2("test", "/sys/fs/cgroup/cpu/system.slice/test") + + with self.assertRaises(CGroupsException): + cpu_usage = controller.get_cpu_usage() # pylint: disable=unused-variable + + def test_get_throttled_time_v2_should_return_the_value_since_its_last_invocation(self): + test_file = os.path.join(self.tmp_dir, "cpu.stat") + shutil.copyfile(os.path.join(data_dir, "cgroups", "v2", "cpu.stat_t0"), test_file) # throttled_time = 15735198706 + controller = CpuControllerV2("test", self.tmp_dir) + controller.initialize_cpu_usage() + shutil.copyfile(os.path.join(data_dir, "cgroups", "v2", "cpu.stat_t1"), test_file) # throttled_usec = 15796563650 + + throttled_time = controller.get_cpu_throttled_time() + + self.assertEqual(throttled_time, round(float(15796563650 - 15735198706) / 1E6, 3), "The value of throttled_time is incorrect") + + def test_get_tracked_metrics_v2_should_return_the_throttled_time(self): + controller = CpuControllerV2("test", os.path.join(data_dir, "cgroups", "v2")) + controller.initialize_cpu_usage() + + def find_throttled_time(metrics): + return [m for m in metrics if m.counter == MetricsCounter.THROTTLED_TIME] + + found = find_throttled_time(controller.get_tracked_metrics()) + self.assertTrue(len(found) == 0, "get_tracked_metrics should not fetch the throttled time by default. Found: {0}".format(found)) + + found = find_throttled_time(controller.get_tracked_metrics(track_throttled_time=True)) + self.assertTrue(len(found) == 1, "get_tracked_metrics should have fetched the throttled time by default. Found: {0}".format(found)) diff --git a/tests/ga/test_logcollector.py b/tests/ga/test_logcollector.py index cedf894b0..6a8be83af 100644 --- a/tests/ga/test_logcollector.py +++ b/tests/ga/test_logcollector.py @@ -79,9 +79,9 @@ def _mock_cgroup(cls): def mock_read_file(filepath, **args): if filepath == "/proc/stat": - filepath = os.path.join(data_dir, "cgroups", "proc_stat_t0") + filepath = os.path.join(data_dir, "cgroups", "v1", "proc_stat_t0") elif filepath.endswith("/cpuacct.stat"): - filepath = os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") + filepath = os.path.join(data_dir, "cgroups", "v1", "cpuacct.stat_t0") return original_read_file(filepath, **args) cls._mock_read_cpu_cgroup_file = patch("azurelinuxagent.common.utils.fileutil.read_file", side_effect=mock_read_file) @@ -213,7 +213,7 @@ def test_log_collector_parses_commands_in_manifest(self): with patch("azurelinuxagent.ga.logcollector.MANIFEST_NORMAL", manifest): with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'): log_collector = LogCollector() - archive = log_collector.collect_logs_and_get_archive() + archive, uncompressed_file_size = log_collector.collect_logs_and_get_archive() with open(self.output_results_file_path, "r") as fh: results = fh.readlines() @@ -227,6 +227,7 @@ def test_log_collector_parses_commands_in_manifest(self): # Assert copy was parsed self._assert_archive_created(archive) self._assert_files_are_in_archive(expected_files=[file_to_collect]) + self.assertEqual(uncompressed_file_size, os.path.getsize(file_to_collect)) no_files = self._get_number_of_files_in_archive() self.assertEqual(1, no_files, "Expected 1 file in archive, found {0}!".format(no_files)) @@ -242,10 +243,11 @@ def test_log_collector_uses_full_manifest_when_full_mode_enabled(self): with patch("azurelinuxagent.ga.logcollector.MANIFEST_FULL", manifest): with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'): log_collector = LogCollector(is_full_mode=True) - archive = log_collector.collect_logs_and_get_archive() + archive, uncompressed_file_size = log_collector.collect_logs_and_get_archive() self._assert_archive_created(archive) self._assert_files_are_in_archive(expected_files=[file_to_collect]) + self.assertEqual(uncompressed_file_size, os.path.getsize(file_to_collect)) no_files = self._get_number_of_files_in_archive() self.assertEqual(1, no_files, "Expected 1 file in archive, found {0}!".format(no_files)) @@ -256,7 +258,7 @@ def test_log_collector_should_collect_all_files(self): with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'): log_collector = LogCollector() - archive = log_collector.collect_logs_and_get_archive() + archive, uncompressed_file_size = log_collector.collect_logs_and_get_archive() self._assert_archive_created(archive) @@ -269,6 +271,10 @@ def test_log_collector_should_collect_all_files(self): os.path.join(self.root_collect_dir, "another_dir", "least_important_file") ] self._assert_files_are_in_archive(expected_files) + expected_total_uncompressed_size = 0 + for file in expected_files: + expected_total_uncompressed_size += os.path.getsize(file) + self.assertEqual(uncompressed_file_size, expected_total_uncompressed_size) no_files = self._get_number_of_files_in_archive() self.assertEqual(6, no_files, "Expected 6 files in archive, found {0}!".format(no_files)) @@ -278,7 +284,7 @@ def test_log_collector_should_truncate_large_text_files_and_ignore_large_binary_ with patch("azurelinuxagent.ga.logcollector._FILE_SIZE_LIMIT", SMALL_FILE_SIZE): with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'): log_collector = LogCollector() - archive = log_collector.collect_logs_and_get_archive() + archive, uncompressed_file_size = log_collector.collect_logs_and_get_archive() self._assert_archive_created(archive) @@ -294,6 +300,13 @@ def test_log_collector_should_truncate_large_text_files_and_ignore_large_binary_ ] self._assert_files_are_in_archive(expected_files) self._assert_files_are_not_in_archive(unexpected_files) + total_uncompressed_file_size = 0 + for file in expected_files: + if file.startswith("truncated_"): + total_uncompressed_file_size += SMALL_FILE_SIZE + else: + total_uncompressed_file_size += os.path.getsize(file) + self.assertEqual(total_uncompressed_file_size, uncompressed_file_size) no_files = self._get_number_of_files_in_archive() self.assertEqual(5, no_files, "Expected 5 files in archive, found {0}!".format(no_files)) @@ -312,7 +325,7 @@ def test_log_collector_should_prioritize_important_files_if_archive_too_big(self with patch("azurelinuxagent.ga.logcollector._MUST_COLLECT_FILES", must_collect_files): with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'): log_collector = LogCollector() - archive = log_collector.collect_logs_and_get_archive() + archive, uncompressed_file_size = log_collector.collect_logs_and_get_archive() self._assert_archive_created(archive) @@ -328,6 +341,10 @@ def test_log_collector_should_prioritize_important_files_if_archive_too_big(self ] self._assert_files_are_in_archive(expected_files) self._assert_files_are_not_in_archive(unexpected_files) + expected_total_uncompressed_size = 0 + for file in expected_files: + expected_total_uncompressed_size += os.path.getsize(file) + self.assertEqual(uncompressed_file_size, expected_total_uncompressed_size) no_files = self._get_number_of_files_in_archive() self.assertEqual(3, no_files, "Expected 3 files in archive, found {0}!".format(no_files)) @@ -338,7 +355,7 @@ def test_log_collector_should_prioritize_important_files_if_archive_too_big(self with patch("azurelinuxagent.ga.logcollector._UNCOMPRESSED_ARCHIVE_SIZE_LIMIT", 10 * 1024 * 1024): with patch("azurelinuxagent.ga.logcollector._MUST_COLLECT_FILES", must_collect_files): - second_archive = log_collector.collect_logs_and_get_archive() + second_archive, second_uncompressed_file_size = log_collector.collect_logs_and_get_archive() expected_files = [ os.path.join(self.root_collect_dir, "waagent.log"), @@ -352,6 +369,10 @@ def test_log_collector_should_prioritize_important_files_if_archive_too_big(self ] self._assert_files_are_in_archive(expected_files) self._assert_files_are_not_in_archive(unexpected_files) + expected_total_uncompressed_size = 0 + for file in expected_files: + expected_total_uncompressed_size += os.path.getsize(file) + self.assertEqual(second_uncompressed_file_size, expected_total_uncompressed_size) self._assert_archive_created(second_archive) @@ -363,7 +384,7 @@ def test_log_collector_should_update_archive_when_files_are_new_or_modified_or_d # needs to be updated in the archive, deleted if removed from disk, and added if not previously seen. with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'): log_collector = LogCollector() - first_archive = log_collector.collect_logs_and_get_archive() + first_archive, first_uncompressed_file_size = log_collector.collect_logs_and_get_archive() self._assert_archive_created(first_archive) # Everything should be in the archive @@ -376,6 +397,10 @@ def test_log_collector_should_update_archive_when_files_are_new_or_modified_or_d os.path.join(self.root_collect_dir, "another_dir", "least_important_file") ] self._assert_files_are_in_archive(expected_files) + expected_total_uncompressed_size = 0 + for file in expected_files: + expected_total_uncompressed_size += os.path.getsize(file) + self.assertEqual(first_uncompressed_file_size, expected_total_uncompressed_size) no_files = self._get_number_of_files_in_archive() self.assertEqual(6, no_files, "Expected 6 files in archive, found {0}!".format(no_files)) @@ -392,7 +417,7 @@ def test_log_collector_should_update_archive_when_files_are_new_or_modified_or_d LARGE_FILE_SIZE) rm_files(os.path.join(self.root_collect_dir, "waagent.log.1")) - second_archive = log_collector.collect_logs_and_get_archive() + second_archive, second_uncompressed_file_size = log_collector.collect_logs_and_get_archive() self._assert_archive_created(second_archive) expected_files = [ @@ -408,6 +433,10 @@ def test_log_collector_should_update_archive_when_files_are_new_or_modified_or_d ] self._assert_files_are_in_archive(expected_files) self._assert_files_are_not_in_archive(unexpected_files) + expected_total_uncompressed_size = 0 + for file in expected_files: + expected_total_uncompressed_size += os.path.getsize(file) + self.assertEqual(second_uncompressed_file_size, expected_total_uncompressed_size) file = os.path.join(self.root_collect_dir, "waagent.log") # pylint: disable=redefined-builtin new_file_size = self._get_uncompressed_file_size(file) @@ -434,7 +463,7 @@ def test_log_collector_should_clean_up_uncollected_truncated_files(self): with patch("azurelinuxagent.ga.logcollector._FILE_SIZE_LIMIT", SMALL_FILE_SIZE): with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'): log_collector = LogCollector() - archive = log_collector.collect_logs_and_get_archive() + archive, uncompressed_file_size = log_collector.collect_logs_and_get_archive() self._assert_archive_created(archive) @@ -443,6 +472,13 @@ def test_log_collector_should_clean_up_uncollected_truncated_files(self): self._truncated_path(os.path.join(self.root_collect_dir, "waagent.log.1")), # this file should be truncated ] self._assert_files_are_in_archive(expected_files) + expected_total_uncompressed_size = 0 + for file in expected_files: + if file.startswith("truncated_"): + expected_total_uncompressed_size += SMALL_FILE_SIZE + else: + expected_total_uncompressed_size += os.path.getsize(file) + self.assertEqual(uncompressed_file_size, expected_total_uncompressed_size) no_files = self._get_number_of_files_in_archive() self.assertEqual(2, no_files, "Expected 2 files in archive, found {0}!".format(no_files)) @@ -456,7 +492,7 @@ def test_log_collector_should_clean_up_uncollected_truncated_files(self): with patch("azurelinuxagent.ga.logcollector._FILE_SIZE_LIMIT", SMALL_FILE_SIZE): with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'): log_collector = LogCollector() - second_archive = log_collector.collect_logs_and_get_archive() + second_archive, second_uncompressed_file_size = log_collector.collect_logs_and_get_archive() expected_files = [ os.path.join(self.root_collect_dir, "waagent.log"), @@ -467,6 +503,13 @@ def test_log_collector_should_clean_up_uncollected_truncated_files(self): ] self._assert_files_are_in_archive(expected_files) self._assert_files_are_not_in_archive(unexpected_files) + expected_total_uncompressed_size = 0 + for file in expected_files: + if file.startswith("truncated_"): + expected_total_uncompressed_size += SMALL_FILE_SIZE + else: + expected_total_uncompressed_size += os.path.getsize(file) + self.assertEqual(second_uncompressed_file_size, expected_total_uncompressed_size) self._assert_archive_created(second_archive) diff --git a/tests/ga/test_memorycontroller.py b/tests/ga/test_memorycontroller.py new file mode 100644 index 000000000..1beb9a33f --- /dev/null +++ b/tests/ga/test_memorycontroller.py @@ -0,0 +1,124 @@ +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.4+ and Openssl 1.0+ +# + +from __future__ import print_function + +import errno +import os +import shutil + +from azurelinuxagent.ga.cgroupcontroller import CounterNotFound +from azurelinuxagent.ga.memorycontroller import MemoryControllerV1, MemoryControllerV2 +from tests.lib.tools import AgentTestCase, data_dir + + +class TestMemoryControllerV1(AgentTestCase): + def test_get_metrics_v1(self): + test_mem_controller = MemoryControllerV1("test_extension", os.path.join(data_dir, "cgroups", "v1")) + + rss_memory_usage, cache_memory_usage = test_mem_controller.get_memory_usage() + self.assertEqual(100000, rss_memory_usage) + self.assertEqual(50000, cache_memory_usage) + + max_memory_usage = test_mem_controller.get_max_memory_usage() + self.assertEqual(1000000, max_memory_usage) + + swap_memory_usage = test_mem_controller.try_swap_memory_usage() + self.assertEqual(20000, swap_memory_usage) + + def test_get_metrics_v1_when_files_not_present(self): + test_mem_controller = MemoryControllerV1("test_extension", os.path.join(data_dir, "cgroups")) + + with self.assertRaises(IOError) as e: + test_mem_controller.get_memory_usage() + + self.assertEqual(e.exception.errno, errno.ENOENT) + + with self.assertRaises(IOError) as e: + test_mem_controller.get_max_memory_usage() + + self.assertEqual(e.exception.errno, errno.ENOENT) + + with self.assertRaises(IOError) as e: + test_mem_controller.try_swap_memory_usage() + + self.assertEqual(e.exception.errno, errno.ENOENT) + + def test_get_memory_usage_v1_counters_not_found(self): + test_file = os.path.join(self.tmp_dir, "memory.stat") + shutil.copyfile(os.path.join(data_dir, "cgroups", "v1", "memory.stat_missing"), test_file) + test_mem_controller = MemoryControllerV1("test_extension", self.tmp_dir) + + with self.assertRaises(CounterNotFound): + test_mem_controller.get_memory_usage() + + swap_memory_usage = test_mem_controller.try_swap_memory_usage() + self.assertEqual(0, swap_memory_usage) + + +class TestMemoryControllerV2(AgentTestCase): + def test_get_metrics_v2(self): + test_mem_controller = MemoryControllerV2("test_extension", os.path.join(data_dir, "cgroups", "v2")) + + anon_memory_usage, cache_memory_usage = test_mem_controller.get_memory_usage() + self.assertEqual(17589300, anon_memory_usage) + self.assertEqual(134553600, cache_memory_usage) + + max_memory_usage = test_mem_controller.get_max_memory_usage() + self.assertEqual(194494464, max_memory_usage) + + swap_memory_usage = test_mem_controller.try_swap_memory_usage() + self.assertEqual(20000, swap_memory_usage) + + memory_throttled_events = test_mem_controller.get_memory_throttled_events() + self.assertEqual(9, memory_throttled_events) + + def test_get_metrics_v2_when_files_not_present(self): + test_mem_controller = MemoryControllerV2("test_extension", os.path.join(data_dir, "cgroups")) + + with self.assertRaises(IOError) as e: + test_mem_controller.get_memory_usage() + + self.assertEqual(e.exception.errno, errno.ENOENT) + + with self.assertRaises(IOError) as e: + test_mem_controller.get_max_memory_usage() + + self.assertEqual(e.exception.errno, errno.ENOENT) + + with self.assertRaises(IOError) as e: + test_mem_controller.try_swap_memory_usage() + + self.assertEqual(e.exception.errno, errno.ENOENT) + + with self.assertRaises(IOError) as e: + test_mem_controller.get_memory_throttled_events() + + self.assertEqual(e.exception.errno, errno.ENOENT) + + def test_get_memory_usage_v1_counters_not_found(self): + test_stat_file = os.path.join(self.tmp_dir, "memory.stat") + shutil.copyfile(os.path.join(data_dir, "cgroups", "v2", "memory.stat_missing"), test_stat_file) + test_events_file = os.path.join(self.tmp_dir, "memory.events") + shutil.copyfile(os.path.join(data_dir, "cgroups", "v2", "memory.stat_missing"), test_events_file) + test_mem_controller = MemoryControllerV2("test_extension", self.tmp_dir) + + with self.assertRaises(CounterNotFound): + test_mem_controller.get_memory_usage() + + with self.assertRaises(CounterNotFound): + test_mem_controller.get_memory_throttled_events() diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index 420645fe0..a2100cde5 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -21,12 +21,14 @@ import string from azurelinuxagent.common import event, logger -from azurelinuxagent.ga.controllermetrics import CpuMetrics, MemoryMetrics, MetricValue, _REPORT_EVERY_HOUR +from azurelinuxagent.ga.cgroupcontroller import MetricValue, _REPORT_EVERY_HOUR from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.event import EVENTS_DIRECTORY from azurelinuxagent.common.protocol.healthservice import HealthService from azurelinuxagent.common.protocol.util import ProtocolUtil from azurelinuxagent.common.protocol.wire import WireProtocol +from azurelinuxagent.ga.cpucontroller import CpuControllerV1 +from azurelinuxagent.ga.memorycontroller import MemoryControllerV1 from azurelinuxagent.ga.monitor import get_monitor_handler, PeriodicOperation, SendImdsHeartbeat, \ ResetPeriodicLogMessages, SendHostPluginHeartbeat, PollResourceUsage, \ ReportNetworkErrors, ReportNetworkConfigurationChanges, PollSystemWideResourceUsage @@ -222,23 +224,23 @@ def test_send_extension_metrics_telemetry_for_empty_cgroup(self, patch_poll_all_ self.assertEqual(0, patch_add_metric.call_count) @patch('azurelinuxagent.common.event.EventLogger.add_metric') - @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage") + @patch("azurelinuxagent.ga.memorycontroller.MemoryControllerV1.get_memory_usage") @patch('azurelinuxagent.common.logger.Logger.periodic_warn') def test_send_extension_metrics_telemetry_handling_memory_cgroup_exceptions_errno2(self, patch_periodic_warn, # pylint: disable=unused-argument - patch_get_memory_usage, + get_memory_usage, patch_add_metric, *args): ioerror = IOError() ioerror.errno = 2 - patch_get_memory_usage.side_effect = ioerror + get_memory_usage.side_effect = ioerror - CGroupsTelemetry._tracked["/test/path"] = MemoryMetrics("_cgroup_name", "/test/path") + CGroupsTelemetry._tracked["/test/path"] = MemoryControllerV1("_cgroup_name", "/test/path") PollResourceUsage().run() self.assertEqual(0, patch_periodic_warn.call_count) self.assertEqual(0, patch_add_metric.call_count) # No metrics should be sent. @patch('azurelinuxagent.common.event.EventLogger.add_metric') - @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage") + @patch("azurelinuxagent.ga.cpucontroller.CpuControllerV1.get_cpu_usage") @patch('azurelinuxagent.common.logger.Logger.periodic_warn') def test_send_extension_metrics_telemetry_handling_cpu_cgroup_exceptions_errno2(self, patch_periodic_warn, # pylint: disable=unused-argument patch_cpu_usage, patch_add_metric, @@ -247,7 +249,7 @@ def test_send_extension_metrics_telemetry_handling_cpu_cgroup_exceptions_errno2( ioerror.errno = 2 patch_cpu_usage.side_effect = ioerror - CGroupsTelemetry._tracked["/test/path"] = CpuMetrics("_cgroup_name", "/test/path") + CGroupsTelemetry._tracked["/test/path"] = CpuControllerV1("_cgroup_name", "/test/path") PollResourceUsage().run() self.assertEqual(0, patch_periodic_warn.call_count) diff --git a/tests/test_agent.py b/tests/test_agent.py index df1a7ca13..7073af6b7 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -24,10 +24,10 @@ from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.ga import logcollector, cgroupconfigurator from azurelinuxagent.common.utils import fileutil -from azurelinuxagent.ga.cgroupapi import InvalidCgroupMountpointException, CgroupV1 +from azurelinuxagent.ga.cgroupapi import InvalidCgroupMountpointException, CgroupV1, CgroupV2 from azurelinuxagent.ga.collect_logs import CollectLogsHandler -from azurelinuxagent.ga.controllermetrics import AGENT_LOG_COLLECTOR -from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment +from azurelinuxagent.ga.cgroupcontroller import AGENT_LOG_COLLECTOR +from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, mock_cgroup_v2_environment from tests.lib.tools import AgentTestCase, data_dir, Mock, patch EXPECTED_CONFIGURATION = \ @@ -48,10 +48,12 @@ Debug.CgroupMonitorExpiryTime = 2022-03-31 Debug.CgroupMonitorExtensionName = Microsoft.Azure.Monitor.AzureMonitorLinuxAgent Debug.EnableAgentMemoryUsageCheck = False +Debug.EnableCgroupV2ResourceLimiting = False Debug.EnableFastTrack = True Debug.EnableGAVersioning = True Debug.EtpCollectionPeriod = 300 Debug.FirewallRulesLogPeriod = 86400 +Debug.LogCollectorInitialDelay = 300 DetectScvmmEnv = False EnableOverProvisioning = True Extension.LogDir = /var/log/azure @@ -232,7 +234,7 @@ def test_rejects_invalid_log_collector_mode(self, mock_exit, mock_stderr): # py @patch("azurelinuxagent.agent.LogCollector") def test_calls_collect_logs_with_proper_mode(self, mock_log_collector, *args): # pylint: disable=unused-argument agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) - mock_log_collector.run = Mock() + mock_log_collector.return_value.collect_logs_and_get_archive.return_value = (Mock(), Mock()) # LogCollector.collect_logs_and_get_archive returns a tuple agent.collect_logs(is_full_mode=True) full_mode = mock_log_collector.call_args_list[0][0][0] @@ -246,7 +248,7 @@ def test_calls_collect_logs_with_proper_mode(self, mock_log_collector, *args): def test_calls_collect_logs_on_valid_cgroups_v1(self, mock_log_collector): try: CollectLogsHandler.enable_monitor_cgroups_check() - mock_log_collector.run = Mock() + mock_log_collector.return_value.collect_logs_and_get_archive.return_value = (Mock(), Mock()) # LogCollector.collect_logs_and_get_archive returns a tuple # Mock cgroup so process is in the log collector slice def mock_cgroup(*args, **kwargs): # pylint: disable=W0613 @@ -274,6 +276,33 @@ def mock_cgroup(*args, **kwargs): # pylint: disable=W0613 finally: CollectLogsHandler.disable_monitor_cgroups_check() + @patch("azurelinuxagent.agent.LogCollector") + def test_calls_collect_logs_on_valid_cgroups_v2(self, mock_log_collector): + try: + CollectLogsHandler.enable_monitor_cgroups_check() + mock_log_collector.return_value.collect_logs_and_get_archive.return_value = ( + Mock(), Mock()) # LogCollector.collect_logs_and_get_archive returns a tuple + + # Mock cgroup so process is in the log collector slice + def mock_cgroup(*args, **kwargs): # pylint: disable=W0613 + relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT) + return CgroupV2( + cgroup_name=AGENT_LOG_COLLECTOR, + root_cgroup_path="/sys/fs/cgroup", + cgroup_path="/sys/fs/cgroup/{0}".format(relative_path), + enabled_controllers=["cpu", "memory"] + ) + + with mock_cgroup_v2_environment(self.tmp_dir): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_process_cgroup", side_effect=mock_cgroup): + agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) + agent.collect_logs(is_full_mode=True) + + mock_log_collector.assert_called_once() + + finally: + CollectLogsHandler.disable_monitor_cgroups_check() + @patch("azurelinuxagent.agent.LogCollector") def test_doesnt_call_collect_logs_when_cgroup_api_cannot_be_determined(self, mock_log_collector): try: @@ -336,6 +365,39 @@ def raise_on_sys_exit(*args): finally: CollectLogsHandler.disable_monitor_cgroups_check() + @patch("azurelinuxagent.agent.LogCollector") + def test_doesnt_call_collect_logs_on_invalid_cgroups_v2(self, mock_log_collector): + try: + CollectLogsHandler.enable_monitor_cgroups_check() + mock_log_collector.run = Mock() + + # Mock cgroup so process is in incorrect slice + def mock_cgroup(*args, **kwargs): # pylint: disable=W0613 + relative_path = "NOT_THE_CORRECT_PATH" + return CgroupV2( + cgroup_name=AGENT_LOG_COLLECTOR, + root_cgroup_path="/sys/fs/cgroup", + cgroup_path="/sys/fs/cgroup/{0}".format(relative_path), + enabled_controllers=["cpu", "memory"] + ) + + def raise_on_sys_exit(*args): + raise RuntimeError(args[0] if args else "Exiting") + + with mock_cgroup_v2_environment(self.tmp_dir): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_process_cgroup", + side_effect=mock_cgroup): + agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) + + with patch("sys.exit", side_effect=raise_on_sys_exit) as mock_exit: + try: + agent.collect_logs(is_full_mode=True) + except RuntimeError as re: + self.assertEqual(logcollector.INVALID_CGROUPS_ERRCODE, re.args[0]) + mock_exit.assert_called_once_with(logcollector.INVALID_CGROUPS_ERRCODE) + finally: + CollectLogsHandler.disable_monitor_cgroups_check() + @patch('azurelinuxagent.agent.get_cgroup_api', side_effect=InvalidCgroupMountpointException("Test")) @patch("azurelinuxagent.agent.LogCollector") def test_doesnt_call_collect_logs_on_non_systemd_cgroups_v1_mountpoints(self, mock_log_collector, _): @@ -393,7 +455,41 @@ def raise_on_sys_exit(*args): mock_exit.assert_called_once_with(logcollector.INVALID_CGROUPS_ERRCODE) finally: CollectLogsHandler.disable_monitor_cgroups_check() - + + @patch("azurelinuxagent.agent.LogCollector") + @patch("azurelinuxagent.ga.collect_logs.LogCollectorMonitorHandler.get_max_recorded_metrics") + def test_collect_log_should_output_resource_usage_summary(self, mock_get_metrics_summary, mock_log_collector): + try: + CollectLogsHandler.enable_monitor_cgroups_check() + mock_log_collector.return_value.collect_logs_and_get_archive.return_value = (Mock(), Mock()) # LogCollector.collect_logs_and_get_archive returns a tuple + mock_get_metrics_summary.return_value = "metric summary" + + # Mock cgroup so process is in the log collector slice + def mock_cgroup(*args, **kwargs): # pylint: disable=W0613 + relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT) + return CgroupV1( + cgroup_name=AGENT_LOG_COLLECTOR, + controller_mountpoints={ + 'cpu,cpuacct': "/sys/fs/cgroup/cpu,cpuacct", + 'memory': "/sys/fs/cgroup/memory" + }, + controller_paths={ + 'cpu,cpuacct': "/sys/fs/cgroup/cpu,cpuacct/{0}".format(relative_path), + 'memory': "/sys/fs/cgroup/memory/{0}".format(relative_path) + } + ) + + with mock_cgroup_v1_environment(self.tmp_dir): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup", side_effect=mock_cgroup): + agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) + agent.collect_logs(is_full_mode=True) + + mock_log_collector.assert_called_once() + mock_get_metrics_summary.assert_called_once() + + finally: + CollectLogsHandler.disable_monitor_cgroups_check() + def test_it_should_parse_setup_firewall_properly(self): test_firewall_meta = { diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml index dfed709af..b96cc5107 100644 --- a/tests_e2e/orchestrator/runbook.yml +++ b/tests_e2e/orchestrator/runbook.yml @@ -54,6 +54,7 @@ variable: - publish_hostname - recover_network_interface - cgroup_v2_disabled + - log_collector # # Additional arguments pass to the test suites diff --git a/tests_e2e/test_suites/images.yml b/tests_e2e/test_suites/images.yml index 976c987f9..d7d578cba 100644 --- a/tests_e2e/test_suites/images.yml +++ b/tests_e2e/test_suites/images.yml @@ -55,6 +55,19 @@ image-sets: - "oracle_610" - "rhel_610" + # These are the distros which have periodic log collector support. + log-collector-endorsed: + - "centos_82" + - "rhel_82" + - "ubuntu_1604" + - "ubuntu_1804" + - "ubuntu_2004" + - "ubuntu_2204" + - "ubuntu_2204_minimal" + - "ubuntu_2204_arm64" + - "ubuntu_2404" + - "ubuntu_2404_minimal" + # # An image can be specified by a string giving its urn, as in # diff --git a/tests_e2e/test_suites/log_collector.yml b/tests_e2e/test_suites/log_collector.yml new file mode 100644 index 000000000..496198f92 --- /dev/null +++ b/tests_e2e/test_suites/log_collector.yml @@ -0,0 +1,8 @@ +# +# This test is used to verify that the log collector logs the expected behavior on periodic runs. +# +name: "LogCollector" +tests: + - "log_collector/log_collector.py" +images: + - "random(log-collector-endorsed, 1)" diff --git a/tests_e2e/tests/log_collector/log_collector.py b/tests_e2e/tests/log_collector/log_collector.py new file mode 100755 index 000000000..c98a33ae8 --- /dev/null +++ b/tests_e2e/tests/log_collector/log_collector.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import re +import time + +from assertpy import fail + +import tests_e2e.tests.lib.logging +from azurelinuxagent.common.utils.shellutil import CommandError +from tests_e2e.tests.lib.agent_test import AgentVmTest + + +class LogCollector(AgentVmTest): + """ + Tests that the log collector logs the expected behavior on periodic runs. + """ + def run(self): + ssh_client = self._context.create_ssh_client() + ssh_client.run_command("update-waagent-conf Logs.Collect=y Debug.EnableCgroupV2ResourceLimiting=y Debug.LogCollectorInitialDelay=60", use_sudo=True) + # Wait for log collector to finish uploading logs + for _ in range(3): + time.sleep(90) + try: + ssh_client.run_command("grep 'Successfully uploaded logs' /var/log/waagent.log") + break + except CommandError: + tests_e2e.tests.lib.logging.log.info("The Agent has not finished log collection, will check again after a short delay") + else: + raise Exception("Timeout while waiting for the Agent to finish log collection") + + # Get any agent logs between log collector start and finish + try: + output = ssh_client.run_command( + "sed -n " + + "'/INFO CollectLogsHandler ExtHandler Starting log collection/, /INFO CollectLogsHandler ExtHandler Successfully uploaded logs/p' " + + "/var/log/waagent.log").rstrip().splitlines() + except Exception as e: + raise Exception("Unable to get log collector logs from waagent.log: {0}".format(e)) + + # These logs indicate a successful log collector run with resource enforcement and monitoring + expected = [ + r'.*Starting log collection', + r'.*Using cgroup v\d for resource enforcement and monitoring', + r'.*cpu(,cpuacct)? controller for cgroup: azure-walinuxagent-logcollector \[\/sys\/fs\/cgroup(\/cpu,cpuacct)?\/azure.slice\/azure-walinuxagent.slice\/azure-walinuxagent\-logcollector.slice\/collect\-logs.scope\]', + r'.*memory controller for cgroup: azure-walinuxagent-logcollector \[\/sys\/fs\/cgroup(\/memory)?\/azure.slice\/azure-walinuxagent.slice\/azure-walinuxagent\-logcollector.slice\/collect\-logs.scope\]', + r'.*Log collection successfully completed', + r'.*Successfully collected logs', + r'.*Successfully uploaded logs' + ] + + # Filter output to only include relevant log collector logs + lc_logs = [log for log in output if len([pattern for pattern in expected if re.match(pattern, log)]) > 0] + + # Check that all expected logs exist and are in the correct order + indent = lambda lines: "\n".join([f" {ln}" for ln in lines]) + if len(lc_logs) == len(expected) and all([re.match(expected[i], lc_logs[i]) is not None for i in range(len(expected))]): + tests_e2e.tests.lib.logging.log.info("The log collector run completed as expected.\nLog messages:\n%s", indent(lc_logs)) + else: + fail(f"The log collector run did not complete as expected.\nExpected:\n{indent(expected)}\nActual:\n{indent(lc_logs)}") + + ssh_client.run_command("update-waagent-conf Debug.EnableCgroupV2ResourceLimiting=n Debug.LogCollectorInitialDelay=5*60", + use_sudo=True) + + +if __name__ == "__main__": + LogCollector.run_from_command_line() diff --git a/tests_e2e/tests/scripts/agent_cpu_quota-check_agent_cpu_quota.py b/tests_e2e/tests/scripts/agent_cpu_quota-check_agent_cpu_quota.py index 5dfc55be8..29758d02b 100755 --- a/tests_e2e/tests/scripts/agent_cpu_quota-check_agent_cpu_quota.py +++ b/tests_e2e/tests/scripts/agent_cpu_quota-check_agent_cpu_quota.py @@ -115,7 +115,7 @@ def check_agent_log_for_metrics() -> bool: if match is not None: processor_time.append(float(match.group(1))) else: - match = re.search(r"Throttled Time\s*\[walinuxagent.service\]\s*=\s*([0-9.]+)", record.message) + match = re.search(r"Throttled Time \(s\)\s*\[walinuxagent.service\]\s*=\s*([0-9.]+)", record.message) if match is not None: throttled_time.append(float(match.group(1))) if len(processor_time) < 1 or len(throttled_time) < 1: