-
Notifications
You must be signed in to change notification settings - Fork 376
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add cgroupv2 support for log collector #3188
Changes from all commits
611bf5e
c23faff
3ad51db
01d7c0a
2dc8fe2
a210ca1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -146,7 +146,8 @@ def load_conf_from_file(conf_file_path, conf=__conf__): | |
"Debug.CgroupDisableOnQuotaCheckFailure": True, | ||
"Debug.EnableAgentMemoryUsageCheck": False, | ||
"Debug.EnableFastTrack": True, | ||
"Debug.EnableGAVersioning": True | ||
"Debug.EnableGAVersioning": True, | ||
"Debug.EnableCgroupV2ResourceLimiting": False | ||
} | ||
|
||
|
||
|
@@ -200,7 +201,8 @@ def load_conf_from_file(conf_file_path, conf=__conf__): | |
"Debug.EtpCollectionPeriod": 300, | ||
"Debug.AutoUpdateHotfixFrequency": 14400, | ||
"Debug.AutoUpdateNormalFrequency": 86400, | ||
"Debug.FirewallRulesLogPeriod": 86400 | ||
"Debug.FirewallRulesLogPeriod": 86400, | ||
"Debug.LogCollectorInitialDelay": 5 * 60 | ||
} | ||
|
||
|
||
|
@@ -680,3 +682,20 @@ def get_firewall_rules_log_period(conf=__conf__): | |
NOTE: This option is experimental and may be removed in later versions of the Agent. | ||
""" | ||
return conf.get_int("Debug.FirewallRulesLogPeriod", 86400) | ||
|
||
|
||
def get_enable_cgroup_v2_resource_limiting(conf=__conf__): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cgroup v2 log collector will be disabled by default. This conf option will be used to opt-in |
||
""" | ||
If True, the agent will enable resource monitoring and enforcement for the log collector on machines using cgroup v2. | ||
NOTE: This option is experimental and may be removed in later versions of the Agent. | ||
""" | ||
return conf.get_switch("Debug.EnableCgroupV2ResourceLimiting", False) | ||
|
||
|
||
def get_log_collector_initial_delay(conf=__conf__): | ||
""" | ||
Determine the initial delay at service start before the first periodic log collection. | ||
|
||
NOTE: This option is experimental and may be removed in later versions of the Agent. | ||
""" | ||
return conf.get_int("Debug.LogCollectorInitialDelay", 5 * 60) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,8 +24,9 @@ | |
|
||
from azurelinuxagent.common import logger | ||
from azurelinuxagent.common.event import WALAEventOperation, add_event | ||
from azurelinuxagent.ga.controllermetrics import CpuMetrics, MemoryMetrics | ||
from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry | ||
from azurelinuxagent.ga.cpucontroller import _CpuController, CpuControllerV1, CpuControllerV2 | ||
from azurelinuxagent.ga.memorycontroller import MemoryControllerV1, MemoryControllerV2 | ||
from azurelinuxagent.common.conf import get_agent_pid_file_path | ||
from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes, ExtensionError, \ | ||
ExtensionOperationError | ||
|
@@ -292,7 +293,7 @@ def _get_controller_mountpoints(): | |
if match is not None: | ||
path = match.group('path') | ||
controller = match.group('controller') | ||
if controller is not None and path is not None and controller in CgroupV1.get_supported_controllers(): | ||
if controller is not None and path is not None and controller in CgroupV1.get_supported_controller_names(): | ||
mount_points[controller] = path | ||
return mount_points | ||
|
||
|
@@ -335,7 +336,7 @@ def _get_process_relative_controller_paths(process_id): | |
if match is not None: | ||
controller = match.group('controller') | ||
path = match.group('path').lstrip('/') if match.group('path') != '/' else None | ||
if path is not None and controller in CgroupV1.get_supported_controllers(): | ||
if path is not None and controller in CgroupV1.get_supported_controller_names(): | ||
conroller_relative_paths[controller] = path | ||
|
||
return conroller_relative_paths | ||
|
@@ -371,7 +372,7 @@ def get_process_cgroup(self, process_id, cgroup_name): | |
controller_paths=process_controller_paths) | ||
|
||
def log_root_paths(self): | ||
for controller in CgroupV1.get_supported_controllers(): | ||
for controller in CgroupV1.get_supported_controller_names(): | ||
mount_point = self._cgroup_mountpoints.get(controller) | ||
if mount_point is None: | ||
log_cgroup_info("The {0} controller is not mounted".format(controller), send_event=False) | ||
|
@@ -402,14 +403,14 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh | |
|
||
log_cgroup_info("Started extension in unit '{0}'".format(scope_name), send_event=False) | ||
|
||
cpu_metrics = None | ||
cpu_controller = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for this rename, the code cleaned up pretty nicely! |
||
try: | ||
cgroup_relative_path = os.path.join('azure.slice/azure-vmextensions.slice', extension_slice_name) | ||
cgroup = self.get_cgroup_from_relative_path(cgroup_relative_path, extension_name) | ||
for metrics in cgroup.get_controller_metrics(): | ||
if isinstance(metrics, CpuMetrics): | ||
cpu_metrics = metrics | ||
CGroupsTelemetry.track_cgroup(metrics) | ||
for controller in cgroup.get_controllers(): | ||
if isinstance(controller, _CpuController): | ||
cpu_controller = controller | ||
CGroupsTelemetry.track_cgroup_controller(controller) | ||
|
||
except IOError as e: | ||
if e.errno == 2: # 'No such file or directory' | ||
|
@@ -421,7 +422,7 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh | |
# Wait for process completion or timeout | ||
try: | ||
return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, | ||
stderr=stderr, error_code=error_code, cpu_metrics=cpu_metrics) | ||
stderr=stderr, error_code=error_code, cpu_controller=cpu_controller) | ||
except ExtensionError as e: | ||
# The extension didn't terminate successfully. Determine whether it was due to systemd errors or | ||
# extension errors. | ||
|
@@ -498,7 +499,7 @@ def _get_controllers_enabled_at_root(root_cgroup_path): | |
enabled_controllers_file = os.path.join(root_cgroup_path, 'cgroup.subtree_control') | ||
if os.path.exists(enabled_controllers_file): | ||
controllers_enabled_at_root = fileutil.read_file(enabled_controllers_file).rstrip().split() | ||
return list(set(controllers_enabled_at_root) & set(CgroupV2.get_supported_controllers())) | ||
return list(set(controllers_enabled_at_root) & set(CgroupV2.get_supported_controller_names())) | ||
return [] | ||
|
||
@staticmethod | ||
|
@@ -546,7 +547,7 @@ def get_process_cgroup(self, process_id, cgroup_name): | |
|
||
def log_root_paths(self): | ||
log_cgroup_info("The root cgroup path is {0}".format(self._root_cgroup_path), send_event=False) | ||
for controller in CgroupV2.get_supported_controllers(): | ||
for controller in CgroupV2.get_supported_controller_names(): | ||
if controller in self._controllers_enabled_at_root: | ||
log_cgroup_info("The {0} controller is enabled at the root cgroup".format(controller), send_event=False) | ||
else: | ||
|
@@ -564,9 +565,9 @@ def __init__(self, cgroup_name): | |
self._cgroup_name = cgroup_name | ||
|
||
@staticmethod | ||
def get_supported_controllers(): | ||
def get_supported_controller_names(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. renamed to 'get_supported_controller_names' to reflect that this returns a list of strings, not controller objects |
||
""" | ||
Cgroup version specific. Returns a list of the controllers which the agent supports. | ||
Cgroup version specific. Returns a list of the controllers which the agent supports as strings. | ||
""" | ||
raise NotImplementedError() | ||
|
||
|
@@ -578,12 +579,12 @@ def check_in_expected_slice(self, expected_slice): | |
""" | ||
raise NotImplementedError() | ||
|
||
def get_controller_metrics(self, expected_relative_path=None): | ||
def get_controllers(self, expected_relative_path=None): | ||
""" | ||
Cgroup version specific. Returns a list of the metrics for the agent supported controllers which are | ||
mounted/enabled for the cgroup. | ||
Cgroup version specific. Returns a list of the agent supported controllers which are mounted/enabled for the cgroup. | ||
|
||
:param expected_relative_path: The expected relative path of the cgroup. If provided, only metrics for controllers at this expected path will be returned. | ||
:param expected_relative_path: The expected relative path of the cgroup. If provided, only controllers mounted | ||
at this expected path will be returned. | ||
""" | ||
raise NotImplementedError() | ||
|
||
|
@@ -608,7 +609,7 @@ def __init__(self, cgroup_name, controller_mountpoints, controller_paths): | |
self._controller_paths = controller_paths | ||
|
||
@staticmethod | ||
def get_supported_controllers(): | ||
def get_supported_controller_names(): | ||
return [CgroupV1.CPU_CONTROLLER, CgroupV1.MEMORY_CONTROLLER] | ||
|
||
def check_in_expected_slice(self, expected_slice): | ||
|
@@ -620,39 +621,39 @@ def check_in_expected_slice(self, expected_slice): | |
|
||
return in_expected_slice | ||
|
||
def get_controller_metrics(self, expected_relative_path=None): | ||
metrics = [] | ||
def get_controllers(self, expected_relative_path=None): | ||
controllers = [] | ||
|
||
for controller in self.get_supported_controllers(): | ||
controller_metrics = None | ||
controller_path = self._controller_paths.get(controller) | ||
controller_mountpoint = self._controller_mountpoints.get(controller) | ||
for supported_controller_name in self.get_supported_controller_names(): | ||
controller = None | ||
controller_path = self._controller_paths.get(supported_controller_name) | ||
controller_mountpoint = self._controller_mountpoints.get(supported_controller_name) | ||
|
||
if controller_mountpoint is None: | ||
log_cgroup_warning("{0} controller is not mounted; will not track metrics".format(controller), send_event=False) | ||
log_cgroup_warning("{0} controller is not mounted; will not track".format(supported_controller_name), send_event=False) | ||
continue | ||
|
||
if controller_path is None: | ||
log_cgroup_warning("{0} is not mounted for the {1} cgroup; will not track metrics".format(controller, self._cgroup_name), send_event=False) | ||
log_cgroup_warning("{0} is not mounted for the {1} cgroup; will not track".format(supported_controller_name, self._cgroup_name), send_event=False) | ||
continue | ||
|
||
if expected_relative_path is not None: | ||
expected_path = os.path.join(controller_mountpoint, expected_relative_path) | ||
if controller_path != expected_path: | ||
log_cgroup_warning("The {0} controller is not mounted at the expected path for the {1} cgroup; will not track metrics. Actual cgroup path:[{2}] Expected:[{3}]".format(controller, self._cgroup_name, controller_path, expected_path), send_event=False) | ||
log_cgroup_warning("The {0} controller is not mounted at the expected path for the {1} cgroup; will not track. Actual cgroup path:[{2}] Expected:[{3}]".format(supported_controller_name, self._cgroup_name, controller_path, expected_path), send_event=False) | ||
continue | ||
|
||
if controller == self.CPU_CONTROLLER: | ||
controller_metrics = CpuMetrics(self._cgroup_name, controller_path) | ||
elif controller == self.MEMORY_CONTROLLER: | ||
controller_metrics = MemoryMetrics(self._cgroup_name, controller_path) | ||
if supported_controller_name == self.CPU_CONTROLLER: | ||
controller = CpuControllerV1(self._cgroup_name, controller_path) | ||
elif supported_controller_name == self.MEMORY_CONTROLLER: | ||
controller = MemoryControllerV1(self._cgroup_name, controller_path) | ||
|
||
if controller_metrics is not None: | ||
msg = "{0} metrics for cgroup: {1}".format(controller, controller_metrics) | ||
if controller is not None: | ||
msg = "{0} controller for cgroup: {1}".format(supported_controller_name, controller) | ||
log_cgroup_info(msg, send_event=False) | ||
metrics.append(controller_metrics) | ||
controllers.append(controller) | ||
|
||
return metrics | ||
return controllers | ||
|
||
def get_controller_procs_path(self, controller): | ||
controller_path = self._controller_paths.get(controller) | ||
|
@@ -687,7 +688,7 @@ def __init__(self, cgroup_name, root_cgroup_path, cgroup_path, enabled_controlle | |
self._enabled_controllers = enabled_controllers | ||
|
||
@staticmethod | ||
def get_supported_controllers(): | ||
def get_supported_controller_names(): | ||
return [CgroupV2.CPU_CONTROLLER, CgroupV2.MEMORY_CONTROLLER] | ||
|
||
def check_in_expected_slice(self, expected_slice): | ||
|
@@ -697,9 +698,41 @@ def check_in_expected_slice(self, expected_slice): | |
|
||
return True | ||
|
||
def get_controller_metrics(self, expected_relative_path=None): | ||
# TODO - Implement controller metrics for cgroup v2 | ||
raise NotImplementedError() | ||
def get_controllers(self, expected_relative_path=None): | ||
controllers = [] | ||
|
||
for supported_controller_name in self.get_supported_controller_names(): | ||
controller = None | ||
|
||
if supported_controller_name not in self._enabled_controllers: | ||
log_cgroup_warning("{0} controller is not enabled; will not track".format(supported_controller_name), | ||
send_event=False) | ||
continue | ||
|
||
if self._cgroup_path == "": | ||
log_cgroup_warning("Cgroup path for {0} cannot be determined; will not track".format(self._cgroup_name), | ||
send_event=False) | ||
continue | ||
|
||
if expected_relative_path is not None: | ||
expected_path = os.path.join(self._root_cgroup_path, expected_relative_path) | ||
if self._cgroup_path != expected_path: | ||
log_cgroup_warning( | ||
"The {0} cgroup is not mounted at the expected path; will not track. Actual cgroup path:[{1}] Expected:[{2}]".format( | ||
self._cgroup_name, self._cgroup_path, expected_path), send_event=False) | ||
continue | ||
|
||
if supported_controller_name == self.CPU_CONTROLLER: | ||
controller = CpuControllerV2(self._cgroup_name, self._cgroup_path) | ||
elif supported_controller_name == self.MEMORY_CONTROLLER: | ||
controller = MemoryControllerV2(self._cgroup_name, self._cgroup_path) | ||
|
||
if controller is not None: | ||
msg = "{0} controller for cgroup: {1}".format(supported_controller_name, controller) | ||
log_cgroup_info(msg, send_event=False) | ||
controllers.append(controller) | ||
|
||
return controllers | ||
|
||
def get_procs_path(self): | ||
if self._cgroup_path != "": | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have split thought about whether summary is useful. This msg bunch of information, someone has to parse this msg to understand different metrics across the fleet. Do we already not capture as part of monitor thread?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't capture log collector metrics in telemetry right now. I'm not sure how we would determine if these new limits are appropriate without sending events for these metrics
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As part of LogCollectorMonitorHandler thread, when we poll for metrics, we send to telemetry as well. I think you used same data to evaluate limits. Now logging same data in different table is not useful I feel
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How often is this telemetry sent? the log collector process is short in duration. if the polling period is too long, we won't capture enough info; if it is too short, we may be flooding telemetry. A summary seems appropriate.
As for the parsing, should we send this as JSON instead?