Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

agent cpu quota scenario #2937

Merged
merged 4 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion tests_e2e/test_suites/agent_cgroups.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#
# The test suite verify the agent running in expected cgroups and also, checks agent tracking the cgroups for polling resource metrics.
# The test suite verify the agent running in expected cgroups and also, checks agent tracking the cgroups for polling resource metrics. Also, it verifies the agent cpu quota is set as expected.
#
name: "AgentCgroups"
tests:
- "agent_cgroups/agent_cgroups.py"
- "agent_cgroups/agent_cpu_quota.py"
images: "cgroups-endorsed"
9 changes: 8 additions & 1 deletion tests_e2e/test_suites/images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,11 @@ images:
AzureChinaCloud: []
centos_610: "OpenLogic CentOS 6.10 latest"
centos_79: "OpenLogic CentOS 7_9 latest"
centos_82: "OpenLogic CentOS 8_2 latest"
centos_82:
urn: "OpenLogic CentOS 8_2 latest"
vm_sizes:
# Since centos derived from redhat, please see the comment for vm size in rhel_82
- "Standard_B2s"
narrieta marked this conversation as resolved.
Show resolved Hide resolved
debian_8: "credativ Debian 8 latest"
debian_9: "credativ Debian 9 latest"
debian_10: "Debian debian-10 10 latest"
Expand Down Expand Up @@ -126,6 +130,9 @@ images:
urn: "RedHat RHEL 8.2 latest"
locations:
AzureChinaCloud: []
vm_sizes:
# Previously one user reported agent hang on this VM size for redhat 7+ but not observed in rhel 8. So I'm using same vm size to test agent cgroups scenario for rhel 8 to make sure we don't see any issue in automation.
- "Standard_B2s"
narrieta marked this conversation as resolved.
Show resolved Hide resolved
rhel_90:
urn: "RedHat RHEL 9_0 latest"
locations:
Expand Down
39 changes: 39 additions & 0 deletions tests_e2e/tests/agent_cgroups/agent_cpu_quota.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import List, Dict, Any

from tests_e2e.tests.lib.agent_test import AgentTest
from tests_e2e.tests.lib.logging import log


class AgentCPUQuota(AgentTest):
"""
The test verify that the agent detects when it is throttled for using too much CPU, that it detects processes that do belong to the agent's cgroup, and that resource metrics are generated.
"""
def __init__(self, context):
super().__init__(context)
self._ssh_client = self._context.create_ssh_client()

def run(self):
log.info("=====Validating agent cpu quota checks")
self._run_remote_test("agent_cpu_quota-check_agent_cpu_quota.py", use_sudo=True)
log.info("Successfully Verified that agent running in expected CPU quotas")

def get_ignore_error_rules(self) -> List[Dict[str, Any]]:
ignore_rules = [
# This is produced by the test, so it is expected
# Examples:
# 2023-10-03T17:59:03.007572Z INFO MonitorHandler ExtHandler [CGW] Disabling resource usage monitoring. Reason: Check on cgroups failed:
# [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 3190] /usr/bin/python3\x00/home/azureuser/bin/agent_cpu_quota-start_servi', '[PID: 3293] dd\x00if=/dev/zero\x00of=/dev/null\x00']
# [CGroupsException] The agent has been throttled for 5.7720997 seconds
{'message': r"Disabling resource usage monitoring. Reason: Check on cgroups failed"},
# This may happen during service stop while terminating the process
# Example:
# 2022-03-11T21:11:11.713161Z ERROR E2ETest [Errno 3] No such process:
{'message': r'E2ETest.*No such process'},
# 2022-10-26T15:38:39.655677Z ERROR E2ETest 'dd if=/dev/zero of=/dev/null' failed: -15 ():
{'message': r"E2ETest.*dd.*failed: -15"}
]
return ignore_rules


if __name__ == "__main__":
AgentCPUQuota.run_from_command_line()
1 change: 1 addition & 0 deletions tests_e2e/tests/lib/cgroup_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def check_agent_quota_disabled():
Returns True if the cpu quota is infinity
"""
cpu_quota = get_agent_cpu_quota()
# the quota can be expressed as seconds (s) or milliseconds (ms); no quota is expressed as "infinity"
return cpu_quota == 'infinity'


Expand Down
213 changes: 213 additions & 0 deletions tests_e2e/tests/scripts/agent_cpu_quota-check_agent_cpu_quota.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
#!/usr/bin/env pypy3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import datetime
import os
import re
import shutil
import time

from assertpy import fail

from azurelinuxagent.common.osutil import systemd
from azurelinuxagent.common.utils import shellutil
from azurelinuxagent.ga.cgroupconfigurator import _DROP_IN_FILE_CPU_QUOTA
from tests_e2e.tests.lib.agent_log import AgentLog
from tests_e2e.tests.lib.cgroup_helpers import check_agent_quota_disabled, \
get_agent_cpu_quota
from tests_e2e.tests.lib.logging import log
from tests_e2e.tests.lib.remote_test import run_remote_test
from tests_e2e.tests.lib.retry import retry_if_false


def prepare_agent():
# This function prepares the agent:
# 1) It modifies the service unit file to wrap the agent process with a script that starts the actual agent and then
# launches an instance of the dummy process to consume the CPU. Since all these processes are in the same cgroup,
# this has the same effect as the agent itself consuming the CPU.
#
# The process tree is similar to
#
# /usr/bin/python3 /home/azureuser/bin/agent_cpu_quota-start_service.py /usr/bin/python3 -u /usr/sbin/waagent -daemon
# ├─/usr/bin/python3 -u /usr/sbin/waagent -daemon
# │ └─python3 -u bin/WALinuxAgent-9.9.9.9-py3.8.egg -run-exthandlers
# │ └─4*[{python3}]
# ├─dd if=/dev/zero of=/dev/null
# │
# └─{python3}
#
# And the agent's cgroup looks like
#
# CGroup: /azure.slice/walinuxagent.service
# ├─10507 /usr/bin/python3 /home/azureuser/bin/agent_cpu_quota-start_service.py /usr/bin/python3 -u /usr/sbin/waagent -daemon
# ├─10508 /usr/bin/python3 -u /usr/sbin/waagent -daemon
# ├─10516 python3 -u bin/WALinuxAgent-9.9.9.9-py3.8.egg -run-exthandlers
# ├─10711 dd if=/dev/zero of=/dev/null
#
# 2) It turns on a few debug flags and resart the agent
log.info("***Preparing agent for testing cpu quota")
#
# Create a drop in file to wrap "start-service.py" around the actual agent: This will ovveride the ExecStart line in the agent's unit file
#
# ExecStart= (need to be empty to clear the original ExecStart)
# ExecStart=/home/.../agent_cgroups-start-service.py /usr/bin/python3 -u /usr/sbin/waagent -daemon
#
service_file = systemd.get_agent_unit_file()
exec_start = None
with open(service_file, "r") as file_:
for line in file_:
match = re.match("ExecStart=(.+)", line)
if match is not None:
exec_start = match.group(1)
break
else:
file_.seek(0)
raise Exception("Could not find ExecStart in {0}\n:{1}".format(service_file, file_.read()))
agent_python = exec_start.split()[0]
current_directory = os.path.dirname(os.path.abspath(__file__))
start_service_script = os.path.join(current_directory, "agent_cpu_quota-start_service.py")
drop_in_file = os.path.join(systemd.get_agent_drop_in_path(), "99-ExecStart.conf")
log.info("Creating %s...", drop_in_file)
with open(drop_in_file, "w") as file_:
file_.write("""
[Service]
ExecStart=
ExecStart={0} {1} {2}
""".format(agent_python, start_service_script, exec_start))
log.info("Executing daemon-reload")
shellutil.run_command(["systemctl", "daemon-reload"])

# Disable all checks on cgroups and enable log metrics every 20 sec
log.info("Executing script update-waagent-conf to enable agent cgroups config flag")
result = shellutil.run_command(["update-waagent-conf", "Debug.CgroupCheckPeriod=20", "Debug.CgroupLogMetrics=y",
"Debug.CgroupDisableOnProcessCheckFailure=n", "Debug.CgroupDisableOnQuotaCheckFailure=n"])
log.info("Successfully enabled agent cgroups config flag: {0}".format(result))


def verify_agent_reported_metrics():
"""
This method verifies that the agent reports % Processor Time and Throttled Time metrics
"""
log.info("** Verifying agent reported metrics")
log.info("Parsing agent log for metrics")
processor_time = []
throttled_time = []

def check_agent_log_for_metrics() -> bool:
for record in AgentLog().read():
match = re.search(r"% Processor Time\s*\[walinuxagent.service\]\s*=\s*([0-9.]+)", record.message)
if match is not None:
processor_time.append(float(match.group(1)))
else:
match = re.search(r"Throttled Time\s*\[walinuxagent.service\]\s*=\s*([0-9.]+)", record.message)
if match is not None:
throttled_time.append(float(match.group(1)))
if len(processor_time) < 1 or len(throttled_time) < 1:
return False
return True

found: bool = retry_if_false(check_agent_log_for_metrics)
if found:
log.info("%% Processor Time: %s", processor_time)
log.info("Throttled Time: %s", throttled_time)
log.info("Successfully verified agent reported resource metrics")
else:
fail(
"The agent doesn't seem to be collecting % Processor Time and Throttled Time metrics. Agent found Processor Time: {0}, Throttled Time: {1}".format(
processor_time, throttled_time))


def wait_for_log_message(message, timeout=datetime.timedelta(minutes=5)):
log.info("Checking agent's log for message matching [%s]", message)
start_time = datetime.datetime.now()
while datetime.datetime.now() - start_time <= timeout:
for record in AgentLog().read():
match = re.search(message, record.message, flags=re.DOTALL)
if match is not None:
log.info("Found message:\n\t%s", record.text.replace("\n", "\n\t"))
return
time.sleep(30)
fail("The agent did not find [{0}] in its log within the allowed timeout".format(message))


def verify_process_check_on_agent_cgroups():
"""
This method checks agent detect unexpected processes in its cgroup and disables the CPUQuota
"""
log.info("***Verifying process check on agent cgroups")
log.info("Ensuring agent CPUQuota is enabled and backup the drop-in file to restore later in further tests")
if check_agent_quota_disabled():
fail("The agent's CPUQuota is not enabled: {0}".format(get_agent_cpu_quota()))
quota_drop_in = os.path.join(systemd.get_agent_drop_in_path(), _DROP_IN_FILE_CPU_QUOTA)
quota_drop_in_backup = quota_drop_in + ".bk"
log.info("Backing up %s to %s...", quota_drop_in, quota_drop_in_backup)
shutil.copy(quota_drop_in, quota_drop_in_backup)
#
# Re-enable Process checks on cgroups and verify that the agent detects unexpected processes in its cgroup and disables the CPUQuota wehen
# that happens
#
shellutil.run_command(["update-waagent-conf", "Debug.CgroupDisableOnProcessCheckFailure=y"])

# The log message indicating the check failed is similar to
# 2021-03-29T23:33:15.603530Z INFO MonitorHandler ExtHandler Disabling resource usage monitoring. Reason: Check on cgroups failed:
# [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 25826] python3\x00/home/nam/Compute-Runtime-Tux-Pipeline/dungeon_crawler/s']
wait_for_log_message(
"Disabling resource usage monitoring. Reason: Check on cgroups failed:.+The agent's cgroup includes unexpected processes")
if not check_agent_quota_disabled():
fail("The agent did not disable its CPUQuota: {0}".format(get_agent_cpu_quota()))


def verify_throttling_time_check_on_agent_cgroups():
"""
This method checks agent disables its CPUQuota when it exceeds its throttling limit
"""
log.info("***Verifying CPU throttling check on agent cgroups")
# Now disable the check on unexpected processes and enable the check on throttledtime and verify that the agent disables its CPUQuota when it exceeds its throttling limit
log.info("Re-enabling CPUQuota...")
quota_drop_in = os.path.join(systemd.get_agent_drop_in_path(), _DROP_IN_FILE_CPU_QUOTA)
quota_drop_in_backup = quota_drop_in + ".bk"
log.info("Restoring %s from %s...", quota_drop_in, quota_drop_in_backup)
shutil.copy(quota_drop_in_backup, quota_drop_in)
shellutil.run_command(["systemctl", "daemon-reload"])
shellutil.run_command(["update-waagent-conf", "Debug.CgroupDisableOnProcessCheckFailure=n", "Debug.CgroupDisableOnQuotaCheckFailure=y", "Debug.AgentCpuThrottledTimeThreshold=5"])

# The log message indicating the check failed is similar to
# 2021-04-01T20:47:55.892569Z INFO MonitorHandler ExtHandler Disabling resource usage monitoring. Reason: Check on cgroups failed:
# [CGroupsException] The agent has been throttled for 121.339916938 seconds
#
# After we need to wait for a little longer for the agent to update systemd:
# 2021-04-14T01:51:44.399860Z INFO MonitorHandler ExtHandler Executing systemctl daemon-reload...
#
wait_for_log_message(
"Disabling resource usage monitoring. Reason: Check on cgroups failed:.+The agent has been throttled",
timeout=datetime.timedelta(minutes=10))
wait_for_log_message("Stopped tracking cgroup walinuxagent.service", timeout=datetime.timedelta(minutes=10))
wait_for_log_message("Executing systemctl daemon-reload...", timeout=datetime.timedelta(minutes=5))
if not check_agent_quota_disabled():
fail("The agent did not disable its CPUQuota: {0}".format(get_agent_cpu_quota()))


def main():
prepare_agent()
verify_agent_reported_metrics()
verify_process_check_on_agent_cgroups()
verify_throttling_time_check_on_agent_cgroups()


run_remote_test(main)
96 changes: 96 additions & 0 deletions tests_e2e/tests/scripts/agent_cpu_quota-start_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env pypy3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script starts the actual agent and then launches an instance of the dummy process periodically to consume the CPU
#
import signal
import subprocess
import sys
import threading
import time
import traceback

from azurelinuxagent.common import logger


class CpuConsumer(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self._stopped = False

def run(self):
threading.current_thread().setName("*Stress*")

while not self._stopped:
try:
# Dummy operation(reads empty streams and drops) which creates load on the CPU
dd_command = ["dd", "if=/dev/zero", "of=/dev/null"]
logger.info("Starting dummy dd command: {0} to stress CPU", ' '.join(dd_command))
subprocess.Popen(dd_command)
logger.info("dd command completed; sleeping...")
i = 0
while i < 30 and not self._stopped:
time.sleep(1)
i += 1
except Exception as exception:
logger.error("{0}:\n{1}", exception, traceback.format_exc())

def stop(self):
self._stopped = True


try:
threading.current_thread().setName("*StartService*")
logger.set_prefix("E2ETest")
logger.add_logger_appender(logger.AppenderType.FILE, logger.LogLevel.INFO, "/var/log/waagent.log")

agent_command_line = sys.argv[1:]

logger.info("Starting Agent: {0}", ' '.join(agent_command_line))
agent_process = subprocess.Popen(agent_command_line)

# sleep a little to give the agent a chance to initialize
time.sleep(15)

cpu_consumer = CpuConsumer()
cpu_consumer.start()


def forward_signal(signum, _):
if signum == signal.SIGTERM:
logger.info("Stopping stress thread...")
cpu_consumer.stop()
logger.info("Forwarding signal {0} to Agent", signum)
agent_process.send_signal(signum)


signal.signal(signal.SIGTERM, forward_signal)

agent_process.wait()
logger.info("Agent completed")

cpu_consumer.stop()
cpu_consumer.join()
logger.info("Stress completed")

logger.info("Exiting...")
sys.exit(agent_process.returncode)

except Exception as exception:
logger.error("Unexpected error occurred while starting agent service : {0}", exception)
raise