Skip to content

Commit

Permalink
agent: capture output from enter commands, handle exceptions
Browse files Browse the repository at this point in the history
Exceptions from _enter_maintenance don't bubble up anymore but are
logged and treated like temporary failures now.

Output from enter commands is now shown directly on the trace log level
and added to the exception if the command fails or logged when
postpone/tempfail is requested.

PL-131813
  • Loading branch information
dpausp committed Nov 9, 2023
1 parent ad0e262 commit e4593f7
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 54 deletions.
12 changes: 8 additions & 4 deletions pkgs/fc/agent/fc/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def agent_maintenance_config(tmp_path):


@fixture
def reqmanager(tmp_path, agent_maintenance_config):
def reqmanager(tmp_path, logger, agent_maintenance_config):
spooldir = tmp_path / "maintenance"
spooldir.mkdir()
enc_path = tmp_path / "enc.json"
Expand All @@ -44,6 +44,7 @@ def reqmanager(tmp_path, agent_maintenance_config):
spooldir=spooldir,
enc_path=enc_path,
config_file=agent_maintenance_config,
log=logger,
) as rm:
yield rm

Expand All @@ -70,9 +71,12 @@ def _request_population(n):

@fixture
def logger():
_logger = structlog.get_logger()
_logger.trace = lambda *a, **k: None
return _logger
# pytest-structlog patches away structlog.config, but we can still use
# structlog._config.configure...
# We need to do that to support our custom `trace` logging method.
# It fails with BoundLoggingFilteringAtNotset which is the default wrapper.
structlog._config.configure(wrapper_class=structlog.BoundLogger)
return structlog.get_logger()


@fixture
Expand Down
87 changes: 69 additions & 18 deletions pkgs/fc/agent/fc/maintenance/reqmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@
import structlog
from fc.maintenance.activity import RebootType
from fc.util.checks import CheckResult
from fc.util.subprocess_helper import get_popen_stdout_lines
from fc.util.time_date import format_datetime, utcnow
from rich.table import Table

from . import state
from .request import Request, RequestMergeResult
from .state import ARCHIVE, EXIT_POSTPONE, EXIT_TEMPFAIL, State

Expand Down Expand Up @@ -560,34 +562,66 @@ def _enter_maintenance(self):
for name, command in self.config["maintenance-enter"].items():
if not command.strip():
continue
self.log.info(
"enter-maintenance-subsystem", subsystem=name, command=command

log = self.log.bind(
subsystem=name,
)
try:
# XXX: capture output
subprocess.run(command, shell=True, check=True)
except subprocess.CalledProcessError as e:
if e.returncode == EXIT_POSTPONE:
self.log.info(

proc = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
shell=True,
text=True,
)
log.info(
"enter-maintenance-cmd",
_replace_msg=(
"{subsystem}: Maintenance enter command started with PID "
"{cmd_pid}: `{command}`"
),
command=command,
cmd_pid=proc.pid,
)

stdout_lines = get_popen_stdout_lines(
proc, log, "enter-maintenance-out"
)
stdout = "".join(stdout_lines)
proc.wait()

match proc.returncode:
case 0:
log.debug("enter-maintenance-cmd-success")
case state.EXIT_POSTPONE:
log.info(
"enter-maintenance-postpone",
command=command,
_replace_msg=(
"Command `{command}` requested to postpone all requests."
"Command `{command}` requested to postpone all "
"requests."
),
)
log.debug("enter-maintenance-postpone-out", stdout=stdout)
postpone_seen = True
elif e.returncode == EXIT_TEMPFAIL:
self.log.info(
case state.EXIT_TEMPFAIL:
log.info(
"enter-maintenance-tempfail",
command=command,
_replace_msg=(
"Command `{command}` failed temporarily. "
"Requests should be tried again next time."
),
)
log.debug("enter-maintenance-tempfail-out", stdout=stdout)
tempfail_seen = True
else:
raise
case error:
log.error(
"enter-maintenance-fail",
command=command,
exit_code=error,
)
raise subprocess.CalledProcessError(error, command, stdout)

if postpone_seen:
raise PostponeMaintenance()
Expand Down Expand Up @@ -749,8 +783,8 @@ def _handle_enter_tempfail(
self.log.warn(
"execute-requests-force",
_replace_msg=(
"Due requests will be executed regardless of the temporary failure "
"of a maintenance enter command."
"Due requests will be executed regardless of the "
"(temporary) failure of a maintenance enter command."
),
)
return HandleEnterExceptionResult()
Expand All @@ -760,8 +794,8 @@ def _handle_enter_tempfail(
"run-all-now-force",
_replace_msg=(
"Run all mode requested and force mode activated: "
"All requests will be executed now regardless of the temporary "
"failure of a maintenance enter command."
"All requests will be executed now regardless of the "
"(temporary) failure of a maintenance enter command."
),
)
return HandleEnterExceptionResult()
Expand Down Expand Up @@ -847,6 +881,9 @@ def execute(self, run_all_now: bool = False, force_run: bool = False):
are ignored and requests are run regardless. This also runs requests in state
'success' again when they are still in the queue after a recent system reboot.
"""
self.log.debug(
"execute-start", run_all_now=run_all_now, force_run=force_run
)

runnable_requests = self._runnable(run_all_now, force_run)
if not runnable_requests:
Expand All @@ -871,7 +908,21 @@ def execute(self, run_all_now: bool = False, force_run: bool = False):
except TempfailMaintenance:
res = self._handle_enter_tempfail(run_all_now, force_run)
if res.exit:
# Stay in maintenance mode.
# Stay in maintenance mode as we expect the temporary failure
# to go away on the next agent run.
self._write_stats_for_execute()
return

except Exception:
# Other exceptions are similar to tempfail, just with additional
# logging. Could an error from a enter command, from the directory
# or an internal one.
self.log.error("execute-enter-maintenance-failed", exc_info=True)
res = self._handle_enter_tempfail(run_all_now, force_run)
if res.exit:
# Might already be in maintenance or not, depending on where
# _enter_maintenance failed. That's ok, the next agent
# run can continue in either case.
self._write_stats_for_execute()
return

Expand Down
68 changes: 36 additions & 32 deletions pkgs/fc/agent/fc/maintenance/tests/test_reqmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,20 @@ def test_enter_maintenance(log, reqmanager, monkeypatch):
reqmanager._enter_maintenance()

assert log.has("enter-maintenance")
assert log.has(
"enter-maintenance-cmd",
subsystem="demo",
command='echo "entering demo"',
)
assert log.has("enter-maintenance-cmd-success", subsystem="demo")
connect_mock().mark_node_service_status.assert_called_with("host", False)
maintenance_entered_at = reqmanager.maintenance_marker_path.read_text()
assert maintenance_entered_at == "2016-04-20T11:00:00+00:00"

import rich

rich.print(log.events)


def test_enter_maintenance_postpone(log, reqmanager, monkeypatch):
monkeypatch.setattr(
Expand Down Expand Up @@ -264,7 +274,7 @@ def test_enter_maintenance_tempfail(log, reqmanager, monkeypatch):
assert reqmanager.maintenance_marker_path.exists()


def test_execute_postpone(log, reqmanager, monkeypatch):
def test_execute_postpone(log, reqmanager):
req = reqmanager.add(Request(Activity(), 1))
req.state = State.due
req.execute = Mock()
Expand All @@ -280,9 +290,10 @@ def enter_maintenance_postpone():
assert req.state == State.postpone
assert not req.execute.called
reqmanager._leave_maintenance.assert_called()
log.has("execute-requests-postpone")


def test_execute_tempfail(log, reqmanager, monkeypatch):
def test_execute_tempfail(log, reqmanager):
req = reqmanager.add(Request(Activity(), 1))
req.state = State.due
req.execute = Mock()
Expand All @@ -298,51 +309,45 @@ def enter_maintenance_tempfail():
assert req.state == State.due
assert not req.execute.called
reqmanager._leave_maintenance.assert_not_called()
log.has("execute-requests-tempfail")


@unittest.mock.patch("subprocess.run")
@unittest.mock.patch("fc.util.directory.connect")
def test_execute_activity_no_reboot(connect, run, reqmanager, log):
def test_execute_activity_no_reboot(reqmanager, log):
req = reqmanager.add(Request(Activity(), 1))
reqmanager._runnable = lambda run_all_now, force_run: [req]
reqmanager._enter_maintenance = Mock()
reqmanager._leave_maintenance = Mock()

reqmanager.execute()
run.assert_has_calls(
[
call('echo "entering demo"', shell=True, check=True),
call('echo "leaving demo"', shell=True, check=True),
]
)
assert log.has("enter-maintenance")
assert log.has("leave-maintenance")

reqmanager._enter_maintenance.assert_called_once()
reqmanager._leave_maintenance.assert_called_once()
assert log.has("no-reboot-requested")


def test_execute_activity_with_reboot(reqmanager, log, monkeypatch):
monkeypatch.setattr("time.sleep", sleep := Mock())
monkeypatch.setattr("subprocess.run", run := Mock())
reqmanager._enter_maintenance = Mock()
reqmanager._leave_maintenance = Mock()

@unittest.mock.patch("subprocess.run")
@unittest.mock.patch("fc.util.directory.connect")
@unittest.mock.patch("time.sleep")
def test_execute_activity_with_reboot(
sleep: Mock, connect, run: Mock, reqmanager, log
):
activity = Activity()
activity.reboot_needed = RebootType.WARM
req = reqmanager.add(Request(activity, 1))
req.state = State.due
reqmanager._runnable = lambda run_all_now, force_run: [req]

with pytest.raises(SystemExit) as e:
reqmanager.execute(run_all_now=True)
reqmanager.execute()

assert e.value.code == 0

run.assert_has_calls(
[
call('echo "entering demo"', shell=True, check=True),
call("reboot", check=True, capture_output=True, text=True),
]
)
reqmanager._enter_maintenance.assert_called_once()

sleep.assert_called_once_with(5)

assert log.has("enter-maintenance")
run.assert_called_once()
# Should stay in maintenance mode during the reboot.
reqmanager._leave_maintenance.assert_not_called()
assert log.has("maintenance-reboot")
assert not log.has("leave-maintenance")


@unittest.mock.patch("subprocess.run")
Expand Down Expand Up @@ -516,8 +521,7 @@ def test_execute_not_performed_on_connection_error(
connect().mark_node_service_status.side_effect = socket.error()
req = reqmanager.add(Request(Activity(), 1))
req.state = State.due
with pytest.raises(OSError):
reqmanager.execute()
reqmanager.execute()
assert execute.mock_calls == []


Expand Down

0 comments on commit e4593f7

Please sign in to comment.