agent: capture output from enter commands, handle exceptions

Exceptions from _enter_maintenance don't bubble up anymore but are logged and treated like temporary failures now. Output from enter commands is now shown directly on the trace log level and added to the exception if the command fails or logged when postpone/tempfail is requested. PL-131813
flyingcircusio · Nov 9, 2023 · e4593f7 · e4593f7
1 parent ad0e262
commit e4593f7
Show file tree

Hide file tree

Showing 3 changed files with 113 additions and 54 deletions.
diff --git a/pkgs/fc/agent/fc/conftest.py b/pkgs/fc/agent/fc/conftest.py
@@ -34,7 +34,7 @@ def agent_maintenance_config(tmp_path):
 
 
 @fixture
-def reqmanager(tmp_path, agent_maintenance_config):
+def reqmanager(tmp_path, logger, agent_maintenance_config):
     spooldir = tmp_path / "maintenance"
     spooldir.mkdir()
     enc_path = tmp_path / "enc.json"
@@ -44,6 +44,7 @@ def reqmanager(tmp_path, agent_maintenance_config):
             spooldir=spooldir,
             enc_path=enc_path,
             config_file=agent_maintenance_config,
+            log=logger,
         ) as rm:
             yield rm
 
@@ -70,9 +71,12 @@ def _request_population(n):
 
 @fixture
 def logger():
-    _logger = structlog.get_logger()
-    _logger.trace = lambda *a, **k: None
-    return _logger
+    # pytest-structlog patches away structlog.config, but we can still use
+    # structlog._config.configure...
+    # We need to do that to support our custom `trace` logging method.
+    # It fails with BoundLoggingFilteringAtNotset which is the default wrapper.
+    structlog._config.configure(wrapper_class=structlog.BoundLogger)
+    return structlog.get_logger()
 
 
 @fixture

diff --git a/pkgs/fc/agent/fc/maintenance/reqmanager.py b/pkgs/fc/agent/fc/maintenance/reqmanager.py
@@ -21,9 +21,11 @@
 import structlog
 from fc.maintenance.activity import RebootType
 from fc.util.checks import CheckResult
+from fc.util.subprocess_helper import get_popen_stdout_lines
 from fc.util.time_date import format_datetime, utcnow
 from rich.table import Table
 
+from . import state
 from .request import Request, RequestMergeResult
 from .state import ARCHIVE, EXIT_POSTPONE, EXIT_TEMPFAIL, State
 
@@ -560,34 +562,66 @@ def _enter_maintenance(self):
         for name, command in self.config["maintenance-enter"].items():
             if not command.strip():
                 continue
-            self.log.info(
-                "enter-maintenance-subsystem", subsystem=name, command=command
+
+            log = self.log.bind(
+                subsystem=name,
             )
-            try:
-                # XXX: capture output
-                subprocess.run(command, shell=True, check=True)
-            except subprocess.CalledProcessError as e:
-                if e.returncode == EXIT_POSTPONE:
-                    self.log.info(
+
+            proc = subprocess.Popen(
+                command,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                shell=True,
+                text=True,
+            )
+            log.info(
+                "enter-maintenance-cmd",
+                _replace_msg=(
+                    "{subsystem}: Maintenance enter command started with PID "
+                    "{cmd_pid}: `{command}`"
+                ),
+                command=command,
+                cmd_pid=proc.pid,
+            )
+
+            stdout_lines = get_popen_stdout_lines(
+                proc, log, "enter-maintenance-out"
+            )
+            stdout = "".join(stdout_lines)
+            proc.wait()
+
+            match proc.returncode:
+                case 0:
+                    log.debug("enter-maintenance-cmd-success")
+                case state.EXIT_POSTPONE:
+                    log.info(
                         "enter-maintenance-postpone",
                         command=command,
                         _replace_msg=(
-                            "Command `{command}` requested to postpone all requests."
+                            "Command `{command}` requested to postpone all "
+                            "requests."
                         ),
                     )
+                    log.debug("enter-maintenance-postpone-out", stdout=stdout)
                     postpone_seen = True
-                elif e.returncode == EXIT_TEMPFAIL:
-                    self.log.info(
+                case state.EXIT_TEMPFAIL:
+                    log.info(
                         "enter-maintenance-tempfail",
                         command=command,
                         _replace_msg=(
                             "Command `{command}` failed temporarily. "
                             "Requests should be tried again next time."
                         ),
                     )
+                    log.debug("enter-maintenance-tempfail-out", stdout=stdout)
                     tempfail_seen = True
-                else:
-                    raise
+                case error:
+                    log.error(
+                        "enter-maintenance-fail",
+                        command=command,
+                        exit_code=error,
+                    )
+                    raise subprocess.CalledProcessError(error, command, stdout)
 
         if postpone_seen:
             raise PostponeMaintenance()
@@ -749,8 +783,8 @@ def _handle_enter_tempfail(
             self.log.warn(
                 "execute-requests-force",
                 _replace_msg=(
-                    "Due requests will be executed regardless of the temporary failure "
-                    "of a maintenance enter command."
+                    "Due requests will be executed regardless of the "
+                    "(temporary) failure of a maintenance enter command."
                 ),
             )
             return HandleEnterExceptionResult()
@@ -760,8 +794,8 @@ def _handle_enter_tempfail(
                 "run-all-now-force",
                 _replace_msg=(
                     "Run all mode requested and force mode activated: "
-                    "All requests will be executed now regardless of the temporary "
-                    "failure of a maintenance enter command."
+                    "All requests will be executed now regardless of the "
+                    "(temporary) failure of a maintenance enter command."
                 ),
             )
             return HandleEnterExceptionResult()
@@ -847,6 +881,9 @@ def execute(self, run_all_now: bool = False, force_run: bool = False):
         are ignored and requests are run regardless. This also runs requests in state
         'success' again when they are still in the queue after a recent system reboot.
         """
+        self.log.debug(
+            "execute-start", run_all_now=run_all_now, force_run=force_run
+        )
 
         runnable_requests = self._runnable(run_all_now, force_run)
         if not runnable_requests:
@@ -871,7 +908,21 @@ def execute(self, run_all_now: bool = False, force_run: bool = False):
         except TempfailMaintenance:
             res = self._handle_enter_tempfail(run_all_now, force_run)
             if res.exit:
-                # Stay in maintenance mode.
+                # Stay in maintenance mode as we expect the temporary failure
+                # to go away on the next agent run.
+                self._write_stats_for_execute()
+                return
+
+        except Exception:
+            # Other exceptions are similar to tempfail, just with additional
+            # logging. Could an error from a enter command, from the directory
+            # or an internal one.
+            self.log.error("execute-enter-maintenance-failed", exc_info=True)
+            res = self._handle_enter_tempfail(run_all_now, force_run)
+            if res.exit:
+                # Might already be in maintenance or not, depending on where
+                # _enter_maintenance failed. That's ok, the next agent
+                # run can continue in either case.
                 self._write_stats_for_execute()
                 return
 

diff --git a/pkgs/fc/agent/fc/maintenance/tests/test_reqmanager.py b/pkgs/fc/agent/fc/maintenance/tests/test_reqmanager.py
@@ -233,10 +233,20 @@ def test_enter_maintenance(log, reqmanager, monkeypatch):
     reqmanager._enter_maintenance()
 
     assert log.has("enter-maintenance")
+    assert log.has(
+        "enter-maintenance-cmd",
+        subsystem="demo",
+        command='echo "entering demo"',
+    )
+    assert log.has("enter-maintenance-cmd-success", subsystem="demo")
     connect_mock().mark_node_service_status.assert_called_with("host", False)
     maintenance_entered_at = reqmanager.maintenance_marker_path.read_text()
     assert maintenance_entered_at == "2016-04-20T11:00:00+00:00"
 
+    import rich
+
+    rich.print(log.events)
+
 
 def test_enter_maintenance_postpone(log, reqmanager, monkeypatch):
     monkeypatch.setattr(
@@ -264,7 +274,7 @@ def test_enter_maintenance_tempfail(log, reqmanager, monkeypatch):
     assert reqmanager.maintenance_marker_path.exists()
 
 
-def test_execute_postpone(log, reqmanager, monkeypatch):
+def test_execute_postpone(log, reqmanager):
     req = reqmanager.add(Request(Activity(), 1))
     req.state = State.due
     req.execute = Mock()
@@ -280,9 +290,10 @@ def enter_maintenance_postpone():
     assert req.state == State.postpone
     assert not req.execute.called
     reqmanager._leave_maintenance.assert_called()
+    log.has("execute-requests-postpone")
 
 
-def test_execute_tempfail(log, reqmanager, monkeypatch):
+def test_execute_tempfail(log, reqmanager):
     req = reqmanager.add(Request(Activity(), 1))
     req.state = State.due
     req.execute = Mock()
@@ -298,51 +309,45 @@ def enter_maintenance_tempfail():
     assert req.state == State.due
     assert not req.execute.called
     reqmanager._leave_maintenance.assert_not_called()
+    log.has("execute-requests-tempfail")
 
 
-@unittest.mock.patch("subprocess.run")
-@unittest.mock.patch("fc.util.directory.connect")
-def test_execute_activity_no_reboot(connect, run, reqmanager, log):
+def test_execute_activity_no_reboot(reqmanager, log):
     req = reqmanager.add(Request(Activity(), 1))
     reqmanager._runnable = lambda run_all_now, force_run: [req]
+    reqmanager._enter_maintenance = Mock()
+    reqmanager._leave_maintenance = Mock()
+
     reqmanager.execute()
-    run.assert_has_calls(
-        [
-            call('echo "entering demo"', shell=True, check=True),
-            call('echo "leaving demo"', shell=True, check=True),
-        ]
-    )
-    assert log.has("enter-maintenance")
-    assert log.has("leave-maintenance")
 
+    reqmanager._enter_maintenance.assert_called_once()
+    reqmanager._leave_maintenance.assert_called_once()
+    assert log.has("no-reboot-requested")
+
+
+def test_execute_activity_with_reboot(reqmanager, log, monkeypatch):
+    monkeypatch.setattr("time.sleep", sleep := Mock())
+    monkeypatch.setattr("subprocess.run", run := Mock())
+    reqmanager._enter_maintenance = Mock()
+    reqmanager._leave_maintenance = Mock()
 
-@unittest.mock.patch("subprocess.run")
-@unittest.mock.patch("fc.util.directory.connect")
-@unittest.mock.patch("time.sleep")
-def test_execute_activity_with_reboot(
-    sleep: Mock, connect, run: Mock, reqmanager, log
-):
     activity = Activity()
     activity.reboot_needed = RebootType.WARM
     req = reqmanager.add(Request(activity, 1))
-    req.state = State.due
+    reqmanager._runnable = lambda run_all_now, force_run: [req]
+
     with pytest.raises(SystemExit) as e:
-        reqmanager.execute(run_all_now=True)
+        reqmanager.execute()
 
     assert e.value.code == 0
 
-    run.assert_has_calls(
-        [
-            call('echo "entering demo"', shell=True, check=True),
-            call("reboot", check=True, capture_output=True, text=True),
-        ]
-    )
+    reqmanager._enter_maintenance.assert_called_once()
 
     sleep.assert_called_once_with(5)
-
-    assert log.has("enter-maintenance")
+    run.assert_called_once()
+    # Should stay in maintenance mode during the reboot.
+    reqmanager._leave_maintenance.assert_not_called()
     assert log.has("maintenance-reboot")
-    assert not log.has("leave-maintenance")
 
 
 @unittest.mock.patch("subprocess.run")
@@ -516,8 +521,7 @@ def test_execute_not_performed_on_connection_error(
     connect().mark_node_service_status.side_effect = socket.error()
     req = reqmanager.add(Request(Activity(), 1))
     req.state = State.due
-    with pytest.raises(OSError):
-        reqmanager.execute()
+    reqmanager.execute()
     assert execute.mock_calls == []