From bf4044075c41ac66ffcc23a7450938237f4bdeb2 Mon Sep 17 00:00:00 2001 From: Jonathan Dowland <63694+jmtd@users.noreply.github.com> Date: Wed, 13 Mar 2024 09:37:41 +0000 Subject: [PATCH] rework container.execute to use multiprocessing as watchdog (#50) Workaround `docker.APIClient.exec_start` sometimes blocking indefinitely by running in a sub-process and throwing an exception if the sub-process does not complete within a given timeout. Remove the existing post-exec code which polled the value of `docker.APIClient.exec_inspect` for 15 seconds to determine if the command had completed. This is effectively performed by the new sub-process waiting. I've set the timeout to 30 seconds, up from 15, which (from experimentation) seems to be necessary to account for the extra time it takes to invoke `exec_start` within the timeout period. A future change should make this timeout configurable. This general pattern (of watchdogging the docker library code) might be useful elsewhere, in particular for any future efforts to support parallel test execution. Signed-off-by: Jonathan Dowland --- steps/container.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/steps/container.py b/steps/container.py index f169a97..575c940 100644 --- a/steps/container.py +++ b/steps/container.py @@ -30,6 +30,7 @@ import tarfile import tempfile import time +import multiprocessing as mp try: d = docker.Client(version="1.22") @@ -148,26 +149,30 @@ def startWithCommand(self, **kwargs): def execute(self, cmd, detach=False): """ executes cmd in container and return its output """ + self.logger.debug("container.execute(%s,%s)" % (cmd,detach)) inst = d.exec_create(container=self.container, cmd=cmd) if detach: d.exec_start(inst, detach) return None - output = d.exec_start(inst, detach=detach) - retcode = d.exec_inspect(inst)['ExitCode'] + ctx = mp.get_context('fork') + q = ctx.Queue() + p = ctx.Process(target=lambda q: q.put(d.exec_start(inst, detach=detach)), args=(q,)) + p.start() + + if None == p.join(60): # timeout in secs + p.terminate() + raise ExecException("container.execute: timeout reading from exec (command '{}')".format(cmd)) - count = 0 + output = q.get() + retcode = d.exec_inspect(inst)['ExitCode'] - while retcode is None: - count += 1 - retcode = d.exec_inspect(inst)['ExitCode'] - time.sleep(1) - if count > 15: - raise ExecException("Command %s timed out, output: %s" % (cmd, output)) + if retcode is None: + raise ExecException("Command %s timed out, output: %s" % (cmd, output)) if retcode != 0: - raise ExecException("Command %s failed to execute, return code: %s" % (cmd, retcode), output) + raise ExecException("Command %s failed, return code: %s" % (cmd, retcode), output) return output