diff --git a/README.md b/README.md index 50a87f4..38e9559 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ The list of configurable values is: * localid_var: (str) environment variable to determine the local node rank (default: `SLURM_LOCALID`) * tasks_per_node_var: (str) environment variable to determine the tasks per node (default: `SLURM_STEP_TASKS_PER_NDOE`) * ntasks_pattern: (str) regular expression pattern to filter the tasks per node (default: `[0-9]+`) +* wait_timeout: (str) timeout in seconds to wait for a shared-run container to start (default: 10) +* wait_poll_interval: (str) interval in seconds to poll for a shared-run container to start (default: 0.2) ### Templating diff --git a/podman_hpc/podman_hpc.py b/podman_hpc/podman_hpc.py index a8f5131..74e139f 100755 --- a/podman_hpc/podman_hpc.py +++ b/podman_hpc/podman_hpc.py @@ -269,6 +269,7 @@ def _shared_run(conf, run_args, **site_opts): # Start monitor and run threads monitor_thread = None run_thread = None + proc = None if (localid is None or int(localid) == 0): monitor_thread = Process(target=monitor, args=(sock_name, ntasks, container_name, conf)) @@ -276,21 +277,41 @@ def _shared_run(conf, run_args, **site_opts): run_thread = Process(target=shared_run_exec, args=(run_cmd, conf.env)) run_thread.start() - # wait for container to exist - comm = ["container", "exists", container_name] - while podman_devnull(comm, conf) != 0: - time.sleep(0.2) - comm = ["wait", "--condition", "running", container_name] - podman_devnull(comm, conf) - proc = Popen(exec_cmd, env=conf.env) - proc.communicate() - send_complete(sock_name, localid) - # Close out threads - if monitor_thread: - monitor_thread.join() - if run_thread: - run_thread.join() - sys.exit(proc.returncode) + try: + # wait for container to exist + comm = ["container", "exists", container_name] + start_time = time.time() + while podman_devnull(comm, conf) != 0: + time.sleep(conf.wait_poll_interval) + if time.time() - start_time > conf.wait_timeout: + msg = "Timeout waiting for shared-run start" + raise OSError(msg) + if run_thread and run_thread.exitcode: + raise OSError("Failed to start container") + comm = ["wait", "--condition", "running", container_name] + podman_devnull(comm, conf) + proc = Popen(exec_cmd, env=conf.env) + proc.communicate() + send_complete(sock_name, localid) + # Close out threads + if monitor_thread: + monitor_thread.join() + if run_thread: + run_thread.join() + except Exception as ex: + sys.stderr.write(str(ex)) + if monitor_thread: + sys.stderr.write("Killing monitor thread") + monitor_thread.kill() + if run_thread: + run_thread.kill() + if os.path.exists(sock_name): + os.remove(sock_name) + finally: + exit_code = 1 + if proc: + exit_code = proc.returncode + sys.exit(exit_code) # podman-hpc call_podman subcommand (default, hidden, passthrough) ######### @@ -350,7 +371,9 @@ def call_podman(ctx, siteconf, help, podman_args, **site_opts): def shared_run_exec(run_cmd, env): proc = Popen(run_cmd, stdout=PIPE, stderr=PIPE, env=env) - proc.communicate() + out, err = proc.communicate() + if proc.returncode != 0: + sys.stderr.write(err.decode()) def monitor(sockfile, ntasks, container_name, conf): diff --git a/podman_hpc/siteconfig.py b/podman_hpc/siteconfig.py index 04afc42..9916e73 100644 --- a/podman_hpc/siteconfig.py +++ b/podman_hpc/siteconfig.py @@ -31,7 +31,8 @@ class SiteConfig: "default_args", "default_run_args", "additional_stores", "hooks_dir", "localid_var", "tasks_per_node_var", "ntasks_pattern", - "config_home", "mksquashfs_bin"] + "config_home", "mksquashfs_bin", + "wait_timeout", "wait_poll_interval"] _valid_templates = ["shared_run_args_template", "graph_root_template", "run_root_template", @@ -64,6 +65,8 @@ class SiteConfig: tasks_per_node_var = "SLURM_STEP_TASKS_PER_NODE" ntasks_pattern = r'[0-9]+' mksquashfs_bin = "mksquashfs.static" + wait_poll_interval = 0.2 + wait_timeout = 10 shared_run = False source = dict() @@ -112,6 +115,11 @@ def __init__(self, squash_dir=None, log_level=None): "--annotation", f"{_HOOKS_ANNO}=true", "--security-opt", "seccomp=unconfined", ] + if isinstance(self.wait_poll_interval, str): + self.wait_poll_interval = \ + float(self.wait_poll_interval) + if isinstance(self.wait_timeout, str): + self.wait_timeout = float(self.wait_timeout) if len(self.default_pull_args) == 0: self.default_pull_args = [ "--root", self.graph_root, diff --git a/test/mock_bin/mock_podman b/test/mock_bin/mock_podman index d61ced1..8fb642f 100755 --- a/test/mock_bin/mock_podman +++ b/test/mock_bin/mock_podman @@ -11,6 +11,17 @@ elif [ $(echo $@|grep -c 'mksq ') -gt 0 ] ; then P=$(echo $@|sed 's/.*mksq -v //'|sed 's/:.*//') SQ=$(echo $@|sed 's|.*/sqout/||'|sed 's/ .*//') touch $P/$SQ +elif [ $(echo $@|grep -c 'container exists ') -gt 0 ] ; then + if [ ! -z "$MOCK_FAILURE" ] ; then + echo "no container" >& 2 + echo $@ >> $MOCK_OUT + exit 1 + fi +elif [ $(echo $@|grep -c ' run ') -gt 0 ] && [ ! -z "$MOCK_FAILURE" ] ;then + uid=$(echo $@|sed 's/.*--name //'|sed 's/ .*//') + echo "Failed to start $uid" >& 2 + echo $@ >> $MOCK_OUT + exit 1 elif [ $(echo $@|grep -c 'run ') -gt 0 ] ; then echo "bogusid" fi diff --git a/test/test_podman_hpc.py b/test/test_podman_hpc.py index 5dcc025..def94a8 100644 --- a/test/test_podman_hpc.py +++ b/test/test_podman_hpc.py @@ -82,11 +82,11 @@ def test_shared_run(monkeypatch, fix_paths, mock_podman, mock_exit): phpc.main() run = None with open(mock_podman) as f: - for line in f: - items = line.split() + for line in f.read().split("\n"): + items = line.split(" ") if items[0] == "run": run = items - elif items[0] == "exec": + elif "exec --root" in line: exec = items uid = os.getuid() assert run is not None @@ -102,6 +102,21 @@ def test_shared_run(monkeypatch, fix_paths, mock_podman, mock_exit): assert "--rm" not in exec +def test_run_fail(monkeypatch, fix_paths, mock_podman, mock_exit): + sys.argv = ["podman_hpc", "shared-run", "-it", "--rm", + "-e", "FAILME=1", "ubuntu", "uptime"] + monkeypatch.setenv("SLURM_LOCALID", "0") + monkeypatch.setenv("SLURM_STEP_TASKS_PER_NODE", "1") + monkeypatch.setenv("PODMANHPC_WAIT_TIMEOUT", "0.5") + monkeypatch.setenv("MOCK_FAILURE", "1") + phpc.main() + run = None + out = open(mock_podman).read() + assert "run --rm" in out + assert "exec --root" not in out + fn = f"/tmp/uid-{os.getuid()}-pid-{os.getppid()}.txt" + + def test_shared_run_auto(monkeypatch, fix_paths, mock_podman, mock_exit): sys.argv = ["podman_hpc", "run", "-it", "--rm", "--mpi", "--volume", "/a:/b", "ubuntu", "uptime"]