Skip to content

Commit

Permalink
Merge pull request #28 from NERSC/scanon/shared_run_error_handling
Browse files Browse the repository at this point in the history
Share run error handling (WIP)
  • Loading branch information
lastephey authored Jul 27, 2023
2 parents 6c2f4ab + de24807 commit 4dab7e1
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 20 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ The list of configurable values is:
* localid_var: (str) environment variable to determine the local node rank (default: `SLURM_LOCALID`)
* tasks_per_node_var: (str) environment variable to determine the tasks per node (default: `SLURM_STEP_TASKS_PER_NDOE`)
* ntasks_pattern: (str) regular expression pattern to filter the tasks per node (default: `[0-9]+`)
* wait_timeout: (str) timeout in seconds to wait for a shared-run container to start (default: 10)
* wait_poll_interval: (str) interval in seconds to poll for a shared-run container to start (default: 0.2)

### Templating

Expand Down
55 changes: 39 additions & 16 deletions podman_hpc/podman_hpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,28 +269,49 @@ def _shared_run(conf, run_args, **site_opts):
# Start monitor and run threads
monitor_thread = None
run_thread = None
proc = None
if (localid is None or int(localid) == 0):
monitor_thread = Process(target=monitor, args=(sock_name, ntasks,
container_name, conf))
monitor_thread.start()
run_thread = Process(target=shared_run_exec, args=(run_cmd, conf.env))
run_thread.start()

# wait for container to exist
comm = ["container", "exists", container_name]
while podman_devnull(comm, conf) != 0:
time.sleep(0.2)
comm = ["wait", "--condition", "running", container_name]
podman_devnull(comm, conf)
proc = Popen(exec_cmd, env=conf.env)
proc.communicate()
send_complete(sock_name, localid)
# Close out threads
if monitor_thread:
monitor_thread.join()
if run_thread:
run_thread.join()
sys.exit(proc.returncode)
try:
# wait for container to exist
comm = ["container", "exists", container_name]
start_time = time.time()
while podman_devnull(comm, conf) != 0:
time.sleep(conf.wait_poll_interval)
if time.time() - start_time > conf.wait_timeout:
msg = "Timeout waiting for shared-run start"
raise OSError(msg)
if run_thread and run_thread.exitcode:
raise OSError("Failed to start container")
comm = ["wait", "--condition", "running", container_name]
podman_devnull(comm, conf)
proc = Popen(exec_cmd, env=conf.env)
proc.communicate()
send_complete(sock_name, localid)
# Close out threads
if monitor_thread:
monitor_thread.join()
if run_thread:
run_thread.join()
except Exception as ex:
sys.stderr.write(str(ex))
if monitor_thread:
sys.stderr.write("Killing monitor thread")
monitor_thread.kill()
if run_thread:
run_thread.kill()
if os.path.exists(sock_name):
os.remove(sock_name)
finally:
exit_code = 1
if proc:
exit_code = proc.returncode
sys.exit(exit_code)


# podman-hpc call_podman subcommand (default, hidden, passthrough) #########
Expand Down Expand Up @@ -350,7 +371,9 @@ def call_podman(ctx, siteconf, help, podman_args, **site_opts):

def shared_run_exec(run_cmd, env):
proc = Popen(run_cmd, stdout=PIPE, stderr=PIPE, env=env)
proc.communicate()
out, err = proc.communicate()
if proc.returncode != 0:
sys.stderr.write(err.decode())


def monitor(sockfile, ntasks, container_name, conf):
Expand Down
10 changes: 9 additions & 1 deletion podman_hpc/siteconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ class SiteConfig:
"default_args", "default_run_args",
"additional_stores", "hooks_dir",
"localid_var", "tasks_per_node_var", "ntasks_pattern",
"config_home", "mksquashfs_bin"]
"config_home", "mksquashfs_bin",
"wait_timeout", "wait_poll_interval"]
_valid_templates = ["shared_run_args_template",
"graph_root_template",
"run_root_template",
Expand Down Expand Up @@ -64,6 +65,8 @@ class SiteConfig:
tasks_per_node_var = "SLURM_STEP_TASKS_PER_NODE"
ntasks_pattern = r'[0-9]+'
mksquashfs_bin = "mksquashfs.static"
wait_poll_interval = 0.2
wait_timeout = 10
shared_run = False
source = dict()

Expand Down Expand Up @@ -112,6 +115,11 @@ def __init__(self, squash_dir=None, log_level=None):
"--annotation", f"{_HOOKS_ANNO}=true",
"--security-opt", "seccomp=unconfined",
]
if isinstance(self.wait_poll_interval, str):
self.wait_poll_interval = \
float(self.wait_poll_interval)
if isinstance(self.wait_timeout, str):
self.wait_timeout = float(self.wait_timeout)
if len(self.default_pull_args) == 0:
self.default_pull_args = [
"--root", self.graph_root,
Expand Down
11 changes: 11 additions & 0 deletions test/mock_bin/mock_podman
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,17 @@ elif [ $(echo $@|grep -c 'mksq ') -gt 0 ] ; then
P=$(echo $@|sed 's/.*mksq -v //'|sed 's/:.*//')
SQ=$(echo $@|sed 's|.*/sqout/||'|sed 's/ .*//')
touch $P/$SQ
elif [ $(echo $@|grep -c 'container exists ') -gt 0 ] ; then
if [ ! -z "$MOCK_FAILURE" ] ; then
echo "no container" >& 2
echo $@ >> $MOCK_OUT
exit 1
fi
elif [ $(echo $@|grep -c ' run ') -gt 0 ] && [ ! -z "$MOCK_FAILURE" ] ;then
uid=$(echo $@|sed 's/.*--name //'|sed 's/ .*//')
echo "Failed to start $uid" >& 2
echo $@ >> $MOCK_OUT
exit 1
elif [ $(echo $@|grep -c 'run ') -gt 0 ] ; then
echo "bogusid"
fi
Expand Down
21 changes: 18 additions & 3 deletions test/test_podman_hpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,11 @@ def test_shared_run(monkeypatch, fix_paths, mock_podman, mock_exit):
phpc.main()
run = None
with open(mock_podman) as f:
for line in f:
items = line.split()
for line in f.read().split("\n"):
items = line.split(" ")
if items[0] == "run":
run = items
elif items[0] == "exec":
elif "exec --root" in line:
exec = items
uid = os.getuid()
assert run is not None
Expand All @@ -102,6 +102,21 @@ def test_shared_run(monkeypatch, fix_paths, mock_podman, mock_exit):
assert "--rm" not in exec


def test_run_fail(monkeypatch, fix_paths, mock_podman, mock_exit):
sys.argv = ["podman_hpc", "shared-run", "-it", "--rm",
"-e", "FAILME=1", "ubuntu", "uptime"]
monkeypatch.setenv("SLURM_LOCALID", "0")
monkeypatch.setenv("SLURM_STEP_TASKS_PER_NODE", "1")
monkeypatch.setenv("PODMANHPC_WAIT_TIMEOUT", "0.5")
monkeypatch.setenv("MOCK_FAILURE", "1")
phpc.main()
run = None
out = open(mock_podman).read()
assert "run --rm" in out
assert "exec --root" not in out
fn = f"/tmp/uid-{os.getuid()}-pid-{os.getppid()}.txt"


def test_shared_run_auto(monkeypatch, fix_paths, mock_podman, mock_exit):
sys.argv = ["podman_hpc", "run", "-it", "--rm", "--mpi",
"--volume", "/a:/b", "ubuntu", "uptime"]
Expand Down

0 comments on commit 4dab7e1

Please sign in to comment.