diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index 5b2894cf35..ba3a8d01c9 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -24,17 +24,22 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from __future__ import annotations + import abc import typing as t -from ...entity import SmartSimEntity from ...error import AllocationError, LauncherError, SSUnsupportedError -from ...settings import SettingsBase from .step import Step from .step_info import StepInfo, UnmanagedStepInfo from .step_mapping import StepMap, StepMapping from .task_manager import TaskManager +if t.TYPE_CHECKING: + from smartsim._core.launcher.stepMapping import StepMap + from smartsim.entity.entity import SmartSimEntity + from smartsim.settings.launchSettings import SettingsBase + class Launcher(abc.ABC): # pragma: no cover """Abstract base class of all launchers diff --git a/smartsim/_core/launcher/slurm/slurm_launcher.py b/smartsim/_core/launcher/slurm/slurm_launcher.py index 038176d937..316b944c58 100644 --- a/smartsim/_core/launcher/slurm/slurm_launcher.py +++ b/smartsim/_core/launcher/slurm/slurm_launcher.py @@ -29,6 +29,13 @@ import typing as t from shutil import which +from smartsim._core.launcher_.slurm.slurm_commands import sacct, scancel, sstat +from smartsim._core.launcher_.slurm.slurm_parser import ( + parse_sacct, + parse_sstat_nodes, + parse_step_id_from_sacct, +) + from ....error import LauncherError from ....log import get_logger from ....settings import ( @@ -53,8 +60,6 @@ Step, ) from ..step_info import SlurmStepInfo, StepInfo -from .slurm_commands import sacct, scancel, sstat -from .slurm_parser import parse_sacct, parse_sstat_nodes, parse_step_id_from_sacct logger = get_logger(__name__) diff --git a/smartsim/_core/launcher/step/alps_step.py b/smartsim/_core/launcher/step/alps_step.py index dc9f3bff61..233043cb41 100644 --- a/smartsim/_core/launcher/step/alps_step.py +++ b/smartsim/_core/launcher/step/alps_step.py @@ -24,19 +24,25 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + import os import shutil import typing as t from shlex import split as sh_split -from ....entity import Application, FSNode from ....error import AllocationError from ....log import get_logger -from ....settings import AprunSettings, RunSettings, Singularity +from ....settings import Singularity from .step import Step, proxyable_launch_cmd logger = get_logger(__name__) +if t.TYPE_CHECKING: + from smartsim.entity import Application, FSNode + from smartsim.settings import AprunSettings, RunSettings + class AprunStep(Step): def __init__( diff --git a/smartsim/_core/shell/__init__.py b/smartsim/_core/launcher_/__init__.py similarity index 100% rename from smartsim/_core/shell/__init__.py rename to smartsim/_core/launcher_/__init__.py diff --git a/smartsim/_core/launcher_/shell/__init__.py b/smartsim/_core/launcher_/shell/__init__.py new file mode 100644 index 0000000000..efe03908e0 --- /dev/null +++ b/smartsim/_core/launcher_/shell/__init__.py @@ -0,0 +1,25 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/smartsim/_core/shell/shell_launcher.py b/smartsim/_core/launcher_/shell/shell_launcher.py similarity index 100% rename from smartsim/_core/shell/shell_launcher.py rename to smartsim/_core/launcher_/shell/shell_launcher.py diff --git a/smartsim/_core/launcher_/slurm/__init__.py b/smartsim/_core/launcher_/slurm/__init__.py new file mode 100644 index 0000000000..efe03908e0 --- /dev/null +++ b/smartsim/_core/launcher_/slurm/__init__.py @@ -0,0 +1,25 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/smartsim/_core/launcher/slurm/slurm_commands.py b/smartsim/_core/launcher_/slurm/slurm_commands.py similarity index 100% rename from smartsim/_core/launcher/slurm/slurm_commands.py rename to smartsim/_core/launcher_/slurm/slurm_commands.py diff --git a/smartsim/_core/launcher_/slurm/slurm_launcher.py b/smartsim/_core/launcher_/slurm/slurm_launcher.py new file mode 100644 index 0000000000..0f8caa0330 --- /dev/null +++ b/smartsim/_core/launcher_/slurm/slurm_launcher.py @@ -0,0 +1,330 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import collections +import os +import os.path +import subprocess as sp +import time +import typing as t + +from typing_extensions import TypeAlias + +from smartsim._core.config import CONFIG +from smartsim._core.launcher_.slurm import slurm_commands as commands +from smartsim._core.launcher_.slurm import slurm_parser as parser +from smartsim._core.utils import helpers, launcher +from smartsim.error import errors +from smartsim.log import get_logger +from smartsim.status import JobStatus +from smartsim.types import LaunchedJobID + +if t.TYPE_CHECKING: + from smartsim.experiment import Experiment + + +logger = get_logger(__name__) + + +class SrunCommand: + """Produces a properly formatted `srun` command from raw job information. + Ensures the job status can be tracked using the supplied name. + """ + + def __init__( + self, + name: str, + srun_args: t.Sequence[str], + executable: t.Sequence[str], + job_id: str | None = None, + environment: t.Mapping[str, str] | None = None, + ) -> None: + """Initialize a new trackable srun command. + + :param name: The name of the job. + :param srun_args: Any command line args to feed to `srun`. + :param executable: The command and any command line arguments that + should be executed by `srun`. + :param job_id: The id of the allocated job under which step should be + executed. + :param environment: Any additional environment variables to place in + the environment before starting the `srun` subprocess. + :raises errors.AllocationError: If the `job_id` was not supplied and + could not be inferred. + """ + self.name: t.Final = f"{name}-{helpers.create_short_id_str()}" + self.srun_args: t.Final = tuple(srun_args) + if job_id is None: + try: + job_id = os.environ["SLURM_JOB_ID"] + except KeyError as e: + raise errors.AllocationError( + "No allocation specified and could be found" + ) from e + logger.debug(f"Using allocation {job_id} gleaned from user environment") + self.job_id: t.Final = job_id + self.executable: t.Final = tuple(executable) + self.env: t.Final[t.Mapping[str, str]] = ( + dict(environment) if environment is not None else {} + ) + + def as_command_line_args(self) -> tuple[str, ...]: + """Format the `srun` command with job id and name information + + :returns: A sequence of symbols that can be opened in a subshell + """ + srun = helpers.expand_exe_path("srun") + return ( + srun, + *self.srun_args, + f"--job-name={self.name}", + f"--jobid={self.job_id}", + "--", + *self.executable, + ) + + def start(self) -> None: + """Start the `srun` command in a subshell""" + # pylint: disable-next=consider-using-with + sp.Popen( + self.as_command_line_args(), + env={**os.environ, **self.env}, + stdout=sp.DEVNULL, + stderr=sp.DEVNULL, + ) + + +_SlurmCommandType: TypeAlias = SrunCommand +"""Types that are capable of being launched by the `SlurmLauncher`""" + + +class SlurmLauncher: + """A launcher for launching/tracking slurm specific commands""" + + def __init__( + self, *, launched: t.Mapping[LaunchedJobID, _LaunchedJobInfo] | None = None + ) -> None: + """Initialize a new slurm launcher. + + :param launched: Any previously launched slurm jobs that the launcher + should be aware of. Primarily used for testing. + """ + self._launched: t.Final = dict(launched) if launched is not None else {} + + @classmethod + def create(cls, _: Experiment) -> SlurmLauncher: + """Create a new launcher instance from an experiment instance. + + :param _: An experiment instance. + :returns: A new launcher instance. + """ + return cls() + + def start(self, launchable: _SlurmCommandType, interval: int = 2) -> LaunchedJobID: + """Have the slurm launcher start and track the progress of a new + subprocess. + + :param launchable: The template of a slurm subprocess to start. + :param interval: The amount of time in seconds to wait between `sacct` + calls to get the step id. + :returns: An id to reference the process for status tracking. + """ + launchable.start() + trials = CONFIG.wlm_trials + step_id = None + while step_id is None and trials > 0: + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + # TODO: Really don't like the implied sacct format and required + # order to call these fns in order to get the step id + # ------------------------------------------------------------------------- + out, _ = commands.sacct( + ["--noheader", "-p", "--format=jobname,jobid"], raise_on_err=True + ) + step_id = parser.parse_step_id_from_sacct(out, launchable.name) + # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + trials -= 1 + if step_id is None: + time.sleep(interval) + if step_id is None: + raise errors.LauncherError("Could not find id of launched job step") + id_ = launcher.create_job_id() + self._launched[id_] = _LaunchedJobInfo(step_id, launchable.name) + return id_ + + def _get_slurm_info_from_job_id(self, id_: LaunchedJobID, /) -> _LaunchedJobInfo: + """Find the info for a slurm subprocess given a launched job id issued + to a user. + + :param id_: The job id issued to the user. + :raises errors.LauncherJobNotFound: The id is not recognized. + :returns: Info about the launched slurm job. + """ + if (info := self._launched.get(id_)) is None: + msg = f"Launcher `{self}` has not launched a job with id `{id_}`" + raise errors.LauncherJobNotFound(msg) + return info + + def get_status(self, *ids: LaunchedJobID) -> t.Mapping[LaunchedJobID, JobStatus]: + """Take a collection of job ids and return the status of the + corresponding slrum processes started by the slurm launcher. + + :param ids: The collection of ids of launched jobs to query for current + status. + :returns: A mapping of launched ids to their current status. + """ + id_to_info = {id_: self._get_slurm_info_from_job_id(id_) for id_ in ids} + + def status_override(info: _LaunchedJobInfo) -> JobStatus | None: + return info.status_override + + status_to_infos = helpers.group_by(status_override, id_to_info.values()) + needs_fetch = status_to_infos.get(None, ()) + fetched_status = ( + self._get_status(*(info.slurm_id for info in needs_fetch)) + if needs_fetch + else {} + ) + has_overwrite = { + info.slurm_id: status + for status, infos in status_to_infos.items() + for info in infos + if status is not None + } + slurm_ids_to_status = collections.ChainMap(has_overwrite, fetched_status) + return {id_: slurm_ids_to_status[id_to_info[id_].slurm_id] for id_ in ids} + + @staticmethod + def _get_status( + id_: parser.StepID, *ids: parser.StepID + ) -> dict[parser.StepID, JobStatus]: + """Given a collection of step ids, interogate slurm for the status of + the steps. + + :param id_: The first step id to get the status for. + :param ids: Any additional step ids to get the status for. + :returns: A mapping of step ids to statuses + """ + ids = (id_,) + ids + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + # TODO: Really don't like the implied sacct format and required + # order to call these fns in order to get the status + # ------------------------------------------------------------------------- + out, _ = commands.sacct( + ["--noheader", "-p", "-b", "--jobs", ",".join(ids)], raise_on_err=True + ) + stats = ((id_, parser.parse_sacct(out, id_)[0]) for id_ in ids) + # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + to_ss_stat = { + "RUNNING": JobStatus.RUNNING, + "CONFIGURING": JobStatus.RUNNING, + "STAGE_OUT": JobStatus.RUNNING, + "COMPLETED": JobStatus.COMPLETED, + "DEADLINE": JobStatus.COMPLETED, + "TIMEOUT": JobStatus.COMPLETED, + "BOOT_FAIL": JobStatus.FAILED, + "FAILED": JobStatus.FAILED, + "NODE_FAIL": JobStatus.FAILED, + "OUT_OF_MEMORY": JobStatus.FAILED, + "CANCELLED": JobStatus.CANCELLED, + "CANCELLED+": JobStatus.CANCELLED, + "REVOKED": JobStatus.CANCELLED, + "PENDING": JobStatus.PAUSED, + "PREEMPTED": JobStatus.PAUSED, + "RESV_DEL_HOLD": JobStatus.PAUSED, + "REQUEUE_FED": JobStatus.PAUSED, + "REQUEUE_HOLD": JobStatus.PAUSED, + "REQUEUED": JobStatus.PAUSED, + "RESIZING": JobStatus.PAUSED, + "SIGNALING": JobStatus.PAUSED, + "SPECIAL_EXIT": JobStatus.PAUSED, + "STOPPED": JobStatus.PAUSED, + "SUSPENDED": JobStatus.PAUSED, + } + return {id_: to_ss_stat.get(stat, JobStatus.UNKNOWN) for id_, stat in stats} + + def stop_jobs(self, *ids: LaunchedJobID) -> t.Mapping[LaunchedJobID, JobStatus]: + """Take a collection of job ids and kill the corresponding processes + started by the slurm launcher. + + :param ids: The ids of the launched jobs to stop. + :returns: A mapping of ids to their reported status after attempting to + stop them. + """ + slurm_infos = map(self._get_slurm_info_from_job_id, ids) + helpers.threaded_map(self._stop_job, slurm_infos) + return self.get_status(*ids) + + @staticmethod + def _stop_job(job_info: _LaunchedJobInfo) -> None: + """Given the launch information for a slurm process, attempt to kill + that process. + + :param job_info: The info for the job that the launcher should attempt + to kill. + """ + step_id = job_info.slurm_id + is_het_job = os.getenv("SLURM_HET_SIZE") is not None + # If the step is a substep and not part of a heterogenous job, it is a + # MPMD srun. We need to stop parent step because sub-steps of MPMD jobs + # cannot be stopped singularly. + if not is_het_job and parser.is_substep(step_id): + step_id = parser.get_step_id_from_substep_id(step_id) + # If it is a heterogeneous job, we can stop them like a normal step. + # Slurm will throw an error, but will actually kill steps correctly. + ret_code, _, err = commands.scancel([step_id]) + if ret_code != 0: + if is_het_job: + # However, in the case of a heterogenous job, we do need to + # manually mark the step as 'cancelled'. + msg = ( + "SmartSim received a non-zero exit code while canceling" + f" a heterogeneous job step {job_info.name}!\n" + "The following error might be internal to Slurm\n" + "and the heterogeneous job step could have been correctly" + " canceled.\n" + "SmartSim will consider it canceled.\n" + ) + job_info.status_override = JobStatus.CANCELLED + else: + msg = f"Unable to cancel job step {job_info.name}\n{err}" + logger.warning(msg) + + +class _LaunchedJobInfo: + """Slurm specific launch information for a launched job""" + + def __init__( + self, + slurm_id: parser.StepID, + name: str, + *, + status_override: JobStatus | None = None, + ) -> None: + self.slurm_id: t.Final = slurm_id + self.name: t.Final = name + self.status_override = status_override diff --git a/smartsim/_core/launcher/slurm/slurm_parser.py b/smartsim/_core/launcher_/slurm/slurm_parser.py similarity index 82% rename from smartsim/_core/launcher/slurm/slurm_parser.py rename to smartsim/_core/launcher_/slurm/slurm_parser.py index 4ec187f196..89f262d15c 100644 --- a/smartsim/_core/launcher/slurm/slurm_parser.py +++ b/smartsim/_core/launcher_/slurm/slurm_parser.py @@ -121,7 +121,10 @@ def parse_sstat_nodes(output: str, job_id: str) -> t.List[str]: return list(set(nodes)) -def parse_step_id_from_sacct(output: str, step_name: str) -> t.Optional[str]: +StepID = t.NewType("StepID", str) + + +def parse_step_id_from_sacct(output: str, step_name: str) -> t.Optional[StepID]: """Parse and return the step id from a sacct command :param output: output of sacct --noheader -p @@ -135,4 +138,29 @@ def parse_step_id_from_sacct(output: str, step_name: str) -> t.Optional[str]: if len(sacct_string) >= 2: if sacct_string[0] == step_name: step_id = sacct_string[1] - return step_id + return StepID(step_id) if step_id is not None else None + + +def is_substep(step_id: StepID) -> bool: + """Check if `step_id` is part of colon-separated run, this is reflected in a + `+` in the step id, so that the format becomes `12345+1.0`. + + If we find a `+` in the step ID, it traditionally can mean two things: a + MPMD srun command, or a heterogeneous job. + + :param step_id: A step id to check to see if it is a substep. + :returns: Whether or not the step ID is a substep + """ + return step_id != get_step_id_from_substep_id(step_id) + + +def get_step_id_from_substep_id(substep: StepID) -> StepID: + """Parse the base step ID from a substep. In the case where a step ID is + provided that is not referenceing a substep, return the original step ID + without change. + + :param step_id: A substep step ID. + :returns: The parent step ID of the provided substep ID. + """ + step_id, *_ = substep.split("+", maxsplit=1) + return StepID(step_id) diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index 265205bef4..d133aa517c 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -31,6 +31,7 @@ import base64 import collections.abc +import concurrent.futures import functools import itertools import os @@ -57,6 +58,7 @@ _TRedisAIBackendStr = t.Literal["tensorflow", "torch", "onnxruntime"] _T = t.TypeVar("_T") +_R = t.TypeVar("_R") _HashableT = t.TypeVar("_HashableT", bound=t.Hashable) _TSignalHandlerFn = t.Callable[[int, t.Optional["FrameType"]], object] @@ -527,6 +529,20 @@ def packed(args: tuple[Unpack[_Ts]]) -> _T: return packed +def threaded_map( + fn: t.Callable[[_T], _R], iterable: t.Iterable[_T], / +) -> tuple[_R, ...]: + """Map a function over an iterable with each call being executed in a + separate concurrent thread. + + :param fn: The function to map over the sequence + :param iterable: The iterable over which the function is mapped + :returns: The results of the mapped function + """ + with concurrent.futures.ThreadPoolExecutor() as pool: + return tuple(pool.map(fn, iterable)) + + @t.final class SignalInterceptionStack(collections.abc.Collection[_TSignalHandlerFn]): """Registers a stack of callables to be called when a signal is diff --git a/smartsim/entity/dbnode.py b/smartsim/entity/dbnode.py index 60a69b5222..5a401d8f98 100644 --- a/smartsim/entity/dbnode.py +++ b/smartsim/entity/dbnode.py @@ -24,6 +24,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from __future__ import annotations + import fileinput import itertools import json @@ -37,9 +39,11 @@ from .._core.utils.helpers import expand_exe_path from ..error import SSDBFilesNotParseable from ..log import get_logger -from ..settings import RunSettings from .entity import SmartSimEntity +if t.TYPE_CHECKING: + from smartsim.settings import RunSettings + logger = get_logger(__name__) diff --git a/smartsim/launchable/mpmd_job.py b/smartsim/launchable/mpmd_job.py index e526f10746..42ebaca0c0 100644 --- a/smartsim/launchable/mpmd_job.py +++ b/smartsim/launchable/mpmd_job.py @@ -33,11 +33,11 @@ from smartsim.error.errors import SSUnsupportedError from smartsim.launchable.base_job import BaseJob from smartsim.launchable.mpmd_pair import MPMDPair -from smartsim.settings.launch_settings import LaunchSettings if t.TYPE_CHECKING: from smartsim._core.commands.launch_commands import LaunchCommands from smartsim.entity.entity import SmartSimEntity + from smartsim.settings.launch_settings import LaunchSettings def _check_launcher(mpmd_pairs: t.List[MPMDPair]) -> None: diff --git a/smartsim/launchable/mpmd_pair.py b/smartsim/launchable/mpmd_pair.py index 722a16cdee..b63be613e0 100644 --- a/smartsim/launchable/mpmd_pair.py +++ b/smartsim/launchable/mpmd_pair.py @@ -29,10 +29,9 @@ import copy import typing as t -from smartsim.settings.launch_settings import LaunchSettings - if t.TYPE_CHECKING: from smartsim.entity.entity import SmartSimEntity + from smartsim.settings.launch_settings import LaunchSettings class MPMDPair: diff --git a/smartsim/settings/arguments/launch/alps.py b/smartsim/settings/arguments/launch/alps.py index 356a443d65..15617ea7c7 100644 --- a/smartsim/settings/arguments/launch/alps.py +++ b/smartsim/settings/arguments/launch/alps.py @@ -30,7 +30,10 @@ from smartsim._core.arguments.shell import ShellLaunchArguments from smartsim._core.dispatch import dispatch -from smartsim._core.shell.shell_launcher import ShellLauncher, make_shell_format_fn +from smartsim._core.launcher_.shell.shell_launcher import ( + ShellLauncher, + make_shell_format_fn, +) from smartsim.log import get_logger from ...common import set_check_input diff --git a/smartsim/settings/arguments/launch/local.py b/smartsim/settings/arguments/launch/local.py index 2c589cb48d..d96e287c42 100644 --- a/smartsim/settings/arguments/launch/local.py +++ b/smartsim/settings/arguments/launch/local.py @@ -30,7 +30,10 @@ from smartsim._core.arguments.shell import ShellLaunchArguments from smartsim._core.dispatch import dispatch -from smartsim._core.shell.shell_launcher import ShellLauncher, make_shell_format_fn +from smartsim._core.launcher_.shell.shell_launcher import ( + ShellLauncher, + make_shell_format_fn, +) from smartsim.log import get_logger from ...common import StringArgument, set_check_input diff --git a/smartsim/settings/arguments/launch/lsf.py b/smartsim/settings/arguments/launch/lsf.py index ed24271985..92106cd561 100644 --- a/smartsim/settings/arguments/launch/lsf.py +++ b/smartsim/settings/arguments/launch/lsf.py @@ -32,7 +32,10 @@ from smartsim._core.arguments.shell import ShellLaunchArguments from smartsim._core.dispatch import EnvironMappingType, dispatch -from smartsim._core.shell.shell_launcher import ShellLauncher, ShellLauncherCommand +from smartsim._core.launcher_.shell.shell_launcher import ( + ShellLauncher, + ShellLauncherCommand, +) from smartsim.log import get_logger from ...common import set_check_input diff --git a/smartsim/settings/arguments/launch/mpi.py b/smartsim/settings/arguments/launch/mpi.py index ce8c43aa5c..1f8e009a95 100644 --- a/smartsim/settings/arguments/launch/mpi.py +++ b/smartsim/settings/arguments/launch/mpi.py @@ -30,7 +30,10 @@ from smartsim._core.arguments.shell import ShellLaunchArguments from smartsim._core.dispatch import dispatch -from smartsim._core.shell.shell_launcher import ShellLauncher, make_shell_format_fn +from smartsim._core.launcher_.shell.shell_launcher import ( + ShellLauncher, + make_shell_format_fn, +) from smartsim.log import get_logger from ...common import set_check_input diff --git a/smartsim/settings/arguments/launch/pals.py b/smartsim/settings/arguments/launch/pals.py index d48dc799b9..9ff2c8652d 100644 --- a/smartsim/settings/arguments/launch/pals.py +++ b/smartsim/settings/arguments/launch/pals.py @@ -30,7 +30,10 @@ from smartsim._core.arguments.shell import ShellLaunchArguments from smartsim._core.dispatch import dispatch -from smartsim._core.shell.shell_launcher import ShellLauncher, make_shell_format_fn +from smartsim._core.launcher_.shell.shell_launcher import ( + ShellLauncher, + make_shell_format_fn, +) from smartsim.log import get_logger from ...common import set_check_input diff --git a/smartsim/settings/arguments/launch/slurm.py b/smartsim/settings/arguments/launch/slurm.py index c5dceff628..9de2333065 100644 --- a/smartsim/settings/arguments/launch/slurm.py +++ b/smartsim/settings/arguments/launch/slurm.py @@ -27,14 +27,14 @@ from __future__ import annotations import os +import os.path import pathlib import re -import subprocess import typing as t from smartsim._core.arguments.shell import ShellLaunchArguments from smartsim._core.dispatch import EnvironMappingType, dispatch -from smartsim._core.shell.shell_launcher import ShellLauncher, ShellLauncherCommand +from smartsim._core.launcher_.slurm.slurm_launcher import SlurmLauncher, SrunCommand from smartsim.log import get_logger from ...common import set_check_input @@ -44,27 +44,42 @@ def _as_srun_command( - args: ShellLaunchArguments, + args: SlurmLaunchArguments, exe: t.Sequence[str], path: pathlib.Path, env: EnvironMappingType, stdout_path: pathlib.Path, stderr_path: pathlib.Path, -) -> ShellLauncherCommand: - command_tuple = ( - "srun", +) -> SrunCommand: + # TODO: Should probably use the `SmartSimEntity` name here, but right now + # there is no way to access it. In the meantime, we will just use the + # exe being called. + app, *_ = exe + *_, job_name = app.rsplit(os.path.sep, maxsplit=1) + + # TODO: This logic should probably be moved onto the `SlurmLaunchArguments` + # themselves to remove the protected access, and remove it from the + # formatted launch args sequence + # pylint: disable-next=protected-access + job_id = args._launch_args.get("jobid", None) + + csv_env, extra_env_str = args.format_comma_sep_env_vars(dict(env)) + extra_env = dict(var.split("=", maxsplit=1) for var in extra_env_str) + # ^^^^^^^^^^^^^^^^^^^^^^^^^^ + # TODO: Really dislike how we are assuming the format of these strings to + # be in the same format as if they were passed to `env`. Can we + # refactor this method to return a `Mapping[str, str]` instead? + srun_args = ( *(args.format_launch_args() or ()), - f"--output={stdout_path}", - f"--error={stderr_path}", - "--", - *exe, - ) - return ShellLauncherCommand( - env, path, subprocess.DEVNULL, subprocess.DEVNULL, command_tuple + f"--chdir={os.fspath(path)}", + f"--output={os.fspath(stdout_path)}", + f"--error={os.fspath(stderr_path)}", + f"--export=ALL,{csv_env}" if csv_env else "--export=ALL", ) + return SrunCommand(job_name, srun_args, exe, job_id, extra_env) -@dispatch(with_format=_as_srun_command, to_launcher=ShellLauncher) +@dispatch(with_format=_as_srun_command, to_launcher=SlurmLauncher) class SlurmLaunchArguments(ShellLaunchArguments): def launcher_str(self) -> str: """Get the string representation of the launcher @@ -284,7 +299,7 @@ def format_env_vars(self, env_vars: t.Mapping[str, str | None]) -> list[str]: def format_comma_sep_env_vars( self, env_vars: t.Dict[str, t.Optional[str]] - ) -> t.Union[t.Tuple[str, t.List[str]], None]: + ) -> t.Tuple[str, t.List[str]]: """Build environment variable string for Slurm Slurm takes exports in comma separated lists diff --git a/smartsim/wlm/slurm.py b/smartsim/wlm/slurm.py index e1b24b906d..722f4cec36 100644 --- a/smartsim/wlm/slurm.py +++ b/smartsim/wlm/slurm.py @@ -29,9 +29,9 @@ import typing as t from shutil import which -from .._core.launcher.slurm.slurm_commands import salloc, scancel, scontrol, sinfo -from .._core.launcher.slurm.slurm_parser import parse_salloc, parse_salloc_error from .._core.launcher.util.launcher_util import ComputeNode, Partition +from .._core.launcher_.slurm.slurm_commands import salloc, scancel, scontrol, sinfo +from .._core.launcher_.slurm.slurm_parser import parse_salloc, parse_salloc_error from ..error import ( AllocationError, LauncherError, diff --git a/tests/_legacy/on_wlm/test_slurm_commands.py b/tests/_legacy/on_wlm/test_slurm_commands.py index b44d309650..68e9e88199 100644 --- a/tests/_legacy/on_wlm/test_slurm_commands.py +++ b/tests/_legacy/on_wlm/test_slurm_commands.py @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pytest -from smartsim._core.launcher.slurm.slurm_commands import * +from smartsim._core.launcher_.slurm.slurm_commands import * from smartsim.error.errors import LauncherError # retrieved from pytest fixtures diff --git a/tests/_legacy/test_slurm_parser.py b/tests/_legacy/test_slurm_parser.py deleted file mode 100644 index e73ec7ed7e..0000000000 --- a/tests/_legacy/test_slurm_parser.py +++ /dev/null @@ -1,285 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2024, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -from smartsim._core.launcher.slurm import slurm_parser - -# The tests in this file belong to the group_b group -pytestmark = pytest.mark.group_b - - -# -- Salloc --------------------------------------------------------- - - -def test_parse_salloc(): - output = ( - "salloc: Granted job allocation 118568\n" - "salloc: Waiting for resource configuration\n" - "salloc: Nodes nid00116 are ready for job" - ) - alloc_id = slurm_parser.parse_salloc(output) - assert alloc_id == "118568" - - -def test_parse_salloc_extra(): - output = ( - "salloc: Running node verification test prior to job execution.\n" - "salloc: Will use xperf arguments in SLURM_SUBMIT_DIR/xperf-args if it exists.\n" - "salloc: Results saved to SLURM_SUBMIT_DIR/nodeverify.jobid\n" - "\nsalloc: Granted job allocation 22942\n" - "salloc: Waiting for resource configuration\n" - "salloc: Nodes prod76-0006 are ready for job\n" - ) - alloc_id = slurm_parser.parse_salloc(output) - assert alloc_id == "22942" - - -def test_parse_salloc_high(): - output = ( - "salloc: Granted job allocation 29917893\n" - "salloc: Waiting for resource configuration\n" - "salloc: Nodes nid00034 are ready for job\n" - ) - alloc_id = slurm_parser.parse_salloc(output) - assert alloc_id == "29917893" - - -def test_parse_salloc_error(): - output = "salloc: error: Job submit/allocate failed: Job dependency problem" - error = "Job submit/allocate failed: Job dependency problem" - parsed_error = slurm_parser.parse_salloc_error(output) - assert error == parsed_error - - -def test_parse_salloc_error_2(): - output = ( - "salloc: unrecognized option '--no-a-option'\n" - "Try 'salloc --help' for more information\n" - ) - error = "unrecognized option '--no-a-option'" - parsed_error = slurm_parser.parse_salloc_error(output) - assert error == parsed_error - - -def test_parse_salloc_error_3(): - output = ( - "salloc: Running node verification test prior to job execution.\n" - "salloc: Will use xperf arguments in SLURM_SUBMIT_DIR/xperf-args if it exists.\n" - "salloc: Results saved to SLURM_SUBMIT_DIR/nodeverify.jobid\n" - "\nsalloc: error: Job submit/allocate failed: Invalid node name specified\n" - ) - error = "Job submit/allocate failed: Invalid node name specified" - parsed_error = slurm_parser.parse_salloc_error(output) - assert error == parsed_error - - -def test_parse_salloc_error_4(): - output = ( - "salloc: error: No hardware architecture specified (-C)!\n" - "salloc: error: Job submit/allocate failed: Unspecified error\n" - ) - error = "No hardware architecture specified (-C)!" - parsed_error = slurm_parser.parse_salloc_error(output) - assert error == parsed_error - - -# -- sstat --------------------------------------------------------- - - -def test_parse_sstat_nodes(): - """Parse nodes from sstat called with args -i -a -p -n - PrologFlags=Alloc, Contain - """ - output = "118594.extern|nid00028|38671|\n" "118594.0|nid00028|38703|" - nodes = ["nid00028"] - parsed_nodes = slurm_parser.parse_sstat_nodes(output, "118594") - assert nodes == parsed_nodes - - -def test_parse_sstat_nodes_1(): - """Parse nodes from sstat called with args -i -a -p -n - PrologFlags=Alloc - """ - output = "22942.0|prod76-0006|354345|" - nodes = ["prod76-0006"] - parsed_nodes = slurm_parser.parse_sstat_nodes(output, "22942.0") - assert nodes == parsed_nodes - - -def test_parse_sstat_nodes_2(): - """Parse nodes from sstat called with args -i -a -p -n - PrologFlags=Alloc,Contain - """ - output = "29917893.extern|nid00034|44860|\n" "29917893.0|nid00034|44887|\n" - nodes = ["nid00034"] - parsed_nodes = slurm_parser.parse_sstat_nodes(output, "29917893.0") - assert nodes == parsed_nodes - - -def test_parse_sstat_nodes_3(): - """Parse nodes from sstat called with args -i -a -p -n - Special case where interactive queue also causes there - to be a constantly running .0 job - PrologFlags=Alloc,Contain - """ - output = ( - "29917893.extern|nid00034|44860|\n" - "29917893.0|nid00034|44887,45151,45152,45153,45154,45155|\n" - "29917893.2|nid00034|45174|\n" - ) - nodes = ["nid00034"] - parsed_nodes = slurm_parser.parse_sstat_nodes(output, "29917893.2") - assert nodes == parsed_nodes - - -def test_parse_sstat_nodes_4(): - """Parse nodes from sstat called with args -i -a -p -n - - with extra steps - - PrologFlags=Alloc,Contain - """ - output = ( - "30000.extern|nid00034|44860|\n" - "30000.batch|nid00034|42352" - "30000.0|nid00034|44887,45151,45152,45153,45154,45155|\n" - "30000.1|nid00035|45174|\n" - "30000.2|nid00036|45174,32435|\n" - ) - nodes = set(["nid00034", "nid00035", "nid00036"]) - parsed_nodes = set(slurm_parser.parse_sstat_nodes(output, "30000")) - assert nodes == parsed_nodes - - -def test_parse_sstat_nodes_4(): - """Parse nodes from sstat called with args -i -a -p -n - - with extra steps - - PrologFlags=Alloc,Contain - """ - output = ( - "30000.extern|nid00034|44860|\n" - "30000.batch|nid00034|42352" - "30000.0|nid00034|44887,45151,45152,45153,45154,45155|\n" - "30000.1|nid00035|45174|\n" - "30000.2|nid00036|45174,32435|\n" - ) - nodes = set(["nid00034", "nid00035", "nid00036"]) - parsed_nodes = set(slurm_parser.parse_sstat_nodes(output, "30000")) - assert nodes == parsed_nodes - - -def test_parse_sstat_nodes_5(): - """Parse nodes from sstat called with args -i -a -p -n - Special case where interactive queue also causes there - to be a constantly running .0 job - PrologFlags=Alloc,Contain - """ - output = ( - "29917893.extern|nid00034|44860|\n" - "29917893.20|nid00034|44887,45151,45152,45153,45154,45155|\n" - "29917893.2|nid00034|45174|\n" - ) - nodes = ["nid00034"] - parsed_nodes = slurm_parser.parse_sstat_nodes(output, "29917893.2") - assert nodes == parsed_nodes - - -# -- sacct --------------------------------------------------------- - - -def test_parse_sacct_step_id(): - output = ( - "SmartSim|119225|\n" - "extern|119225.extern|\n" - "m1-119225.0|119225.0|\n" - "m2-119225.1|119225.1|" - ) - step_id = "119225.0" - parsed_step_id = slurm_parser.parse_step_id_from_sacct(output, "m1-119225.0") - assert step_id == parsed_step_id - - -def test_parse_sacct_step_id_2(): - output = ( - "SmartSim|119225|\n" - "extern|119225.extern|\n" - "m1-119225.0|119225.0|\n" - "m2-119225.1|119225.1|\n" - "featurestore_0-119225.2|119225.2|\n" - "n1-119225.3|119225.3|" - ) - step_id = "119225.2" - parsed_step_id = slurm_parser.parse_step_id_from_sacct( - output, "featurestore_0-119225.2" - ) - assert step_id == parsed_step_id - - -def test_parse_sacct_step_id_2(): - output = ( - "SmartSim|962333|\n" - "extern|962333.extern|\n" - "python-962333.0|962333.0|\n" - "python-962333.1|962333.1|\n" - "cti_dlaunch1.0|962333.2|\n" - "cti_dlaunch1.0|962333.3|" - ) - step_id = "962333.1" - parsed_step_id = slurm_parser.parse_step_id_from_sacct(output, "python-962333.1") - assert step_id == parsed_step_id - - -def test_parse_sacct_status(): - """test retrieval of status and exitcode - PrologFlags=Alloc,Contain - """ - output = "29917893.2|COMPLETED|0:0|\n" - status = ("COMPLETED", "0") - parsed_status = slurm_parser.parse_sacct(output, "29917893.2") - assert status == parsed_status - - -def test_parse_sacct_status_1(): - """test retrieval of status and exitcode - PrologFlags=Alloc - """ - output = "22999.0|FAILED|1:0|\n" - status = ("FAILED", "1") - parsed_status = slurm_parser.parse_sacct(output, "22999.0") - assert status == parsed_status - - -def test_parse_sacct_status_2(): - """test retrieval of status and exitcode - PrologFlags=Alloc - """ - output = "22999.10|COMPLETED|0:0|\n22999.1|FAILED|1:0|\n" - status = ("FAILED", "1") - parsed_status = slurm_parser.parse_sacct(output, "22999.1") - assert status == parsed_status diff --git a/tests/temp_tests/test_settings/test_alpsLauncher.py b/tests/temp_tests/test_settings/test_alpsLauncher.py index 5abfbb9c76..9e2aa74e9b 100644 --- a/tests/temp_tests/test_settings/test_alpsLauncher.py +++ b/tests/temp_tests/test_settings/test_alpsLauncher.py @@ -29,7 +29,7 @@ import pytest -from smartsim._core.shell.shell_launcher import ShellLauncherCommand +from smartsim._core.launcher_.shell.shell_launcher import ShellLauncherCommand from smartsim.settings import LaunchSettings from smartsim.settings.arguments.launch.alps import ( AprunLaunchArguments, diff --git a/tests/temp_tests/test_settings/test_localLauncher.py b/tests/temp_tests/test_settings/test_localLauncher.py index 6576b2249c..84c0bf1b59 100644 --- a/tests/temp_tests/test_settings/test_localLauncher.py +++ b/tests/temp_tests/test_settings/test_localLauncher.py @@ -29,7 +29,7 @@ import pytest -from smartsim._core.shell.shell_launcher import ShellLauncherCommand +from smartsim._core.launcher_.shell.shell_launcher import ShellLauncherCommand from smartsim.settings import LaunchSettings from smartsim.settings.arguments.launch.local import ( LocalLaunchArguments, diff --git a/tests/temp_tests/test_settings/test_mpiLauncher.py b/tests/temp_tests/test_settings/test_mpiLauncher.py index 57be23ee2b..b0cfdbb051 100644 --- a/tests/temp_tests/test_settings/test_mpiLauncher.py +++ b/tests/temp_tests/test_settings/test_mpiLauncher.py @@ -31,7 +31,7 @@ import pytest -from smartsim._core.shell.shell_launcher import ShellLauncherCommand +from smartsim._core.launcher_.shell.shell_launcher import ShellLauncherCommand from smartsim.settings import LaunchSettings from smartsim.settings.arguments.launch.mpi import ( MpiexecLaunchArguments, diff --git a/tests/temp_tests/test_settings/test_palsLauncher.py b/tests/temp_tests/test_settings/test_palsLauncher.py index d38d1842c6..2831adf83c 100644 --- a/tests/temp_tests/test_settings/test_palsLauncher.py +++ b/tests/temp_tests/test_settings/test_palsLauncher.py @@ -30,7 +30,7 @@ import pytest -from smartsim._core.shell.shell_launcher import ShellLauncherCommand +from smartsim._core.launcher_.shell.shell_launcher import ShellLauncherCommand from smartsim.settings import LaunchSettings from smartsim.settings.arguments.launch.pals import ( PalsMpiexecLaunchArguments, diff --git a/tests/temp_tests/test_settings/test_slurmLauncher.py b/tests/temp_tests/test_settings/test_slurmLauncher.py index 6be9b5542a..86b69c0537 100644 --- a/tests/temp_tests/test_settings/test_slurmLauncher.py +++ b/tests/temp_tests/test_settings/test_slurmLauncher.py @@ -27,7 +27,7 @@ import pytest -from smartsim._core.shell.shell_launcher import ShellLauncherCommand +from smartsim._core.launcher_.slurm.slurm_launcher import SrunCommand from smartsim.settings import LaunchSettings from smartsim.settings.arguments.launch.slurm import ( SlurmLaunchArguments, @@ -291,108 +291,94 @@ def test_set_het_groups(monkeypatch): @pytest.mark.parametrize( - "args, expected", + "args, expected_srun_cmd_line_args", ( pytest.param( {}, - ( - "srun", - "--output=output.txt", - "--error=error.txt", - "--", - "echo", - "hello", - "world", - ), + (), id="Empty Args", ), pytest.param( {"N": "1"}, ( - "srun", "-N", "1", - "--output=output.txt", - "--error=error.txt", - "--", - "echo", - "hello", - "world", ), id="Short Arg", ), pytest.param( {"nodes": "1"}, - ( - "srun", - "--nodes=1", - "--output=output.txt", - "--error=error.txt", - "--", - "echo", - "hello", - "world", - ), + ("--nodes=1",), id="Long Arg", ), pytest.param( {"v": None}, - ( - "srun", - "-v", - "--output=output.txt", - "--error=error.txt", - "--", - "echo", - "hello", - "world", - ), + ("-v",), id="Short Arg (No Value)", ), pytest.param( {"verbose": None}, - ( - "srun", - "--verbose", - "--output=output.txt", - "--error=error.txt", - "--", - "echo", - "hello", - "world", - ), + ("--verbose",), id="Long Arg (No Value)", ), pytest.param( {"nodes": "1", "n": "123"}, ( - "srun", "--nodes=1", "-n", "123", - "--output=output.txt", - "--error=error.txt", - "--", - "echo", - "hello", - "world", ), id="Short and Long Args", ), ), ) -def test_formatting_launch_args(args, expected, test_dir): - shell_launch_cmd = _as_srun_command( +@pytest.mark.parametrize( + "env, env_csv, extra_env", + ( + pytest.param({}, None, {}, id="Empty Env"), + pytest.param( + {"SPAM": "eggs"}, "SPAM=eggs", {}, id="Env with no commas in var vals" + ), + pytest.param( + {"SPAM": "eggs,ham"}, + "SPAM", + {"SPAM": "eggs,ham"}, + id="Env with commas in var vals", + ), + ), +) +def test_formatting_launch_args( + test_dir, monkeypatch, args, expected_srun_cmd_line_args, env, env_csv, extra_env +): + monkeypatch.setenv("SLURM_JOB_ID", "MOCK-JOB-ID") + monkeypatch.setattr( + "smartsim._core.utils.helpers.expand_exe_path", + lambda exe: f"/full/path/to/{exe}", + ) + monkeypatch.setattr( + "smartsim._core.utils.helpers.create_short_id_str", lambda: "12345" + ) + srun_cmd = _as_srun_command( args=SlurmLaunchArguments(args), exe=("echo", "hello", "world"), path=test_dir, - env={}, + env=env, stdout_path="output.txt", stderr_path="error.txt", ) - assert isinstance(shell_launch_cmd, ShellLauncherCommand) - assert shell_launch_cmd.command_tuple == expected - assert shell_launch_cmd.path == test_dir - assert shell_launch_cmd.env == {} - assert shell_launch_cmd.stdout == subprocess.DEVNULL - assert shell_launch_cmd.stderr == subprocess.DEVNULL + assert isinstance(srun_cmd, SrunCommand) + assert srun_cmd.as_command_line_args() == ( + "/full/path/to/srun", + *expected_srun_cmd_line_args, + f"--chdir={test_dir}", + "--output=output.txt", + "--error=error.txt", + "--export=ALL" + (f",{env_csv}" if env_csv else ""), + "--job-name=echo-12345", + f"--jobid=MOCK-JOB-ID", + "--", + "echo", + "hello", + "world", + ) + assert srun_cmd.env == extra_env diff --git a/tests/_legacy/test_helpers.py b/tests/test_helpers.py similarity index 93% rename from tests/_legacy/test_helpers.py rename to tests/test_helpers.py index 7b453905cb..5a193e470d 100644 --- a/tests/_legacy/test_helpers.py +++ b/tests/test_helpers.py @@ -26,6 +26,7 @@ import collections import signal +import time import pytest @@ -93,6 +94,25 @@ def test_decode_raises_on_empty(): helpers.decode_cmd("") +def test_threaded_map(): + assert (0, 1, 4, 9, 16) == tuple(helpers.threaded_map(lambda x: x * x, range(5))) + + +def test_threaded_map_is_async(): + sleep_time = 1 + seq = range(5) + + def some_long_io_op(x): + time.sleep(sleep_time) + return x + x + + start = time.perf_counter() + res = helpers.threaded_map(some_long_io_op, seq) + end = time.perf_counter() + assert (0, 2, 4, 6, 8) == tuple(res) + assert end - start < (sleep_time * len(seq)) / 2 + + class MockSignal: def __init__(self): self.signal_handlers = collections.defaultdict(lambda: signal.SIG_IGN) diff --git a/tests/test_shell_launcher.py b/tests/test_shell_launcher.py index f371d793f1..5fad8e58b1 100644 --- a/tests/test_shell_launcher.py +++ b/tests/test_shell_launcher.py @@ -29,7 +29,7 @@ import contextlib import os import pathlib -import subprocess +import subprocess as sp import sys import textwrap import unittest.mock @@ -37,7 +37,10 @@ import psutil import pytest -from smartsim._core.shell.shell_launcher import ShellLauncher, ShellLauncherCommand, sp +from smartsim._core.launcher_.shell.shell_launcher import ( + ShellLauncher, + ShellLauncherCommand, +) from smartsim._core.utils import helpers from smartsim._core.utils.shell import * from smartsim.entity import entity @@ -143,8 +146,8 @@ def test_check_popen_inputs(shell_launcher: ShellLauncher, test_dir: str): cmd = ShellLauncherCommand( {}, pathlib.Path(test_dir) / "directory_dne", - subprocess.DEVNULL, - subprocess.DEVNULL, + sp.DEVNULL, + sp.DEVNULL, EchoHelloWorldEntity().as_executable_sequence(), ) with pytest.raises(ValueError): @@ -156,7 +159,7 @@ def test_shell_launcher_start_calls_popen( ): """Test that the process leading up to the shell launcher popen call was correct""" with unittest.mock.patch( - "smartsim._core.shell.shell_launcher.sp.Popen" + "smartsim._core.launcher_.shell.shell_launcher.sp.Popen" ) as mock_open: _ = shell_launcher.start(shell_cmd) mock_open.assert_called_once() @@ -167,7 +170,7 @@ def test_shell_launcher_start_calls_popen_with_value( ): """Test that popen was called with correct values""" with unittest.mock.patch( - "smartsim._core.shell.shell_launcher.sp.Popen" + "smartsim._core.launcher_.shell.shell_launcher.sp.Popen" ) as mock_open: _ = shell_launcher.start(shell_cmd) mock_open.assert_called_once_with( diff --git a/tests/test_slurm_launcher.py b/tests/test_slurm_launcher.py new file mode 100644 index 0000000000..70c8102c0b --- /dev/null +++ b/tests/test_slurm_launcher.py @@ -0,0 +1,342 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +from __future__ import annotations + +import itertools +import os +import pathlib +import shutil +import time +import typing as t + +import pytest + +from smartsim._core.launcher_.slurm.slurm_launcher import ( + SlurmLauncher, + SrunCommand, + _LaunchedJobInfo, +) +from smartsim._core.utils.launcher import create_job_id +from smartsim.error import errors +from smartsim.status import JobStatus + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +def test_srun_command_raises_if_no_alloc_provided(monkeypatch, test_dir): + monkeypatch.delenv("SLURM_JOB_ID", raising=False) + with pytest.raises(errors.AllocationError): + SrunCommand( + "HelloWorld", + ["-N", "1", "-n", "3", f"--chdir={os.fspath(test_dir)}"], + ["echo", "hello", "world"], + None, + {}, + ) + + +def test_srun_command_appends_job_tracking_flags(monkeypatch): + monkeypatch.setattr( + "smartsim._core.utils.helpers.expand_exe_path", + lambda exe: f"/full/path/to/{exe}", + ) + monkeypatch.setattr( + "smartsim._core.utils.helpers.create_short_id_str", lambda: "12345" + ) + srun = SrunCommand( + "HelloWorld", + ["-N", "1", "-n", "3"], + ["echo", "hello", "world"], + "mock-job-id", + {}, + ) + assert srun.as_command_line_args() == ( + "/full/path/to/srun", + "-N", + "1", + "-n", + "3", + "--job-name=HelloWorld-12345", + "--jobid=mock-job-id", + "--", + "echo", + "hello", + "world", + ) + + +@pytest.fixture +def make_srun_command(test_dir): + def inner( + srun_flags: t.Sequence[str], + exe: t.Sequence[str], + *, + use_current_alloc: bool = False, + ) -> SrunCommand: + *_, name = exe[0].split(os.path.sep) + return SrunCommand( + name, + [ + *srun_flags, + f"--chdir={os.fspath(test_dir)}", + f"--output={os.path.join(test_dir, f'{name}.out')}", + f"--error={os.path.join(test_dir, f'{name}.err')}", + ], + exe, + job_id=None if use_current_alloc else "MOCK-JOB-ID", + ) + + yield inner + + +def test_slurm_launcher_can_start_a_command(monkeypatch, make_srun_command): + monkeypatch.setattr( + "smartsim._core.launcher_.slurm.slurm_commands.sacct", + lambda *_, **__: ("out", "err"), + ) + monkeypatch.setattr( + "smartsim._core.launcher_.slurm.slurm_parser.parse_step_id_from_sacct", + lambda *_, **__: "mock-step-id", + ) + launcher = SlurmLauncher() + srun = make_srun_command(["-N", "1", "-n", "1"], ["echo", "spam", "eggs"]) + monkeypatch.setattr(srun, "start", lambda *_, **__: ...) + id_ = launcher.start(srun) + info = launcher._launched[id_] + assert info.slurm_id == "mock-step-id" + assert info.name == srun.name + assert info.status_override is None + + +def test_slurm_launcher_errors_if_cannot_parse_id(monkeypatch, make_srun_command): + monkeypatch.setattr( + "smartsim._core.launcher_.slurm.slurm_commands.sacct", + lambda *_, **__: ("out", "err"), + ) + monkeypatch.setattr( + "smartsim._core.launcher_.slurm.slurm_parser.parse_step_id_from_sacct", + lambda *_, **__: None, + ) + monkeypatch.setattr(time, "sleep", lambda *_, **__: ...) + launcher = SlurmLauncher() + srun = make_srun_command(["-N", "1", "-n", "1"], ["echo", "spam", "eggs"]) + monkeypatch.setattr(srun, "start", lambda *_, **__: ...) + with pytest.raises( + errors.LauncherError, match=r"Could not find id of launched job step" + ): + launcher.start(srun) + + +def fail_if_called(*_, **__): + assert False, "Function unexpectedly called" + + +def test_slurm_launcher_will_not_fetch_statuses_if_any_id_is_not_recognized( + monkeypatch, +): + known_id = create_job_id() + launcher = SlurmLauncher( + launched={known_id: _LaunchedJobInfo("mock-id", "mock-job")} + ) + unknown_id = create_job_id() + + monkeypatch.setattr(launcher, "_get_status", fail_if_called) + with pytest.raises( + errors.LauncherError, match=f"has not launched a job with id `{unknown_id}`" + ): + launcher.get_status(known_id, unknown_id) + + +@pytest.mark.parametrize( + "launched", + ( + pytest.param( + { + create_job_id(): _LaunchedJobInfo("mock-id-1", "mock-job-1"), + create_job_id(): _LaunchedJobInfo("mock-id-2", "mock-job-2"), + }, + id="No override", + ), + pytest.param( + { + create_job_id(): _LaunchedJobInfo( + "mock-id-1", "mock-job-1", status_override=JobStatus.FAILED + ), + create_job_id(): _LaunchedJobInfo( + "mock-id-2", "mock-job-2", status_override=JobStatus.CANCELLED + ), + }, + id="override", + ), + pytest.param( + { + create_job_id(): _LaunchedJobInfo("mock-id-1", "mock-job-1"), + create_job_id(): _LaunchedJobInfo( + "mock-id-2", "mock-job-2", status_override=JobStatus.CANCELLED + ), + create_job_id(): _LaunchedJobInfo("mock-id-3", "mock-job-3"), + }, + id="Both overrids and no override", + ), + ), +) +@pytest.mark.parametrize( + "mock_sacct_out, mock_fetch_status", + ( + pytest.param("RUNNING", JobStatus.RUNNING, id="running"), + pytest.param("CONFIGURING", JobStatus.RUNNING, id="configuring"), + pytest.param("STAGE_OUT", JobStatus.RUNNING, id="stage_out"), + pytest.param("COMPLETED", JobStatus.COMPLETED, id="completed"), + pytest.param("DEADLINE", JobStatus.COMPLETED, id="deadline"), + pytest.param("TIMEOUT", JobStatus.COMPLETED, id="timeout"), + pytest.param("BOOT_FAIL", JobStatus.FAILED, id="boot_fail"), + pytest.param("FAILED", JobStatus.FAILED, id="failed"), + pytest.param("NODE_FAIL", JobStatus.FAILED, id="node_fail"), + pytest.param("OUT_OF_MEMORY", JobStatus.FAILED, id="out_of_memory"), + pytest.param("CANCELLED", JobStatus.CANCELLED, id="cancelled"), + pytest.param("CANCELLED+", JobStatus.CANCELLED, id="cancelled"), + pytest.param("REVOKED", JobStatus.CANCELLED, id="revoked"), + pytest.param("PENDING", JobStatus.PAUSED, id="pending"), + pytest.param("PREEMPTED", JobStatus.PAUSED, id="preempted"), + pytest.param("RESV_DEL_HOLD", JobStatus.PAUSED, id="resv_del_hold"), + pytest.param("REQUEUE_FED", JobStatus.PAUSED, id="requeue_fed"), + pytest.param("REQUEUE_HOLD", JobStatus.PAUSED, id="requeue_hold"), + pytest.param("REQUEUED", JobStatus.PAUSED, id="requeued"), + pytest.param("RESIZING", JobStatus.PAUSED, id="resizing"), + pytest.param("SIGNALING", JobStatus.PAUSED, id="signaling"), + pytest.param("SPECIAL_EXIT", JobStatus.PAUSED, id="special_exit"), + pytest.param("STOPPED", JobStatus.PAUSED, id="stopped"), + pytest.param("SUSPENDED", JobStatus.PAUSED, id="suspended"), + pytest.param("NONSENSE", JobStatus.UNKNOWN, id="nonsense"), + ), +) +def test_slurm_will_fetch_statuses( + monkeypatch, launched, mock_sacct_out, mock_fetch_status +): + launcher = SlurmLauncher(launched=launched) + monkeypatch.setattr( + "smartsim._core.launcher_.slurm.slurm_commands.sacct", + lambda *_, **__: ("out", "err"), + ) + monkeypatch.setattr( + "smartsim._core.launcher_.slurm.slurm_parser.parse_sacct", + lambda *_, **__: (mock_sacct_out,), + ) + assert launcher.get_status(*launched) == { + k: v.status_override or mock_fetch_status for k, v in launched.items() + } + + +def test_slurm_launcher_will_not_stop_jobs_if_any_id_is_not_recognized(monkeypatch): + known_id = create_job_id() + launcher = SlurmLauncher( + launched={known_id: _LaunchedJobInfo("mock-id", "mock-job")} + ) + unknown_id = create_job_id() + + monkeypatch.setattr(launcher, "_stop_job", fail_if_called) + with pytest.raises( + errors.LauncherError, match=f"has not launched a job with id `{unknown_id}`" + ): + launcher.get_status(known_id, unknown_id) + + +@pytest.mark.parametrize("is_het_job", [False, True]) +@pytest.mark.parametrize("scancel_rc", [0, 123]) +def test_slurm_launcher_stops_jobs(monkeypatch, is_het_job, scancel_rc): + if is_het_job: + monkeypatch.setenv("SLURM_HET_SIZE", "123456") + else: + monkeypatch.delenv("SLURM_HET_SIZE", raising=False) + monkeypatch.setattr( + "smartsim._core.launcher_.slurm.slurm_commands.scancel", + lambda *_, **__: (scancel_rc, "out", "err"), + ) + id_ = create_job_id() + info = _LaunchedJobInfo("mock-id", "mock-job") + launcher = SlurmLauncher(launched={id_: info}) + monkeypatch.setattr( + launcher, + "get_status", + lambda *ids: dict(zip(ids, itertools.repeat(JobStatus.CANCELLED))), + ) + assert launcher.stop_jobs(id_) == {id_: JobStatus.CANCELLED} + assert info.status_override == ( + JobStatus.CANCELLED if is_het_job and scancel_rc != 0 else None + ) + + +requires_slurm = pytest.mark.skipif( + shutil.which("srun") is None + or shutil.which("sbatch") is None + or shutil.which("sacct") is None + or shutil.which("scancel") is None, + reason="Slurm utilities could be found", +) + + +def requires_alloc_size(num_nodes): + try: + alloc_size = int(os.environ.get("SLURM_NNODES", None)) + except (TypeError, ValueError): + alloc_size = None + return pytest.mark.skipif( + alloc_size is None or alloc_size < num_nodes, + reason=f"Test requires an allocation with at least {num_nodes} nodes", + ) + + +@requires_slurm +@requires_alloc_size(1) +def test_srun_hello_world(make_srun_command, test_dir): + launcher = SlurmLauncher() + srun = make_srun_command( + ["-N", "1", "-n", "3"], ["echo", "hello world"], use_current_alloc=True + ) + id_ = launcher.start(srun) + time.sleep(1) + assert launcher.get_status(id_)[id_] == JobStatus.COMPLETED + with open(os.path.join(test_dir, "echo.out"), "r") as fd: + assert fd.read() == "hello world\n" * 3 + + +@pytest.mark.xfail(reason=r"Slurm launcher cannout parse `CANCELLED by \d+` syntax") +@requires_slurm +@requires_alloc_size(1) +def test_srun_sleep_for_two_min_with_cancel(make_srun_command): + launcher = SlurmLauncher() + srun = make_srun_command( + ["-N", "1", "-n", "1"], ["sleep", "120"], use_current_alloc=True + ) + id_ = launcher.start(srun) + time.sleep(1) + assert launcher.get_status(id_)[id_] == JobStatus.RUNNING + launcher.stop_jobs(id_) + time.sleep(1) + assert launcher.get_status(id_)[id_] == JobStatus.CANCELLED diff --git a/tests/test_slurm_parser.py b/tests/test_slurm_parser.py new file mode 100644 index 0000000000..5f117e18cf --- /dev/null +++ b/tests/test_slurm_parser.py @@ -0,0 +1,295 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import textwrap + +import pytest + +from smartsim._core.launcher_.slurm import slurm_parser + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +# -- Salloc --------------------------------------------------------- + + +@pytest.mark.parametrize( + "output, expected_id", + ( + pytest.param( + textwrap.dedent("""\ + salloc: Granted job allocation 118568 + salloc: Waiting for resource configuration + salloc: Nodes nid00116 are ready for job + """), + "118568", + id="Simple", + ), + pytest.param( + textwrap.dedent("""\ + salloc: Running node verification test prior to job execution. + salloc: Will use xperf arguments in SLURM_SUBMIT_DIR/xperf-args if it exists. + salloc: Results saved to SLURM_SUBMIT_DIR/nodeverify.jobid + + salloc: Granted job allocation 22942 + salloc: Waiting for resource configuration + salloc: Nodes prod76-0006 are ready for job + """), + "22942", + id="Extra", + ), + pytest.param( + textwrap.dedent("""\ + salloc: Granted job allocation 29917893 + salloc: Waiting for resource configuration + salloc: Nodes nid00034 are ready for job + """), + "29917893", + id="High", + ), + ), +) +def test_parse_salloc(output, expected_id): + alloc_id = slurm_parser.parse_salloc(output) + assert alloc_id == expected_id + + +@pytest.mark.parametrize( + "output, expected_error", + ( + pytest.param( + "salloc: error: Job submit/allocate failed: Job dependency problem", + "Job submit/allocate failed: Job dependency problem", + id="Dependency Problem", + ), + pytest.param( + textwrap.dedent("""\ + salloc: unrecognized option '--no-a-option' + Try 'salloc --help' for more information + """), + "unrecognized option '--no-a-option'", + id="Bad Option", + ), + pytest.param( + textwrap.dedent("""\ + salloc: Running node verification test prior to job execution. + salloc: Will use xperf arguments in SLURM_SUBMIT_DIR/xperf-args if it exists. + salloc: Results saved to SLURM_SUBMIT_DIR/nodeverify.jobid + + salloc: error: Job submit/allocate failed: Invalid node name specified + """), + "Job submit/allocate failed: Invalid node name specified", + id="Bad Node Name", + ), + pytest.param( + textwrap.dedent("""\ + salloc: error: No hardware architecture specified (-C)! + salloc: error: Job submit/allocate failed: Unspecified error + """), + "No hardware architecture specified (-C)!", + id="Missing HW Architecture", + ), + ), +) +def test_parse_salloc_error(output, expected_error): + parsed_error = slurm_parser.parse_salloc_error(output) + assert expected_error == parsed_error + + +# -- sstat --------------------------------------------------------- + + +@pytest.mark.parametrize( + "output, job_id, nodes", + ( + pytest.param( + textwrap.dedent("""\ + 118594.extern|nid00028|38671| + 118594.0|nid00028|38703|"""), + "118594", + {"nid00028"}, + id="No suffix", + ), + pytest.param( + "22942.0|prod76-0006|354345|", "22942.0", {"prod76-0006"}, id="with suffix" + ), + pytest.param( + textwrap.dedent("""\ + 29917893.extern|nid00034|44860| + 29917893.0|nid00034|44887| + """), + "29917893.0", + {"nid00034"}, + id="with suffix and extern", + ), + pytest.param( + textwrap.dedent("""\ + 29917893.extern|nid00034|44860| + 29917893.0|nid00034|44887,45151,45152,45153,45154,45155| + 29917893.2|nid00034|45174| + """), + "29917893.2", + {"nid00034"}, + id="With interactive queue runnning `.0` job", + ), + pytest.param( + textwrap.dedent("""\ + 30000.extern|nid00034|44860| + 30000.batch|nid00034|42352 + 30000.0|nid00034|44887,45151,45152,45153,45154,45155| + 30000.1|nid00035|45174| + 30000.2|nid00036|45174,32435| + """), + "30000", + {"nid00034", "nid00035", "nid00036"}, + id="With extra steps", + ), + pytest.param( + textwrap.dedent("""\ + 29917893.extern|nid00034|44860| + 29917893.20|nid00034|44887,45151,45152,45153,45154,45155| + 29917893.2|nid00034|45174| + """), + "29917893.2", + {"nid00034"}, + id="with suffix and lines with same prefix", + ), + ), +) +def test_parse_sstat_nodes(output, job_id, nodes): + """Parse nodes from sstat called with args -i -a -p -n + PrologFlags=Alloc, Contain + """ + parsed_nodes = slurm_parser.parse_sstat_nodes(output, job_id) + assert len(nodes) == len(parsed_nodes) + assert nodes == set(parsed_nodes) + + +# -- sacct --------------------------------------------------------- + + +@pytest.mark.parametrize( + "output, step_name, step_id", + ( + pytest.param( + textwrap.dedent("""\ + SmartSim|119225| + extern|119225.extern| + m1-119225.0|119225.0| + m2-119225.1|119225.1|"""), + "m1-119225.0", + "119225.0", + id="Basic", + ), + pytest.param( + textwrap.dedent("""\ + SmartSim|119225| + extern|119225.extern| + m1-119225.0|119225.0| + m2-119225.1|119225.1| + featurestore_0-119225.2|119225.2| + n1-119225.3|119225.3|"""), + "featurestore_0-119225.2", + "119225.2", + id="New Job", + ), + pytest.param( + textwrap.dedent("""\ + SmartSim|962333| + extern|962333.extern| + python-962333.0|962333.0| + python-962333.1|962333.1| + cti_dlaunch1.0|962333.2| + cti_dlaunch1.0|962333.3|"""), + "python-962333.1", + "962333.1", + id="Very similar names", + ), + ), +) +def test_parse_sacct_step_id(output, step_name, step_id): + parsed_step_id = slurm_parser.parse_step_id_from_sacct(output, step_name) + assert step_id == parsed_step_id + + +@pytest.mark.parametrize( + "output, job_id, status", + ( + pytest.param( + "29917893.2|COMPLETED|0:0|\n", + "29917893.2", + ("COMPLETED", "0"), + id="Completed", + ), + pytest.param( + "22999.0|FAILED|1:0|\n", + "22999.0", + ("FAILED", "1"), + id="Failed", + ), + pytest.param( + textwrap.dedent("""\ + 22999.10|COMPLETED|0:0| + 22999.1|FAILED|1:0| + """), + "22999.1", + ("FAILED", "1"), + id="Failed with extra", + ), + ), +) +def test_parse_sacct_status(output, job_id, status): + """test retrieval of status and exitcode + PrologFlags=Alloc,Contain + """ + parsed_status = slurm_parser.parse_sacct(output, job_id) + assert status == parsed_status + + +# -- utils --------------------------------------------------------- + + +@pytest.mark.parametrize( + "step_id, expected", + [ + pytest.param("12345", False), + pytest.param("12345+1.0", True), + ], +) +def test_is_substep(step_id, expected): + assert slurm_parser.is_substep(step_id) == expected + + +@pytest.mark.parametrize( + "step_id, expected", + [ + pytest.param("12345", "12345"), + pytest.param("12345+1.0", "12345"), + ], +) +def test_get_step_id_from_substep(step_id, expected): + assert slurm_parser.get_step_id_from_substep_id(step_id) == expected