Skip to content

Commit

Permalink
Added more informative messages
Browse files Browse the repository at this point in the history
  • Loading branch information
kessler-frost committed Jan 26, 2024
1 parent f8f6ffc commit 8e78962
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 7 deletions.
26 changes: 19 additions & 7 deletions covalent_slurm_plugin/job_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from typing import Dict, List, Optional

SLURM_JOB_SCRIPT_TEMPLATE = """\
#!/bin/bash -i
#!/bin/bash
{sbatch_directives}
{shell_env_setup}
Expand All @@ -29,13 +29,16 @@
if [ $? -ne 0 ] ; then
>&2 echo "Failed to activate conda env '$__env_name' on compute node."
>&2 echo "In case you have the conda env installed, please make sure your .bashrc file doesn't ignore non-interactive shells."
exit 99
fi
remote_py_version=$(python -c "print('.'.join(map(str, __import__('sys').version_info[:2])))")
if [[ $remote_py_version != "{python_version}" ]] ; then
if [[ $remote_py_version != "{python_version}" && {ignore_versions} != 1 ]] ; then
>&2 echo "Python version mismatch."
>&2 echo "Environment '$__env_name' (python=$remote_py_version) does not match task (python={python_version})."
>&2 echo "The task might still be runnable but if failed the error might not be as informative."
>&2 echo "You can do that by passing 'ignore_versions=True' in the SlurmExecutor constructor."
exit 199
fi
Expand All @@ -44,9 +47,11 @@
>&2 echo "Covalent may not be installed in the compute environment."
>&2 echo "Please install covalent=={covalent_version} in the '$__env_name' conda env."
exit 299
elif [[ $covalent_version != "{covalent_version}" ]] ; then
elif [[ $covalent_version != "{covalent_version}" && {ignore_versions} != 1 ]] ; then
>&2 echo "Covalent version mismatch."
>&2 echo "Environment '$__env_name' (covalent==$covalent_version) does not match task (covalent=={covalent_version})."
>&2 echo "The task might still be runnable but if failed the error might not be as informative."
>&2 echo "You can do that by passing 'ignore_versions=True' in the SlurmExecutor constructor."
exit 299
fi
Expand All @@ -55,9 +60,11 @@
>&2 echo "Cloudpickle may not be installed in the compute environment."
>&2 echo "Please install cloudpickle=={cloudpickle_version} in the '$__env_name' conda env."
exit 399
elif [[ $cloudpickle_version != "{cloudpickle_version}" ]] ; then
elif [[ $cloudpickle_version != "{cloudpickle_version}" && {ignore_versions} != 1 ]] ; then
>&2 echo "Cloudpickle version mismatch."
>&2 echo "Environment '$__env_name' (cloudpickle==$cloudpickle_version) does not match task (cloudpickle=={cloudpickle_version})."
>&2 echo "The task might still be runnable but if failed the error might not be as informative."
>&2 echo "You can do that by passing 'ignore_versions=True' in the SlurmExecutor constructor."
exit 399
fi
Expand All @@ -81,6 +88,7 @@ def __init__(
srun_append: Optional[str] = "",
postrun_commands: Optional[List[str]] = None,
use_srun: bool = True,
ignore_versions: bool = False,
):
"""Create a job script formatter.
Expand All @@ -98,6 +106,9 @@ def __init__(
self._postrun_commands = postrun_commands or []
self._use_srun = use_srun

# Convert it to an int for easier comparison in bash
self._ignore_versions = int(ignore_versions)

@property
def sbatch_directives(self) -> str:
"""Get the sbatch directives."""
Expand All @@ -116,9 +127,9 @@ def shell_env_setup(self) -> str:
setup_lines = [
f"source {self._bashrc_path}" if self._bashrc_path else "",
]
for key, value in self._variables.items():
setup_lines.append(f'export {key}="{value}"')

setup_lines.extend(
f'export {key}="{value}"' for key, value in self._variables.items()
)
return "\n".join(setup_lines)

@property
Expand Down Expand Up @@ -214,6 +225,7 @@ def format(
"sbatch_directives": self.sbatch_directives,
"shell_env_setup": self.shell_env_setup,
"conda_env_setup": self.conda_env_setup,
"ignore_versions": self._ignore_versions,
"covalent_version": self.covalent_version,
"cloudpickle_version": self.cloudpickle_version,
"python_version": python_version,
Expand Down
5 changes: 5 additions & 0 deletions covalent_slurm_plugin/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class ExecutorPluginDefaults(BaseModel):
cache_dir: Optional[str] = str(
Path.home() / ".config/covalent/executor_plugins/covalent-slurm-cache"
)
ignore_versions: bool = False


_EXECUTOR_PLUGIN_DEFAULTS = ExecutorPluginDefaults().model_dump()
Expand Down Expand Up @@ -110,6 +111,7 @@ class SlurmExecutor(AsyncBaseExecutor):
log_stderr: The path to the file to be used for redirecting stderr.
time_limit: time limit for the task
retries: Number of times to retry execution upon failure
ignore_versions: Whether to ignore the Python, Covalent, and Cloudpickle version mismatch on the remote machine and try running the task anyway. Default is False.
"""

def __init__(
Expand Down Expand Up @@ -138,6 +140,7 @@ def __init__(
log_stderr: str = "",
time_limit: int = -1,
retries: int = 0,
ignore_versions: bool = None,
):
super().__init__(
log_stdout=log_stdout, log_stderr=log_stderr, time_limit=time_limit, retries=retries
Expand All @@ -156,6 +159,7 @@ def __init__(
self.slurm_path = slurm_path or get_config("executors.slurm.slurm_path")
self.poll_freq = poll_freq or get_config("executors.slurm.poll_freq")
self.cache_dir = Path(cache_dir or get_config("executors.slurm.cache_dir"))
self.ignore_versions = ignore_versions if ignore_versions is not None else get_config("executors.slurm.ignore_versions")

# Resolve ssh_key_file and cert_file to absolute paths.
self.ssh_key_file = Path(self.ssh_key_file).expanduser().resolve()
Expand Down Expand Up @@ -306,6 +310,7 @@ def _format_submit_script(
srun_append=self.srun_append,
postrun_commands=self.postrun_commands,
use_srun=self.use_srun,
ignore_versions=self.ignore_versions,
)

return job_script.format(
Expand Down
2 changes: 2 additions & 0 deletions tests/docker_tests/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Ignore the public key for ssh
*.pub
32 changes: 32 additions & 0 deletions tests/docker_tests/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
FROM turuncu/slurm:latest

RUN apt update && apt install openssh-server vim less wget sudo -y

# Create a user “slurmuser” and group “slurmgroup”
RUN groupadd slurmgroup && useradd -ms /bin/bash -g slurmgroup slurmuser
RUN echo 'slurmuser:root123' | chpasswd

# Create slurmuser directory in home
RUN mkdir -p /home/slurmuser/.ssh

# Copy the ssh public key in the authorized_keys file. The idkey.pub below is a public key file you get from ssh-keygen. They are under ~/.ssh directory by default.
COPY slurm_test.pub /home/slurmuser/.ssh/authorized_keys

# Copy the test.job file to the home directory
COPY test.job /home/slurmuser/test.job

# Copy covalent install file to the home directory
COPY covalent_install.sh /home/slurmuser/covalent_install.sh

# Run the covalent install file
RUN chmod +x /home/slurmuser/covalent_install.sh && /home/slurmuser/covalent_install.sh

# change ownership of the key file.
RUN chown slurmuser:slurmgroup /home/slurmuser/.ssh/authorized_keys && chmod 600 /home/slurmuser/.ssh/authorized_keys

# Start SSH service
RUN service ssh start

# Expose docker port 22
EXPOSE 22
CMD ["/usr/sbin/sshd","-D"]
83 changes: 83 additions & 0 deletions tests/docker_tests/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Testing with a Slurm docker container

## Prerequisites

Ensure you have docker installed and running on your system. You can check this by running `docker ps` and ensuring that you get a list of running containers.

We will be using the slurm docker image from [here](https://hub.docker.com/r/turuncu/slurm). You can pull this image by running:

```bash
docker pull turuncu/slurm
```

Also make sure your current directory is `tests/docker_tests` for all the commands mentioned below.

## Building the image

### Generating a keypair

We need to generate a keypair to allow the executor to ssh into the container. To do this run:

```bash
ssh-keygen -t ed25519 -f slurm_test -N ''
```

This will generate a keypair called `slurm_test` and `slurm_test.pub` in the current directory. We will use these keys to allow the executor to ssh into the container.

The name of the key is important as it is used in the `Dockerfile` to copy the public key into the container.

### Running docker build

We do some additional setup to the image so that the executor is able to ssh into the container. To do this make sure you are in the right directory (tests/docker_tests) and run:

```bash
docker build -t slurm-image .
```

This will build the image and tag it as `slurm-image`.

## Running the container

To run the container, run:

```bash
docker run -d -p 22:22 --name slurm-container slurm-image
```

This will run the container in the background with name `slurm-container` and map port 22 of the container to port 22 of the host machine. This will allow us to ssh into the container.

### Changing the permissions of the slurm config

We need to change the permissions of the slurm config file so that `slurmuser` can read it. To do this run:

```bash
docker exec slurm-container chmod +r /etc/slurm/slurm.conf
```

## (Optional) Try running a basic slurm job

To test that the container is working, we can try running a basic slurm job. To do this, ssh into the container by running:

```bash
ssh -i slurm_test slurmuser@localhost
```

Then inside the container, run:

```bash
sbatch test.job
```

This will submit the test job to the slurm scheduler and create two new files in the current directory (should be `/home/slurmuser`) as `test_<job-id>.out` and `test_<job-id>.err`. The `.out` file should contain the stdout output of the job (should be "Hello World") and the `.err` file should contain any errors (should contain the python version as it is redirected to stderr).

## Running the tests

Now that we have everything set up, use your favourite workflow and assign the executor as `@ct.electron(executor=slurm_executor)` for any of the electrons, where the `slurm_executor` is defined as:

```python
from covalent_slurm_plugin import SlurmExecutor

slurm_executor = SlurmExecutor(username="slurmuser", address="localhost", ssh_key_file="./slurm_test", conda_env="covalent", ignore_versions=True)
```

You can mark `ignore_versions` as `False` (which is the default) if you want to make sure the same versions of python, covalent, and cloudpickle are used in the slurm job as on your local machine.
23 changes: 23 additions & 0 deletions tests/docker_tests/covalent_install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

set -eu -o pipefail
export HOME=/home/slurmuser

sed -i '/^case \\$-.*/,+3d' /home/slurmuser/.bashrc
cd $HOME

MINICONDA_EXE="Miniconda3-py38_23.3.1-0-Linux-x86_64.sh"
wget https://repo.anaconda.com/miniconda/$MINICONDA_EXE
chmod +x $MINICONDA_EXE
./$MINICONDA_EXE -b -p $HOME/miniconda3
rm $MINICONDA_EXE

export PATH=$HOME/miniconda3/bin:$PATH
eval "$(conda shell.bash hook)"
conda init bash

conda create -n covalent python=3.10 -y
echo "conda activate covalent" >> $HOME/.bashrc

chown -R slurmuser:slurmgroup $HOME/{.cache,.conda,miniconda3}
conda run -n covalent python -m pip install covalent
13 changes: 13 additions & 0 deletions tests/docker_tests/test.job
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
#
#SBATCH --job-name=test
#SBATCH --nodes=1
#SBATCH --ntasks=1
##SBATCH --mem=1G
##SBATCH --partition=debug
#SBATCH --time=00:10:00
#SBATCH --output=%x_%j.out
#SBATCH --error=%x_%j.err

echo "Hello World"
echo "$(which python)" 1>&2

0 comments on commit 8e78962

Please sign in to comment.