Added more informative messages

AgnostiqHQ · Jan 26, 2024 · 8e78962 · 8e78962
1 parent f8f6ffc
commit 8e78962
Show file tree

Hide file tree

Showing 7 changed files with 177 additions and 7 deletions.
diff --git a/covalent_slurm_plugin/job_script.py b/covalent_slurm_plugin/job_script.py
@@ -20,7 +20,7 @@
 from typing import Dict, List, Optional
 
 SLURM_JOB_SCRIPT_TEMPLATE = """\
-#!/bin/bash -i
+#!/bin/bash
 {sbatch_directives}
 
 {shell_env_setup}
@@ -29,13 +29,16 @@
 
 if [ $? -ne 0 ] ; then
   >&2 echo "Failed to activate conda env '$__env_name' on compute node."
+  >&2 echo "In case you have the conda env installed, please make sure your .bashrc file doesn't ignore non-interactive shells."
   exit 99
 fi
 
 remote_py_version=$(python -c "print('.'.join(map(str, __import__('sys').version_info[:2])))")
-if [[ $remote_py_version != "{python_version}" ]] ; then
+if [[ $remote_py_version != "{python_version}" && {ignore_versions} != 1 ]] ; then
   >&2 echo "Python version mismatch."
   >&2 echo "Environment '$__env_name' (python=$remote_py_version) does not match task (python={python_version})."
+  >&2 echo "The task might still be runnable but if failed the error might not be as informative."
+  >&2 echo "You can do that by passing 'ignore_versions=True' in the SlurmExecutor constructor."
   exit 199
 fi
 
@@ -44,9 +47,11 @@
   >&2 echo "Covalent may not be installed in the compute environment."
   >&2 echo "Please install covalent=={covalent_version} in the '$__env_name' conda env."
   exit 299
-elif [[ $covalent_version != "{covalent_version}" ]] ; then
+elif [[ $covalent_version != "{covalent_version}" && {ignore_versions} != 1 ]] ; then
   >&2 echo "Covalent version mismatch."
   >&2 echo "Environment '$__env_name' (covalent==$covalent_version) does not match task (covalent=={covalent_version})."
+  >&2 echo "The task might still be runnable but if failed the error might not be as informative."
+  >&2 echo "You can do that by passing 'ignore_versions=True' in the SlurmExecutor constructor."
   exit 299
 fi
 
@@ -55,9 +60,11 @@
   >&2 echo "Cloudpickle may not be installed in the compute environment."
   >&2 echo "Please install cloudpickle=={cloudpickle_version} in the '$__env_name' conda env."
   exit 399
-elif [[ $cloudpickle_version != "{cloudpickle_version}" ]] ; then
+elif [[ $cloudpickle_version != "{cloudpickle_version}" && {ignore_versions} != 1 ]] ; then
   >&2 echo "Cloudpickle version mismatch."
   >&2 echo "Environment '$__env_name' (cloudpickle==$cloudpickle_version) does not match task (cloudpickle=={cloudpickle_version})."
+  >&2 echo "The task might still be runnable but if failed the error might not be as informative."
+  >&2 echo "You can do that by passing 'ignore_versions=True' in the SlurmExecutor constructor."
   exit 399
 fi
 
@@ -81,6 +88,7 @@ def __init__(
         srun_append: Optional[str] = "",
         postrun_commands: Optional[List[str]] = None,
         use_srun: bool = True,
+        ignore_versions: bool = False,
     ):
         """Create a job script formatter.
 
@@ -98,6 +106,9 @@ def __init__(
         self._postrun_commands = postrun_commands or []
         self._use_srun = use_srun
 
+        # Convert it to an int for easier comparison in bash
+        self._ignore_versions = int(ignore_versions)
+
     @property
     def sbatch_directives(self) -> str:
         """Get the sbatch directives."""
@@ -116,9 +127,9 @@ def shell_env_setup(self) -> str:
         setup_lines = [
             f"source {self._bashrc_path}" if self._bashrc_path else "",
         ]
-        for key, value in self._variables.items():
-            setup_lines.append(f'export {key}="{value}"')
-
+        setup_lines.extend(
+            f'export {key}="{value}"' for key, value in self._variables.items()
+        )
         return "\n".join(setup_lines)
 
     @property
@@ -214,6 +225,7 @@ def format(
             "sbatch_directives": self.sbatch_directives,
             "shell_env_setup": self.shell_env_setup,
             "conda_env_setup": self.conda_env_setup,
+            "ignore_versions": self._ignore_versions,
             "covalent_version": self.covalent_version,
             "cloudpickle_version": self.cloudpickle_version,
             "python_version": python_version,

diff --git a/covalent_slurm_plugin/slurm.py b/covalent_slurm_plugin/slurm.py
@@ -64,6 +64,7 @@ class ExecutorPluginDefaults(BaseModel):
     cache_dir: Optional[str] = str(
         Path.home() / ".config/covalent/executor_plugins/covalent-slurm-cache"
     )
+    ignore_versions: bool = False
 
 
 _EXECUTOR_PLUGIN_DEFAULTS = ExecutorPluginDefaults().model_dump()
@@ -110,6 +111,7 @@ class SlurmExecutor(AsyncBaseExecutor):
         log_stderr: The path to the file to be used for redirecting stderr.
         time_limit: time limit for the task
         retries: Number of times to retry execution upon failure
+        ignore_versions: Whether to ignore the Python, Covalent, and Cloudpickle version mismatch on the remote machine and try running the task anyway. Default is False.
     """
 
     def __init__(
@@ -138,6 +140,7 @@ def __init__(
         log_stderr: str = "",
         time_limit: int = -1,
         retries: int = 0,
+        ignore_versions: bool = None,
     ):
         super().__init__(
             log_stdout=log_stdout, log_stderr=log_stderr, time_limit=time_limit, retries=retries
@@ -156,6 +159,7 @@ def __init__(
         self.slurm_path = slurm_path or get_config("executors.slurm.slurm_path")
         self.poll_freq = poll_freq or get_config("executors.slurm.poll_freq")
         self.cache_dir = Path(cache_dir or get_config("executors.slurm.cache_dir"))
+        self.ignore_versions = ignore_versions if ignore_versions is not None else get_config("executors.slurm.ignore_versions")
 
         # Resolve ssh_key_file and cert_file to absolute paths.
         self.ssh_key_file = Path(self.ssh_key_file).expanduser().resolve()
@@ -306,6 +310,7 @@ def _format_submit_script(
             srun_append=self.srun_append,
             postrun_commands=self.postrun_commands,
             use_srun=self.use_srun,
+            ignore_versions=self.ignore_versions,
         )
 
         return job_script.format(

diff --git a/tests/docker_tests/.gitignore b/tests/docker_tests/.gitignore
@@ -0,0 +1,2 @@
+# Ignore the public key for ssh
+*.pub
diff --git a/tests/docker_tests/Dockerfile b/tests/docker_tests/Dockerfile
@@ -0,0 +1,32 @@
+FROM turuncu/slurm:latest
+
+RUN apt update && apt install openssh-server vim less wget sudo -y
+
+# Create a user “slurmuser” and group “slurmgroup”
+RUN groupadd slurmgroup && useradd -ms /bin/bash -g slurmgroup slurmuser
+RUN echo 'slurmuser:root123' | chpasswd
+
+# Create slurmuser directory in home
+RUN mkdir -p /home/slurmuser/.ssh
+
+# Copy the ssh public key in the authorized_keys file. The idkey.pub below is a public key file you get from ssh-keygen. They are under ~/.ssh directory by default.
+COPY slurm_test.pub /home/slurmuser/.ssh/authorized_keys
+
+# Copy the test.job file to the home directory
+COPY test.job /home/slurmuser/test.job
+
+# Copy covalent install file to the home directory
+COPY covalent_install.sh /home/slurmuser/covalent_install.sh
+
+# Run the covalent install file
+RUN chmod +x /home/slurmuser/covalent_install.sh && /home/slurmuser/covalent_install.sh
+
+# change ownership of the key file. 
+RUN chown slurmuser:slurmgroup /home/slurmuser/.ssh/authorized_keys && chmod 600 /home/slurmuser/.ssh/authorized_keys
+
+# Start SSH service
+RUN service ssh start
+
+# Expose docker port 22
+EXPOSE 22
+CMD ["/usr/sbin/sshd","-D"]
diff --git a/tests/docker_tests/README.md b/tests/docker_tests/README.md
@@ -0,0 +1,83 @@
+# Testing with a Slurm docker container
+
+## Prerequisites
+
+Ensure you have docker installed and running on your system. You can check this by running `docker ps` and ensuring that you get a list of running containers.
+
+We will be using the slurm docker image from [here](https://hub.docker.com/r/turuncu/slurm). You can pull this image by running:
+
+```bash
+docker pull turuncu/slurm
+```
+
+Also make sure your current directory is `tests/docker_tests` for all the commands mentioned below.
+
+## Building the image
+
+### Generating a keypair
+
+We need to generate a keypair to allow the executor to ssh into the container. To do this run:
+
+```bash
+ssh-keygen -t ed25519 -f slurm_test -N ''
+```
+
+This will generate a keypair called `slurm_test` and `slurm_test.pub` in the current directory. We will use these keys to allow the executor to ssh into the container.
+
+The name of the key is important as it is used in the `Dockerfile` to copy the public key into the container.
+
+### Running docker build
+
+We do some additional setup to the image so that the executor is able to ssh into the container. To do this make sure you are in the right directory (tests/docker_tests) and run:
+
+```bash
+docker build -t slurm-image .
+```
+
+This will build the image and tag it as `slurm-image`.
+
+## Running the container
+
+To run the container, run:
+
+```bash
+docker run -d -p 22:22 --name slurm-container slurm-image
+```
+
+This will run the container in the background with name `slurm-container` and map port 22 of the container to port 22 of the host machine. This will allow us to ssh into the container.
+
+### Changing the permissions of the slurm config
+
+We need to change the permissions of the slurm config file so that `slurmuser` can read it. To do this run:
+
+```bash
+docker exec slurm-container chmod +r /etc/slurm/slurm.conf
+```
+
+## (Optional) Try running a basic slurm job
+
+To test that the container is working, we can try running a basic slurm job. To do this, ssh into the container by running:
+
+```bash
+ssh -i slurm_test slurmuser@localhost
+```
+
+Then inside the container, run:
+
+```bash
+sbatch test.job
+```
+
+This will submit the test job to the slurm scheduler and create two new files in the current directory (should be `/home/slurmuser`) as `test_<job-id>.out` and `test_<job-id>.err`. The `.out` file should contain the stdout output of the job (should be "Hello World") and the `.err` file should contain any errors (should contain the python version as it is redirected to stderr).
+
+## Running the tests
+
+Now that we have everything set up, use your favourite workflow and assign the executor as `@ct.electron(executor=slurm_executor)` for any of the electrons, where the `slurm_executor` is defined as:
+
+```python
+from covalent_slurm_plugin import SlurmExecutor
+
+slurm_executor = SlurmExecutor(username="slurmuser", address="localhost", ssh_key_file="./slurm_test", conda_env="covalent", ignore_versions=True)
+```
+
+You can mark `ignore_versions` as `False` (which is the default) if you want to make sure the same versions of python, covalent, and cloudpickle are used in the slurm job as on your local machine.
diff --git a/tests/docker_tests/covalent_install.sh b/tests/docker_tests/covalent_install.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -eu -o pipefail
+export HOME=/home/slurmuser
+
+sed -i '/^case \\$-.*/,+3d' /home/slurmuser/.bashrc
+cd $HOME
+
+MINICONDA_EXE="Miniconda3-py38_23.3.1-0-Linux-x86_64.sh"
+wget https://repo.anaconda.com/miniconda/$MINICONDA_EXE
+chmod +x $MINICONDA_EXE
+./$MINICONDA_EXE -b -p $HOME/miniconda3
+rm $MINICONDA_EXE
+
+export PATH=$HOME/miniconda3/bin:$PATH
+eval "$(conda shell.bash hook)"
+conda init bash
+
+conda create -n covalent python=3.10 -y
+echo "conda activate covalent" >> $HOME/.bashrc
+
+chown -R slurmuser:slurmgroup $HOME/{.cache,.conda,miniconda3}
+conda run -n covalent python -m pip install covalent
diff --git a/tests/docker_tests/test.job b/tests/docker_tests/test.job
@@ -0,0 +1,13 @@
+#!/bin/bash
+#
+#SBATCH --job-name=test
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+##SBATCH --mem=1G
+##SBATCH --partition=debug
+#SBATCH --time=00:10:00
+#SBATCH --output=%x_%j.out
+#SBATCH --error=%x_%j.err
+
+echo "Hello World"
+echo "$(which python)" 1>&2