From fe2bbb7c077f3ef14665899f6045805be30cc5b9 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Wed, 20 Nov 2024 15:08:02 +0100 Subject: [PATCH] Try to skip BlockingIOError --- executorlib/standalone/hdf.py | 61 ++++--- notebooks/2-hpc-submission.ipynb | 278 +------------------------------ 2 files changed, 36 insertions(+), 303 deletions(-) diff --git a/executorlib/standalone/hdf.py b/executorlib/standalone/hdf.py index 06048f6b..0e7f957e 100644 --- a/executorlib/standalone/hdf.py +++ b/executorlib/standalone/hdf.py @@ -20,7 +20,7 @@ def dump(file_name: str, data_dict: dict) -> None: "output": "output", "queue_id": "queue_id", } - with h5py.File(file_name, "a") as fname: + with h5py.File(name=file_name, mode="a") as fname: for data_key, data_value in data_dict.items(): if data_key in group_dict.keys(): fname.create_dataset( @@ -39,21 +39,24 @@ def load(file_name: str) -> dict: Returns: dict: dictionary containing the python function to be executed {"fn": ..., "args": (), "kwargs": {}} """ - with h5py.File(file_name, "r") as hdf: - data_dict = {} - if "function" in hdf: - data_dict["fn"] = cloudpickle.loads(np.void(hdf["/function"])) - else: - raise TypeError("Function not found in HDF5 file.") - if "input_args" in hdf: - data_dict["args"] = cloudpickle.loads(np.void(hdf["/input_args"])) - else: - data_dict["args"] = () - if "input_kwargs" in hdf: - data_dict["kwargs"] = cloudpickle.loads(np.void(hdf["/input_kwargs"])) - else: - data_dict["kwargs"] = {} - return data_dict + try: + with h5py.File(name=file_name, mode="r") as hdf: + data_dict = {} + if "function" in hdf: + data_dict["fn"] = cloudpickle.loads(np.void(hdf["/function"])) + else: + raise TypeError("Function not found in HDF5 file.") + if "input_args" in hdf: + data_dict["args"] = cloudpickle.loads(np.void(hdf["/input_args"])) + else: + data_dict["args"] = () + if "input_kwargs" in hdf: + data_dict["kwargs"] = cloudpickle.loads(np.void(hdf["/input_kwargs"])) + else: + data_dict["kwargs"] = {} + return data_dict + except BlockingIOError: + return load(file_name=file_name) def get_output(file_name: str) -> Tuple[bool, object]: @@ -66,16 +69,22 @@ def get_output(file_name: str) -> Tuple[bool, object]: Returns: Tuple[bool, object]: boolean flag indicating if output is available and the output object itself """ - with h5py.File(file_name, "r") as hdf: - if "output" in hdf: - return True, cloudpickle.loads(np.void(hdf["/output"])) - else: - return False, None + try: + with h5py.File(name=file_name, mode="r") as hdf: + if "output" in hdf: + return True, cloudpickle.loads(np.void(hdf["/output"])) + else: + return False, None + except BlockingIOError: + return get_output(file_name=file_name) def get_queue_id(file_name: str) -> Optional[int]: - with h5py.File(file_name, "r") as hdf: - if "queue_id" in hdf: - return cloudpickle.loads(np.void(hdf["/queue_id"])) - else: - return None + try: + with h5py.File(name=file_name, mode="r") as hdf: + if "queue_id" in hdf: + return cloudpickle.loads(np.void(hdf["/queue_id"])) + else: + return None + except BlockingIOError: + return get_queue_id(file_name=file_name) diff --git a/notebooks/2-hpc-submission.ipynb b/notebooks/2-hpc-submission.ipynb index f0998333..e54467b5 100644 --- a/notebooks/2-hpc-submission.ipynb +++ b/notebooks/2-hpc-submission.ipynb @@ -1,277 +1 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "ddf66f38-dc4a-4306-8b1c-b923fdb76922", - "metadata": {}, - "source": [ - "# HPC Submission Mode\n", - "In contrast to the [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html) and the [HPC allocation mode](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html) the HPC Submission Mode does not communicate via the [zero message queue](https://zeromq.org) but instead stores the python functions on the file system and uses the job scheduler to handle the dependencies of the Python functions. Consequently, the block allocation `block_allocation` and the init function `init_function` are not available in HPC Submission mode. At the same time it is possible to close the Python process which created the `Executor`, wait until the execution of the submitted Python functions is completed and afterwards reload the results from the cache. \n", - "\n", - "Internally the HPC submission mode is using the [Python simple queuing system adatper (pysqa)](https://pysqa.readthedocs.io) to connect to HPC job schedulers and the [h5py](https://www.h5py.org) package for serializing the Python functions to store them on the file system. Both packages are optional dependency of executorlib. The installation of the [pysqa](https://pysqa.readthedocs.io) package and the [h5py](https://www.h5py.org) package are covered in the installation section. " - ] - }, - { - "cell_type": "markdown", - "id": "d56862a6-8279-421d-a090-7ca2a3c4d416", - "metadata": {}, - "source": [ - "## SLURM\n", - "The [Simple Linux Utility for Resource Management (SLURM)](https://slurm.schedmd.com) job scheduler is currently the most commonly used job scheduler for HPC clusters. In the HPC submission mode executorlib internally uses the [sbatch](https://slurm.schedmd.com/sbatch.html) command this is in contrast to the [HPC allocatiom mode] which internally uses the [srun](https://slurm.schedmd.com/srun.html) command. \n", - "\n", - "The connection to the job scheduler is based on the [Python simple queuing system adatper (pysqa)](https://pysqa.readthedocs.io). It provides a default configuration for most commonly used job schedulers including SLURM, in addition it is also possible to provide the submission template as part of the resource dictionary `resource_dict` or via the path to the configuration directory with the `pysqa_config_directory` parameter. All three options are covered in more detail on the [pysqa documentation](https://pysqa.readthedocs.io)." - ] - }, - { - "cell_type": "markdown", - "id": "db7760e8-35a6-4a1c-8b0f-410b536c3835", - "metadata": {}, - "source": [ - "```python\n", - "from executorlib import Executor\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "b20913f3-59e4-418c-a399-866124f8e497", - "metadata": {}, - "source": [ - "In comparison to the [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html), the only two parameters which are changed are the specification of the backend as `backend=\"slurm_submission\"` and the requirement to specify the cache directory using the `cache_directory=\"./cache\"`. The rest of the syntax remains exactly the same, to simplify the up-scaling of simulation workflows. " - ] - }, - { - "cell_type": "markdown", - "id": "0b8f3b77-6199-4736-9f28-3058c5230777", - "metadata": {}, - "source": [ - "```python\n", - "with Executor(backend=\"slurm_submission\", cache_directory=\"./cache\") as exe:\n", - " future_lst = [exe.submit(sum, [i, i]) for i in range(1, 4)]\n", - " print([f.result() for f in future_lst])\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "37bef7ac-ce3e-4d8a-b848-b1474c370bca", - "metadata": {}, - "source": [ - "Specific parameters for HPC submission mode like the maximum run time `\"run_time_max\"`, the maximum memory `\"memory_max\"` or the submission template for the job submission script `\"submission_template\"` can be specified as part of the resource dictionary. Again it is possible to specify the resource dictonary `resource_dicionary` either for each function in the `submit()` function or during the initialization of the `Executor`. " - ] - }, - { - "cell_type": "markdown", - "id": "658781de-f222-4235-8c26-b0f77a0831b3", - "metadata": {}, - "source": [ - "```python\n", - "submission_template = \"\"\"\\\n", - "#!/bin/bash\n", - "#SBATCH --output=time.out\n", - "#SBATCH --job-name={{job_name}}\n", - "#SBATCH --chdir={{working_directory}}\n", - "#SBATCH --get-user-env=L\n", - "#SBATCH --partition={{partition}}\n", - "{%- if run_time_max %}\n", - "#SBATCH --time={{ [1, run_time_max // 60]|max }}\n", - "{%- endif %}\n", - "{%- if dependency %}\n", - "#SBATCH --dependency=afterok:{{ dependency | join(',') }}\n", - "{%- endif %}\n", - "{%- if memory_max %}\n", - "#SBATCH --mem={{memory_max}}G\n", - "{%- endif %}\n", - "#SBATCH --cpus-per-task={{cores}}\n", - "\n", - "{{command}}\n", - "\"\"\"\n", - "\n", - "with Executor(backend=\"slurm_submission\", cache_directory=\"./cache\") as exe:\n", - " future = exe.submit(\n", - " sum, [4, 4], \n", - " resource_dict={\n", - " \"submission_template\": submission_template, \n", - " \"run_time_max\": 180, # in seconds \n", - " })\n", - " print([f.result() for f in future_lst])\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "f7ad9c97-7743-4f87-9344-4299b2b31a56", - "metadata": {}, - "source": [ - "With these options executorlib in combination with the SLURM job scheduler provides a lot flexibility to configure the submission of Python functions depending on the specific configuration of the job scheduler. " - ] - }, - { - "cell_type": "markdown", - "id": "2a814efb-2fbc-41ba-98df-cf121d19ea66", - "metadata": {}, - "source": [ - "## Flux\n", - "While most HPC job schedulers require extensive configuration before they can be tested, the [flux framework](http://flux-framework.org) can be installed with the conda package manager, as explained in the [installation section](https://executorlib.readthedocs.io/en/latest/installation.html#alternative-installations). This simple installation makes the flux framework especially suitable for demonstrations, testing and continous integration. So below a number of features for the HPC submission mode are demonstrated based on the example of the [flux framework](http://flux-framework.org) still the same applies to other job schedulers like SLURM introduced above." - ] - }, - { - "cell_type": "markdown", - "id": "29d7aa18-357e-416e-805c-1322b59abec1", - "metadata": {}, - "source": [ - "### Dependencies\n", - "As already demonstrated for the [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html) the `Executor` class from executorlib is capable of resolving the dependencies of serial functions, when [concurrent futures Future](https://docs.python.org/3/library/concurrent.futures.html#future-objects) objects are used as inputs for subsequent function calls. For the case of the HPC submission these dependencies are communicated to the job scheduler, which allows to stop the Python process which created the `Executor` class, wait until the execution of the submitted Python functions is completed and afterwards restart the Python process for the `Executor` class and reload the calculation results from the cache defined by the `cache_directory` parameter." - ] - }, - { - "cell_type": "markdown", - "id": "3d55176a-facc-4ff5-91cd-690d480bd5b8", - "metadata": {}, - "source": [ - "```python\n", - "def add_funct(a, b):\n", - " return a + b\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "77125681-9344-43c4-8904-46d48cb90104", - "metadata": {}, - "source": [ - "```python\n", - "with Executor(backend=\"flux_submission\", cache_directory=\"./cache\") as exe:\n", - " future = 0\n", - " for i in range(4, 8):\n", - " future = exe.submit(add_funct, i, future)\n", - " print(future.result())\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "ca75cb6c-c50f-4bee-9b09-d8d29d6c263b", - "metadata": {}, - "source": [ - "### Resource Assignment\n", - "In analogy to the [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html) the resource assignment for the HPC submission mode is handled by either including the resource dictionary parameter `resource_dict` in the initialization of the `Executor` class or in every call of the `submit()` function. \n", - "\n", - "Below this is demonstrated once for the assignment of muliple CPU cores for the execution of a Python function which internally uses the message passing interface (MPI) via the [mpi4py](https://mpi4py.readthedocs.io) package. " - ] - }, - { - "cell_type": "markdown", - "id": "ea800f9a-6915-4b5a-bc57-2e072cc95437", - "metadata": {}, - "source": [ - "```python\n", - "def calc(i):\n", - " from mpi4py import MPI\n", - "\n", - " size = MPI.COMM_WORLD.Get_size()\n", - " rank = MPI.COMM_WORLD.Get_rank()\n", - " return i, size, rank\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "8bebb1b4-25fc-4f57-8633-a2677b712a87", - "metadata": {}, - "source": [ - "```python\n", - "with Executor(backend=\"flux_submission\", cache_directory=\"./cache\") as exe:\n", - " fs = exe.submit(calc, 3, resource_dict={\"cores\": 2})\n", - " print(fs.result())\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "d91499d7-5c6c-4c10-b7b7-bfc4b87ddaa8", - "metadata": {}, - "source": [ - "Beyond CPU cores and threads which were previously also introduced for the [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html) the HPC submission mode also provides the option to select the available accelerator cards or GPUs, by specifying the `\"gpus_per_core\"` parameter in the resource dictionary `resource_dict`. For demonstration we create a Python function which reads the GPU device IDs and submit it to the `Executor` class:\n", - "```python\n", - "def get_available_gpus():\n", - " import socket\n", - " from tensorflow.python.client import device_lib\n", - " local_device_protos = device_lib.list_local_devices()\n", - " return [\n", - " (x.name, x.physical_device_desc, socket.gethostname()) \n", - " for x in local_device_protos if x.device_type == 'GPU'\n", - " ]\n", - "```\n", - "\n", - "```python\n", - "with Executor(\n", - " backend=\"flux_submission\",\n", - " cache_directory=\"./cache\",\n", - " resource_dict={\"gpus_per_core\": 1}\n", - ") as exe:\n", - " fs_1 = exe.submit(get_available_gpus)\n", - " fs_2 = exe.submit(get_available_gpus)\n", - " print(fs_1.result(), fs_2.result())\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "3f47fd34-04d1-42a7-bb06-6821dc99a648", - "metadata": {}, - "source": [ - "### Cleaning Cache\n", - "Finally, as the HPC Submission Mode leverages the file system to communicate serialized Python functions, it is important to clean up the cache directory specified by the `cache_directory` parameter once the results of the submitted Python functions are no longer needed. The serialized Python functions are stored in binary format using the [cloudpickle](https://github.com/cloudpipe/cloudpickle) library for serialization. This format is design for caching but not for long-term storage. The user is responsible for the long-term storage of their data." - ] - }, - { - "cell_type": "markdown", - "id": "481eeb82-9240-4fdf-84ab-87e39681d201", - "metadata": {}, - "source": [ - "```python\n", - "import os\n", - "import shutil\n", - "\n", - "cache_dir = \"./cache\"\n", - "if os.path.exists(cache_dir):\n", - " print(os.listdir(cache_dir))\n", - " try:\n", - " shutil.rmtree(cache_dir)\n", - " except OSError:\n", - " pass\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1de93586-d302-4aa6-878a-51acfb1d3009", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} +{"metadata":{"kernelspec":{"display_name":"Flux","language":"python","name":"flux"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.12.7"}},"nbformat_minor":5,"nbformat":4,"cells":[{"id":"ddf66f38-dc4a-4306-8b1c-b923fdb76922","cell_type":"markdown","source":"# HPC Submission Mode\nIn contrast to the [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html) and the [HPC allocation mode](https://executorlib.readthedocs.io/en/latest/3-hpc-allocation.html) the HPC Submission Mode does not communicate via the [zero message queue](https://zeromq.org) but instead stores the python functions on the file system and uses the job scheduler to handle the dependencies of the Python functions. Consequently, the block allocation `block_allocation` and the init function `init_function` are not available in HPC Submission mode. At the same time it is possible to close the Python process which created the `Executor`, wait until the execution of the submitted Python functions is completed and afterwards reload the results from the cache. \n\nInternally the HPC submission mode is using the [Python simple queuing system adatper (pysqa)](https://pysqa.readthedocs.io) to connect to HPC job schedulers and the [h5py](https://www.h5py.org) package for serializing the Python functions to store them on the file system. Both packages are optional dependency of executorlib. The installation of the [pysqa](https://pysqa.readthedocs.io) package and the [h5py](https://www.h5py.org) package are covered in the installation section. ","metadata":{}},{"id":"d56862a6-8279-421d-a090-7ca2a3c4d416","cell_type":"markdown","source":"## SLURM\nThe [Simple Linux Utility for Resource Management (SLURM)](https://slurm.schedmd.com) job scheduler is currently the most commonly used job scheduler for HPC clusters. In the HPC submission mode executorlib internally uses the [sbatch](https://slurm.schedmd.com/sbatch.html) command this is in contrast to the [HPC allocatiom mode] which internally uses the [srun](https://slurm.schedmd.com/srun.html) command. \n\nThe connection to the job scheduler is based on the [Python simple queuing system adatper (pysqa)](https://pysqa.readthedocs.io). It provides a default configuration for most commonly used job schedulers including SLURM, in addition it is also possible to provide the submission template as part of the resource dictionary `resource_dict` or via the path to the configuration directory with the `pysqa_config_directory` parameter. All three options are covered in more detail on the [pysqa documentation](https://pysqa.readthedocs.io).","metadata":{}},{"id":"cd9977b9-91f2-4c0e-aa8c-8386109fcd1c","cell_type":"code","source":"from executorlib import Executor","metadata":{"trusted":true},"outputs":[],"execution_count":1},{"id":"b20913f3-59e4-418c-a399-866124f8e497","cell_type":"markdown","source":"In comparison to the [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html), the only two parameters which are changed are the specification of the backend as `backend=\"slurm_submission\"` and the requirement to specify the cache directory using the `cache_directory=\"./cache\"`. The rest of the syntax remains exactly the same, to simplify the up-scaling of simulation workflows. ","metadata":{}},{"id":"0b8f3b77-6199-4736-9f28-3058c5230777","cell_type":"markdown","source":"```python\nwith Executor(backend=\"slurm_submission\", cache_directory=\"./cache\") as exe:\n future_lst = [exe.submit(sum, [i, i]) for i in range(1, 4)]\n print([f.result() for f in future_lst])\n```","metadata":{}},{"id":"37bef7ac-ce3e-4d8a-b848-b1474c370bca","cell_type":"markdown","source":"Specific parameters for HPC submission mode like the maximum run time `\"run_time_max\"`, the maximum memory `\"memory_max\"` or the submission template for the job submission script `\"submission_template\"` can be specified as part of the resource dictionary. Again it is possible to specify the resource dictonary `resource_dicionary` either for each function in the `submit()` function or during the initialization of the `Executor`. ","metadata":{}},{"id":"658781de-f222-4235-8c26-b0f77a0831b3","cell_type":"markdown","source":"```python\nsubmission_template = \"\"\"\\\n#!/bin/bash\n#SBATCH --output=time.out\n#SBATCH --job-name={{job_name}}\n#SBATCH --chdir={{working_directory}}\n#SBATCH --get-user-env=L\n#SBATCH --partition={{partition}}\n{%- if run_time_max %}\n#SBATCH --time={{ [1, run_time_max // 60]|max }}\n{%- endif %}\n{%- if dependency %}\n#SBATCH --dependency=afterok:{{ dependency | join(',') }}\n{%- endif %}\n{%- if memory_max %}\n#SBATCH --mem={{memory_max}}G\n{%- endif %}\n#SBATCH --cpus-per-task={{cores}}\n\n{{command}}\n\"\"\"\n\nwith Executor(backend=\"slurm_submission\", cache_directory=\"./cache\") as exe:\n future = exe.submit(\n sum, [4, 4], \n resource_dict={\n \"submission_template\": submission_template, \n \"run_time_max\": 180, # in seconds \n })\n print([f.result() for f in future_lst])\n```","metadata":{}},{"id":"f7ad9c97-7743-4f87-9344-4299b2b31a56","cell_type":"markdown","source":"With these options executorlib in combination with the SLURM job scheduler provides a lot flexibility to configure the submission of Python functions depending on the specific configuration of the job scheduler. ","metadata":{}},{"id":"2a814efb-2fbc-41ba-98df-cf121d19ea66","cell_type":"markdown","source":"## Flux\nWhile most HPC job schedulers require extensive configuration before they can be tested, the [flux framework](http://flux-framework.org) can be installed with the conda package manager, as explained in the [installation section](https://executorlib.readthedocs.io/en/latest/installation.html#alternative-installations). This simple installation makes the flux framework especially suitable for demonstrations, testing and continous integration. So below a number of features for the HPC submission mode are demonstrated based on the example of the [flux framework](http://flux-framework.org) still the same applies to other job schedulers like SLURM introduced above.","metadata":{}},{"id":"29d7aa18-357e-416e-805c-1322b59abec1","cell_type":"markdown","source":"### Dependencies\nAs already demonstrated for the [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html) the `Executor` class from executorlib is capable of resolving the dependencies of serial functions, when [concurrent futures Future](https://docs.python.org/3/library/concurrent.futures.html#future-objects) objects are used as inputs for subsequent function calls. For the case of the HPC submission these dependencies are communicated to the job scheduler, which allows to stop the Python process which created the `Executor` class, wait until the execution of the submitted Python functions is completed and afterwards restart the Python process for the `Executor` class and reload the calculation results from the cache defined by the `cache_directory` parameter.","metadata":{}},{"id":"a388eb6b-13f5-425c-97eb-97a27f85c68d","cell_type":"code","source":"def add_funct(a, b):\n return a + b","metadata":{"trusted":true},"outputs":[],"execution_count":2},{"id":"d78a5a39-ba15-4352-b7d8-f673e57f5312","cell_type":"code","source":"with Executor(backend=\"flux_submission\", cache_directory=\"./cache\") as exe:\n future = 0\n for i in range(4, 8):\n future = exe.submit(add_funct, i, future)\n print(future.result())","metadata":{"trusted":true},"outputs":[{"name":"stdout","text":"22\n","output_type":"stream"}],"execution_count":3},{"id":"ca75cb6c-c50f-4bee-9b09-d8d29d6c263b","cell_type":"markdown","source":"### Resource Assignment\nIn analogy to the [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html) the resource assignment for the HPC submission mode is handled by either including the resource dictionary parameter `resource_dict` in the initialization of the `Executor` class or in every call of the `submit()` function. \n\nBelow this is demonstrated once for the assignment of muliple CPU cores for the execution of a Python function which internally uses the message passing interface (MPI) via the [mpi4py](https://mpi4py.readthedocs.io) package. ","metadata":{}},{"id":"c8599943-e145-431f-8096-6dbe30f013ac","cell_type":"code","source":"def calc(i):\n from mpi4py import MPI\n\n size = MPI.COMM_WORLD.Get_size()\n rank = MPI.COMM_WORLD.Get_rank()\n return i, size, rank","metadata":{"trusted":true},"outputs":[],"execution_count":4},{"id":"99fd8346-f23a-4351-94e0-2d0b1259a1c6","cell_type":"code","source":"with Executor(backend=\"flux_submission\", cache_directory=\"./cache\") as exe:\n fs = exe.submit(calc, 3, resource_dict={\"cores\": 2})\n print(fs.result())","metadata":{"trusted":true},"outputs":[{"name":"stdout","text":"[(3, 2, 0), (3, 2, 1)]\n","output_type":"stream"}],"execution_count":5},{"id":"d91499d7-5c6c-4c10-b7b7-bfc4b87ddaa8","cell_type":"markdown","source":"Beyond CPU cores and threads which were previously also introduced for the [Local Mode](https://executorlib.readthedocs.io/en/latest/1-local.html) the HPC submission mode also provides the option to select the available accelerator cards or GPUs, by specifying the `\"gpus_per_core\"` parameter in the resource dictionary `resource_dict`. For demonstration we create a Python function which reads the GPU device IDs and submit it to the `Executor` class:\n```python\ndef get_available_gpus():\n import socket\n from tensorflow.python.client import device_lib\n local_device_protos = device_lib.list_local_devices()\n return [\n (x.name, x.physical_device_desc, socket.gethostname()) \n for x in local_device_protos if x.device_type == 'GPU'\n ]\n```\n\n```python\nwith Executor(\n backend=\"flux_submission\",\n cache_directory=\"./cache\",\n resource_dict={\"gpus_per_core\": 1}\n) as exe:\n fs_1 = exe.submit(get_available_gpus)\n fs_2 = exe.submit(get_available_gpus)\n print(fs_1.result(), fs_2.result())\n```","metadata":{}},{"id":"3f47fd34-04d1-42a7-bb06-6821dc99a648","cell_type":"markdown","source":"### Cleaning Cache\nFinally, as the HPC Submission Mode leverages the file system to communicate serialized Python functions, it is important to clean up the cache directory specified by the `cache_directory` parameter once the results of the submitted Python functions are no longer needed. The serialized Python functions are stored in binary format using the [cloudpickle](https://github.com/cloudpipe/cloudpickle) library for serialization. This format is design for caching but not for long-term storage. The user is responsible for the long-term storage of their data.","metadata":{}},{"id":"1de93586-d302-4aa6-878a-51acfb1d3009","cell_type":"code","source":"import os\nimport shutil\n\ncache_dir = \"./cache\"\nif os.path.exists(cache_dir):\n print(os.listdir(cache_dir))\n try:\n shutil.rmtree(cache_dir)\n except OSError:\n pass","metadata":{"trusted":true},"outputs":[{"name":"stdout","text":"['add_funct3fa9a188d23bb4964cde4df14581dcf7.h5out', 'time.out', 'add_funct1fedb9b58536b1d0f3639fcd83c02176.h5out', 'run_queue.sh', 'add_funct8112d1fc6fb83fdcf61064169015cc3a.h5out', 'calce986bd53f45c57b4a48f89f1118f3f56.h5out', 'add_funct51b59596564885f953618cdf9a724462.h5out', 'error.out']\n","output_type":"stream"}],"execution_count":6}]} \ No newline at end of file