Tutorial for blob storage with test (#776)

* tut test * tut test * tut test * nbmake * nbmake * nbmake * nbmake * nbmake * nbmake * enums * enums * nbmake --------- Co-authored-by: Alex Ganose <[email protected]>
materialsproject · Jul 18, 2024 · 42cae12 · 42cae12
1 parent aaa1f7b
commit 42cae12
Show file tree

Hide file tree

Showing 8 changed files with 285 additions and 1 deletion.
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -71,6 +71,9 @@ jobs:
         if: github.event_name == 'repository_dispatch' && github.event.action == 'pymatgen-ci-trigger'
         run: pip install --upgrade 'git+https://github.com/materialsproject/pymatgen@${{ github.event.client_payload.pymatgen_ref }}'
 
+      - name: Test Notebooks
+        run: pytest --nbmake ./tutorials
+
       - name: Test
         env:
           MP_API_KEY: ${{ secrets.MP_API_KEY }}

diff --git a/.gitignore b/.gitignore
@@ -66,3 +66,5 @@ docs/reference/atomate2.*
 
 # see https://github.com/materialsproject/atomate2/issues/345
 *.doctrees*
+
+.ipynb_checkpoints
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -44,3 +44,12 @@ repos:
     stages: [commit, commit-msg]
     args: [--ignore-words-list, 'titel,statics,ba,nd,te,atomate']
     types_or: [python, rst, markdown]
+- repo: https://github.com/kynan/nbstripout
+  rev: 0.7.1
+  hooks:
+    - id: nbstripout
+      args: [
+        --drop-empty-cells,
+        --strip-init-cells,
+        --extra-keys=metadata.kernelspec,
+        ]
diff --git a/pyproject.toml b/pyproject.toml
@@ -75,6 +75,7 @@ tests = [
     "pytest-cov==5.0.0",
     "pytest-mock==3.14.0",
     "pytest==8.0.2",
+    "nbmake==1.5.3"
 ]
 strict = [
     "PyYAML==6.0.1",

diff --git a/tests/vasp/conftest.py b/tests/vasp/conftest.py
@@ -84,6 +84,15 @@ def mock_vasp(
 
     For examples, see the tests in tests/vasp/makers/core.py.
     """
+    yield from _mock_vasp(monkeypatch, vasp_test_dir)
+
+
+def _mock_vasp(
+    monkeypatch: MonkeyPatch, vasp_test_dir: Path
+) -> Generator[Callable[[Any, Any], Any], None, None]:
+    """
+    Isolated version of the mock_vasp fixture that can be used in other contexts.
+    """
 
     def mock_run_vasp(*args, **kwargs):
         name = CURRENT_JOB.job.name
@@ -94,7 +103,6 @@ def mock_run_vasp(*args, **kwargs):
                 f"no reference directory found for job {name!r}; "
                 f"reference paths received={_REF_PATHS}"
             ) from None
-
         fake_run_vasp(ref_path, **_FAKE_RUN_VASP_KWARGS.get(name, {}))
 
     get_input_set_orig = VaspInputGenerator.get_input_set

diff --git a/tutorials/__init__.py b/tutorials/__init__.py
@@ -0,0 +1 @@
+"""Tutorials."""
diff --git a/tutorials/blob_storage.ipynb b/tutorials/blob_storage.ipynb
@@ -0,0 +1,218 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from atomate2.vasp.flows.core import StaticMaker\n",
+    "from jobflow import JobStore, run_locally\n",
+    "from maggma.stores import MemoryStore\n",
+    "from pymatgen.core import Structure\n",
+    "from mock_vasp import mock_vasp, TEST_DIR\n",
+    "from monty.json import MontyDecoder\n",
+    "from pymatgen.io.vasp import Chgcar\n",
+    "import numpy as np\n",
+    "\n",
+    "def decode(d):\n",
+    "    return MontyDecoder().process_decoded(d)\n",
+    "\n",
+    "jobstore = JobStore(MemoryStore(), additional_stores={\"data\": MemoryStore()})\n",
+    "si_structure = Structure.from_file(TEST_DIR / \"structures\" / \"Si.cif\")\n",
+    "ref_paths = {\"static\": \"Si_band_structure/static\"}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using Blob Storage\n",
+    "\n",
+    "While most of the output data from `atomate2` is serialized and stored in a MongoDB database, some objects exceed the 16MB limit for MongoDB documents and must be placed into blob storage.  Objects like the electronic charge density (`Chgcar`) are routinely larger than this file size and requires special treatment.  `jobflows` method of dealing with these objects this shown below:\n",
+    "\n",
+    "```python\n",
+    "@job(data=Chgcar)\n",
+    "def some_job():\n",
+    "    # return a document/dictionary that contains a Chgcar\n",
+    "    return dictionary\n",
+    "```\n",
+    "\n",
+    "where the argument to the `@job` decorator indicates that all `Chgcar` objects will be automaically dispatched to \n",
+    "\n",
+    "```python\n",
+    "JOB_STORE.additional_stores[\"data\"]\n",
+    "```\n",
+    "\n",
+    "Which should already be configured in your `jobflow.yaml` file.\n",
+    "\n",
+    "For more details on how `additional_store` works please check out this [example](https://github.com/materialsproject/jobflow/blob/main/examples/data_store.py).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "`atomate2` will automatically dispatch some well-known large objects to the `data` blob storage.\n",
+    "\n",
+    "A full list of the the objects that will automatically dispatched to blob storage can be found [here](https://github.com/materialsproject/atomate2/blob/22b2fa0f7152aa7716906da4cf08672b8960d45d/src/atomate2/vasp/jobs/base.py#L39-L52):\n",
+    "\n",
+    "\n",
+    "\n",
+    "A common usage case of object storage is in storing volumetric data from VASP outputs.  The storage of volumetric data is turned off by default, but specific files can be turned on by setting the `task_document_kwargs` for any child class of `BaseVaspMaker`.\n",
+    "For example, to store the `CHGCAR` file, you would set the `task_document_kwargs` in StaticMaker as follows:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "static_maker = StaticMaker(task_document_kwargs={\"store_volumetric_data\": (\"chgcar\",)})\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that a valid list of object `Enum` values must be provided to `store_volumetric_data` in order to store the data.  The list of valid objects can be found [here](https://github.com/materialsproject/emmet/blob/183d74c8ed640b64ba596eedbebba7072bc4f1af/emmet-core/emmet/core/vasp/calculation.py#L48)\n",
+    "\n",
+    "```python\n",
+    "class VaspObject(ValueEnum):\n",
+    "    \"\"\"Types of VASP data objects.\"\"\"\n",
+    "\n",
+    "    BANDSTRUCTURE = \"bandstructure\"\n",
+    "    DOS = \"dos\"\n",
+    "    CHGCAR = \"chgcar\"\n",
+    "    AECCAR0 = \"aeccar0\"\n",
+    "    AECCAR1 = \"aeccar1\"\n",
+    "    AECCAR2 = \"aeccar2\"\n",
+    "    TRAJECTORY = \"trajectory\"\n",
+    "    ELFCAR = \"elfcar\"\n",
+    "    WAVECAR = \"wavecar\"\n",
+    "    LOCPOT = \"locpot\"\n",
+    "    OPTIC = \"optic\"\n",
+    "    PROCAR = \"procar\"\n",
+    "```\n",
+    "\n",
+    "\n",
+    "Using the `static_maker` we can create a job and execute it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# create the job\n",
+    "job = static_maker.make(si_structure)\n",
+    "# run the job in a mock vasp environment\n",
+    "# make sure to send the results to the temporary jobstore\n",
+    "with mock_vasp(ref_paths=ref_paths) as mf:\n",
+    "    responses = run_locally(\n",
+    "        job,\n",
+    "        create_folders=True,\n",
+    "        ensure_success=True,\n",
+    "        store=jobstore,\n",
+    "        raise_immediately=True,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once the job completes, you can retrieve the full task document along with the serialized `Chgcar` object from the blob storage and reconstruct the `Chgcar` object using the `load=True` flag as shown below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "with jobstore as s:\n",
+    "    result = s.get_output(job.uuid, load=True)\n",
+    "\n",
+    "chgcar = decode(result[\"vasp_objects\"][\"chgcar\"])\n",
+    "assert isinstance(chgcar, Chgcar)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "However, if the objects is too big to keep around while you are exploring the data structure, you can use the default `load=False` flag and only load the reference to the object.  This will allow you to explore the data structure without loading the object into memory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with jobstore as s:\n",
+    "    result_no_obj = s.get_output(job.uuid)\n",
+    "result_no_obj[\"vasp_objects\"]\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then you can query for the object at any time using the `blob_uuid`.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "seach_data = result_no_obj[\"vasp_objects\"][\"chgcar\"]\n",
+    "with jobstore.additional_stores[\"data\"] as s:\n",
+    "    blob_data = s.query_one(criteria={\"blob_uuid\": seach_data[\"blob_uuid\"]})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we can deserialize the object again from the `data` subfield of the blob query result."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chgcar2 = decode(blob_data[\"data\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorials/mock_vasp.py b/tutorials/mock_vasp.py
@@ -0,0 +1,42 @@
+"""Mock VASP functions for executing tutorials."""
+
+import contextlib
+import os
+import shutil
+import sys
+import tempfile
+from collections.abc import Generator
+from pathlib import Path
+
+from pytest import MonkeyPatch
+
+# load the vasp conftest
+TEST_ROOT = Path(__file__).parent.parent / "tests"
+TEST_DIR = TEST_ROOT / "test_data"
+VASP_TEST_DATA = TEST_ROOT / "test_data/vasp"
+sys.path.insert(0, str(TEST_ROOT / "vasp"))
+from conftest import _mock_vasp  # noqa: E402
+
+
+@contextlib.contextmanager
+def mock_vasp(ref_paths: dict) -> Generator:
+    """Mock VASP functions.
+
+    Parameters
+    ----------
+    ref_paths (dict): A dictionary of reference paths to the test data.
+
+    Yields
+    ------
+        function: A function that mocks calls to VASP.
+    """
+    for mf in _mock_vasp(MonkeyPatch(), TEST_ROOT / "test_data/vasp"):
+        fake_run_vasp_kwargs = {k: {"check_inputs": tuple()} for k in ref_paths}
+        old_cwd = os.getcwd()
+        new_path = tempfile.mkdtemp()
+        os.chdir(new_path)
+        try:
+            yield mf(ref_paths, fake_run_vasp_kwargs)
+        finally:
+            os.chdir(old_cwd)
+            shutil.rmtree(new_path)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -66,3 +66,5 @@ docs/reference/atomate2.*

		# see https://github.com/materialsproject/atomate2/issues/345
		.doctrees

		.ipynb_checkpoints