From 3f191454db2b979e84d0c5a6bf95d838f099f74e Mon Sep 17 00:00:00 2001
From: gpwolfe <g.patrick.wolfe@gmail.com>
Date: Thu, 6 Feb 2025 15:07:53 -0500
Subject: [PATCH] Add parsing helper funcs, update example notebook

---
 colabfit/tools/parsers.py                    | 98 ++++++++++++++++----
 colabfit/tools/property.py                   |  6 +-
 examples/carbon_allotrope_vast_example.ipynb | 56 ++++-------
 3 files changed, 101 insertions(+), 59 deletions(-)

diff --git a/colabfit/tools/parsers.py b/colabfit/tools/parsers.py
index 53b0c6f..c5c21b2 100644
--- a/colabfit/tools/parsers.py
+++ b/colabfit/tools/parsers.py
@@ -1,9 +1,81 @@
 from ase import Atoms
+from ase.io import iread
 from colabfit.tools.configuration import AtomicConfiguration
 from colabfit.tools.utilities import convert_stress
 from pathlib import Path
 import re
 
+##############################################################
+# Helper functions
+##############################################################
+
+
+def name_config_by_filepath(fp: Path, dataset_path: Path) -> str:
+    """
+    Generates a configuration name from the filepath with dataset path removed.
+    Args:
+        fp (Path): File path from which the configuration name is to be generated.
+        dataset_path (Path): dataset_path will be removed from the beginning of the fp.
+    Returns:
+        str: The generated configuration name.
+    """
+    relative_path = fp.relative_to(dataset_path)
+    name = "__".join(relative_path.parts)
+    return name
+
+
+def read_directory(directory: Path, parser, rglobstr="*.extxyz", **kwargs):
+    """
+    Read all files in a directory with a given parser.
+    Args:
+        directory (Path): Parent directory of data files.
+        parser (function): Parser function to read data files.
+        rglobstr (str): rglob string to search for data files.
+        kwargs: Additional keyword arguments to pass to the parser.
+    Returns:
+        A generator of parsed configurations.
+    """
+    if isinstance(directory, str):
+        try:
+            directory = Path(directory)
+            if not directory.exists():
+                raise ValueError(f"{directory} does not exist")
+        except Exception as e:
+            raise ValueError(f"Could not convert {directory} to Path object") from e
+    files = directory.rglob(rglobstr)
+    for file in files:
+        if kwargs:
+            yield from parser(file, **kwargs)
+        else:
+            yield from parser(file)
+
+
+##############################################################
+# extxyz file parser
+##############################################################
+
+
+def read_extxyz(filepath: Path, dataset_path: Path):
+    with open(filepath, "rt") as f:
+        for i, config in enumerate(iread(f, format="extxyz", index=":")):
+            config.info["_name"] = (
+                name_config_by_filepath(filepath, dataset_path) + f"__index__{i}"
+            )
+            yield AtomicConfiguration.from_ase(config)
+
+
+def read_extxyz_no_ix(filepath: Path, dataset_path: Path):
+    "Returns configurations with no index in the configuration name"
+    with open(filepath, "rt") as f:
+        for i, config in enumerate(iread(f, format="extxyz", index=":")):
+            config.info["_name"] = name_config_by_filepath(filepath, dataset_path)
+            yield AtomicConfiguration.from_ase(config)
+
+
+##############################################################
+# MLIP .cfg file parser
+##############################################################
+
 
 def mlip_cfg_reader(symbol_map, filepath):
     with open(filepath, "rt") as f:
@@ -58,9 +130,7 @@ def mlip_cfg_reader(symbol_map, filepath):
                             ]
                         )
                     if "fx" in keys:
-                        forces.append(
-                            [float(f) for f in [li["fx"], li["fy"], li["fz"]]]
-                        )
+                        forces.append([float(f) for f in [li["fx"], li["fy"], li["fz"]]])
             elif line.startswith("END_CFG"):
                 if "cartes_x" in keys:
                     config = Atoms(positions=coords, symbols=symbols, cell=cell)
@@ -82,7 +152,7 @@ def mlip_cfg_reader(symbol_map, filepath):
 
 
 ##############################################################
-# VASP OUTCAR parser
+# VASP OUTCAR parser functions
 ##############################################################
 
 
@@ -131,13 +201,6 @@ def vasp_contcar_parser(fp):
         return symbol_arr
 
 
-def config_namer_by_filepath(fp, dataset_path):
-    ds_fp_str = "__".join(dataset_path.absolute().parts).replace("/", "")
-    name = "__".join(fp.absolute().parts[:-1]).replace("/", "")
-    name = name.replace(ds_fp_str + "__", "")
-    return name
-
-
 def vasp_outcar_reader(symbols, fp):
     with open(fp, "r") as f:
         incar = dict()
@@ -153,13 +216,11 @@ def vasp_outcar_reader(symbols, fp):
         for line in f:
             # Prelim handling
             if line.strip() == "":
-                pass
-
+                continue
             # handle lattice
             elif "direct lattice vectors" in line:
                 in_latt = True
                 lattice = []
-                pass
             elif in_latt is True:
                 latt = line.strip().replace("-", " -").split()
                 lattice.append([float(x) for x in [latt[0], latt[1], latt[2]]])
@@ -187,10 +248,9 @@ def vasp_outcar_reader(symbols, fp):
                     energy = None
             elif "POSITION" in line:
                 in_coords = True
-                pass
             elif in_coords is True:
                 if "--------" in line:
-                    pass
+                    continue
                 elif "total drift" in line:
                     in_coords = False
                     if energy is not None:
@@ -209,7 +269,7 @@ def vasp_outcar_reader(symbols, fp):
                         pos = []
                         energy = None
                     else:
-                        pass
+                        continue
                 else:
                     cmatch = vasp_coord_regex.search(line)
                     pos.append(
@@ -225,7 +285,7 @@ def vasp_outcar_reader(symbols, fp):
                 stress = convert_stress(stress_keys, stress)
 
             else:
-                pass
+                continue
                 # print("something went wrong")
 
 
@@ -255,7 +315,7 @@ def file_finder(fp, file_glob, count=0):
 def vasp_outcar_wrapper(data_dir: Path, dataset_path, CO_METADATA=None):
     outcars = sorted(list(data_dir.rglob("OUTCAR")))
     for filepath in outcars:
-        name = config_namer_by_filepath(filepath, dataset_path)
+        name = name_config_by_filepath(filepath, dataset_path)
         poscar = next(filepath.parent.glob(filepath.name.replace("OUTCAR", "POSCAR")))
         symbols = vasp_contcar_parser(poscar)
         kpoints_file = file_finder(filepath.parent, "KPOINTS")
diff --git a/colabfit/tools/property.py b/colabfit/tools/property.py
index 1c0f257..3b2365d 100644
--- a/colabfit/tools/property.py
+++ b/colabfit/tools/property.py
@@ -468,7 +468,7 @@ def from_definition(
                         elif isinstance(data, np.floating):
                             data = float(data)
                         elif isinstance(data, (str, bool, int, float)):
-                            pass
+                            continue
                         instance[key] = {
                             "source-value": data,
                         }
@@ -821,7 +821,7 @@ def validate_metadata(self):
         for prop_name in self.properties.keys():
             if self._metadata["property_keys"]["value"][prop_name] is None:
                 raise ValueError(
-                    f"Metadata must have 'original_file_key' set for each property. None set for '{prop_name}'."
+                    f"Metadata must have 'original_file_key' set for each property. None set for '{prop_name}'."  # noqa E501
                 )
 
     def validate_properties(self):
@@ -863,7 +863,7 @@ def validate_properties(self):
                     raise ValueError(f"Property '{prop_name}' must have 'units' set.")
                 elif val.get("has-unit") is False and prop_view.get("units") is not None:
                     raise ValueError(
-                        f"Property '{prop_name}' must have key {key}: 'units' set to None."
+                        f"Property '{prop_name}' must have key {key}: 'units' set to None."  # noqa E501
                     )
             if self._metadata["property_keys"]["value"][prop_name] is None:
                 raise ValueError(
diff --git a/examples/carbon_allotrope_vast_example.ipynb b/examples/carbon_allotrope_vast_example.ipynb
index 3836de8..4a90472 100644
--- a/examples/carbon_allotrope_vast_example.ipynb
+++ b/examples/carbon_allotrope_vast_example.ipynb
@@ -35,6 +35,7 @@
     "from colabfit.tools.configuration import AtomicConfiguration\n",
     "from colabfit.tools.database import DataManager, VastDataLoader, generate_ds_id\n",
     "from colabfit.tools.configuration_set import configuration_set_info\n",
+    "from colabfit.tools.parsers import read_extxyz, read_directory\n",
     "from colabfit.tools.property import PropertyMap, property_info\n",
     "from colabfit.tools.property_definitions import (\n",
     "    atomic_forces_pd,\n",
@@ -204,7 +205,7 @@
    "id": "4f5f650d",
    "metadata": {},
    "source": [
-    "### Define reader function"
+    "### Define reader function and insert Property Objects and Configurations"
    ]
   },
   {
@@ -218,44 +219,25 @@
     "DATASET_FP = Path(\"/path/to/data/files\")\n",
     "\n",
     "\n",
-    "# Reader function should output a colabfit AtomicConfiguration object\n",
-    "def reader(fp: Path):\n",
-    "    # names and/or labels may be used later to define configuration sets\n",
-    "    name = str(fp).replace(str(DATASET_FP), \"\").split(\"/\")\n",
-    "    name = \"__\".join([x for x in name if x != \"\"])\n",
-    "    # In this dataset, there is only one configuration per file, but the following would handle files with multiple configurations\n",
-    "    iter_configs = iread(fp, format=\"extxyz\", index=\":\")\n",
-    "    for i, config in enumerate(iter_configs):\n",
-    "        config.info[\"_name\"] = name\n",
-    "        yield AtomicConfiguration.from_ase(config)\n",
+    "# Reader function may be custom, but should output a colabfit AtomicConfiguration object\n",
+    "# The following is approximately the extxyz parser from colabfit.tools.parsers\n",
+    "# Note that for a directory of extxyz files, a wrapper function will be needed (as below)\n",
+    "# def reader(fp: Path):\n",
+    "#     # names and/or labels may be used later to define configuration sets\n",
+    "#     name = str(fp).replace(str(DATASET_FP), \"\").split(\"/\")\n",
+    "#     name = \"__\".join([x for x in name if x != \"\"])\n",
+    "#     # In this dataset, there is only one configuration per file, but the following would handle files with multiple configurations\n",
+    "#     iter_configs = iread(fp, format=\"extxyz\", index=\":\")\n",
+    "#     for i, config in enumerate(iter_configs):\n",
+    "#         config.info[\"_name\"] = name\n",
+    "#         yield AtomicConfiguration.from_ase(config)\n",
     "\n",
     "\n",
-    "# Wrapper to apply reader function to directory\n",
-    "def read_directory(dir_path: str):\n",
-    "    dir_path = Path(dir_path)\n",
-    "    if not dir_path.exists():\n",
-    "        return\n",
-    "    data_paths = sorted(list(dir_path.rglob(\"*.xyz\")))\n",
-    "    for data_path in data_paths:\n",
-    "        yield from reader(data_path)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ff960c0f",
-   "metadata": {},
-   "source": [
-    "### Insert Property Objects and Configurations"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0cb311ba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "config_generator = read_directory(DATASET_FP)\n",
+    "# See the definition of read_directory in colabfit.tools.parsers. `dataset_path` is a kwarg passed\n",
+    "# to read_extxyz (also in colabfit.tools.parsers) to help set the `config.info[\"_name\"]` field.\n",
+    "config_generator = read_directory(\n",
+    "    DATASET_FP, reader=read_extxyz, rglobstr=\"*.xyz\", dataset_path=DATASET_FP\n",
+    ")\n",
     "dm = DataManager(\n",
     "    configs=config_generator,\n",
     "    prop_defs=[energy_pd, atomic_forces_pd],\n",