From 3f191454db2b979e84d0c5a6bf95d838f099f74e Mon Sep 17 00:00:00 2001 From: gpwolfe Date: Thu, 6 Feb 2025 15:07:53 -0500 Subject: [PATCH] Add parsing helper funcs, update example notebook --- colabfit/tools/parsers.py | 98 ++++++++++++++++---- colabfit/tools/property.py | 6 +- examples/carbon_allotrope_vast_example.ipynb | 56 ++++------- 3 files changed, 101 insertions(+), 59 deletions(-) diff --git a/colabfit/tools/parsers.py b/colabfit/tools/parsers.py index 53b0c6f..c5c21b2 100644 --- a/colabfit/tools/parsers.py +++ b/colabfit/tools/parsers.py @@ -1,9 +1,81 @@ from ase import Atoms +from ase.io import iread from colabfit.tools.configuration import AtomicConfiguration from colabfit.tools.utilities import convert_stress from pathlib import Path import re +############################################################## +# Helper functions +############################################################## + + +def name_config_by_filepath(fp: Path, dataset_path: Path) -> str: + """ + Generates a configuration name from the filepath with dataset path removed. + Args: + fp (Path): File path from which the configuration name is to be generated. + dataset_path (Path): dataset_path will be removed from the beginning of the fp. + Returns: + str: The generated configuration name. + """ + relative_path = fp.relative_to(dataset_path) + name = "__".join(relative_path.parts) + return name + + +def read_directory(directory: Path, parser, rglobstr="*.extxyz", **kwargs): + """ + Read all files in a directory with a given parser. + Args: + directory (Path): Parent directory of data files. + parser (function): Parser function to read data files. + rglobstr (str): rglob string to search for data files. + kwargs: Additional keyword arguments to pass to the parser. + Returns: + A generator of parsed configurations. + """ + if isinstance(directory, str): + try: + directory = Path(directory) + if not directory.exists(): + raise ValueError(f"{directory} does not exist") + except Exception as e: + raise ValueError(f"Could not convert {directory} to Path object") from e + files = directory.rglob(rglobstr) + for file in files: + if kwargs: + yield from parser(file, **kwargs) + else: + yield from parser(file) + + +############################################################## +# extxyz file parser +############################################################## + + +def read_extxyz(filepath: Path, dataset_path: Path): + with open(filepath, "rt") as f: + for i, config in enumerate(iread(f, format="extxyz", index=":")): + config.info["_name"] = ( + name_config_by_filepath(filepath, dataset_path) + f"__index__{i}" + ) + yield AtomicConfiguration.from_ase(config) + + +def read_extxyz_no_ix(filepath: Path, dataset_path: Path): + "Returns configurations with no index in the configuration name" + with open(filepath, "rt") as f: + for i, config in enumerate(iread(f, format="extxyz", index=":")): + config.info["_name"] = name_config_by_filepath(filepath, dataset_path) + yield AtomicConfiguration.from_ase(config) + + +############################################################## +# MLIP .cfg file parser +############################################################## + def mlip_cfg_reader(symbol_map, filepath): with open(filepath, "rt") as f: @@ -58,9 +130,7 @@ def mlip_cfg_reader(symbol_map, filepath): ] ) if "fx" in keys: - forces.append( - [float(f) for f in [li["fx"], li["fy"], li["fz"]]] - ) + forces.append([float(f) for f in [li["fx"], li["fy"], li["fz"]]]) elif line.startswith("END_CFG"): if "cartes_x" in keys: config = Atoms(positions=coords, symbols=symbols, cell=cell) @@ -82,7 +152,7 @@ def mlip_cfg_reader(symbol_map, filepath): ############################################################## -# VASP OUTCAR parser +# VASP OUTCAR parser functions ############################################################## @@ -131,13 +201,6 @@ def vasp_contcar_parser(fp): return symbol_arr -def config_namer_by_filepath(fp, dataset_path): - ds_fp_str = "__".join(dataset_path.absolute().parts).replace("/", "") - name = "__".join(fp.absolute().parts[:-1]).replace("/", "") - name = name.replace(ds_fp_str + "__", "") - return name - - def vasp_outcar_reader(symbols, fp): with open(fp, "r") as f: incar = dict() @@ -153,13 +216,11 @@ def vasp_outcar_reader(symbols, fp): for line in f: # Prelim handling if line.strip() == "": - pass - + continue # handle lattice elif "direct lattice vectors" in line: in_latt = True lattice = [] - pass elif in_latt is True: latt = line.strip().replace("-", " -").split() lattice.append([float(x) for x in [latt[0], latt[1], latt[2]]]) @@ -187,10 +248,9 @@ def vasp_outcar_reader(symbols, fp): energy = None elif "POSITION" in line: in_coords = True - pass elif in_coords is True: if "--------" in line: - pass + continue elif "total drift" in line: in_coords = False if energy is not None: @@ -209,7 +269,7 @@ def vasp_outcar_reader(symbols, fp): pos = [] energy = None else: - pass + continue else: cmatch = vasp_coord_regex.search(line) pos.append( @@ -225,7 +285,7 @@ def vasp_outcar_reader(symbols, fp): stress = convert_stress(stress_keys, stress) else: - pass + continue # print("something went wrong") @@ -255,7 +315,7 @@ def file_finder(fp, file_glob, count=0): def vasp_outcar_wrapper(data_dir: Path, dataset_path, CO_METADATA=None): outcars = sorted(list(data_dir.rglob("OUTCAR"))) for filepath in outcars: - name = config_namer_by_filepath(filepath, dataset_path) + name = name_config_by_filepath(filepath, dataset_path) poscar = next(filepath.parent.glob(filepath.name.replace("OUTCAR", "POSCAR"))) symbols = vasp_contcar_parser(poscar) kpoints_file = file_finder(filepath.parent, "KPOINTS") diff --git a/colabfit/tools/property.py b/colabfit/tools/property.py index 1c0f257..3b2365d 100644 --- a/colabfit/tools/property.py +++ b/colabfit/tools/property.py @@ -468,7 +468,7 @@ def from_definition( elif isinstance(data, np.floating): data = float(data) elif isinstance(data, (str, bool, int, float)): - pass + continue instance[key] = { "source-value": data, } @@ -821,7 +821,7 @@ def validate_metadata(self): for prop_name in self.properties.keys(): if self._metadata["property_keys"]["value"][prop_name] is None: raise ValueError( - f"Metadata must have 'original_file_key' set for each property. None set for '{prop_name}'." + f"Metadata must have 'original_file_key' set for each property. None set for '{prop_name}'." # noqa E501 ) def validate_properties(self): @@ -863,7 +863,7 @@ def validate_properties(self): raise ValueError(f"Property '{prop_name}' must have 'units' set.") elif val.get("has-unit") is False and prop_view.get("units") is not None: raise ValueError( - f"Property '{prop_name}' must have key {key}: 'units' set to None." + f"Property '{prop_name}' must have key {key}: 'units' set to None." # noqa E501 ) if self._metadata["property_keys"]["value"][prop_name] is None: raise ValueError( diff --git a/examples/carbon_allotrope_vast_example.ipynb b/examples/carbon_allotrope_vast_example.ipynb index 3836de8..4a90472 100644 --- a/examples/carbon_allotrope_vast_example.ipynb +++ b/examples/carbon_allotrope_vast_example.ipynb @@ -35,6 +35,7 @@ "from colabfit.tools.configuration import AtomicConfiguration\n", "from colabfit.tools.database import DataManager, VastDataLoader, generate_ds_id\n", "from colabfit.tools.configuration_set import configuration_set_info\n", + "from colabfit.tools.parsers import read_extxyz, read_directory\n", "from colabfit.tools.property import PropertyMap, property_info\n", "from colabfit.tools.property_definitions import (\n", " atomic_forces_pd,\n", @@ -204,7 +205,7 @@ "id": "4f5f650d", "metadata": {}, "source": [ - "### Define reader function" + "### Define reader function and insert Property Objects and Configurations" ] }, { @@ -218,44 +219,25 @@ "DATASET_FP = Path(\"/path/to/data/files\")\n", "\n", "\n", - "# Reader function should output a colabfit AtomicConfiguration object\n", - "def reader(fp: Path):\n", - " # names and/or labels may be used later to define configuration sets\n", - " name = str(fp).replace(str(DATASET_FP), \"\").split(\"/\")\n", - " name = \"__\".join([x for x in name if x != \"\"])\n", - " # In this dataset, there is only one configuration per file, but the following would handle files with multiple configurations\n", - " iter_configs = iread(fp, format=\"extxyz\", index=\":\")\n", - " for i, config in enumerate(iter_configs):\n", - " config.info[\"_name\"] = name\n", - " yield AtomicConfiguration.from_ase(config)\n", + "# Reader function may be custom, but should output a colabfit AtomicConfiguration object\n", + "# The following is approximately the extxyz parser from colabfit.tools.parsers\n", + "# Note that for a directory of extxyz files, a wrapper function will be needed (as below)\n", + "# def reader(fp: Path):\n", + "# # names and/or labels may be used later to define configuration sets\n", + "# name = str(fp).replace(str(DATASET_FP), \"\").split(\"/\")\n", + "# name = \"__\".join([x for x in name if x != \"\"])\n", + "# # In this dataset, there is only one configuration per file, but the following would handle files with multiple configurations\n", + "# iter_configs = iread(fp, format=\"extxyz\", index=\":\")\n", + "# for i, config in enumerate(iter_configs):\n", + "# config.info[\"_name\"] = name\n", + "# yield AtomicConfiguration.from_ase(config)\n", "\n", "\n", - "# Wrapper to apply reader function to directory\n", - "def read_directory(dir_path: str):\n", - " dir_path = Path(dir_path)\n", - " if not dir_path.exists():\n", - " return\n", - " data_paths = sorted(list(dir_path.rglob(\"*.xyz\")))\n", - " for data_path in data_paths:\n", - " yield from reader(data_path)" - ] - }, - { - "cell_type": "markdown", - "id": "ff960c0f", - "metadata": {}, - "source": [ - "### Insert Property Objects and Configurations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0cb311ba", - "metadata": {}, - "outputs": [], - "source": [ - "config_generator = read_directory(DATASET_FP)\n", + "# See the definition of read_directory in colabfit.tools.parsers. `dataset_path` is a kwarg passed\n", + "# to read_extxyz (also in colabfit.tools.parsers) to help set the `config.info[\"_name\"]` field.\n", + "config_generator = read_directory(\n", + " DATASET_FP, reader=read_extxyz, rglobstr=\"*.xyz\", dataset_path=DATASET_FP\n", + ")\n", "dm = DataManager(\n", " configs=config_generator,\n", " prop_defs=[energy_pd, atomic_forces_pd],\n",