Skip to content

Commit

Permalink
Add parsing helper funcs, update example notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
gpwolfe committed Feb 6, 2025
1 parent 74ead68 commit 3f19145
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 59 deletions.
98 changes: 79 additions & 19 deletions colabfit/tools/parsers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,81 @@
from ase import Atoms
from ase.io import iread
from colabfit.tools.configuration import AtomicConfiguration
from colabfit.tools.utilities import convert_stress
from pathlib import Path
import re

##############################################################
# Helper functions
##############################################################


def name_config_by_filepath(fp: Path, dataset_path: Path) -> str:
"""
Generates a configuration name from the filepath with dataset path removed.
Args:
fp (Path): File path from which the configuration name is to be generated.
dataset_path (Path): dataset_path will be removed from the beginning of the fp.
Returns:
str: The generated configuration name.
"""
relative_path = fp.relative_to(dataset_path)
name = "__".join(relative_path.parts)
return name


def read_directory(directory: Path, parser, rglobstr="*.extxyz", **kwargs):
"""
Read all files in a directory with a given parser.
Args:
directory (Path): Parent directory of data files.
parser (function): Parser function to read data files.
rglobstr (str): rglob string to search for data files.
kwargs: Additional keyword arguments to pass to the parser.
Returns:
A generator of parsed configurations.
"""
if isinstance(directory, str):
try:
directory = Path(directory)
if not directory.exists():
raise ValueError(f"{directory} does not exist")
except Exception as e:
raise ValueError(f"Could not convert {directory} to Path object") from e
files = directory.rglob(rglobstr)
for file in files:
if kwargs:
yield from parser(file, **kwargs)
else:
yield from parser(file)


##############################################################
# extxyz file parser
##############################################################


def read_extxyz(filepath: Path, dataset_path: Path):
with open(filepath, "rt") as f:
for i, config in enumerate(iread(f, format="extxyz", index=":")):
config.info["_name"] = (
name_config_by_filepath(filepath, dataset_path) + f"__index__{i}"
)
yield AtomicConfiguration.from_ase(config)


def read_extxyz_no_ix(filepath: Path, dataset_path: Path):
"Returns configurations with no index in the configuration name"
with open(filepath, "rt") as f:
for i, config in enumerate(iread(f, format="extxyz", index=":")):
config.info["_name"] = name_config_by_filepath(filepath, dataset_path)
yield AtomicConfiguration.from_ase(config)


##############################################################
# MLIP .cfg file parser
##############################################################


def mlip_cfg_reader(symbol_map, filepath):
with open(filepath, "rt") as f:
Expand Down Expand Up @@ -58,9 +130,7 @@ def mlip_cfg_reader(symbol_map, filepath):
]
)
if "fx" in keys:
forces.append(
[float(f) for f in [li["fx"], li["fy"], li["fz"]]]
)
forces.append([float(f) for f in [li["fx"], li["fy"], li["fz"]]])
elif line.startswith("END_CFG"):
if "cartes_x" in keys:
config = Atoms(positions=coords, symbols=symbols, cell=cell)
Expand All @@ -82,7 +152,7 @@ def mlip_cfg_reader(symbol_map, filepath):


##############################################################
# VASP OUTCAR parser
# VASP OUTCAR parser functions
##############################################################


Expand Down Expand Up @@ -131,13 +201,6 @@ def vasp_contcar_parser(fp):
return symbol_arr


def config_namer_by_filepath(fp, dataset_path):
ds_fp_str = "__".join(dataset_path.absolute().parts).replace("/", "")
name = "__".join(fp.absolute().parts[:-1]).replace("/", "")
name = name.replace(ds_fp_str + "__", "")
return name


def vasp_outcar_reader(symbols, fp):
with open(fp, "r") as f:
incar = dict()
Expand All @@ -153,13 +216,11 @@ def vasp_outcar_reader(symbols, fp):
for line in f:
# Prelim handling
if line.strip() == "":
pass

continue
# handle lattice
elif "direct lattice vectors" in line:
in_latt = True
lattice = []
pass
elif in_latt is True:
latt = line.strip().replace("-", " -").split()
lattice.append([float(x) for x in [latt[0], latt[1], latt[2]]])
Expand Down Expand Up @@ -187,10 +248,9 @@ def vasp_outcar_reader(symbols, fp):
energy = None
elif "POSITION" in line:
in_coords = True
pass
elif in_coords is True:
if "--------" in line:
pass
continue
elif "total drift" in line:
in_coords = False
if energy is not None:
Expand All @@ -209,7 +269,7 @@ def vasp_outcar_reader(symbols, fp):
pos = []
energy = None
else:
pass
continue
else:
cmatch = vasp_coord_regex.search(line)
pos.append(
Expand All @@ -225,7 +285,7 @@ def vasp_outcar_reader(symbols, fp):
stress = convert_stress(stress_keys, stress)

else:
pass
continue
# print("something went wrong")


Expand Down Expand Up @@ -255,7 +315,7 @@ def file_finder(fp, file_glob, count=0):
def vasp_outcar_wrapper(data_dir: Path, dataset_path, CO_METADATA=None):
outcars = sorted(list(data_dir.rglob("OUTCAR")))
for filepath in outcars:
name = config_namer_by_filepath(filepath, dataset_path)
name = name_config_by_filepath(filepath, dataset_path)
poscar = next(filepath.parent.glob(filepath.name.replace("OUTCAR", "POSCAR")))
symbols = vasp_contcar_parser(poscar)
kpoints_file = file_finder(filepath.parent, "KPOINTS")
Expand Down
6 changes: 3 additions & 3 deletions colabfit/tools/property.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ def from_definition(
elif isinstance(data, np.floating):
data = float(data)
elif isinstance(data, (str, bool, int, float)):
pass
continue
instance[key] = {
"source-value": data,
}
Expand Down Expand Up @@ -821,7 +821,7 @@ def validate_metadata(self):
for prop_name in self.properties.keys():
if self._metadata["property_keys"]["value"][prop_name] is None:
raise ValueError(
f"Metadata must have 'original_file_key' set for each property. None set for '{prop_name}'."
f"Metadata must have 'original_file_key' set for each property. None set for '{prop_name}'." # noqa E501
)

def validate_properties(self):
Expand Down Expand Up @@ -863,7 +863,7 @@ def validate_properties(self):
raise ValueError(f"Property '{prop_name}' must have 'units' set.")
elif val.get("has-unit") is False and prop_view.get("units") is not None:
raise ValueError(
f"Property '{prop_name}' must have key {key}: 'units' set to None."
f"Property '{prop_name}' must have key {key}: 'units' set to None." # noqa E501
)
if self._metadata["property_keys"]["value"][prop_name] is None:
raise ValueError(
Expand Down
56 changes: 19 additions & 37 deletions examples/carbon_allotrope_vast_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
"from colabfit.tools.configuration import AtomicConfiguration\n",
"from colabfit.tools.database import DataManager, VastDataLoader, generate_ds_id\n",
"from colabfit.tools.configuration_set import configuration_set_info\n",
"from colabfit.tools.parsers import read_extxyz, read_directory\n",
"from colabfit.tools.property import PropertyMap, property_info\n",
"from colabfit.tools.property_definitions import (\n",
" atomic_forces_pd,\n",
Expand Down Expand Up @@ -204,7 +205,7 @@
"id": "4f5f650d",
"metadata": {},
"source": [
"### Define reader function"
"### Define reader function and insert Property Objects and Configurations"
]
},
{
Expand All @@ -218,44 +219,25 @@
"DATASET_FP = Path(\"/path/to/data/files\")\n",
"\n",
"\n",
"# Reader function should output a colabfit AtomicConfiguration object\n",
"def reader(fp: Path):\n",
" # names and/or labels may be used later to define configuration sets\n",
" name = str(fp).replace(str(DATASET_FP), \"\").split(\"/\")\n",
" name = \"__\".join([x for x in name if x != \"\"])\n",
" # In this dataset, there is only one configuration per file, but the following would handle files with multiple configurations\n",
" iter_configs = iread(fp, format=\"extxyz\", index=\":\")\n",
" for i, config in enumerate(iter_configs):\n",
" config.info[\"_name\"] = name\n",
" yield AtomicConfiguration.from_ase(config)\n",
"# Reader function may be custom, but should output a colabfit AtomicConfiguration object\n",
"# The following is approximately the extxyz parser from colabfit.tools.parsers\n",
"# Note that for a directory of extxyz files, a wrapper function will be needed (as below)\n",
"# def reader(fp: Path):\n",
"# # names and/or labels may be used later to define configuration sets\n",
"# name = str(fp).replace(str(DATASET_FP), \"\").split(\"/\")\n",
"# name = \"__\".join([x for x in name if x != \"\"])\n",
"# # In this dataset, there is only one configuration per file, but the following would handle files with multiple configurations\n",
"# iter_configs = iread(fp, format=\"extxyz\", index=\":\")\n",
"# for i, config in enumerate(iter_configs):\n",
"# config.info[\"_name\"] = name\n",
"# yield AtomicConfiguration.from_ase(config)\n",
"\n",
"\n",
"# Wrapper to apply reader function to directory\n",
"def read_directory(dir_path: str):\n",
" dir_path = Path(dir_path)\n",
" if not dir_path.exists():\n",
" return\n",
" data_paths = sorted(list(dir_path.rglob(\"*.xyz\")))\n",
" for data_path in data_paths:\n",
" yield from reader(data_path)"
]
},
{
"cell_type": "markdown",
"id": "ff960c0f",
"metadata": {},
"source": [
"### Insert Property Objects and Configurations"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0cb311ba",
"metadata": {},
"outputs": [],
"source": [
"config_generator = read_directory(DATASET_FP)\n",
"# See the definition of read_directory in colabfit.tools.parsers. `dataset_path` is a kwarg passed\n",
"# to read_extxyz (also in colabfit.tools.parsers) to help set the `config.info[\"_name\"]` field.\n",
"config_generator = read_directory(\n",
" DATASET_FP, reader=read_extxyz, rglobstr=\"*.xyz\", dataset_path=DATASET_FP\n",
")\n",
"dm = DataManager(\n",
" configs=config_generator,\n",
" prop_defs=[energy_pd, atomic_forces_pd],\n",
Expand Down

0 comments on commit 3f19145

Please sign in to comment.