Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for non-supercell clusters/molecules #209

Merged
merged 11 commits into from
Nov 15, 2023
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

## v1.2.0

- Molecules as "big supercells" is now implemented using `Database.from_files_molecule`.
- Fixed a bug with the Materials Project API where `mpr.materials.summary.search` no longer works. This was changed to `mpr.materials.search`, which seems to provide the same functionality.

## v1.1.0

- Added compatibility with the [new Materials Project API (v2)](https://next-gen.materialsproject.org/api). Old users of Lightshow will have to update their API key and might notice that some materials that were previously available no longer are.
Expand Down
4 changes: 1 addition & 3 deletions lightshow/_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,7 @@ def test_from_materials_project_structure_names():

@pytest.fixture
def database_from_file():
dat = Database.from_files(
STRUCTURE_FILES_PATH, filename="POSCAR", cleanup_paths=True
)
dat = Database.from_files(STRUCTURE_FILES_PATH, filename="POSCAR")
return deepcopy(dat)


Expand Down
9 changes: 0 additions & 9 deletions lightshow/_tests/test_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,6 @@
from geometry import consistency_check # noqa # type: ignore


def test_database_from_disk(database_from_file, test_structure_names):
dat = database_from_file
dat.initialize_supercells(9.0)
dat.initialize_inequivalent_sites()
assert set(dat.structures.keys()) == set(test_structure_names)
assert set(dat.metadata.keys()) == set(test_structure_names)
assert set(dat.supercells.keys()) == set(test_structure_names)


def test_from_materials_project(test_from_materials_project_structure_names):
try:
dat = Database.from_materials_project(
Expand Down
43 changes: 0 additions & 43 deletions lightshow/_tests/test_vasp.py

This file was deleted.

155 changes: 83 additions & 72 deletions lightshow/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,19 @@

from datetime import datetime
import json
import os
from pathlib import Path
from shutil import copy2
from warnings import warn

from monty.json import MSONable
from mp_api.client import MPRester
from pymatgen.core.structure import Structure
from pymatgen.core.structure import Structure, Molecule
from tqdm import tqdm

from lightshow import _get_API_key_from_environ
from lightshow import pymatgen_utils


def _delete_common_strings(old_list_of_strings):
list_of_strings = [str(Path(xx).parent) for xx in old_list_of_strings]
list_of_names = [str(Path(xx).name) for xx in old_list_of_strings]

commonprefix = os.path.commonprefix(list_of_strings)
new_p = [x[len(commonprefix) :] for x in list_of_strings]

# Reverse every string in the list then do it again
list_of_strings_reversed = [xx[::-1] for xx in new_p]
commonsuffix = os.path.commonprefix(list_of_strings_reversed)
new_p = [x[len(commonsuffix) :] for x in list_of_strings_reversed]

final_p = [xx[::-1] for xx in new_p]

return [
str(Path(xx) / Path(name)) for xx, name in zip(final_p, list_of_names)
]


def _get_api_key(api_key):
if api_key is None:
api_key = _get_API_key_from_environ()
Expand All @@ -49,38 +29,61 @@ def _get_api_key(api_key):
class Database(MSONable):
"""Contains all materials and metadata for some database."""

def cleanup_paths(self):
"""When loading data from disk, paths can become very repetitive. This
method strips all common prefixes and suffixes from the keys of the
structures and metadata properties."""
@classmethod
def from_files_molecule(
cls,
root,
filename="*.xyz",
lattice=None,
pbar=True,
):
"""Searches for files matching the provided ``filename``, and assumes
those files are structural files in a format compatible with
``Molecule.from_file``.

old_keys = list(self._structures.keys())
new_keys = _delete_common_strings(old_keys)
for old_key, new_key in zip(old_keys, new_keys):
self._structures[new_key] = self._structures.pop(old_key)
self._metadata[new_key] = self._metadata.pop(old_key)
Parameters
----------
root : str
The directory in which to begin the search.
filename : str, optional
The files to search for. Uses ``rglob`` to recursively find any
files matching ``filename`` within the provided directory.
lattice : list of floats, optional
Lattice parameter used to construct the crystal lattice.
pbar : bool, optional
If True, will show a tqdm progress bar.

if self._supercells_initialized:
self._supercells[new_key] = self._supercells.pop(old_key)
Returns
-------
Database
"""

if lattice is None:
lattice = [20.0, 20.0, 20.0]

structures = {}
metadata = {}
for key, path in enumerate(
tqdm(Path(root).rglob(filename), disable=not pbar)
):
key = f"{key:08}"
molecule = Molecule.from_file(path)
structures[key] = molecule.get_boxed_structure(*lattice)
metadata[key] = {"origin": str(path)}
return cls(structures=structures, metadata=metadata, supercells=dict())

@classmethod
def from_files(cls, root, filename="CONTCAR", cleanup_paths=True):
def from_files(cls, root, filename="CONTCAR", pbar=True):
"""Searches for files matching the provided ``filename``, which can
include wildcards, and assumes those files are structural files in CIF
format. The names/ids of these files is given by the full directory
structure where that file was found. For example, if
``root == "my_dir"``, ``filename == "CONTCAR"`` and we have a single
structure file in ``my_dir/test/CONTCAR``, then the resulting
structures will be ``{"my_dir/test/CONTCAR": struct}``. Similarly, if
``filename == "CONTCAR*", then all files of the form ``CONTCAR*`` will
be found and used. The directory structure produced will be something
like
include wildcards, and assumes those files are structural files in a
format that can be processed by ``Structure.from_file``. Each structure
is given its own index, with the origin path stored in its metadata.

.. code-block:: python

{
"my_dir/test/CONTCAR1": struct1,
"my_dir/test/CONTCAR2": struct2,
"0": struct1,
"1": struct2,
...
}

Expand All @@ -91,43 +94,30 @@ def from_files(cls, root, filename="CONTCAR", cleanup_paths=True):
filename : str, optional
The files to search for. Uses ``rglob`` to recursively find any
files matching ``filename`` within the provided directory.
cleanup_paths : bool, optional
If True, runs :class:`cleanup_paths()` after initializing the
Database.
pbar : bool, optional
If True, will show a tqdm progress bar.

Returns
-------
Database
"""

structures = {
str(Path(path.parent) / path.stem): Structure.from_file(
path
).get_primitive_structure()
for path in Path(root).rglob(filename)
}

# Check for any duplicate names
names = list(structures.keys())
names = [Path(name).name for name in names]
if len(list(set(names))) < len(names):
new_structures = dict()
for key, value in structures.items():
new_structures[str(Path(key).parent)] = value
structures = new_structures

metadata = {key: dict() for key in structures.keys()}

kls = cls(structures=structures, metadata=metadata, supercells=dict())
if cleanup_paths:
kls.cleanup_paths()
return kls
structures = {}
metadata = {}
for key, path in enumerate(
tqdm(Path(root).rglob(filename), disable=not pbar)
):
key = f"{key:08}"
struct = Structure.from_file(path)
structures[key] = struct.get_primitive_structure()
metadata[key] = {"origin": str(path)}
return cls(structures=structures, metadata=metadata, supercells=dict())

@classmethod
def from_materials_project(cls, **kwargs):
"""Constructs the :class:`.Database` object by pulling structures and
metadata directly from the Materials Project. This is a simple
passthrough method which utilizes the MPRester.materials.summary.search
passthrough method which utilizes the MPRester.materials.search
API of the Materials Project v2 API.

Parameters
Expand All @@ -141,7 +131,7 @@ def from_materials_project(cls, **kwargs):
Deleted Parameters
------------------
mpr_query_kwargs : dict
Direct passthrough to MPRester.materials.summary.search. See
Direct passthrough to MPRester.materials.search. See
examples below.
api_key : None, optional
API key which can either be provided directly or is read from
Expand All @@ -160,7 +150,7 @@ def from_materials_project(cls, **kwargs):
pass

with MPRester(api_key) as mpr:
searched = mpr.materials.summary.search(**kwargs)
searched = mpr.materials.search(**kwargs)

structures = {s.material_id.string: s.structure for s in searched}
metadata = {s.material_id.string: s.dict() for s in searched}
Expand Down Expand Up @@ -345,6 +335,25 @@ def _write_unit_cells(self, root, pbar=False):
fname = Path(root) / key / "POSCAR"
structure.to(fmt="POSCAR", filename=str(fname))

def _write_origin_paths(self, root, pbar=False):
"""A helper method for writing important metadata for each of the
structures if the data was loaded from disk.

Parameters
----------
root : os.PathLike
pbar : bool, optional
"""

for key, metadata in tqdm(self._metadata.items(), disable=not pbar):
if "origin" not in metadata.keys():
continue
fname = Path(root) / key / "metadata.json"
origin = str(Path(metadata["origin"]).resolve())
new_metadata = {"origin": origin}
with open(fname, "w") as outfile:
json.dump(new_metadata, outfile, indent=4, sort_keys=True)

def write(
self,
root,
Expand Down Expand Up @@ -495,6 +504,8 @@ def write(
if write_unit_cells:
self._write_unit_cells(root, pbar=pbar)

self._write_origin_paths(root, pbar=pbar)

# Save a metadata file (not a serialized version of this class) to
# disk along with the input files
with open(writer_metadata_path, "w") as outfile:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ classifiers = [
dependencies = [
"numpy==1.26.1",
"pymatgen==2023.10.11",
"mp-api",
"mp-api==0.37.5",
"ase",
"tqdm",
"monty"
Expand Down
Loading