Skip to content

Commit

Permalink
Update POTCAR summary stats to include 6.4 POTCARs and add `dev_scrip…
Browse files Browse the repository at this point in the history
…t` utils for future updates (#3370)

* Added option for 64 POTCARs, hidden func to regenerate potcar_summary_stats file used in validation

* Updated potcar_summary_stats.json.gz to include 64 POTCARs

* Verify new POTCAR_64 summary_stats work; prep for PR

* pre-commit auto-fixes

* Added (1) support for LDA 64 POTCARs; (2) ability to generate fake POTCARs from existing POTCARs by randomizing the data contained in them, dev_scripts/potcar_scrambler.py; (3) unit test for pymatgen.io.vasp.inputs._gen_potcar_summary_stats by generating summary stats for a library of fake POTCARs and then checking that the fake set passes PotcarSingle.is_valid with overriden stats

* google-style doc str

* replace print with warnings.warn

* refactor test_gen_potcar_summary_stats using pytest fixtures

* rename function arg PMG_VASP_PSP_DIR to vasp_psp_dir

* cleanup fake potcar library to only include a few required examples

* replace os.system('rm -rf') with shutil.rmtree() and system(f"mkdir -p") with os.makedirs(exist_ok=True)

* git mv tests/files/fake_{POTCAR,potcar}_library

* generate_fake_potcar_libraries prefix src_dirs with SETTINGS["PMG_VASP_PSP_DIR"]

---------

Co-authored-by: Aaron Kaplan <[email protected]>
Co-authored-by: Janosh Riebesell <[email protected]>
  • Loading branch information
3 people authored Oct 3, 2023
1 parent 2a43d25 commit 8c594d7
Show file tree
Hide file tree
Showing 10 changed files with 258 additions and 13 deletions.
146 changes: 146 additions & 0 deletions dev_scripts/potcar_scrambler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
from __future__ import annotations

import os
import shutil
import warnings

import numpy as np
from monty.serialization import zopen

from pymatgen.core import SETTINGS
from pymatgen.io.vasp import Potcar, PotcarSingle
from pymatgen.io.vasp.sets import _load_yaml_config


class PotcarScrambler:

"""
Takes a POTCAR and replaces its values with completely random values
Does type matching and attempts precision matching on floats to ensure
file is read correctly by Potcar and PotcarSingle classes.
Used to generate copyright-compliant POTCARs for PMG tests.
In case of questions, contact Aaron Kaplan <[email protected]>.
Recommended use:
PotcarScrambler.from_file(
input_filename = <input POTCAR name as str>,
output_filename = <name of desired randomized POTCAR as str>
)
to generate a POTCAR with name `output_filename` with completely random values
from existing POTCAR `input_filename`
"""

def __init__(self, potcars: Potcar | PotcarSingle):
if isinstance(potcars, PotcarSingle):
self.PSP_list = [potcars]
else:
self.PSP_list = potcars
self.scrambled_potcars_str = ""
for psp in self.PSP_list:
scrambled_potcar_str = self.scramble_single_potcar(psp)
self.scrambled_potcars_str += scrambled_potcar_str
return

def _rand_float_from_str_with_prec(self, input_str: str, bloat: float = 1.5):
n_prec = len(input_str.split(".")[1])
bd = max(1, bloat * abs(float(input_str)))
return round(bd * np.random.rand(1)[0], n_prec)

def _read_fortran_str_and_scramble(self, input_str: str, bloat: float = 1.5):
input_str = input_str.strip()

if input_str.lower() in ["t", "f"] or input_str.lower() in ["true", "false"]:
return bool(np.random.randint(2))

if input_str.upper() == input_str.lower() and input_str[0].isnumeric():
if "." in input_str:
return self._rand_float_from_str_with_prec(input_str, bloat=bloat)
integer = int(input_str)
fac = int(np.sign(integer)) # return int of same sign
return fac * np.random.randint(abs(max(1, int(np.ceil(bloat * integer)))))
try:
float(input_str)
return self._rand_float_from_str_with_prec(input_str, bloat=bloat)
except ValueError:
return input_str

def scramble_single_potcar(self, potcar: PotcarSingle):
scrambled_potcar_str = ""
for line in potcar.data.split("\n")[:-1]:
single_line_rows = line.split(";")
if "SHA256" in line or "COPYR" in line:
# files not copyrighted, remove copyright statement
# sha256 no longer applicable
continue

cline = ""
for idx, row in enumerate(single_line_rows):
split_row = row.split()
for itmp, tmp in enumerate(split_row):
cline += f"{self._read_fortran_str_and_scramble(tmp)}"
if itmp < len(split_row) - 1:
cline += " "
if len(single_line_rows) > 1 and idx == 0:
cline += "; "

aux_str = ""
if "TITEL" in line:
aux_str = " FAKE"
scrambled_potcar_str += f"{cline}{aux_str}\n"
return scrambled_potcar_str

def to_file(self, filename: str):
with zopen(filename, "wt") as f:
f.write(self.scrambled_potcars_str)

@staticmethod
def from_file(input_filename: str, output_filename: str | None = None):
psp = Potcar.from_file(input_filename)
psp_scrambled = PotcarScrambler(psp)
if output_filename:
psp_scrambled.to_file(output_filename)
return psp_scrambled


def generate_fake_potcar_libraries():
"""
To test the `_gen_potcar_summary_stats` function in `pymatgen.io.vasp.inputs`,
need a library of fake POTCARs which do not violate copyright
"""
mp_relax_set = _load_yaml_config("MPRelaxSet")
psp_variants = [mp_relax_set["POTCAR"][element] for element in mp_relax_set["POTCAR"]]

output_dir = "./fake_potcar_library/"
shutil.rmtree(output_dir, ignore_errors=True)

vasp_psp_dir = SETTINGS.get("PMG_VASP_PSP_DIR")
src_dirs = [f"{vasp_psp_dir}/{func_dir}" for func_dir in PotcarSingle.functional_dir.values()]

if not any(map(os.path.isdir, src_dirs)):
raise RuntimeError(f"No input POTCAR library found, tried {src_dirs}")

for func_dir in src_dirs:
if not os.path.isdir(func_dir):
continue

for psp_name in psp_variants:
rebase_dir = f"{output_dir}/{func_dir}/{psp_name}/"
paths_to_try = [
f"{func_dir}/POTCAR.{psp_name}",
f"{func_dir}/POTCAR.{psp_name}.gz",
f"{func_dir}/{psp_name}/POTCAR",
f"{func_dir}/{psp_name}/POTCAR.gz",
]
if not any(map(os.path.isfile, paths_to_try)):
warnings.warn(f"Could not find {psp_name} in {paths_to_try}")
for potcar_path in paths_to_try:
if os.path.isfile(potcar_path):
os.makedirs(rebase_dir, exist_ok=True)
PotcarScrambler.from_file(input_filename=potcar_path, output_filename=f"{rebase_dir}/POTCAR.gz")
break


if __name__ == "__main__":
generate_fake_potcar_libraries()
91 changes: 79 additions & 12 deletions pymatgen/io/vasp/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from monty.json import MontyDecoder, MSONable
from monty.os import cd
from monty.os.path import zpath
from monty.serialization import loadfn
from monty.serialization import dumpfn, loadfn
from tabulate import tabulate

from pymatgen.core import SETTINGS
Expand Down Expand Up @@ -1597,13 +1597,26 @@ class PotcarSingle:
are raised if a POTCAR hash fails validation.
"""

"""
NB: there are multiple releases of the {LDA,PBE} {52,54} POTCARs
the original (univie) releases include no SHA256 hashes nor COPYR fields
in the PSCTR/header field.
We indicate the older release in `functional_dir` as PBE_52, PBE_54, LDA_52, LDA_54.
The newer release is indicated as PBE_52_W_HASH, etc.
"""
functional_dir = dict(
PBE="POT_GGA_PAW_PBE",
PBE_52="POT_GGA_PAW_PBE_52",
PBE_52_W_HASH="POTPAW_PBE_52",
PBE_54="POT_GGA_PAW_PBE_54",
PBE_54_W_HASH="POTPAW_PBE_54",
PBE_64="POT_PAW_PBE_64",
LDA="POT_LDA_PAW",
LDA_52="POT_LDA_PAW_52",
LDA_52_W_HASH="POTPAW_LDA_52",
LDA_54="POT_LDA_PAW_54",
LDA_54_W_HASH="POTPAW_LDA_54",
LDA_64="POT_LDA_PAW_64",
PW91="POT_GGA_PAW_PW91",
LDA_US="POT_LDA_US",
PW91_US="POT_GGA_US_PW91",
Expand Down Expand Up @@ -2106,8 +2119,8 @@ def md5_header_hash(self) -> str:
def is_valid(self) -> bool:
"""
Check that POTCAR matches reference metadata.
Parsed metadata is stored in self._meta as a human-readable dict,
self._meta = {
Parsed metadata is stored in self._summary_stats as a human-readable dict,
self._summary_stats = {
"keywords": {
"header": list[str],
"data": list[str],
Expand Down Expand Up @@ -2135,17 +2148,17 @@ def is_valid(self) -> bool:
Note also that POTCARs can contain **different** data keywords
All keywords found in the header, essentially self.keywords, and the data block
(<Data Keyword> above) are stored in self._meta["keywords"]
(<Data Keyword> above) are stored in self._summary_stats["keywords"]
To avoid issues of copyright, statistics (mean, mean of abs vals, variance, max, min)
for the numeric values in the header and data sections of POTCAR are stored
in self._meta["stats"]
in self._summary_stats["stats"]
tol is then used to match statistical values within a tolerance
"""
functional_lexch = {
"PE": ["PBE", "PBE_52", "PBE_54"],
"CA": ["LDA", "LDA_52", "LDA_54", "LDA_US", "Perdew_Zunger81"],
"PE": ["PBE", "PBE_52", "PBE_52_W_HASH", "PBE_54", "PBE_54_W_HASH", "PBE_64"],
"CA": ["LDA", "LDA_52", "LDA_52_W_HASH", "LDA_54", "LDA_54_W_HASH", "LDA_64", "LDA_US", "Perdew_Zunger81"],
"91": ["PW91", "PW91_US"],
}

Expand All @@ -2164,8 +2177,9 @@ def is_valid(self) -> bool:
)

def parse_fortran_style_str(input_str: str) -> Any:
"""Parse any input string as bool, int, float, or failing that, str. Used to parse FORTRAN-generated
POTCAR files where it's unknown a priori what type of data will be encountered.
"""Parse any input string as bool, int, float, or failing that, str.
Used to parse FORTRAN-generated POTCAR files where it's unknown
a priori what type of data will be encountered.
"""
input_str = input_str.strip()

Expand Down Expand Up @@ -2225,7 +2239,9 @@ def data_stats(data_list: Sequence) -> dict:
"MAX": arr.max(),
}

summary_stats = { # for this PotcarSingle instance
# NB: to add future summary stats in a way that's consistent with PMG,
# it's easiest to save the summary stats as an attr of PotcarSingle
self._summary_stats = { # for this PotcarSingle instance
"keywords": {
"header": [kwd.lower() for kwd in self.keywords],
"data": psp_keys,
Expand All @@ -2239,12 +2255,12 @@ def data_stats(data_list: Sequence) -> dict:
data_match_tol = 1e-6
for ref_psp in possible_potcar_matches:
key_match = all(
set(ref_psp["keywords"][key]) == set(summary_stats["keywords"][key]) # type: ignore
set(ref_psp["keywords"][key]) == set(self._summary_stats["keywords"][key]) # type: ignore
for key in ["header", "data"]
)

data_diff = [
abs(ref_psp["stats"][key][stat] - summary_stats["stats"][key][stat]) # type: ignore
abs(ref_psp["stats"][key][stat] - self._summary_stats["stats"][key][stat]) # type: ignore
for stat in ["MEAN", "ABSMEAN", "VAR", "MIN", "MAX"]
for key in ["header", "data"]
]
Expand Down Expand Up @@ -2274,6 +2290,57 @@ def __repr__(self) -> str:
return f"{cls_name}({symbol=}, {functional=}, {TITEL=}, {VRHFIN=}, {n_valence_elec=:.0f})"


def _gen_potcar_summary_stats(
append: bool = False,
vasp_psp_dir: str | None = None,
summary_stats_filename: str = f"{module_dir}/potcar_summary_stats.json.gz",
):
"""
This function solely intended to be used for PMG development to regenerate the
potcar_summary_stats.json.gz file used to validate POTCARs
THIS FUNCTION IS DESTRUCTIVE. It will completely overwrite your potcar_summary_stats.json.gz.
Args:
append (bool): Change whether data is appended to the existing potcar_summary_stats.json.gz,
or if a completely new file is generated. Defaults to False.
PMG_VASP_PSP_DIR (str): Change where this function searches for POTCARs
defaults to the PMG_VASP_PSP_DIR environment variable if not set. Defaults to None.
summary_stats_filename (str): Name of the output summary stats file. Defaults to
'<pymatgen_install_dir>/io/vasp/potcar_summary_stats.json.gz'.
"""
func_dir_exist: dict[str, str] = {}
vasp_psp_dir = vasp_psp_dir or SETTINGS.get("PMG_VASP_PSP_DIR")
for func in PotcarSingle.functional_dir:
cpsp_dir = f"{vasp_psp_dir}/{PotcarSingle.functional_dir[func]}"
if os.path.isdir(cpsp_dir):
func_dir_exist[func] = PotcarSingle.functional_dir[func]
else:
warnings.warn(f"missing {PotcarSingle.functional_dir[func]} POTCAR directory")

# use append = True if a new POTCAR library is released to add new summary stats
# without completely regenerating the dict of summary stats
# use append = False to completely regenerate the summary stats dict
new_summary_stats = loadfn(summary_stats_filename) if append else {}

for func in func_dir_exist:
new_summary_stats.setdefault(func, {}) # initialize dict if key missing

potcar_list = [
*glob(f"{vasp_psp_dir}/{func_dir_exist[func]}/POTCAR*"),
*glob(f"{vasp_psp_dir}/{func_dir_exist[func]}/*/POTCAR*"),
]
for potcar in potcar_list:
psp = PotcarSingle.from_file(potcar)
new_summary_stats[func][psp.TITEL.replace(" ", "")] = {
"LEXCH": psp.LEXCH,
"VRHFIN": psp.VRHFIN.replace(" ", ""),
**psp._summary_stats,
}

dumpfn(new_summary_stats, summary_stats_filename)


class Potcar(list, MSONable):
"""
Object for reading and writing POTCAR files for calculations. Consists of a
Expand Down
Binary file modified pymatgen/io/vasp/potcar_summary_stats.json.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
34 changes: 33 additions & 1 deletion tests/io/vasp/test_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
import pytest
import scipy.constants as const
from monty.io import zopen
from monty.serialization import loadfn
from numpy.testing import assert_allclose
from pytest import approx
from pytest import MonkeyPatch, approx

from pymatgen.core import SETTINGS
from pymatgen.core.composition import Composition
Expand All @@ -26,6 +27,7 @@
PotcarSingle,
UnknownPotcarWarning,
VaspInput,
_gen_potcar_summary_stats,
)
from pymatgen.util.testing import TEST_FILES_DIR, PymatgenTest

Expand Down Expand Up @@ -1210,3 +1212,33 @@ def test_from_directory(self):
dct = vi.as_dict()
vasp_input = VaspInput.from_dict(dct)
assert "CONTCAR.Li2O" in vasp_input


def test_gen_potcar_summary_stats(tmp_path: Path, monkeypatch: MonkeyPatch):
"""Regenerate the potcar_summary_stats.json.gz file used to validate POTCARs with scrambled POTCARs."""
psp_path = f"{TEST_FILES_DIR}/fake_potcar_library/"
summ_stats_file = f"{tmp_path}/fake_potcar_summary_stats.json.gz"
_gen_potcar_summary_stats(append=False, vasp_psp_dir=psp_path, summary_stats_filename=summ_stats_file)

# only checking for two directories to save space, fake POTCAR library is big
summ_stats = loadfn(summ_stats_file)
assert set(summ_stats) == (expected_funcs := {"LDA_64", "PBE_54_W_HASH"})

# The fake POTCAR library is pretty big even with just two sub-libraries
# just copying over entries to work with PotcarSingle.is_valid
for func in PotcarSingle.functional_dir:
if func in expected_funcs:
continue
if "pbe" in func.lower() or "pw91" in func.lower():
summ_stats[func] = summ_stats["PBE_54_W_HASH"].copy()
elif "lda" in func.lower() or "perdew_zunger81" in func.lower():
summ_stats[func] = summ_stats["LDA_64"].copy()

# override reference potcar_summary_stats with fake data
monkeypatch.setattr(PotcarSingle, "potcar_summary_stats", summ_stats)

for func in expected_funcs:
bdir = f"{psp_path}/{PotcarSingle.functional_dir[func]}"
valid_elements = [x for x in os.listdir(f"{bdir}") if x[0] != "." and os.path.isdir(f"{bdir}/{x}")]
for element in valid_elements:
assert PotcarSingle.from_file(f"{bdir}/POTCAR.{element}.gz").is_valid

0 comments on commit 8c594d7

Please sign in to comment.