diff --git a/emmet-builders/emmet/builders/utils.py b/emmet-builders/emmet/builders/utils.py index 272f55d2a8..7a417fe983 100644 --- a/emmet-builders/emmet/builders/utils.py +++ b/emmet-builders/emmet/builders/utils.py @@ -10,6 +10,9 @@ from itertools import chain, combinations from pymatgen.core import Structure from pymatgen.analysis.diffusion.neb.full_path_mapper import MigrationGraph +from pymatgen.io.vasp.inputs import PotcarSingle + +from emmet.builders.settings import EmmetBuildSettings def maximal_spanning_non_intersecting_subsets(sets) -> Set[Set]: @@ -211,3 +214,31 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): sys.stdout.close() sys.stdout = self._original_stdout + + +def get_potcar_stats(): + default_settings = EmmetBuildSettings() + + stats: dict[str, dict] = {} # type: ignore + + for ( + calc_type, + input_set, + ) in default_settings.VASP_DEFAULT_INPUT_SETS.items(): + _input = input_set() + + stats[calc_type] = {} + functional = _input._config_dict["POTCAR_FUNCTIONAL"] + + for potcar_symbol in _input.CONFIG["POTCAR"].values(): + potcar = PotcarSingle.from_symbol_and_functional( + symbol=potcar_symbol, functional=functional + ) + summary_stats = potcar._summary_stats.copy() + # fallback method for validation - use header hash and symbol + # note that the potcar_spec assigns PotcarSingle.symbol to "titel" + summary_stats["titel"] = potcar.TITEL + summary_stats["hash"] = potcar.md5_header_hash + stats[calc_type].update({potcar_symbol: summary_stats}) + + return stats diff --git a/emmet-builders/emmet/builders/vasp/task_validator.py b/emmet-builders/emmet/builders/vasp/task_validator.py index a2dc6bc5e0..3f7637686d 100644 --- a/emmet-builders/emmet/builders/vasp/task_validator.py +++ b/emmet-builders/emmet/builders/vasp/task_validator.py @@ -1,12 +1,12 @@ from typing import Dict, Optional -from collections import defaultdict from maggma.builders import MapBuilder from maggma.core import Store from emmet.builders.settings import EmmetBuildSettings -from emmet.core.vasp.task_valid import TaskDocument +from emmet.builders.utils import get_potcar_stats from emmet.core.vasp.calc_types.enums import CalcType +from emmet.core.vasp.task_valid import TaskDocument from emmet.core.vasp.validation import DeprecationMessage, ValidationDoc @@ -15,7 +15,7 @@ def __init__( self, tasks: Store, task_validation: Store, - potcar_hashes: Optional[Dict[CalcType, Dict[str, str]]] = None, + potcar_stats: Optional[Dict[CalcType, Dict[str, str]]] = None, settings: Optional[EmmetBuildSettings] = None, query: Optional[Dict] = None, **kwargs, @@ -26,7 +26,7 @@ def __init__( Args: tasks: Store of task documents task_validation: Store of task_types for tasks - potcar_hashes: Optional dictionary of potcar hash data. + potcar_stats: Optional dictionary of potcar hash data. Mapping is calculation type -> potcar symbol -> hash value. """ self.tasks = tasks @@ -34,29 +34,14 @@ def __init__( self.settings = EmmetBuildSettings.autoload(settings) self.query = query self.kwargs = kwargs - self.potcar_hashes = potcar_hashes + self.potcar_stats = potcar_stats # Set up potcar cache if appropriate - if self.settings.VASP_VALIDATE_POTCAR_HASHES: - if not self.potcar_hashes: - from pymatgen.io.vasp.inputs import PotcarSingle - - hashes = defaultdict(dict) # type: dict - - for ( - calc_type, - input_set, - ) in self.settings.VASP_DEFAULT_INPUT_SETS.items(): - functional = input_set.CONFIG["POTCAR_FUNCTIONAL"] - for potcar_symbol in input_set.CONFIG["POTCAR"].values(): - potcar = PotcarSingle.from_symbol_and_functional( - symbol=potcar_symbol, functional=functional - ) - hashes[calc_type][potcar_symbol] = potcar._summary_stats - - self.potcar_hashes = potcar_hashes + if self.settings.VASP_VALIDATE_POTCAR_STATS: + if not self.potcar_stats: + self.potcar_stats = get_potcar_stats() else: - self.potcar_hashes = None + self.potcar_stats = None super().__init__( source=tasks, @@ -88,7 +73,7 @@ def unary_function(self, item): input_sets=self.settings.VASP_DEFAULT_INPUT_SETS, LDAU_fields=self.settings.VASP_CHECKED_LDAU_FIELDS, max_allowed_scf_gradient=self.settings.VASP_MAX_SCF_GRADIENT, - potcar_hashes=self.potcar_hashes, + potcar_stats=self.potcar_stats, ) bad_tags = list(set(task_doc.tags).intersection(self.settings.DEPRECATED_TAGS)) diff --git a/emmet-builders/tests/test_materials.py b/emmet-builders/tests/test_materials.py index 401262947e..4e63d73cf5 100644 --- a/emmet-builders/tests/test_materials.py +++ b/emmet-builders/tests/test_materials.py @@ -16,7 +16,7 @@ def tasks_store(test_dir): @pytest.fixture(scope="session") def validation_store(tasks_store): - settings = EmmetBuildSettings(VASP_VALIDATE_POTCAR_HASHES=False) + settings = EmmetBuildSettings(VASP_VALIDATE_POTCAR_STATS=False) validation_store = MemoryStore() builder = TaskValidator( tasks=tasks_store, task_validation=validation_store, settings=settings diff --git a/emmet-builders/tests/test_utils.py b/emmet-builders/tests/test_utils.py index b0f4931e6e..6fd1e12a19 100644 --- a/emmet-builders/tests/test_utils.py +++ b/emmet-builders/tests/test_utils.py @@ -2,10 +2,12 @@ chemsys_permutations, maximal_spanning_non_intersecting_subsets, get_hop_cutoff, + get_potcar_stats, ) from pymatgen.analysis.diffusion.neb.full_path_mapper import MigrationGraph from numpy.testing import assert_almost_equal from monty.serialization import loadfn +from emmet.core.settings import EmmetSettings def test_maximal_spanning_non_intersecting_subsets(): @@ -55,3 +57,28 @@ def test_get_hop_cutoff(test_dir): check_mg = MigrationGraph.with_distance(nasicon_mg.structure, "Mg", d) assert_almost_equal(d, 4.59, decimal=2) assert len(check_mg.unique_hops) == 6 + + +def test_get_potcar_stats(): + calc_type = EmmetSettings().VASP_DEFAULT_INPUT_SETS + + try: + potcar_stats = get_potcar_stats() + except Exception as exc: + if "No POTCAR for" in str(exc): + # No Potcar library available, skip test + return + else: + raise exc + + # ensure that all calc types are included in potcar_stats + assert potcar_stats.keys() == calc_type.keys() + + for calc_type in potcar_stats: + # ensure that each entry has needed fields for both + # legacy and modern potcar validation + assert all( + set(potcar_stats[calc_type][symb]) + == set(["hash", "keywords", "titel", "stats"]) + for symb in potcar_stats[calc_type] + ) diff --git a/emmet-builders/tests/test_vasp.py b/emmet-builders/tests/test_vasp.py index ad6dec2dc9..52887739d4 100644 --- a/emmet-builders/tests/test_vasp.py +++ b/emmet-builders/tests/test_vasp.py @@ -18,7 +18,7 @@ def validation_store(): def test_validator(tasks_store, validation_store): - settings = EmmetBuildSettings(VASP_VALIDATE_POTCAR_HASHES=False) + settings = EmmetBuildSettings(VASP_VALIDATE_POTCAR_STATS=False) builder = TaskValidator( tasks=tasks_store, task_validation=validation_store, settings=settings ) diff --git a/emmet-core/emmet/core/settings.py b/emmet-core/emmet/core/settings.py index ed509d288f..27070397b9 100644 --- a/emmet-core/emmet/core/settings.py +++ b/emmet-core/emmet/core/settings.py @@ -149,8 +149,8 @@ class EmmetSettings(BaseSettings): description="Default input sets for task validation", ) - VASP_VALIDATE_POTCAR_HASHES: bool = Field( - True, description="Whether to validate POTCAR hash values." + VASP_VALIDATE_POTCAR_STATS: bool = Field( + True, description="Whether to validate POTCAR stat values." ) VASP_CHECKED_LDAU_FIELDS: List[str] = Field( diff --git a/emmet-core/emmet/core/vasp/validation.py b/emmet-core/emmet/core/vasp/validation.py index 28c8648637..b4dde876e9 100644 --- a/emmet-core/emmet/core/vasp/validation.py +++ b/emmet-core/emmet/core/vasp/validation.py @@ -18,6 +18,10 @@ class DeprecationMessage(DocEnum): MANUAL = "M", "Manual deprecation" + SYMMETRY = ( + "S001", + "Could not determine crystalline space group, needed for input set check.", + ) KPTS = "C001", "Too few KPoints" KSPACING = "C002", "KSpacing not high enough" ENCUT = "C002", "ENCUT too low" @@ -66,7 +70,7 @@ def from_task_doc( input_sets: Dict[str, ImportString] = SETTINGS.VASP_DEFAULT_INPUT_SETS, LDAU_fields: List[str] = SETTINGS.VASP_CHECKED_LDAU_FIELDS, max_allowed_scf_gradient: float = SETTINGS.VASP_MAX_SCF_GRADIENT, - potcar_hashes: Optional[Dict[CalcType, Dict[str, str]]] = None, + potcar_stats: Optional[Dict[CalcType, Dict[str, str]]] = None, ) -> "ValidationDoc": """ Determines if a calculation is valid based on expected input parameters from a pymatgen inputset @@ -80,7 +84,7 @@ def from_task_doc( LDAU_fields: LDAU fields to check for consistency max_allowed_scf_gradient: maximum uphill gradient allowed for SCF steps after the initial equillibriation period - potcar_hashes: Dictionary of potcar hash data. Mapping is calculation type -> potcar symbol -> hash value. + potcar_stats: Dictionary of potcar stat data. Mapping is calculation type -> potcar symbol -> hash value. """ bandgap = task_doc.output.bandgap @@ -110,10 +114,19 @@ def from_task_doc( reasons.append(DeprecationMessage.SET) valid_input_set = None + try: + # Sometimes spglib can't determine space group with the default + # `symprec` and `angle_tolerance`. In these cases, + # `Structure.get_space_group_info()` fails + valid_input_set.structure.get_space_group_info() + except Exception: + reasons.append(DeprecationMessage.SYMMETRY) + valid_input_set = None + if valid_input_set: # Checking POTCAR summary_stats if a directory is supplied - if potcar_hashes: - if _potcar_hash_check(task_doc, potcar_hashes): + if potcar_stats: + if _potcar_stats_check(task_doc, potcar_stats): if task_type in [ TaskType.NSCF_Line, TaskType.NSCF_Uniform, @@ -130,6 +143,7 @@ def from_task_doc( if task_type != task_type.NSCF_Line: # Not validating k-point data for line-mode calculations as constructing # the k-path is too costly for the builder and the uniform input set is used. + if valid_input_set.kpoints is not None: if _kpoint_check( valid_input_set, @@ -311,7 +325,7 @@ def _kspacing_warnings(input_set, inputs, data, warnings, kspacing_tolerance): ) -def _potcar_hash_check(task_doc, potcar_hashes): +def _potcar_stats_check(task_doc, potcar_stats: dict): """ Checks to make sure the POTCAR summary stats is equal to the correct value from the pymatgen input set. @@ -325,32 +339,47 @@ def _potcar_hash_check(task_doc, potcar_hashes): # Assume it is an old calculation without potcar_spec data and treat it as passing POTCAR hash check return False + use_legacy_hash_check = False + if any(len(entry.get("summary_stats", {})) == 0 for entry in potcar_details): + # potcar_spec doesn't include summary_stats kwarg needed to check potcars + # fall back to header hash checking + use_legacy_hash_check = True + all_match = True for entry in potcar_details: symbol = entry["titel"].split(" ")[1] - ref_summ_stats = potcar_hashes[str(task_doc.calc_type)].get(symbol, None) + ref_summ_stats = potcar_stats[str(task_doc.calc_type)].get(symbol, None) + if not ref_summ_stats: + # Symbol differs from reference set - deprecate all_match = False break - key_match = all( - set(ref_summ_stats["keywords"][key]) - == set(entry["summary_stats"]["keywords"][key]) - for key in ["header", "data"] - ) + if use_legacy_hash_check: + all_match = all( + entry[key] == ref_summ_stats[key] + for key in ( + "hash", + "titel", + ) + ) - data_match = all( - abs( - ref_summ_stats["stats"][key][stat] - - entry["summary_stats"]["stats"][key][stat] + else: + all_match = all( + set(ref_summ_stats["keywords"][key]) + == set(entry["summary_stats"]["keywords"][key]) + for key in ["header", "data"] + ) and all( + abs( + ref_summ_stats["stats"][key][stat] + - entry["summary_stats"]["stats"][key][stat] + ) + < data_tol + for stat in ["MEAN", "ABSMEAN", "VAR", "MIN", "MAX"] + for key in ["header", "data"] ) - < data_tol - for stat in ["MEAN", "ABSMEAN", "VAR", "MIN", "MAX"] - for key in ["header", "data"] - ) - if (not key_match) or (not data_match): - all_match = False + if not all_match: break return not all_match diff --git a/emmet-core/tests/vasp/test_vasp.py b/emmet-core/tests/vasp/test_vasp.py index 60527745c2..0d24ea6f9f 100644 --- a/emmet-core/tests/vasp/test_vasp.py +++ b/emmet-core/tests/vasp/test_vasp.py @@ -5,7 +5,7 @@ from emmet.core.vasp.calc_types import RunType, TaskType, run_type, task_type from emmet.core.vasp.task_valid import TaskDocument -from emmet.core.vasp.validation import ValidationDoc, _potcar_hash_check +from emmet.core.vasp.validation import ValidationDoc, _potcar_stats_check def test_task_type(): @@ -55,6 +55,14 @@ def test_validator(tasks): assert all([doc.valid for doc in validation_docs]) +def test_validator_failed_symmetry(test_dir): + with zopen(test_dir / "failed_elastic_task.json.gz", "r") as f: + failed_task = json.load(f) + taskdoc = TaskDocument(**failed_task) + validation = ValidationDoc.from_task_doc(taskdoc) + assert any("SYMMETRY" in repr(reason) for reason in validation.reasons) + + def test_computed_entry(tasks): entries = [task.entry for task in tasks] ids = {e.entry_id for e in entries} @@ -87,7 +95,7 @@ def test_ldau_validation(test_dir): assert valid.valid -def test_potcar_hash_check(test_dir): +def test_potcar_stats_check(test_dir): from pymatgen.io.vasp import PotcarSingle with zopen(test_dir / "CoF_TaskDoc.json") as f: @@ -106,27 +114,29 @@ def test_potcar_hash_check(test_dir): I cannot rebuild the TaskDoc without excluding the `orig_inputs` key. """ task_doc = TaskDocument(**{key: data[key] for key in data if key != "last_updated"}) - - # First check: generate hashes from POTCARs in TaskDoc, check should pass - calc_type = str(task_doc.calc_type) - expected_hashes = {calc_type: {}} try: + # First check: generate hashes from POTCARs in TaskDoc, check should pass + calc_type = str(task_doc.calc_type) + expected_hashes = {calc_type: {}} for spec in task_doc.calcs_reversed[0]["input"]["potcar_spec"]: symbol = spec["titel"].split(" ")[1] - expected_hashes[calc_type][ - symbol - ] = PotcarSingle.from_symbol_and_functional( + potcar = PotcarSingle.from_symbol_and_functional( symbol=symbol, functional="PBE" - )._summary_stats + ) + expected_hashes[calc_type][symbol] = { + **potcar._summary_stats, + "hash": potcar.md5_header_hash, + "titel": potcar.TITEL, + } - assert not _potcar_hash_check(task_doc, expected_hashes) + assert not _potcar_stats_check(task_doc, expected_hashes) # Second check: remove POTCAR from expected_hashes, check should fail - missing_hashes = {calc_type: {**expected_hashes[calc_type]}} + missing_hashes = {calc_type: expected_hashes[calc_type].copy()} first_element = list(missing_hashes[calc_type])[0] missing_hashes[calc_type].pop(first_element) - assert _potcar_hash_check(task_doc, missing_hashes) + assert _potcar_stats_check(task_doc, missing_hashes) # Third check: change data in expected hashes, check should fail @@ -134,7 +144,46 @@ def test_potcar_hash_check(test_dir): for key in wrong_hashes[calc_type][first_element]["stats"]["data"]: wrong_hashes[calc_type][first_element]["stats"]["data"][key] *= 1.1 - assert _potcar_hash_check(task_doc, wrong_hashes) + assert _potcar_stats_check(task_doc, wrong_hashes) + + # Fourth check: use legacy hash check if `summary_stats` + # field not populated. This should pass + legacy_data = data.copy() + legacy_data["calcs_reversed"][0]["input"]["potcar_spec"] = [ + { + key: potcar[key] + for key in ( + "titel", + "hash", + ) + } + for potcar in legacy_data["calcs_reversed"][0]["input"]["potcar_spec"] + ] + legacy_task_doc = TaskDocument( + **{key: legacy_data[key] for key in legacy_data if key != "last_updated"} + ) + assert not _potcar_stats_check(legacy_task_doc, expected_hashes) + + # Fifth check: use legacy hash check if `summary_stats` + # field not populated, but one hash is wrong. This should fail + legacy_data = data.copy() + legacy_data["calcs_reversed"][0]["input"]["potcar_spec"] = [ + { + key: potcar[key] + for key in ( + "titel", + "hash", + ) + } + for potcar in legacy_data["calcs_reversed"][0]["input"]["potcar_spec"] + ] + legacy_data["calcs_reversed"][0]["input"]["potcar_spec"][0][ + "hash" + ] = legacy_data["calcs_reversed"][0]["input"]["potcar_spec"][0]["hash"][:-1] + legacy_task_doc = TaskDocument( + **{key: legacy_data[key] for key in legacy_data if key != "last_updated"} + ) + assert _potcar_stats_check(legacy_task_doc, expected_hashes) except (OSError, ValueError): # missing Pymatgen POTCARs, cannot perform test diff --git a/test_files/failed_elastic_task.json.gz b/test_files/failed_elastic_task.json.gz new file mode 100644 index 0000000000..a2437623dd Binary files /dev/null and b/test_files/failed_elastic_task.json.gz differ