FAIR-Chem · lbluque · Aug 5, 2024 · May 20, 2024 · May 20, 2024 · May 21, 2024
diff --git a/src/fairchem/core/common/distutils.py b/src/fairchem/core/common/distutils.py
@@ -10,7 +10,8 @@
 import logging
 import os
 import subprocess
-from typing import TypeVar
+from datetime import timedelta
+from typing import Any, TypeVar
 
 import torch
 import torch.distributed as dist
@@ -27,6 +28,7 @@ def os_environ_get_or_throw(x: str) -> str:
 
 
 def setup(config) -> None:
+    timeout = timedelta(minutes=config.get("timeout", 30))
     if config["submit"]:
         node_list = os.environ.get("SLURM_STEP_NODELIST")
         if node_list is None:
@@ -72,6 +74,7 @@ def setup(config) -> None:
                     init_method=config["init_method"],
                     world_size=config["world_size"],
                     rank=config["rank"],
+                    timeout=timeout,
                 )
             except subprocess.CalledProcessError as e:  # scontrol failed
                 raise e
@@ -95,10 +98,11 @@ def setup(config) -> None:
             rank=world_rank,
             world_size=world_size,
             init_method="env://",
+            timeout=timeout,
         )
     else:
         config["local_rank"] = int(os.environ.get("LOCAL_RANK", config["local_rank"]))
-        dist.init_process_group(backend="nccl")
+        dist.init_process_group(backend="nccl", timeout=timeout)
 
 
 def cleanup() -> None:
@@ -135,6 +139,14 @@ def broadcast(
     dist.broadcast(tensor, src, group, async_op)
 
 
+def broadcast_object_list(
+    object_list: list[Any], src: int, group=dist.group.WORLD, device: str | None = None
+) -> None:
+    if get_world_size() == 1:
+        return
+    dist.broadcast_object_list(object_list, src, group, device)
+
+
 def all_reduce(
     data, group=dist.group.WORLD, average: bool = False, device=None
 ) -> torch.Tensor:

diff --git a/src/fairchem/core/datasets/ase_datasets.py b/src/fairchem/core/datasets/ase_datasets.py
@@ -13,7 +13,7 @@
 import os
 import warnings
 from abc import ABC, abstractmethod
-from functools import cache, reduce
+from functools import cache
 from glob import glob
 from pathlib import Path
 from typing import Any, Callable
@@ -469,13 +469,14 @@ class AseDBDataset(AseAtomsDataset):
 
     def _load_dataset_get_ids(self, config: dict) -> list[int]:
         if isinstance(config["src"], list):
-            if os.path.isdir(config["src"][0]):
-                filepaths = reduce(
-                    lambda x, y: x + y,
-                    (glob(f"{path}/*") for path in config["src"]),
-                )
-            else:
-                filepaths = config["src"]
+            filepaths = []
+            for path in config["src"]:
+                if os.path.isdir(path):
+                    filepaths.extend(glob(f"{path}/*"))
+                elif os.path.isfile(path):
+                    filepaths.append(path)
+                else:
+                    raise RuntimeError(f"Error reading dataset in {path}!")
         elif os.path.isfile(config["src"]):
             filepaths = [config["src"]]
         elif os.path.isdir(config["src"]):

diff --git a/src/fairchem/core/modules/normalization/__init__.py b/src/fairchem/core/modules/normalization/__init__.py
diff --git a/src/fairchem/core/modules/normalization/_load_utils.py b/src/fairchem/core/modules/normalization/_load_utils.py
@@ -0,0 +1,113 @@
+"""
+Copyright (c) Meta, Inc. and its affiliates.
+
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Callable
+
+import torch
+
+from fairchem.core.common.utils import save_checkpoint
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from torch.nn import Module
+    from torch.utils.data import Dataset
+
+
+def _load_check_duplicates(config: dict, name: str) -> dict[str, torch.nn.Module]:
+    """Attempt to load a single file with normalizers/element references and check config for duplicate targets.
+
+    Args:
+        config: configuration dictionary
+        name: Name of module to use for logging
+
+    Returns:
+        dictionary of normalizer or element reference modules
+    """
+    modules = {}
+    if "file" in config:
+        modules = torch.load(config["file"])
+        logging.info(f"Loaded {name} for the following targets: {list(modules.keys())}")
+    # make sure that element-refs are not specified both as fit and file
+    fit_targets = config["fit"]["targets"] if "fit" in config else []
+    duplicates = list(
+        filter(
+            lambda x: x in fit_targets,
+            list(config) + list(modules.keys()),
+        )
+    )
+    if len(duplicates) > 0:
+        logging.warning(
+            f"{name} values for the following targets {duplicates} have been specified to be fit and also read"
+            f" from a file. The files read from file will be used instead of fitting."
+        )
+    duplicates = list(filter(lambda x: x in modules, config))
+    if len(duplicates) > 0:
+        logging.warning(
+            f"Duplicate {name} values for the following targets {duplicates} where specified in the file "
+            f"{config['file']} and an explicitly set file. The normalization values read from "
+            f"{config['file']} will be used."
+        )
+    return modules
+
+
+def _load_from_config(
+    config: dict,
+    name: str,
+    fit_fun: Callable[[list[str], Dataset, Any, ...], dict[str, Module]],
+    create_fun: Callable[[str | Path], Module],
+    dataset: Dataset,
+    checkpoint_dir: str | Path | None = None,
+    **fit_kwargs,
+) -> dict[str, torch.nn.Module]:
+    """Load or fit normalizers or element references from config
+
+    If a fit is done, a fitted key with value true is added to the config to avoid re-fitting
+    once a checkpoint has been saved.
+
+    Args:
+        config: configuration dictionary
+        name: Name of module to use for logging
+        fit_fun: Function to fit modules
+        create_fun: Function to create a module from file
+        checkpoint_dir: directory to save modules. If not given, modules won't be saved.
+
+    Returns:
+        dictionary of normalizer or element reference modules
+
+    """
+    modules = _load_check_duplicates(config, name)
+    for target in config:
+        if target == "fit" and not config["fit"].get("fitted", False):
+            # remove values for output targets that have already been read from files
+            targets = [
+                target for target in config["fit"]["targets"] if target not in modules
+            ]
+            fit_kwargs.update(
+                {k: v for k, v in config["fit"].items() if k != "targets"}
+            )
+            modules.update(fit_fun(targets=targets, dataset=dataset, **fit_kwargs))
+            config["fit"]["fitted"] = True
+        # if a single file for all outputs is not provided,
+        # then check if a single file is provided for a specific output
+        elif target != "file":
+            modules[target] = create_fun(**config[target])
+        # save the linear references for possible subsequent use
+        if checkpoint_dir is not None:
+            path = save_checkpoint(
+                modules,
+                checkpoint_dir,
+                f"{name}.pt",
+            )
+            logging.info(
+                f"{name} checkpoint for targets {list(modules.keys())} have been saved to: {path}"
+            )
+
+    return modules