deepmodeling · wanghan-iapcm · Feb 10, 2024 · Feb 7, 2024 · Feb 7, 2024 · Feb 7, 2024
diff --git a/deepmd/dpmodel/descriptor/make_base_descriptor.py b/deepmd/dpmodel/descriptor/make_base_descriptor.py
@@ -69,15 +69,13 @@
             """
             pass
 
-        @abstractmethod
         def compute_input_stats(self, merged):
             """Update mean and stddev for descriptor elements."""
-            pass
+            raise NotImplementedError
 
-        @abstractmethod
-        def init_desc_stat(self, sumr, suma, sumn, sumr2, suma2):
+        def init_desc_stat(self, **kwargs):
             """Initialize the model bias by the statistics."""
-            pass
+            raise NotImplementedError
 
         @abstractmethod
         def fwd(

diff --git a/deepmd/dpmodel/fitting/invar_fitting.py b/deepmd/dpmodel/fitting/invar_fitting.py
@@ -236,6 +236,14 @@
         else:
             raise KeyError(key)
 
+    def compute_output_stats(self, merged):
+        """Update the output bias for fitting net."""
+        raise NotImplementedError
+
+    def init_fitting_stat(self, result_dict):
+        """Initialize the model bias by the statistics."""
+        raise NotImplementedError
+
     def serialize(self) -> dict:
         """Serialize the fitting to dict."""
         return {

diff --git a/deepmd/dpmodel/fitting/make_base_fitting.py b/deepmd/dpmodel/fitting/make_base_fitting.py
@@ -52,6 +52,14 @@
             """Calculate fitting."""
             pass
 
+        def compute_output_stats(self, merged):
+            """Update the output bias for fitting net."""
+            raise NotImplementedError
+
+        def init_fitting_stat(self, **kwargs):
+            """Initialize the model bias by the statistics."""
+            raise NotImplementedError
+
         @abstractmethod
         def serialize(self) -> dict:
             """Serialize the obj to dict."""

diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py
@@ -43,6 +43,9 @@
 from deepmd.pt.model.descriptor import (
     Descriptor,
 )
+from deepmd.pt.model.task import (
+    Fitting,
+)
 from deepmd.pt.train import (
     training,
 )
@@ -60,6 +63,7 @@
 )
 from deepmd.pt.utils.stat import (
     make_stat_input,
+    process_stat_path,
 )
 from deepmd.utils.summary import SummaryPrinter as BaseSummaryPrinter
 
@@ -125,51 +129,18 @@
 
         # stat files
         hybrid_descrpt = model_params_single["descriptor"]["type"] == "hybrid"
-        has_stat_file_path = True
         if not hybrid_descrpt:
-            ### this design requires "rcut", "rcut_smth" and "sel" in the descriptor
-            ### VERY BAD DESIGN!!!!
-            ### not all descriptors provides these parameter in their constructor
-            default_stat_file_name = Descriptor.get_stat_name(
-                model_params_single["descriptor"]
-            )
-            model_params_single["stat_file_dir"] = data_dict_single.get(
-                "stat_file_dir", f"stat_files{suffix}"
-            )
-            model_params_single["stat_file"] = data_dict_single.get(
-                "stat_file", default_stat_file_name
-            )
-            model_params_single["stat_file_path"] = os.path.join(
-                model_params_single["stat_file_dir"], model_params_single["stat_file"]
-            )
-            if not os.path.exists(model_params_single["stat_file_path"]):
-                has_stat_file_path = False
-        else:  ### need to remove this
-            default_stat_file_name = []
-            for descrpt in model_params_single["descriptor"]["list"]:
-                default_stat_file_name.append(
-                    f'stat_file_rcut{descrpt["rcut"]:.2f}_'
-                    f'smth{descrpt["rcut_smth"]:.2f}_'
-                    f'sel{descrpt["sel"]}_{descrpt["type"]}.npz'
-                )
-            model_params_single["stat_file_dir"] = data_dict_single.get(
-                "stat_file_dir", f"stat_files{suffix}"
+            stat_file_path_single, has_stat_file_path = process_stat_path(
+                data_dict_single.get("stat_file", None),
+                data_dict_single.get("stat_file_dir", f"stat_files{suffix}"),
+                model_params_single,
+                Descriptor,
+                Fitting,
             )
-            model_params_single["stat_file"] = data_dict_single.get(
-                "stat_file", default_stat_file_name
+        else:  ### TODO hybrid descriptor not implemented
+            raise NotImplementedError(
+                "data stat for hybrid descriptor is not implemented!"
             )
-            assert isinstance(
-                model_params_single["stat_file"], list
-            ), "Stat file of hybrid descriptor must be a list!"
-            stat_file_path = []
-            for stat_file_path_item in model_params_single["stat_file"]:
-                single_file_path = os.path.join(
-                    model_params_single["stat_file_dir"], stat_file_path_item
-                )
-                stat_file_path.append(single_file_path)
-                if not os.path.exists(single_file_path):
-                    has_stat_file_path = False
-            model_params_single["stat_file_path"] = stat_file_path
 
         # validation and training data
         validation_data_single = DpLoaderSet(
@@ -209,19 +180,30 @@
                     type_split=type_split,
                     noise_settings=noise_settings,
                 )
-        return train_data_single, validation_data_single, sampled_single
+        return (
+            train_data_single,
+            validation_data_single,
+            sampled_single,
+            stat_file_path_single,
+        )
 
     if not multi_task:
-        train_data, validation_data, sampled = prepare_trainer_input_single(
+        (
+            train_data,
+            validation_data,
+            sampled,
+            stat_file_path,
+        ) = prepare_trainer_input_single(
             config["model"], config["training"], config["loss"]
         )
     else:
-        train_data, validation_data, sampled = {}, {}, {}
+        train_data, validation_data, sampled, stat_file_path = {}, {}, {}, {}
         for model_key in config["model"]["model_dict"]:
             (
                 train_data[model_key],
                 validation_data[model_key],
                 sampled[model_key],
+                stat_file_path[model_key],
             ) = prepare_trainer_input_single(
                 config["model"]["model_dict"][model_key],
                 config["training"]["data_dict"][model_key],
@@ -232,7 +214,8 @@
     trainer = training.Trainer(
         config,
         train_data,
-        sampled,
+        sampled=sampled,
+        stat_file_path=stat_file_path,
         validation_data=validation_data,
         init_model=init_model,
         restart_model=restart_model,

diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py
@@ -88,7 +88,7 @@ def __init__(
         assert not self.multi_task, "multitask mode currently not supported!"
         self.type_split = self.input_param["descriptor"]["type"] in ["se_e2_a"]
         self.type_map = self.input_param["type_map"]
-        self.dp = ModelWrapper(get_model(self.input_param, None).to(DEVICE))
+        self.dp = ModelWrapper(get_model(self.input_param).to(DEVICE))
         self.dp.load_state_dict(state_dict)
         self.rcut = self.dp.model["Default"].descriptor.get_rcut()
         self.sec = np.cumsum(self.dp.model["Default"].descriptor.get_sel())

diff --git a/deepmd/pt/model/descriptor/descriptor.py b/deepmd/pt/model/descriptor/descriptor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
 from abc import (
     ABC,
     abstractmethod,
@@ -7,6 +8,7 @@
     Callable,
     List,
     Optional,
+    Union,
 )
 
 import numpy as np
@@ -23,6 +25,8 @@
     BaseDescriptor,
 )
 
+log = logging.getLogger(__name__)
+
 
 class Descriptor(torch.nn.Module, BaseDescriptor):
     """The descriptor.
@@ -56,15 +60,130 @@
         return Descriptor.__plugins.register(key)
 
     @classmethod
-    def get_stat_name(cls, config):
-        descrpt_type = config["type"]
-        return Descriptor.__plugins.plugins[descrpt_type].get_stat_name(config)
+    def get_stat_name(cls, ntypes, type_name, **kwargs):
+        """
+        Get the name for the statistic file of the descriptor.
+        Usually use the combination of descriptor name, rcut, rcut_smth and sel as the statistic file name.
+        """
+        if cls is not Descriptor:
+            raise NotImplementedError("get_stat_name is not implemented!")
+        descrpt_type = type_name
+        return Descriptor.__plugins.plugins[descrpt_type].get_stat_name(
+            ntypes, type_name, **kwargs
+        )
 
     @classmethod
     def get_data_process_key(cls, config):
+        """
+        Get the keys for the data preprocess.
+        Usually need the information of rcut and sel.
+        TODO Need to be deprecated when the dataloader has been cleaned up.
+        """
+        if cls is not Descriptor:
+            raise NotImplementedError("get_data_process_key is not implemented!")
         descrpt_type = config["type"]
         return Descriptor.__plugins.plugins[descrpt_type].get_data_process_key(config)
 
+    @property
+    def data_stat_key(self):
+        """
+        Get the keys for the data statistic of the descriptor.
+        Return a list of statistic names needed, such as "sumr", "suma" or "sumn".
+        """
+        raise NotImplementedError("data_stat_key is not implemented!")
+
+    def compute_or_load_stat(
+        self,
+        type_map: List[str],
+        sampled=None,
+        stat_file_path: Optional[Union[str, List[str]]] = None,
+    ):
+        """
+        Compute or load the statistics parameters of the descriptor.
+        Calculate and save the mean and standard deviation of the descriptor to `stat_file_path`
+        if `sampled` is not None, otherwise load them from `stat_file_path`.
+
+        Parameters
+        ----------
+        type_map
+            Mapping atom type to the name (str) of the type.
+            For example `type_map[1]` gives the name of the type 1.
+        sampled
+            The sampled data frames from different data systems.
+        stat_file_path
+            The path to the statistics files.
+        """
+        # TODO support hybrid descriptor
+        descrpt_stat_key = self.data_stat_key
+        if sampled is not None:  # compute the statistics results
+            tmp_dict = self.compute_input_stats(sampled)
+            result_dict = {key: tmp_dict[key] for key in descrpt_stat_key}
+            result_dict["type_map"] = type_map
+            if stat_file_path is not None:
+                self.save_stats(result_dict, stat_file_path)
+        else:  # load the statistics results
+            assert stat_file_path is not None, "No stat file to load!"
+            result_dict = self.load_stats(type_map, stat_file_path)
+        self.init_desc_stat(**result_dict)
+
+    def save_stats(self, result_dict, stat_file_path: Union[str, List[str]]):
+        """
+        Save the statistics results to `stat_file_path`.
+
+        Parameters
+        ----------
+        result_dict
+            The dictionary of statistics results.
+        stat_file_path
+            The path to the statistics file(s).
+        """
+        if not isinstance(stat_file_path, list):
+            log.info(f"Saving stat file to {stat_file_path}")
+            np.savez_compressed(stat_file_path, **result_dict)
+        else:  # TODO hybrid descriptor not implemented
+            raise NotImplementedError(
+                "save_stats for hybrid descriptor is not implemented!"
+            )
+
+    def load_stats(self, type_map, stat_file_path: Union[str, List[str]]):
+        """
+        Load the statistics results to `stat_file_path`.
+
+        Parameters
+        ----------
+        type_map
+            Mapping atom type to the name (str) of the type.
+            For example `type_map[1]` gives the name of the type 1.
+        stat_file_path
+            The path to the statistics file(s).
+
+        Returns
+        -------
+        result_dict
+            The dictionary of statistics results.
+        """
+        descrpt_stat_key = self.data_stat_key
+        target_type_map = type_map
+        if not isinstance(stat_file_path, list):
+            log.info(f"Loading stat file from {stat_file_path}")
+            stats = np.load(stat_file_path)
+            stat_type_map = list(stats["type_map"])
+            missing_type = [i for i in target_type_map if i not in stat_type_map]
+            assert not missing_type, (
+                f"These type are not in stat file {stat_file_path}: {missing_type}! "
+                f"Please change the stat file path!"
+            )
+            idx_map = [stat_type_map.index(i) for i in target_type_map]
+            if stats[descrpt_stat_key[0]].size:  # not empty
+                result_dict = {key: stats[key][idx_map] for key in descrpt_stat_key}
+            else:
+                result_dict = {key: [] for key in descrpt_stat_key}
+        else:  # TODO hybrid descriptor not implemented
+            raise NotImplementedError(
+                "load_stats for hybrid descriptor is not implemented!"
+            )
+        return result_dict
+
     def __new__(cls, *args, **kwargs):
         if cls is Descriptor:
             try:
@@ -156,15 +275,13 @@
         """Returns the embedding dimension."""
         pass
 
-    @abstractmethod
     def compute_input_stats(self, merged):
         """Update mean and stddev for DescriptorBlock elements."""
-        pass
+        raise NotImplementedError
 
-    @abstractmethod
-    def init_desc_stat(self, sumr, suma, sumn, sumr2, suma2):
-        """Initialize the model bias by the statistics."""
-        pass
+    def init_desc_stat(self, **kwargs):
+        """Initialize mean and stddev by the statistics."""
+        raise NotImplementedError
 
     def share_params(self, base_class, shared_level, resume=False):
         assert (
@@ -188,13 +305,14 @@
                     self.sumr2,
                     self.suma2,
                 )
-                base_class.init_desc_stat(
-                    sumr_base + sumr,
-                    suma_base + suma,
-                    sumn_base + sumn,
-                    sumr2_base + sumr2,
-                    suma2_base + suma2,
-                )
+                stat_dict = {
+                    "sumr": sumr_base + sumr,
+                    "suma": suma_base + suma,
+                    "sumn": sumn_base + sumn,
+                    "sumr2": sumr2_base + sumr2,
+                    "suma2": suma2_base + suma2,
+                }
+                base_class.init_desc_stat(**stat_dict)
                 self.mean = base_class.mean
                 self.stddev = base_class.stddev
             # self.load_state_dict(base_class.state_dict()) # this does not work, because it only inits the model