From 38128666627c6d1dc6f56e55d2f9c7313a7902bf Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Wed, 28 Feb 2024 16:42:09 +0800 Subject: [PATCH 01/37] Fix single-task training&data stat --- deepmd/pt/model/descriptor/dpa2.py | 2 +- deepmd/pt/model/model/__init__.py | 9 ++++----- deepmd/pt/model/model/model.py | 4 ++-- deepmd/utils/path.py | 1 + examples/water/dpa2/input_torch.json | 8 ++------ examples/water/se_atten/input_torch.json | 2 ++ examples/water/se_e2_a/input_torch.json | 1 + 7 files changed, 13 insertions(+), 14 deletions(-) diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py index e693116cf4..b1df56a004 100644 --- a/deepmd/pt/model/descriptor/dpa2.py +++ b/deepmd/pt/model/descriptor/dpa2.py @@ -304,7 +304,7 @@ def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None) } for item in merged ] - descrpt.compute_input_stats(merged_tmp) + descrpt.compute_input_stats(merged_tmp, path) def serialize(self) -> dict: """Serialize the obj to dict.""" diff --git a/deepmd/pt/model/model/__init__.py b/deepmd/pt/model/model/__init__.py index 0dc9ae20af..b823a051f5 100644 --- a/deepmd/pt/model/model/__init__.py +++ b/deepmd/pt/model/model/__init__.py @@ -20,7 +20,7 @@ BaseDescriptor, ) from deepmd.pt.model.task import ( - Fitting, + BaseFitting, ) from .dp_model import ( @@ -61,7 +61,7 @@ def get_zbl_model(model_params): fitting_net["out_dim"] = descriptor.get_dim_emb() if "ener" in fitting_net["type"]: fitting_net["return_energy"] = True - fitting = Fitting(**fitting_net) + fitting = BaseFitting(**fitting_net) dp_model = DPAtomicModel(descriptor, fitting, type_map=model_params["type_map"]) # pairtab filepath = model_params["use_srtab"] @@ -97,9 +97,8 @@ def get_model(model_params): fitting_net["out_dim"] = descriptor.get_dim_emb() if "ener" in fitting_net["type"]: fitting_net["return_energy"] = True - fitting = Fitting(**fitting_net) - - model = EnergyModel(descriptor, fitting, type_map=model_params["type_map"]) + fitting = BaseFitting(**fitting_net) + model = DPModel(descriptor, fitting, type_map=model_params["type_map"]) model.model_def_script = json.dumps(model_params) return model diff --git a/deepmd/pt/model/model/model.py b/deepmd/pt/model/model/model.py index 0f5e27aea9..e32d2f307d 100644 --- a/deepmd/pt/model/model/model.py +++ b/deepmd/pt/model/model/model.py @@ -59,9 +59,9 @@ # in DPAtomicModel (and other classes), but this requires the developer aware # of it when developing it... class BaseModel(make_base_model()): - def __init__(self): + def __init__(self, *args, **kwargs): """Construct a basic model for different tasks.""" - super().__init__() + super().__init__(*args, **kwargs) def compute_or_load_stat( self, diff --git a/deepmd/utils/path.py b/deepmd/utils/path.py index c9a7cd8554..79361b6c23 100644 --- a/deepmd/utils/path.py +++ b/deepmd/utils/path.py @@ -355,6 +355,7 @@ def save_numpy(self, arr: np.ndarray) -> None: if self._name in self._keys: del self.root[self._name] self.root.create_dataset(self._name, data=arr) + self.root.flush() def glob(self, pattern: str) -> List["DPPath"]: """Search path using the glob pattern. diff --git a/examples/water/dpa2/input_torch.json b/examples/water/dpa2/input_torch.json index 9d783b35d5..108e75df62 100644 --- a/examples/water/dpa2/input_torch.json +++ b/examples/water/dpa2/input_torch.json @@ -1,18 +1,13 @@ { "_comment": "that's all", "model": { - "type_embedding": { - "neuron": [ - 8 - ], - "tebd_input_mode": "concat" - }, "type_map": [ "O", "H" ], "descriptor": { "type": "dpa2", + "tebd_dim": 8, "repinit_rcut": 9.0, "repinit_rcut_smth": 8.0, "repinit_nsel": 120, @@ -74,6 +69,7 @@ "_comment": " that's all" }, "training": { + "stat_file": "./dpa2", "training_data": { "systems": [ "../data/data_0", diff --git a/examples/water/se_atten/input_torch.json b/examples/water/se_atten/input_torch.json index 7da3d64164..bc948cc2a0 100644 --- a/examples/water/se_atten/input_torch.json +++ b/examples/water/se_atten/input_torch.json @@ -15,6 +15,7 @@ 50, 100 ], + "tebd_dim": 8, "axis_neuron": 16, "attn": 128, "attn_layer": 2, @@ -59,6 +60,7 @@ "_comment": " that's all" }, "training": { + "stat_file": "./dpa1", "training_data": { "systems": [ "../data/data_0", diff --git a/examples/water/se_e2_a/input_torch.json b/examples/water/se_e2_a/input_torch.json index 053a721a44..c686b49d45 100644 --- a/examples/water/se_e2_a/input_torch.json +++ b/examples/water/se_e2_a/input_torch.json @@ -51,6 +51,7 @@ "_comment": " that's all" }, "training": { + "stat_file": "./se_e2_a", "training_data": { "systems": [ "../data/data_0", From ae27607b38d7c0a1b9ed1b9c3219f2bfaa106d56 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Wed, 28 Feb 2024 17:12:03 +0800 Subject: [PATCH 02/37] Fix EnergyFittingNetDirect --- deepmd/pt/model/model/dp_model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deepmd/pt/model/model/dp_model.py b/deepmd/pt/model/model/dp_model.py index 5410f518d1..79c129334a 100644 --- a/deepmd/pt/model/model/dp_model.py +++ b/deepmd/pt/model/model/dp_model.py @@ -10,6 +10,7 @@ ) from deepmd.pt.model.task.ener import ( EnergyFittingNet, + EnergyFittingNetDirect, ) from deepmd.pt.model.task.polarizability import ( PolarFittingNet, @@ -36,7 +37,9 @@ def __new__(cls, descriptor, fitting, *args, **kwargs): # according to the fitting network to decide the type of the model if cls is DPModel: # map fitting to model - if isinstance(fitting, EnergyFittingNet): + if isinstance(fitting, EnergyFittingNet) or isinstance( + fitting, EnergyFittingNetDirect + ): cls = EnergyModel elif isinstance(fitting, DipoleFittingNet): cls = DipoleModel From f9265d5868ccc8c7c23d5ebb44aa4dbdc99063de Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Wed, 28 Feb 2024 20:39:47 +0800 Subject: [PATCH 03/37] Add data_requirement for dataloader --- .../descriptor/make_base_descriptor.py | 4 +- deepmd/dpmodel/model/base_model.py | 4 + deepmd/pt/entrypoints/main.py | 41 +-------- .../pt/model/atomic_model/dp_atomic_model.py | 17 ++-- deepmd/pt/model/descriptor/descriptor.py | 6 +- deepmd/pt/model/descriptor/dpa1.py | 6 +- deepmd/pt/model/descriptor/dpa2.py | 15 ++-- deepmd/pt/model/descriptor/hybrid.py | 16 ++-- deepmd/pt/model/descriptor/repformers.py | 16 +++- deepmd/pt/model/descriptor/se_a.py | 16 +++- deepmd/pt/model/descriptor/se_atten.py | 16 +++- deepmd/pt/model/descriptor/se_r.py | 16 +++- deepmd/pt/model/model/dipole_model.py | 20 +++++ deepmd/pt/model/model/dp_zbl_model.py | 37 ++++++++ deepmd/pt/model/model/ener_model.py | 37 ++++++++ deepmd/pt/model/model/polar_model.py | 20 +++++ deepmd/pt/model/task/dipole.py | 9 +- deepmd/pt/model/task/ener.py | 13 +-- deepmd/pt/train/training.py | 90 ++++++++++++------- deepmd/pt/train/wrapper.py | 1 + deepmd/pt/utils/dataloader.py | 22 +++++ deepmd/pt/utils/dataset.py | 19 +++- deepmd/pt/utils/stat.py | 36 +++----- source/tests/pt/model/test_dipole_fitting.py | 6 -- source/tests/pt/test_stat.py | 2 - 25 files changed, 327 insertions(+), 158 deletions(-) diff --git a/deepmd/dpmodel/descriptor/make_base_descriptor.py b/deepmd/dpmodel/descriptor/make_base_descriptor.py index 18416ff16b..fe911551d5 100644 --- a/deepmd/dpmodel/descriptor/make_base_descriptor.py +++ b/deepmd/dpmodel/descriptor/make_base_descriptor.py @@ -84,9 +84,7 @@ def mixed_types(self) -> bool: """ pass - def compute_input_stats( - self, merged: List[dict], path: Optional[DPPath] = None - ): + def compute_input_stats(self, merged: callable, path: Optional[DPPath] = None): """Update mean and stddev for descriptor elements.""" raise NotImplementedError diff --git a/deepmd/dpmodel/model/base_model.py b/deepmd/dpmodel/model/base_model.py index faf3e7cfff..c4b998d763 100644 --- a/deepmd/dpmodel/model/base_model.py +++ b/deepmd/dpmodel/model/base_model.py @@ -92,6 +92,10 @@ def is_aparam_nall(self) -> bool: def model_output_type(self) -> str: """Get the output type for the model.""" + @abstractmethod + def data_requirement(self) -> dict: + """Get the data requirement for the model.""" + @abstractmethod def serialize(self) -> dict: """Serialize the model. diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index a317cea6a9..c9eba6e579 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -47,9 +47,6 @@ from deepmd.pt.utils.multi_task import ( preprocess_shared_params, ) -from deepmd.pt.utils.stat import ( - make_stat_input, -) from deepmd.utils.path import ( DPPath, ) @@ -83,7 +80,6 @@ def get_trainer( multi_task=multi_task, model_branch=model_branch, ) - config["model"]["resuming"] = (finetune_model is not None) or (ckpt is not None) shared_links = None if multi_task: config["model"], shared_links = preprocess_shared_params(config["model"]) @@ -98,24 +94,6 @@ def prepare_trainer_input_single( validation_dataset_params = data_dict_single["validation_data"] training_systems = training_dataset_params["systems"] validation_systems = validation_dataset_params["systems"] - - # noise params - noise_settings = None - if loss_dict_single.get("type", "ener") == "denoise": - noise_settings = { - "noise_type": loss_dict_single.pop("noise_type", "uniform"), - "noise": loss_dict_single.pop("noise", 1.0), - "noise_mode": loss_dict_single.pop("noise_mode", "fix_num"), - "mask_num": loss_dict_single.pop("mask_num", 8), - "mask_prob": loss_dict_single.pop("mask_prob", 0.15), - "same_mask": loss_dict_single.pop("same_mask", False), - "mask_coord": loss_dict_single.pop("mask_coord", False), - "mask_type": loss_dict_single.pop("mask_type", False), - "max_fail_num": loss_dict_single.pop("max_fail_num", 10), - "mask_type_idx": len(model_params_single["type_map"]) - 1, - } - # noise_settings = None - # stat files stat_file_path_single = data_dict_single.get("stat_file", None) if stat_file_path_single is not None: @@ -140,29 +118,15 @@ def prepare_trainer_input_single( training_dataset_params["batch_size"], model_params_single, ) - sampled_single = None else: train_data_single = DpLoaderSet( training_systems, training_dataset_params["batch_size"], model_params_single, ) - data_stat_nbatch = model_params_single.get("data_stat_nbatch", 10) - sampled_single = make_stat_input( - train_data_single.systems, - train_data_single.dataloaders, - data_stat_nbatch, - ) - if noise_settings is not None: - train_data_single = DpLoaderSet( - training_systems, - training_dataset_params["batch_size"], - model_params_single, - ) return ( train_data_single, validation_data_single, - sampled_single, stat_file_path_single, ) @@ -170,18 +134,16 @@ def prepare_trainer_input_single( ( train_data, validation_data, - sampled, stat_file_path, ) = prepare_trainer_input_single( config["model"], config["training"], config["loss"] ) else: - train_data, validation_data, sampled, stat_file_path = {}, {}, {}, {} + train_data, validation_data, stat_file_path = {}, {}, {} for model_key in config["model"]["model_dict"]: ( train_data[model_key], validation_data[model_key], - sampled[model_key], stat_file_path[model_key], ) = prepare_trainer_input_single( config["model"]["model_dict"][model_key], @@ -193,7 +155,6 @@ def prepare_trainer_input_single( trainer = training.Trainer( config, train_data, - sampled=sampled, stat_file_path=stat_file_path, validation_data=validation_data, init_model=init_model, diff --git a/deepmd/pt/model/atomic_model/dp_atomic_model.py b/deepmd/pt/model/atomic_model/dp_atomic_model.py index d2c1743d30..5c41499ace 100644 --- a/deepmd/pt/model/atomic_model/dp_atomic_model.py +++ b/deepmd/pt/model/atomic_model/dp_atomic_model.py @@ -18,9 +18,6 @@ from deepmd.pt.model.task.base_fitting import ( BaseFitting, ) -from deepmd.pt.utils.utils import ( - dict_to_device, -) from deepmd.utils.path import ( DPPath, ) @@ -170,7 +167,7 @@ def forward_atomic( def compute_or_load_stat( self, - sampled, + sampled_func, stat_file_path: Optional[DPPath] = None, ): """ @@ -183,8 +180,8 @@ def compute_or_load_stat( Parameters ---------- - sampled - The sampled data frames from different data systems. + sampled_func + The lazy sampled function to get data frames from different data systems. stat_file_path The dictionary of paths to the statistics files. """ @@ -192,13 +189,9 @@ def compute_or_load_stat( # descriptors and fitting net with different type_map # should not share the same parameters stat_file_path /= " ".join(self.type_map) - for data_sys in sampled: - dict_to_device(data_sys) - if sampled is None: - sampled = [] - self.descriptor.compute_input_stats(sampled, stat_file_path) + self.descriptor.compute_input_stats(sampled_func, stat_file_path) if self.fitting_net is not None: - self.fitting_net.compute_output_stats(sampled, stat_file_path) + self.fitting_net.compute_output_stats(sampled_func, stat_file_path) @torch.jit.export def get_dim_fparam(self) -> int: diff --git a/deepmd/pt/model/descriptor/descriptor.py b/deepmd/pt/model/descriptor/descriptor.py index 964cdb01eb..d400e42f75 100644 --- a/deepmd/pt/model/descriptor/descriptor.py +++ b/deepmd/pt/model/descriptor/descriptor.py @@ -5,9 +5,11 @@ abstractmethod, ) from typing import ( + Callable, Dict, List, Optional, + Union, ) import torch @@ -86,7 +88,9 @@ def get_dim_emb(self) -> int: """Returns the embedding dimension.""" pass - def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None): + def compute_input_stats( + self, merged: Union[Callable, List], path: Optional[DPPath] = None + ): """Update mean and stddev for DescriptorBlock elements.""" raise NotImplementedError diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py index b616d20cd8..3e71ea4746 100644 --- a/deepmd/pt/model/descriptor/dpa1.py +++ b/deepmd/pt/model/descriptor/dpa1.py @@ -1,7 +1,9 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( + Callable, List, Optional, + Union, ) import torch @@ -128,7 +130,9 @@ def dim_out(self): def dim_emb(self): return self.get_dim_emb() - def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None): + def compute_input_stats( + self, merged: Union[Callable, List], path: Optional[DPPath] = None + ): return self.se_atten.compute_input_stats(merged, path) def serialize(self) -> dict: diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py index b1df56a004..49c3f76631 100644 --- a/deepmd/pt/model/descriptor/dpa2.py +++ b/deepmd/pt/model/descriptor/dpa2.py @@ -1,7 +1,9 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( + Callable, List, Optional, + Union, ) import torch @@ -295,16 +297,11 @@ def dim_emb(self): """Returns the embedding dimension g2.""" return self.get_dim_emb() - def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None): + def compute_input_stats( + self, merged: Union[Callable, List], path: Optional[DPPath] = None + ): for ii, descrpt in enumerate([self.repinit, self.repformers]): - merged_tmp = [ - { - key: item[key] if not isinstance(item[key], list) else item[key][ii] - for key in item - } - for item in merged - ] - descrpt.compute_input_stats(merged_tmp, path) + descrpt.compute_input_stats(merged, path) def serialize(self) -> dict: """Serialize the obj to dict.""" diff --git a/deepmd/pt/model/descriptor/hybrid.py b/deepmd/pt/model/descriptor/hybrid.py index 688d448b81..df45217949 100644 --- a/deepmd/pt/model/descriptor/hybrid.py +++ b/deepmd/pt/model/descriptor/hybrid.py @@ -1,7 +1,9 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( + Callable, List, Optional, + Union, ) import torch @@ -157,17 +159,13 @@ def share_params(self, base_class, shared_level, resume=False): else: raise NotImplementedError - def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None): + def compute_input_stats( + self, merged: Union[Callable, List], path: Optional[DPPath] = None + ): """Update mean and stddev for descriptor elements.""" for ii, descrpt in enumerate(self.descriptor_list): - merged_tmp = [ - { - key: item[key] if not isinstance(item[key], list) else item[key][ii] - for key in item - } - for item in merged - ] - descrpt.compute_input_stats(merged_tmp, path) + # need support for hybrid descriptors + descrpt.compute_input_stats(merged, path) def forward( self, diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py index ad523bcc2d..000bed2cb1 100644 --- a/deepmd/pt/model/descriptor/repformers.py +++ b/deepmd/pt/model/descriptor/repformers.py @@ -1,8 +1,10 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( + Callable, Dict, List, Optional, + Union, ) import torch @@ -278,12 +280,22 @@ def forward( return g1, g2, h2, rot_mat.view(-1, nloc, self.dim_emb, 3), sw - def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None): + def compute_input_stats( + self, merged: Union[Callable, List], path: Optional[DPPath] = None + ): """Update mean and stddev for descriptor elements.""" env_mat_stat = EnvMatStatSe(self) if path is not None: path = path / env_mat_stat.get_hash() - env_mat_stat.load_or_compute_stats(merged, path) + if path is None or not path.is_dir(): + if callable(merged): + # only get data for once + sampled = merged() + else: + sampled = merged + else: + sampled = [] + env_mat_stat.load_or_compute_stats(sampled, path) self.stats = env_mat_stat.stats mean, stddev = env_mat_stat() if not self.set_davg_zero: diff --git a/deepmd/pt/model/descriptor/se_a.py b/deepmd/pt/model/descriptor/se_a.py index 6c29636d6d..ca2c5ef5c2 100644 --- a/deepmd/pt/model/descriptor/se_a.py +++ b/deepmd/pt/model/descriptor/se_a.py @@ -1,11 +1,13 @@ # SPDX-License-Identifier: LGPL-3.0-or-later import itertools from typing import ( + Callable, ClassVar, Dict, List, Optional, Tuple, + Union, ) import numpy as np @@ -387,12 +389,22 @@ def __getitem__(self, key): else: raise KeyError(key) - def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None): + def compute_input_stats( + self, merged: Union[Callable, List], path: Optional[DPPath] = None + ): """Update mean and stddev for descriptor elements.""" env_mat_stat = EnvMatStatSe(self) if path is not None: path = path / env_mat_stat.get_hash() - env_mat_stat.load_or_compute_stats(merged, path) + if path is None or not path.is_dir(): + if callable(merged): + # only get data for once + sampled = merged() + else: + sampled = merged + else: + sampled = [] + env_mat_stat.load_or_compute_stats(sampled, path) self.stats = env_mat_stat.stats mean, stddev = env_mat_stat() if not self.set_davg_zero: diff --git a/deepmd/pt/model/descriptor/se_atten.py b/deepmd/pt/model/descriptor/se_atten.py index 0b32bd9341..60612f6046 100644 --- a/deepmd/pt/model/descriptor/se_atten.py +++ b/deepmd/pt/model/descriptor/se_atten.py @@ -1,8 +1,10 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( + Callable, Dict, List, Optional, + Union, ) import numpy as np @@ -200,12 +202,22 @@ def dim_emb(self): """Returns the output dimension of embedding.""" return self.get_dim_emb() - def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None): + def compute_input_stats( + self, merged: Union[Callable, List], path: Optional[DPPath] = None + ): """Update mean and stddev for descriptor elements.""" env_mat_stat = EnvMatStatSe(self) if path is not None: path = path / env_mat_stat.get_hash() - env_mat_stat.load_or_compute_stats(merged, path) + if path is None or not path.is_dir(): + if callable(merged): + # only get data for once + sampled = merged() + else: + sampled = merged + else: + sampled = [] + env_mat_stat.load_or_compute_stats(sampled, path) self.stats = env_mat_stat.stats mean, stddev = env_mat_stat() if not self.set_davg_zero: diff --git a/deepmd/pt/model/descriptor/se_r.py b/deepmd/pt/model/descriptor/se_r.py index bdb7dafe73..e8a2483da6 100644 --- a/deepmd/pt/model/descriptor/se_r.py +++ b/deepmd/pt/model/descriptor/se_r.py @@ -1,9 +1,11 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( + Callable, Dict, List, Optional, Tuple, + Union, ) import numpy as np @@ -148,12 +150,22 @@ def mixed_types(self) -> bool: """ return False - def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None): + def compute_input_stats( + self, merged: Union[Callable, List], path: Optional[DPPath] = None + ): """Update mean and stddev for descriptor elements.""" env_mat_stat = EnvMatStatSe(self) if path is not None: path = path / env_mat_stat.get_hash() - env_mat_stat.load_or_compute_stats(merged, path) + if path is None or not path.is_dir(): + if callable(merged): + # only get data for once + sampled = merged() + else: + sampled = merged + else: + sampled = [] + env_mat_stat.load_or_compute_stats(sampled, path) self.stats = env_mat_stat.stats mean, stddev = env_mat_stat() if not self.set_davg_zero: diff --git a/deepmd/pt/model/model/dipole_model.py b/deepmd/pt/model/model/dipole_model.py index 6629541459..f6d896b5d8 100644 --- a/deepmd/pt/model/model/dipole_model.py +++ b/deepmd/pt/model/model/dipole_model.py @@ -90,3 +90,23 @@ def forward_lower( else: model_predict = model_ret return model_predict + + @property + def data_requirement(self): + data_requirement = { + "dipole": { + "ndof": 3, + "atomic": False, + "must": False, + "high_prec": False, + "type_sel": self.get_sel_type(), + }, + "atomic_dipole": { + "ndof": 3, + "atomic": True, + "must": False, + "high_prec": False, + "type_sel": self.get_sel_type(), + }, + } + return data_requirement diff --git a/deepmd/pt/model/model/dp_zbl_model.py b/deepmd/pt/model/model/dp_zbl_model.py index c8264f2007..fd47b4368d 100644 --- a/deepmd/pt/model/model/dp_zbl_model.py +++ b/deepmd/pt/model/model/dp_zbl_model.py @@ -97,3 +97,40 @@ def forward_lower( model_predict["dforce"] = model_ret["dforce"] model_predict = model_ret return model_predict + + @property + def data_requirement(self): + data_requirement = { + "energy": { + "ndof": 1, + "atomic": False, + "must": False, + "high_prec": True, + }, + "force": { + "ndof": 3, + "atomic": True, + "must": False, + "high_prec": False, + }, + "virial": { + "ndof": 9, + "atomic": False, + "must": False, + "high_prec": False, + }, + "atom_ener": { + "ndof": 1, + "atomic": True, + "must": False, + "high_prec": False, + }, + "atom_pref": { + "ndof": 1, + "atomic": True, + "must": False, + "high_prec": False, + "repeat": 3, + }, + } + return data_requirement diff --git a/deepmd/pt/model/model/ener_model.py b/deepmd/pt/model/model/ener_model.py index 1a5706dbbf..1497cbade4 100644 --- a/deepmd/pt/model/model/ener_model.py +++ b/deepmd/pt/model/model/ener_model.py @@ -95,3 +95,40 @@ def forward_lower( else: model_predict = model_ret return model_predict + + @property + def data_requirement(self): + data_requirement = { + "energy": { + "ndof": 1, + "atomic": False, + "must": False, + "high_prec": True, + }, + "force": { + "ndof": 3, + "atomic": True, + "must": False, + "high_prec": False, + }, + "virial": { + "ndof": 9, + "atomic": False, + "must": False, + "high_prec": False, + }, + "atom_ener": { + "ndof": 1, + "atomic": True, + "must": False, + "high_prec": False, + }, + "atom_pref": { + "ndof": 1, + "atomic": True, + "must": False, + "high_prec": False, + "repeat": 3, + }, + } + return data_requirement diff --git a/deepmd/pt/model/model/polar_model.py b/deepmd/pt/model/model/polar_model.py index d956a0344c..450f5f2fb5 100644 --- a/deepmd/pt/model/model/polar_model.py +++ b/deepmd/pt/model/model/polar_model.py @@ -74,3 +74,23 @@ def forward_lower( else: model_predict = model_ret return model_predict + + @property + def get_data_requirement(self): + data_requirement = { + "polar": { + "ndof": 9, + "atomic": False, + "must": False, + "high_prec": False, + "type_sel": self.get_sel_type(), + }, + "atomic_polar": { + "ndof": 9, + "atomic": True, + "must": False, + "high_prec": False, + "type_sel": self.get_sel_type(), + }, + } + return data_requirement diff --git a/deepmd/pt/model/task/dipole.py b/deepmd/pt/model/task/dipole.py index bff3dd93bc..5336e9ed5d 100644 --- a/deepmd/pt/model/task/dipole.py +++ b/deepmd/pt/model/task/dipole.py @@ -20,6 +20,9 @@ from deepmd.pt.utils.env import ( DEFAULT_PRECISION, ) +from deepmd.utils.path import ( + DPPath, +) log = logging.getLogger(__name__) @@ -67,7 +70,6 @@ class DipoleFittingNet(GeneralFitting): def __init__( self, - var_name: str, ntypes: int, dim_descrpt: int, embedding_width: int, @@ -89,7 +91,7 @@ def __init__( self.r_differentiable = r_differentiable self.c_differentiable = c_differentiable super().__init__( - var_name=var_name, + var_name="dipole", ntypes=ntypes, dim_descrpt=dim_descrpt, neuron=neuron, @@ -140,6 +142,9 @@ def data_stat_key(self): """ return [] + def compute_output_stats(self, merged, stat_file_path: Optional[DPPath] = None): + raise NotImplementedError + def forward( self, descriptor: torch.Tensor, diff --git a/deepmd/pt/model/task/ener.py b/deepmd/pt/model/task/ener.py index 8479111819..f00e4e01be 100644 --- a/deepmd/pt/model/task/ener.py +++ b/deepmd/pt/model/task/ener.py @@ -144,17 +144,18 @@ def data_stat_key(self): return ["bias_atom_e"] def compute_output_stats(self, merged, stat_file_path: Optional[DPPath] = None): - energy = [item["energy"] for item in merged] - data_mixed_type = "real_natoms_vec" in merged[0] - if data_mixed_type: - input_natoms = [item["real_natoms_vec"] for item in merged] - else: - input_natoms = [item["natoms"] for item in merged] if stat_file_path is not None: stat_file_path = stat_file_path / "bias_atom_e" if stat_file_path is not None and stat_file_path.is_file(): bias_atom_e = stat_file_path.load_numpy() else: + sampled = merged() + energy = [item["energy"] for item in sampled] + data_mixed_type = "real_natoms_vec" in sampled[0] + if data_mixed_type: + input_natoms = [item["real_natoms_vec"] for item in sampled] + else: + input_natoms = [item["natoms"] for item in sampled] bias_atom_e = compute_output_bias(energy, input_natoms, rcond=self.rcond) if stat_file_path is not None: stat_file_path.save_numpy(bias_atom_e) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 152c69a444..01a8d0ca28 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -38,6 +38,7 @@ from deepmd.pt.utils.dataloader import ( BufferedIterator, get_weighted_sampler, + lazy, ) from deepmd.pt.utils.env import ( DEVICE, @@ -49,10 +50,14 @@ from deepmd.pt.utils.learning_rate import ( LearningRateExp, ) +from deepmd.pt.utils.stat import ( + make_stat_input, +) if torch.__version__.startswith("2"): import torch._dynamo + import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data import ( @@ -67,7 +72,6 @@ def __init__( self, config: Dict[str, Any], training_data, - sampled=None, stat_file_path=None, validation_data=None, init_model=None, @@ -82,7 +86,15 @@ def __init__( Args: - config: The Dict-like configuration with training options. """ - resume_model = init_model if init_model is not None else restart_model + if init_model is not None: + resume_model = init_model + elif restart_model is not None: + resume_model = restart_model + elif finetune_model is not None: + resume_model = finetune_model + else: + resume_model = None + resuming = resume_model is not None self.restart_training = restart_model is not None model_params = config["model"] training_params = config["training"] @@ -93,8 +105,8 @@ def __init__( self.model_keys = ( list(model_params["model_dict"]) if self.multi_task else ["Default"] ) - if self.multi_task and sampled is None: - sampled = {key: None for key in self.model_keys} + # if self.multi_task and sampled is None: + # sampled = {key: None for key in self.model_keys} self.rank = dist.get_rank() if dist.is_initialized() else 0 self.world_size = dist.get_world_size() if dist.is_initialized() else 1 self.num_model = len(self.model_keys) @@ -184,11 +196,26 @@ def get_data_loader(_training_data, _validation_data, _training_params): valid_numb_batch, ) - def get_single_model(_model_params, _sampled, _stat_file_path): + def get_single_model( + _model_params, _training_data, _validation_data, _stat_file_path + ): model = get_model(deepcopy(_model_params)).to(DEVICE) - if not model_params.get("resuming", False): + _training_data.add_data_requirement(model.data_requirement) + if _validation_data is not None: + _validation_data.add_data_requirement(model.data_requirement) + if not resuming: + + @lazy + def get_sample(): + sampled = make_stat_input( + _training_data.systems, + _training_data.dataloaders, + _model_params.get("data_stat_nbatch", 10), + ) + return sampled + model.compute_or_load_stat( - sampled=_sampled, + sampled_func=get_sample, stat_file_path=_stat_file_path, ) return model @@ -233,6 +260,9 @@ def get_loss(loss_params, start_lr, _ntypes): # Data + Model dp_random.seed(training_params["seed"]) if not self.multi_task: + self.model = get_single_model( + model_params, training_data, validation_data, stat_file_path + ) ( self.training_dataloader, self.training_data, @@ -240,7 +270,6 @@ def get_loss(loss_params, start_lr, _ntypes): self.validation_data, self.valid_numb_batch, ) = get_data_loader(training_data, validation_data, training_params) - self.model = get_single_model(model_params, sampled, stat_file_path) else: ( self.training_dataloader, @@ -251,6 +280,12 @@ def get_loss(loss_params, start_lr, _ntypes): self.model, ) = {}, {}, {}, {}, {}, {} for model_key in self.model_keys: + self.model[model_key] = get_single_model( + model_params["model_dict"][model_key], + training_data[model_key], + validation_data[model_key], + stat_file_path[model_key], + ) ( self.training_dataloader[model_key], self.training_data[model_key], @@ -262,11 +297,6 @@ def get_loss(loss_params, start_lr, _ntypes): validation_data[model_key], training_params["data_dict"][model_key], ) - self.model[model_key] = get_single_model( - model_params["model_dict"][model_key], - sampled[model_key], - stat_file_path[model_key], - ) # Learning rate self.warmup_steps = training_params.get("warmup_steps", 0) @@ -309,7 +339,7 @@ def get_loss(loss_params, start_lr, _ntypes): # resuming and finetune optimizer_state_dict = None - if model_params["resuming"]: + if resuming: ntest = model_params.get("data_bias_nsample", 1) origin_model = ( finetune_model if finetune_model is not None else resume_model @@ -404,7 +434,7 @@ def get_loss(loss_params, start_lr, _ntypes): # Multi-task share params if shared_links is not None: - self.wrapper.share_params(shared_links, resume=model_params["resuming"]) + self.wrapper.share_params(shared_links, resume=resuming) if dist.is_initialized(): torch.cuda.set_device(LOCAL_RANK) @@ -812,28 +842,22 @@ def get_data(self, is_train=True, task_key="Default"): batch_data[key] = batch_data[key].to(DEVICE) else: batch_data[key] = [item.to(DEVICE) for item in batch_data[key]] - input_dict = {} - for item in [ + # we may need a better way to classify which are inputs and which are labels + # now wrapper only supports the following inputs: + input_keys = [ "coord", "atype", "box", - ]: - if item in batch_data: - input_dict[item] = batch_data[item] - else: - input_dict[item] = None + "spin", + ] + input_dict = {item_key: None for item_key in input_keys} label_dict = {} - for item in [ - "energy", - "force", - "virial", - "clean_coord", - "clean_type", - "coord_mask", - "type_mask", - ]: - if item in batch_data: - label_dict[item] = batch_data[item] + for item_key in batch_data: + if item_key in input_keys: + input_dict[item_key] = batch_data[item_key] + else: + if item_key not in ["sid", "fid"] and "find_" not in item_key: + label_dict[item_key] = batch_data[item_key] log_dict = {} if "fid" in batch_data: log_dict["fid"] = batch_data["fid"] diff --git a/deepmd/pt/train/wrapper.py b/deepmd/pt/train/wrapper.py index 2207f111a0..ba9cd8c288 100644 --- a/deepmd/pt/train/wrapper.py +++ b/deepmd/pt/train/wrapper.py @@ -159,6 +159,7 @@ def forward( coord, atype, box: Optional[torch.Tensor] = None, + spin: Optional[torch.Tensor] = None, cur_lr: Optional[torch.Tensor] = None, label: Optional[torch.Tensor] = None, task_key: Optional[torch.Tensor] = None, diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py index 2125f9cdee..9d30748321 100644 --- a/deepmd/pt/utils/dataloader.py +++ b/deepmd/pt/utils/dataloader.py @@ -147,6 +147,11 @@ def __getitem__(self, idx): batch["sid"] = idx return batch + def add_data_requirement(self, dict_of_keys): + """Add data requirement for each system in multiple systems.""" + for system in self.systems: + system.add_data_requirement(dict_of_keys) + _sentinel = object() QUEUESIZE = 32 @@ -254,3 +259,20 @@ def get_weighted_sampler(training_data, prob_style, sys_prob=False): with torch.device("cpu"): sampler = WeightedRandomSampler(probs, len_sampler, replacement=True) return sampler + + +class LazyFunction: + def __init__(self, func): + self.func = func + self.result = None + self.called = False + + def __call__(self, *args, **kwargs): + if not self.called: + self.result = self.func(*args, **kwargs) + self.called = True + return self.result + + +def lazy(func): + return LazyFunction(func) diff --git a/deepmd/pt/utils/dataset.py b/deepmd/pt/utils/dataset.py index 4619b6417f..9de82778dc 100644 --- a/deepmd/pt/utils/dataset.py +++ b/deepmd/pt/utils/dataset.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: LGPL-3.0-or-later + from torch.utils.data import ( Dataset, ) @@ -27,9 +28,6 @@ def __init__( self._data_system = DeepmdData( sys_path=system, shuffle_test=shuffle, type_map=self._type_map ) - self._data_system.add("energy", 1, atomic=False, must=False, high_prec=True) - self._data_system.add("force", 3, atomic=True, must=False, high_prec=False) - self._data_system.add("virial", 9, atomic=False, must=False, high_prec=False) self.mixed_type = self._data_system.mixed_type self._ntypes = self._data_system.get_ntypes() self._natoms = self._data_system.get_natoms() @@ -43,3 +41,18 @@ def __getitem__(self, index): b_data = self._data_system.get_item_torch(index) b_data["natoms"] = self._natoms_vec return b_data + + def add_data_requirement(self, dict_of_keys): + """Add data requirement for this data system.""" + for data_key in dict_of_keys: + self._data_system.add( + data_key, + dict_of_keys[data_key]["ndof"], + atomic=dict_of_keys[data_key].get("atomic", False), + must=dict_of_keys[data_key].get("must", False), + high_prec=dict_of_keys[data_key].get("high_prec", False), + type_sel=dict_of_keys[data_key].get("type_sel", None), + repeat=dict_of_keys[data_key].get("repeat", 1), + default=dict_of_keys[data_key].get("default", 0.0), + dtype=dict_of_keys[data_key].get("dtype", None), + ) diff --git a/deepmd/pt/utils/stat.py b/deepmd/pt/utils/stat.py index 38f71d6994..661653b41e 100644 --- a/deepmd/pt/utils/stat.py +++ b/deepmd/pt/utils/stat.py @@ -4,6 +4,10 @@ import numpy as np import torch +from deepmd.pt.utils.utils import ( + dict_to_device, +) + log = logging.getLogger(__name__) @@ -19,19 +23,9 @@ def make_stat_input(datasets, dataloaders, nbatches): - a list of dicts, each of which contains data from a system """ lst = [] - keys = [ - "coord", - "force", - "energy", - "atype", - "box", - "natoms", - ] - if datasets[0].mixed_type: - keys.append("real_natoms_vec") log.info(f"Packing data for statistics from {len(datasets)} systems") for i in range(len(datasets)): - sys_stat = {key: [] for key in keys} + sys_stat = {} with torch.device("cpu"): iterator = iter(dataloaders[i]) for _ in range(nbatches): @@ -41,20 +35,16 @@ def make_stat_input(datasets, dataloaders, nbatches): iterator = iter(dataloaders[i]) stat_data = next(iterator) for dd in stat_data: - if dd in keys: + if isinstance(stat_data[dd], torch.Tensor): + if dd not in sys_stat: + sys_stat[dd] = [] sys_stat[dd].append(stat_data[dd]) - for key in keys: - if not isinstance(sys_stat[key][0], list): - if sys_stat[key][0] is None: - sys_stat[key] = None - else: - sys_stat[key] = torch.cat(sys_stat[key], dim=0) + for key in sys_stat: + if sys_stat[key][0] is None: + sys_stat[key] = None else: - sys_stat_list = [] - for ii, _ in enumerate(sys_stat[key][0]): - tmp_stat = [x[ii] for x in sys_stat[key]] - sys_stat_list.append(torch.cat(tmp_stat, dim=0)) - sys_stat[key] = sys_stat_list + sys_stat[key] = torch.cat(sys_stat[key], dim=0) + dict_to_device(sys_stat) lst.append(sys_stat) return lst diff --git a/source/tests/pt/model/test_dipole_fitting.py b/source/tests/pt/model/test_dipole_fitting.py index fcdd408726..83054f1042 100644 --- a/source/tests/pt/model/test_dipole_fitting.py +++ b/source/tests/pt/model/test_dipole_fitting.py @@ -79,7 +79,6 @@ def test_consistency( [0, 4], ): ft0 = DipoleFittingNet( - "foo", self.nt, self.dd0.dim_out, embedding_width=self.dd0.get_dim_emb(), @@ -132,7 +131,6 @@ def test_jit( [0, 4], ): ft0 = DipoleFittingNet( - "foo", self.nt, self.dd0.dim_out, embedding_width=self.dd0.get_dim_emb(), @@ -168,7 +166,6 @@ def test_rot(self): [0, 4], ): ft0 = DipoleFittingNet( - "foo", 3, # ntype self.dd0.dim_out, # dim_descrpt embedding_width=self.dd0.get_dim_emb(), @@ -218,7 +215,6 @@ def test_rot(self): def test_permu(self): coord = torch.matmul(self.coord, self.cell) ft0 = DipoleFittingNet( - "foo", 3, # ntype self.dd0.dim_out, embedding_width=self.dd0.get_dim_emb(), @@ -260,7 +256,6 @@ def test_trans(self): self.cell, ) ft0 = DipoleFittingNet( - "foo", 3, # ntype self.dd0.dim_out, embedding_width=self.dd0.get_dim_emb(), @@ -305,7 +300,6 @@ def setUp(self): self.atype = torch.IntTensor([0, 0, 0, 1, 1], device="cpu").to(env.DEVICE) self.dd0 = DescrptSeA(self.rcut, self.rcut_smth, self.sel).to(env.DEVICE) self.ft0 = DipoleFittingNet( - "dipole", self.nt, self.dd0.dim_out, embedding_width=self.dd0.get_dim_emb(), diff --git a/source/tests/pt/test_stat.py b/source/tests/pt/test_stat.py index 1e3c707d6f..7ef2d85e18 100644 --- a/source/tests/pt/test_stat.py +++ b/source/tests/pt/test_stat.py @@ -176,8 +176,6 @@ def test_descriptor(self): for sys in sampled: for key in [ "coord", - "force", - "energy", "atype", "natoms", "box", From c9eb767581a9de7f9837de918b53afa378bf9c8c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Feb 2024 12:44:16 +0000 Subject: [PATCH 04/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt/model/task/dipole.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/deepmd/pt/model/task/dipole.py b/deepmd/pt/model/task/dipole.py index 000600cdf2..68ce09a080 100644 --- a/deepmd/pt/model/task/dipole.py +++ b/deepmd/pt/model/task/dipole.py @@ -134,11 +134,9 @@ def output_def(self) -> FittingOutputDef: ] ) - def compute_output_stats(self, merged, stat_file_path: Optional[DPPath] = None): raise NotImplementedError - def forward( self, descriptor: torch.Tensor, From 00105c7d548e297724d223d5667245d0e84102bf Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Wed, 28 Feb 2024 20:47:21 +0800 Subject: [PATCH 05/37] Update make_base_descriptor.py --- deepmd/dpmodel/descriptor/make_base_descriptor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/deepmd/dpmodel/descriptor/make_base_descriptor.py b/deepmd/dpmodel/descriptor/make_base_descriptor.py index fe911551d5..0f55916111 100644 --- a/deepmd/dpmodel/descriptor/make_base_descriptor.py +++ b/deepmd/dpmodel/descriptor/make_base_descriptor.py @@ -4,8 +4,10 @@ abstractmethod, ) from typing import ( + Callable, List, Optional, + Union, ) from deepmd.common import ( @@ -84,7 +86,9 @@ def mixed_types(self) -> bool: """ pass - def compute_input_stats(self, merged: callable, path: Optional[DPPath] = None): + def compute_input_stats( + self, merged: Union[Callable, List], path: Optional[DPPath] = None + ): """Update mean and stddev for descriptor elements.""" raise NotImplementedError From 5a9df83fbf4c60f09706fe7fbc7e3553650cf5ad Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Wed, 28 Feb 2024 20:49:41 +0800 Subject: [PATCH 06/37] Update typing --- deepmd/dpmodel/descriptor/make_base_descriptor.py | 2 +- deepmd/pt/model/descriptor/descriptor.py | 2 +- deepmd/pt/model/descriptor/dpa1.py | 2 +- deepmd/pt/model/descriptor/dpa2.py | 2 +- deepmd/pt/model/descriptor/hybrid.py | 2 +- deepmd/pt/model/descriptor/repformers.py | 2 +- deepmd/pt/model/descriptor/se_a.py | 6 ++++-- deepmd/pt/model/descriptor/se_atten.py | 2 +- deepmd/pt/model/descriptor/se_r.py | 2 +- 9 files changed, 12 insertions(+), 10 deletions(-) diff --git a/deepmd/dpmodel/descriptor/make_base_descriptor.py b/deepmd/dpmodel/descriptor/make_base_descriptor.py index 0f55916111..ab4c206fdf 100644 --- a/deepmd/dpmodel/descriptor/make_base_descriptor.py +++ b/deepmd/dpmodel/descriptor/make_base_descriptor.py @@ -87,7 +87,7 @@ def mixed_types(self) -> bool: pass def compute_input_stats( - self, merged: Union[Callable, List], path: Optional[DPPath] = None + self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None ): """Update mean and stddev for descriptor elements.""" raise NotImplementedError diff --git a/deepmd/pt/model/descriptor/descriptor.py b/deepmd/pt/model/descriptor/descriptor.py index d400e42f75..02cd657c7c 100644 --- a/deepmd/pt/model/descriptor/descriptor.py +++ b/deepmd/pt/model/descriptor/descriptor.py @@ -89,7 +89,7 @@ def get_dim_emb(self) -> int: pass def compute_input_stats( - self, merged: Union[Callable, List], path: Optional[DPPath] = None + self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None ): """Update mean and stddev for DescriptorBlock elements.""" raise NotImplementedError diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py index 3e71ea4746..1d8da3469f 100644 --- a/deepmd/pt/model/descriptor/dpa1.py +++ b/deepmd/pt/model/descriptor/dpa1.py @@ -131,7 +131,7 @@ def dim_emb(self): return self.get_dim_emb() def compute_input_stats( - self, merged: Union[Callable, List], path: Optional[DPPath] = None + self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None ): return self.se_atten.compute_input_stats(merged, path) diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py index 49c3f76631..ee86d81fae 100644 --- a/deepmd/pt/model/descriptor/dpa2.py +++ b/deepmd/pt/model/descriptor/dpa2.py @@ -298,7 +298,7 @@ def dim_emb(self): return self.get_dim_emb() def compute_input_stats( - self, merged: Union[Callable, List], path: Optional[DPPath] = None + self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None ): for ii, descrpt in enumerate([self.repinit, self.repformers]): descrpt.compute_input_stats(merged, path) diff --git a/deepmd/pt/model/descriptor/hybrid.py b/deepmd/pt/model/descriptor/hybrid.py index df45217949..a761204d64 100644 --- a/deepmd/pt/model/descriptor/hybrid.py +++ b/deepmd/pt/model/descriptor/hybrid.py @@ -160,7 +160,7 @@ def share_params(self, base_class, shared_level, resume=False): raise NotImplementedError def compute_input_stats( - self, merged: Union[Callable, List], path: Optional[DPPath] = None + self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None ): """Update mean and stddev for descriptor elements.""" for ii, descrpt in enumerate(self.descriptor_list): diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py index 000bed2cb1..174daf14af 100644 --- a/deepmd/pt/model/descriptor/repformers.py +++ b/deepmd/pt/model/descriptor/repformers.py @@ -281,7 +281,7 @@ def forward( return g1, g2, h2, rot_mat.view(-1, nloc, self.dim_emb, 3), sw def compute_input_stats( - self, merged: Union[Callable, List], path: Optional[DPPath] = None + self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None ): """Update mean and stddev for descriptor elements.""" env_mat_stat = EnvMatStatSe(self) diff --git a/deepmd/pt/model/descriptor/se_a.py b/deepmd/pt/model/descriptor/se_a.py index ca2c5ef5c2..5843534e04 100644 --- a/deepmd/pt/model/descriptor/se_a.py +++ b/deepmd/pt/model/descriptor/se_a.py @@ -131,7 +131,9 @@ def dim_out(self): """Returns the output dimension of this descriptor.""" return self.sea.dim_out - def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None): + def compute_input_stats( + self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None + ): """Update mean and stddev for descriptor elements.""" return self.sea.compute_input_stats(merged, path) @@ -390,7 +392,7 @@ def __getitem__(self, key): raise KeyError(key) def compute_input_stats( - self, merged: Union[Callable, List], path: Optional[DPPath] = None + self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None ): """Update mean and stddev for descriptor elements.""" env_mat_stat = EnvMatStatSe(self) diff --git a/deepmd/pt/model/descriptor/se_atten.py b/deepmd/pt/model/descriptor/se_atten.py index 60612f6046..a056fbe889 100644 --- a/deepmd/pt/model/descriptor/se_atten.py +++ b/deepmd/pt/model/descriptor/se_atten.py @@ -203,7 +203,7 @@ def dim_emb(self): return self.get_dim_emb() def compute_input_stats( - self, merged: Union[Callable, List], path: Optional[DPPath] = None + self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None ): """Update mean and stddev for descriptor elements.""" env_mat_stat = EnvMatStatSe(self) diff --git a/deepmd/pt/model/descriptor/se_r.py b/deepmd/pt/model/descriptor/se_r.py index e8a2483da6..e3832b16e4 100644 --- a/deepmd/pt/model/descriptor/se_r.py +++ b/deepmd/pt/model/descriptor/se_r.py @@ -151,7 +151,7 @@ def mixed_types(self) -> bool: return False def compute_input_stats( - self, merged: Union[Callable, List], path: Optional[DPPath] = None + self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None ): """Update mean and stddev for descriptor elements.""" env_mat_stat = EnvMatStatSe(self) From 75da5b1f8f6bb24ba38a4616d3ba952a52a035c9 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Wed, 28 Feb 2024 20:54:39 +0800 Subject: [PATCH 07/37] Update training.py --- deepmd/pt/train/training.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 01a8d0ca28..a7f0d9956a 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -105,8 +105,6 @@ def __init__( self.model_keys = ( list(model_params["model_dict"]) if self.multi_task else ["Default"] ) - # if self.multi_task and sampled is None: - # sampled = {key: None for key in self.model_keys} self.rank = dist.get_rank() if dist.is_initialized() else 0 self.world_size = dist.get_world_size() if dist.is_initialized() else 1 self.num_model = len(self.model_keys) From 6c171c5614ecc8508b63eca2632ac085a715e403 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Wed, 28 Feb 2024 21:48:01 +0800 Subject: [PATCH 08/37] Fix uts --- deepmd/dpmodel/model/dp_model.py | 4 ++- deepmd/pt/model/task/dipole.py | 10 ++++-- deepmd/pt/model/task/ener.py | 14 ++++++-- deepmd/pt/model/task/polarizability.py | 14 ++++++-- source/tests/pt/model/test_descriptor.py | 4 +++ source/tests/pt/model/test_dipole_fitting.py | 14 ++++---- source/tests/pt/model/test_embedding_net.py | 5 +++ .../pt/model/test_polarizability_fitting.py | 23 +++++------- source/tests/pt/test_loss.py | 4 +++ source/tests/pt/test_stat.py | 35 +++++++++++++++++++ 10 files changed, 99 insertions(+), 28 deletions(-) diff --git a/deepmd/dpmodel/model/dp_model.py b/deepmd/dpmodel/model/dp_model.py index 804ce51dfd..705750414b 100644 --- a/deepmd/dpmodel/model/dp_model.py +++ b/deepmd/dpmodel/model/dp_model.py @@ -14,4 +14,6 @@ # use "class" to resolve "Variable not allowed in type expression" @BaseModel.register("standard") class DPModel(make_model(DPAtomicModel), BaseModel): - pass + def data_requirement(self) -> dict: + """Get the data requirement for the model.""" + raise NotImplementedError diff --git a/deepmd/pt/model/task/dipole.py b/deepmd/pt/model/task/dipole.py index 68ce09a080..08a3673a8c 100644 --- a/deepmd/pt/model/task/dipole.py +++ b/deepmd/pt/model/task/dipole.py @@ -1,8 +1,10 @@ # SPDX-License-Identifier: LGPL-3.0-or-later import logging from typing import ( + Callable, List, Optional, + Union, ) import torch @@ -91,7 +93,7 @@ def __init__( self.r_differentiable = r_differentiable self.c_differentiable = c_differentiable super().__init__( - var_name="dipole", + var_name="dipole" if "var_name" not in kwargs else kwargs.pop("var_name"), ntypes=ntypes, dim_descrpt=dim_descrpt, neuron=neuron, @@ -134,7 +136,11 @@ def output_def(self) -> FittingOutputDef: ] ) - def compute_output_stats(self, merged, stat_file_path: Optional[DPPath] = None): + def compute_output_stats( + self, + merged: Union[Callable, List[dict]], + stat_file_path: Optional[DPPath] = None, + ): raise NotImplementedError def forward( diff --git a/deepmd/pt/model/task/ener.py b/deepmd/pt/model/task/ener.py index ed9d517763..55ee79db25 100644 --- a/deepmd/pt/model/task/ener.py +++ b/deepmd/pt/model/task/ener.py @@ -2,9 +2,11 @@ import copy import logging from typing import ( + Callable, List, Optional, Tuple, + Union, ) import numpy as np @@ -138,13 +140,21 @@ def serialize(self) -> dict: data["atom_ener"] = self.atom_ener return data - def compute_output_stats(self, merged, stat_file_path: Optional[DPPath] = None): + def compute_output_stats( + self, + merged: Union[Callable, List[dict]], + stat_file_path: Optional[DPPath] = None, + ): if stat_file_path is not None: stat_file_path = stat_file_path / "bias_atom_e" if stat_file_path is not None and stat_file_path.is_file(): bias_atom_e = stat_file_path.load_numpy() else: - sampled = merged() + if callable(merged): + # only get data for once + sampled = merged() + else: + sampled = merged energy = [item["energy"] for item in sampled] data_mixed_type = "real_natoms_vec" in sampled[0] if data_mixed_type: diff --git a/deepmd/pt/model/task/polarizability.py b/deepmd/pt/model/task/polarizability.py index 1bc4798c48..0fe817084e 100644 --- a/deepmd/pt/model/task/polarizability.py +++ b/deepmd/pt/model/task/polarizability.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: LGPL-3.0-or-later import logging from typing import ( + Callable, List, Optional, Union, @@ -24,6 +25,9 @@ from deepmd.pt.utils.utils import ( to_numpy_array, ) +from deepmd.utils.path import ( + DPPath, +) log = logging.getLogger(__name__) @@ -72,7 +76,6 @@ class PolarFittingNet(GeneralFitting): def __init__( self, - var_name: str, ntypes: int, dim_descrpt: int, embedding_width: int, @@ -112,7 +115,7 @@ def __init__( ).view(ntypes, 1) self.shift_diag = shift_diag super().__init__( - var_name=var_name, + var_name="polar" if "var_name" not in kwargs else kwargs.pop("var_name"), ntypes=ntypes, dim_descrpt=dim_descrpt, neuron=neuron, @@ -160,6 +163,13 @@ def output_def(self) -> FittingOutputDef: ] ) + def compute_output_stats( + self, + merged: Union[Callable, List[dict]], + stat_file_path: Optional[DPPath] = None, + ): + raise NotImplementedError + def forward( self, descriptor: torch.Tensor, diff --git a/source/tests/pt/model/test_descriptor.py b/source/tests/pt/model/test_descriptor.py index ffad27201a..7d21d1c13d 100644 --- a/source/tests/pt/model/test_descriptor.py +++ b/source/tests/pt/model/test_descriptor.py @@ -38,6 +38,9 @@ op_module, ) +from ..test_stat import ( + energy_data_requirement, +) from .test_embedding_net import ( get_single_batch, ) @@ -114,6 +117,7 @@ def setUp(self): self.systems[0], model_config["type_map"], ) + ds.add_data_requirement(energy_data_requirement) self.np_batch, self.pt_batch = get_single_batch(ds) self.sec = np.cumsum(self.sel) self.ntypes = len(self.sel) diff --git a/source/tests/pt/model/test_dipole_fitting.py b/source/tests/pt/model/test_dipole_fitting.py index 83054f1042..fa4be9171c 100644 --- a/source/tests/pt/model/test_dipole_fitting.py +++ b/source/tests/pt/model/test_dipole_fitting.py @@ -114,12 +114,12 @@ def test_consistency( ) ret2 = ft2(rd0, atype, gr, fparam=ifp, aparam=iap) np.testing.assert_allclose( - to_numpy_array(ret0["foo"]), - ret1["foo"], + to_numpy_array(ret0["dipole"]), + ret1["dipole"], ) np.testing.assert_allclose( - to_numpy_array(ret0["foo"]), - to_numpy_array(ret2["foo"]), + to_numpy_array(ret0["dipole"]), + to_numpy_array(ret2["dipole"]), ) def test_jit( @@ -206,7 +206,7 @@ def test_rot(self): ) ret0 = ft0(rd0, extended_atype, gr0, fparam=ifp, aparam=iap) - res.append(ret0["foo"]) + res.append(ret0["dipole"]) np.testing.assert_allclose( to_numpy_array(res[1]), to_numpy_array(torch.matmul(res[0], rmat)) @@ -241,7 +241,7 @@ def test_permu(self): ) ret0 = ft0(rd0, extended_atype, gr0, fparam=0, aparam=0) - res.append(ret0["foo"]) + res.append(ret0["dipole"]) np.testing.assert_allclose( to_numpy_array(res[0][:, idx_perm]), to_numpy_array(res[1]) @@ -281,7 +281,7 @@ def test_trans(self): ) ret0 = ft0(rd0, extended_atype, gr0, fparam=0, aparam=0) - res.append(ret0["foo"]) + res.append(ret0["dipole"]) np.testing.assert_allclose(to_numpy_array(res[0]), to_numpy_array(res[1])) diff --git a/source/tests/pt/model/test_embedding_net.py b/source/tests/pt/model/test_embedding_net.py index 87e8a97444..a1895718dd 100644 --- a/source/tests/pt/model/test_embedding_net.py +++ b/source/tests/pt/model/test_embedding_net.py @@ -39,6 +39,10 @@ ) from deepmd.tf.descriptor import DescrptSeA as DescrptSeA_tf +from ..test_stat import ( + energy_data_requirement, +) + CUR_DIR = os.path.dirname(__file__) @@ -128,6 +132,7 @@ def setUp(self): self.systems[0], model_config["type_map"], ) + ds.add_data_requirement(energy_data_requirement) self.filter_neuron = model_config["descriptor"]["neuron"] self.axis_neuron = model_config["descriptor"]["axis_neuron"] self.np_batch, self.torch_batch = get_single_batch(ds) diff --git a/source/tests/pt/model/test_polarizability_fitting.py b/source/tests/pt/model/test_polarizability_fitting.py index f76a9e28ac..3b55f8bc05 100644 --- a/source/tests/pt/model/test_polarizability_fitting.py +++ b/source/tests/pt/model/test_polarizability_fitting.py @@ -67,7 +67,6 @@ def test_consistency( [None, self.scale], ): ft0 = PolarFittingNet( - "foo", self.nt, self.dd0.dim_out, embedding_width=self.dd0.get_dim_emb(), @@ -113,16 +112,16 @@ def test_consistency( aparam=to_numpy_array(iap), ) np.testing.assert_allclose( - to_numpy_array(ret0["foo"]), - ret1["foo"], + to_numpy_array(ret0["polar"]), + ret1["polar"], ) np.testing.assert_allclose( - to_numpy_array(ret0["foo"]), - to_numpy_array(ret2["foo"]), + to_numpy_array(ret0["polar"]), + to_numpy_array(ret2["polar"]), ) np.testing.assert_allclose( - to_numpy_array(ret0["foo"]), - ret3["foo"], + to_numpy_array(ret0["polar"]), + ret3["polar"], ) def test_jit( @@ -135,7 +134,6 @@ def test_jit( [True, False], ): ft0 = PolarFittingNet( - "foo", self.nt, self.dd0.dim_out, embedding_width=self.dd0.get_dim_emb(), @@ -177,7 +175,6 @@ def test_rot(self): [None, self.scale], ): ft0 = PolarFittingNet( - "foo", self.nt, self.dd0.dim_out, # dim_descrpt embedding_width=self.dd0.get_dim_emb(), @@ -220,7 +217,7 @@ def test_rot(self): ) ret0 = ft0(rd0, extended_atype, gr0, fparam=ifp, aparam=iap) - res.append(ret0["foo"]) + res.append(ret0["polar"]) np.testing.assert_allclose( to_numpy_array(res[1]), to_numpy_array( @@ -235,7 +232,6 @@ def test_permu(self): coord = torch.matmul(self.coord, self.cell) for fit_diag, scale in itertools.product([True, False], [None, self.scale]): ft0 = PolarFittingNet( - "foo", self.nt, self.dd0.dim_out, embedding_width=self.dd0.get_dim_emb(), @@ -264,7 +260,7 @@ def test_permu(self): ) ret0 = ft0(rd0, extended_atype, gr0, fparam=None, aparam=None) - res.append(ret0["foo"]) + res.append(ret0["polar"]) np.testing.assert_allclose( to_numpy_array(res[0][:, idx_perm]), @@ -281,7 +277,6 @@ def test_trans(self): ) for fit_diag, scale in itertools.product([True, False], [None, self.scale]): ft0 = PolarFittingNet( - "foo", self.nt, self.dd0.dim_out, embedding_width=self.dd0.get_dim_emb(), @@ -309,7 +304,7 @@ def test_trans(self): ) ret0 = ft0(rd0, extended_atype, gr0, fparam=0, aparam=0) - res.append(ret0["foo"]) + res.append(ret0["polar"]) np.testing.assert_allclose(to_numpy_array(res[0]), to_numpy_array(res[1])) diff --git a/source/tests/pt/test_loss.py b/source/tests/pt/test_loss.py index e117c7f05a..484d62a3ad 100644 --- a/source/tests/pt/test_loss.py +++ b/source/tests/pt/test_loss.py @@ -28,6 +28,9 @@ from .model.test_embedding_net import ( get_single_batch, ) +from .test_stat import ( + energy_data_requirement, +) CUR_DIR = os.path.dirname(__file__) @@ -47,6 +50,7 @@ def get_batch(): if isinstance(systems, str): systems = expand_sys_str(systems) dataset = DeepmdDataSetForLoader(systems[0], model_config["type_map"]) + dataset.add_data_requirement(energy_data_requirement) np_batch, pt_batch = get_single_batch(dataset) return np_batch, pt_batch diff --git a/source/tests/pt/test_stat.py b/source/tests/pt/test_stat.py index 318b2e042f..54810fcc8f 100644 --- a/source/tests/pt/test_stat.py +++ b/source/tests/pt/test_stat.py @@ -47,6 +47,40 @@ CUR_DIR = os.path.dirname(__file__) +energy_data_requirement = { + "energy": { + "ndof": 1, + "atomic": False, + "must": False, + "high_prec": True, + }, + "force": { + "ndof": 3, + "atomic": True, + "must": False, + "high_prec": False, + }, + "virial": { + "ndof": 9, + "atomic": False, + "must": False, + "high_prec": False, + }, + "atom_ener": { + "ndof": 1, + "atomic": True, + "must": False, + "high_prec": False, + }, + "atom_pref": { + "ndof": 1, + "atomic": True, + "must": False, + "high_prec": False, + "repeat": 3, + }, +} + def compare(ut, base, given): if isinstance(base, list): @@ -111,6 +145,7 @@ def setUp(self): self.filter_neuron = model_config["descriptor"]["neuron"] self.axis_neuron = model_config["descriptor"]["axis_neuron"] self.n_neuron = model_config["fitting_net"]["neuron"] + self.my_dataset.add_data_requirement(energy_data_requirement) self.my_sampled = my_make( self.my_dataset.systems, self.my_dataset.dataloaders, self.data_stat_nbatch From 2e87e1d11150cb164f1bc2b08e86e2ca5954ce08 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Wed, 28 Feb 2024 22:47:41 +0800 Subject: [PATCH 09/37] Fix uts --- deepmd/pt/model/model/model.py | 4 ++++ source/tests/pt/model/test_model.py | 5 +++++ source/tests/pt/model/test_polarizability_fitting.py | 1 - 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/deepmd/pt/model/model/model.py b/deepmd/pt/model/model/model.py index e32d2f307d..0e2afadd14 100644 --- a/deepmd/pt/model/model/model.py +++ b/deepmd/pt/model/model/model.py @@ -84,3 +84,7 @@ def compute_or_load_stat( The path to the statistics files. """ raise NotImplementedError + + def data_requirement(self) -> dict: + """Get the data requirement for the model.""" + raise NotImplementedError diff --git a/source/tests/pt/model/test_model.py b/source/tests/pt/model/test_model.py index d8c7de39c3..69ec88f5d7 100644 --- a/source/tests/pt/model/test_model.py +++ b/source/tests/pt/model/test_model.py @@ -51,6 +51,10 @@ LearningRateExp, ) +from ..test_stat import ( + energy_data_requirement, +) + VariableState = collections.namedtuple("VariableState", ["value", "gradient"]) @@ -281,6 +285,7 @@ def test_consistency(self): "type_map": self.type_map, }, ) + my_ds.add_data_requirement(energy_data_requirement) my_model = get_model( model_params={ "descriptor": { diff --git a/source/tests/pt/model/test_polarizability_fitting.py b/source/tests/pt/model/test_polarizability_fitting.py index 3b55f8bc05..b1a5e3f730 100644 --- a/source/tests/pt/model/test_polarizability_fitting.py +++ b/source/tests/pt/model/test_polarizability_fitting.py @@ -323,7 +323,6 @@ def setUp(self): self.atype = torch.IntTensor([0, 0, 0, 1, 1], device="cpu") self.dd0 = DescrptSeA(self.rcut, self.rcut_smth, self.sel).to(env.DEVICE) self.ft0 = PolarFittingNet( - "polar", self.nt, self.dd0.dim_out, embedding_width=self.dd0.get_dim_emb(), From 2618d988aa9cf35f14e2259a84e99675e71c0da7 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 01:08:46 +0800 Subject: [PATCH 10/37] Support multi-task training --- .../descriptor/make_base_descriptor.py | 5 + deepmd/dpmodel/descriptor/se_e2_a.py | 4 + deepmd/dpmodel/descriptor/se_r.py | 4 + deepmd/pt/model/descriptor/__init__.py | 4 + deepmd/pt/model/descriptor/dpa1.py | 18 ++ deepmd/pt/model/descriptor/dpa2.py | 35 ++++ deepmd/pt/model/descriptor/hybrid.py | 2 - deepmd/pt/model/descriptor/se_a.py | 13 ++ deepmd/pt/model/task/fitting.py | 12 -- deepmd/pt/train/wrapper.py | 4 +- deepmd/pt/utils/multi_task.py | 101 ++++++---- source/tests/pt/model/water/multitask.json | 139 ++++++++++++++ source/tests/pt/test_multitask.py | 173 ++++++++++++++++++ source/tests/pt/test_training.py | 9 - 14 files changed, 462 insertions(+), 61 deletions(-) create mode 100644 source/tests/pt/model/water/multitask.json create mode 100644 source/tests/pt/test_multitask.py diff --git a/deepmd/dpmodel/descriptor/make_base_descriptor.py b/deepmd/dpmodel/descriptor/make_base_descriptor.py index ab4c206fdf..db0611b184 100644 --- a/deepmd/dpmodel/descriptor/make_base_descriptor.py +++ b/deepmd/dpmodel/descriptor/make_base_descriptor.py @@ -86,6 +86,11 @@ def mixed_types(self) -> bool: """ pass + @abstractmethod + def share_params(self, base_class, shared_level, resume=False): + """Share the parameters of self to the base_class with shared_level.""" + pass + def compute_input_stats( self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None ): diff --git a/deepmd/dpmodel/descriptor/se_e2_a.py b/deepmd/dpmodel/descriptor/se_e2_a.py index b102933ac9..45dff13129 100644 --- a/deepmd/dpmodel/descriptor/se_e2_a.py +++ b/deepmd/dpmodel/descriptor/se_e2_a.py @@ -240,6 +240,10 @@ def mixed_types(self): """ return False + def share_params(self, base_class, shared_level, resume=False): + """Share the parameters of self to the base_class with shared_level.""" + raise NotImplementedError + def get_ntypes(self) -> int: """Returns the number of element types.""" return self.ntypes diff --git a/deepmd/dpmodel/descriptor/se_r.py b/deepmd/dpmodel/descriptor/se_r.py index 5973c55353..f2f60ca6c1 100644 --- a/deepmd/dpmodel/descriptor/se_r.py +++ b/deepmd/dpmodel/descriptor/se_r.py @@ -200,6 +200,10 @@ def mixed_types(self): """ return False + def share_params(self, base_class, shared_level, resume=False): + """Share the parameters of self to the base_class with shared_level.""" + raise NotImplementedError + def get_ntypes(self) -> int: """Returns the number of element types.""" return self.ntypes diff --git a/deepmd/pt/model/descriptor/__init__.py b/deepmd/pt/model/descriptor/__init__.py index 5fd644f149..4796357faa 100644 --- a/deepmd/pt/model/descriptor/__init__.py +++ b/deepmd/pt/model/descriptor/__init__.py @@ -1,4 +1,7 @@ # SPDX-License-Identifier: LGPL-3.0-or-later +from .base_descriptor import ( + BaseDescriptor, +) from .descriptor import ( DescriptorBlock, make_default_type_embedding, @@ -31,6 +34,7 @@ ) __all__ = [ + "BaseDescriptor", "DescriptorBlock", "make_default_type_embedding", "DescrptBlockSeA", diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py index 1d8da3469f..c64f1e7f9a 100644 --- a/deepmd/pt/model/descriptor/dpa1.py +++ b/deepmd/pt/model/descriptor/dpa1.py @@ -122,6 +122,24 @@ def mixed_types(self) -> bool: """ return self.se_atten.mixed_types() + def share_params(self, base_class, shared_level, resume=False): + assert ( + self.__class__ == base_class.__class__ + ), "Only descriptors of the same type can share params!" + # For DPA1 descriptors, the user-defined share-level + # shared_level: 0 + # share all parameters in both type_embedding and se_atten + if shared_level == 0: + self._modules["type_embedding"] = base_class._modules["type_embedding"] + self.se_atten.share_params(base_class.se_atten, 0, resume=resume) + # shared_level: 1 + # share all parameters in type_embedding + elif shared_level == 1: + self._modules["type_embedding"] = base_class._modules["type_embedding"] + # Other shared levels + else: + raise NotImplementedError + @property def dim_out(self): return self.get_dim_out() diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py index ee86d81fae..69269b1b56 100644 --- a/deepmd/pt/model/descriptor/dpa2.py +++ b/deepmd/pt/model/descriptor/dpa2.py @@ -288,6 +288,41 @@ def mixed_types(self) -> bool: """ return True + def share_params(self, base_class, shared_level, resume=False): + assert ( + self.__class__ == base_class.__class__ + ), "Only descriptors of the same type can share params!" + # For DPA2 descriptors, the user-defined share-level + # shared_level: 0 + # share all parameters in type_embedding, repinit and repformers + if shared_level == 0: + self._modules["type_embedding"] = base_class._modules["type_embedding"] + self.repinit.share_params(base_class.repinit, 0, resume=resume) + self._modules["g1_shape_tranform"] = base_class._modules[ + "g1_shape_tranform" + ] + self.repformers.share_params(base_class.repformers, 0, resume=resume) + # shared_level: 1 + # share all parameters in type_embedding and repinit + elif shared_level == 1: + self._modules["type_embedding"] = base_class._modules["type_embedding"] + self.repinit.share_params(base_class.repinit, 0, resume=resume) + # shared_level: 2 + # share all parameters in type_embedding and repformers + elif shared_level == 2: + self._modules["type_embedding"] = base_class._modules["type_embedding"] + self._modules["g1_shape_tranform"] = base_class._modules[ + "g1_shape_tranform" + ] + self.repformers.share_params(base_class.repformers, 0, resume=resume) + # shared_level: 3 + # share all parameters in type_embedding + elif shared_level == 3: + self._modules["type_embedding"] = base_class._modules["type_embedding"] + # Other shared levels + else: + raise NotImplementedError + @property def dim_out(self): return self.get_dim_out() diff --git a/deepmd/pt/model/descriptor/hybrid.py b/deepmd/pt/model/descriptor/hybrid.py index a761204d64..872f642ef5 100644 --- a/deepmd/pt/model/descriptor/hybrid.py +++ b/deepmd/pt/model/descriptor/hybrid.py @@ -154,8 +154,6 @@ def share_params(self, base_class, shared_level, resume=False): self.descriptor_list[ii].share_params( base_class.descriptor_list[ii], shared_level, resume=resume ) - if self.hybrid_mode == "sequential": - self.sequential_transform = base_class.sequential_transform else: raise NotImplementedError diff --git a/deepmd/pt/model/descriptor/se_a.py b/deepmd/pt/model/descriptor/se_a.py index 5843534e04..355c917dcf 100644 --- a/deepmd/pt/model/descriptor/se_a.py +++ b/deepmd/pt/model/descriptor/se_a.py @@ -126,6 +126,19 @@ def mixed_types(self): """ return self.sea.mixed_types() + def share_params(self, base_class, shared_level, resume=False): + assert ( + self.__class__ == base_class.__class__ + ), "Only descriptors of the same type can share params!" + # For SeA descriptors, the user-defined share-level + # shared_level: 0 + # share all parameters in sea + if shared_level == 0: + self.sea.share_params(base_class.sea, 0, resume=resume) + # Other shared levels + else: + raise NotImplementedError + @property def dim_out(self): """Returns the output dimension of this descriptor.""" diff --git a/deepmd/pt/model/task/fitting.py b/deepmd/pt/model/task/fitting.py index 20876d9be7..a964f0222d 100644 --- a/deepmd/pt/model/task/fitting.py +++ b/deepmd/pt/model/task/fitting.py @@ -77,18 +77,6 @@ def share_params(self, base_class, shared_level, resume=False): # the following will successfully link all the params except buffers, which need manually link. for item in self._modules: self._modules[item] = base_class._modules[item] - elif shared_level == 2: - # share all the layers before final layer - # the following will successfully link all the params except buffers, which need manually link. - self._modules["filter_layers"][0].deep_layers = base_class._modules[ - "filter_layers" - ][0].deep_layers - elif shared_level == 3: - # share the first layers - # the following will successfully link all the params except buffers, which need manually link. - self._modules["filter_layers"][0].deep_layers[0] = base_class._modules[ - "filter_layers" - ][0].deep_layers[0] else: raise NotImplementedError diff --git a/deepmd/pt/train/wrapper.py b/deepmd/pt/train/wrapper.py index 49619b19ea..52cc636c10 100644 --- a/deepmd/pt/train/wrapper.py +++ b/deepmd/pt/train/wrapper.py @@ -61,7 +61,7 @@ def __init__( self.inference_only = self.loss is None def set_trainable_params(self): - supported_types = ["type_embedding", "descriptor", "fitting_net"] + supported_types = ["descriptor", "fitting_net"] for model_item in self.model: for net_type in supported_types: trainable = True @@ -83,7 +83,7 @@ def set_trainable_params(self): param.requires_grad = trainable def share_params(self, shared_links, resume=False): - supported_types = ["type_embedding", "descriptor", "fitting_net"] + supported_types = ["descriptor", "fitting_net"] for shared_item in shared_links: class_name = shared_links[shared_item]["type"] shared_base = shared_links[shared_item]["links"][0] diff --git a/deepmd/pt/utils/multi_task.py b/deepmd/pt/utils/multi_task.py index f97a826b03..ae3933a101 100644 --- a/deepmd/pt/utils/multi_task.py +++ b/deepmd/pt/utils/multi_task.py @@ -4,17 +4,10 @@ ) from deepmd.pt.model.descriptor import ( - DescrptDPA1, - DescrptDPA2, - DescrptSeA, -) -from deepmd.pt.model.network.network import ( - TypeEmbedNet, + BaseDescriptor, ) from deepmd.pt.model.task import ( - EnergyFittingNet, - EnergyFittingNetDirect, - FittingNetAttenLcc, + BaseFitting, ) @@ -37,9 +30,68 @@ def preprocess_shared_params(model_config): - "shared_level": Shared level (int) of this item in this model. Lower for more params to share, 0 means to share all params in this item. This list are sorted by "shared_level". + For example, if one has `model_config` like this: + "model": { + "shared_dict": { + "my_type_map": ["foo", "bar"], + "my_des1": { + "type": "se_e2_a", + "neuron": [10, 20, 40] + }, + }, + "model_dict": { + "model_1": { + "type_map": "my_type_map", + "descriptor": "my_des1", + "fitting_net": { + "neuron": [100, 100, 100] + } + }, + "model_2": { + "type_map": "my_type_map", + "descriptor": "my_des1", + "fitting_net": { + "neuron": [100, 100, 100] + } + } + "model_3": { + "type_map": "my_type_map", + "descriptor": "my_des1:1", + "fitting_net": { + "neuron": [100, 100, 100] + } + } + } + } + The above config will init three model branches named `model_1` and `model_2` and `model_3`, + in which: + - `model_2` and `model_3` will have the same `type_map` as that in `model_1`. + - `model_2` will share all the parameters of `descriptor` with `model_1`, + while `model_3` will share part of parameters of `descriptor` with `model_1` + on human-defined share-level `1` (default is `0`, meaning share all the parameters). + - `model_1`, `model_2` and `model_3` have three different `fitting_net`s. + The returned `model_config` will automatically fulfill the input `model_config` as if there's no sharing, + and the `shared_links` will keep all the sharing information with looking: + { + 'my_des1': { + 'type': 'DescrptSeA', + 'links': [ + {'model_key': 'model_1', + 'shared_type': 'descriptor', + 'shared_level': 0}, + {'model_key': 'model_2', + 'shared_type': 'descriptor', + 'shared_level': 0}, + {'model_key': 'model_3', + 'shared_type': 'descriptor', + 'shared_level': 1} + ] + } + } + """ assert "model_dict" in model_config, "only multi-task model can use this method!" - supported_types = ["type_map", "type_embedding", "descriptor", "fitting_net"] + supported_types = ["type_map", "descriptor", "fitting_net"] shared_dict = model_config.get("shared_dict", {}) shared_links = {} type_map_keys = [] @@ -98,32 +150,9 @@ def replace_one_item(params_dict, key_type, key_in_dict, suffix="", index=None): def get_class_name(item_key, item_params): - if item_key == "type_embedding": - return TypeEmbedNet.__name__ - elif item_key == "descriptor": - item_type = item_params.get("type", "se_e2_a") - if item_type == "se_e2_a": - return DescrptSeA.__name__ - elif item_type in ["se_atten", "dpa1"]: - return DescrptDPA1.__name__ - elif item_type in ["dpa2"]: - return DescrptDPA2.__name__ - # todo add support for other combination - # elif item_type == "gaussian_lcc": - # return DescrptGaussianLcc.__name__ - # elif item_type == "hybrid": - # return DescrptHybrid.__name__ - else: - raise RuntimeError(f"Unknown descriptor type {item_type}") + if item_key == "descriptor": + return BaseDescriptor.get_class_by_type(item_params.get("type", "se_e2_a")) elif item_key == "fitting_net": - item_type = item_params.get("type", "ener") - if item_type == "ener": - return EnergyFittingNet.__name__ - elif item_type in ["direct_force", "direct_force_ener"]: - return EnergyFittingNetDirect.__name__ - elif item_type == "atten_vec_lcc": - return FittingNetAttenLcc.__name__ - else: - raise RuntimeError(f"Unknown fitting_net type {item_type}") + return BaseFitting.get_class_by_type(item_params.get("type", "ener")) else: raise RuntimeError(f"Unknown class_name type {item_key}") diff --git a/source/tests/pt/model/water/multitask.json b/source/tests/pt/model/water/multitask.json new file mode 100644 index 0000000000..6baddd672b --- /dev/null +++ b/source/tests/pt/model/water/multitask.json @@ -0,0 +1,139 @@ +{ + "model": { + "shared_dict": { + "my_type_map": [ + "O", + "H", + "B" + ], + "my_descriptor": { + "type": "se_e2_a", + "sel": [ + 46, + 92 + ], + "rcut_smth": 0.50, + "rcut": 6.00, + "neuron": [ + 25, + 50, + 100 + ], + "resnet_dt": false, + "axis_neuron": 16, + "seed": 1, + "_comment": " that's all" + }, + "_comment": "that's all" + }, + "model_dict": { + "model_1": { + "type_map": "my_type_map", + "descriptor": "my_descriptor", + "fitting_net": { + "neuron": [ + 240, + 240, + 240 + ], + "resnet_dt": true, + "seed": 1, + "_comment": " that's all" + } + }, + "model_2": { + "type_map": "my_type_map", + "descriptor": "my_descriptor", + "fitting_net": { + "neuron": [ + 240, + 240, + 240 + ], + "resnet_dt": true, + "seed": 1, + "_comment": " that's all" + } + } + } + }, + "learning_rate": { + "type": "exp", + "decay_steps": 5000, + "start_lr": 0.0002, + "decay_rate": 0.98, + "stop_lr": 3.51e-08, + "_comment": "that's all" + }, + "loss_dict": { + "_comment": " that's all", + "model_1": { + "type": "ener", + "start_pref_e": 0.02, + "limit_pref_e": 1, + "start_pref_f": 1000, + "limit_pref_f": 1, + "start_pref_v": 0, + "limit_pref_v": 0 + }, + "model_2": { + "type": "ener", + "start_pref_e": 0.02, + "limit_pref_e": 1, + "start_pref_f": 1000, + "limit_pref_f": 1, + "start_pref_v": 0, + "limit_pref_v": 0 + } + }, + "training": { + "model_prob": { + "model_1": 0.5, + "model_2": 0.5 + }, + "data_dict": { + "model_1": { + "stat_file": "./stat_files/model_1", + "training_data": { + "systems": [ + "pt/water/data/data_0" + ], + "batch_size": 1, + "_comment": "that's all" + }, + "validation_data": { + "systems": [ + "pt/water/data/data_0" + ], + "batch_size": 1, + "_comment": "that's all" + } + }, + "model_2": { + "stat_file": "./stat_files/model_2", + "training_data": { + "systems": [ + "pt/water/data/data_0" + ], + "batch_size": 1, + "_comment": "that's all" + }, + "validation_data": { + "systems": [ + "pt/water/data/data_0" + ], + "batch_size": 1, + "_comment": "that's all" + } + } + }, + "numb_steps": 100000, + "warmup_steps": 0, + "gradient_max_norm": 5.0, + "seed": 10, + "disp_file": "lcurve.out", + "disp_freq": 100, + "save_freq": 100, + "_comment": "that's all" + } +} diff --git a/source/tests/pt/test_multitask.py b/source/tests/pt/test_multitask.py new file mode 100644 index 0000000000..c7a2784367 --- /dev/null +++ b/source/tests/pt/test_multitask.py @@ -0,0 +1,173 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +import json +import os +import shutil +import unittest +from copy import ( + deepcopy, +) +from pathlib import ( + Path, +) + +import torch + +from deepmd.pt.entrypoints.main import ( + get_trainer, +) + +from .model.test_permutation import ( + model_dpa1, + model_dpa2, + model_se_e2_a, +) + +multitask_template_json = str(Path(__file__).parent / "water/multitask.json") +with open(multitask_template_json) as f: + multitask_template = json.load(f) + + +class MultiTaskTrainTest: + def test_multitask_train(self): + trainer = get_trainer(deepcopy(self.config)) + trainer.run() + # check model keys + self.assertEqual(len(trainer.wrapper.model), 2) + self.assertTrue("model_1" in trainer.wrapper.model) + self.assertTrue("model_2" in trainer.wrapper.model) + + # check shared parameters + multi_state_dict = trainer.wrapper.model.state_dict() + for state_key in multi_state_dict: + if "model_1" in state_key: + self.assertTrue( + state_key.replace("model_1", "model_2") in multi_state_dict + ) + if "model_2" in state_key: + self.assertTrue( + state_key.replace("model_2", "model_1") in multi_state_dict + ) + if "model_1.descriptor" in state_key: + torch.testing.assert_allclose( + multi_state_dict[state_key], + multi_state_dict[state_key.replace("model_1", "model_2")], + ) + self.tearDown() + + def tearDown(self): + for f in os.listdir("."): + if f.startswith("model") and f.endswith(".pt"): + os.remove(f) + if f in ["lcurve.out"]: + os.remove(f) + if f in [self.stat_files]: + shutil.rmtree(f) + + +class TestMultiTaskSeA(unittest.TestCase, MultiTaskTrainTest): + def setUp(self): + multitask_se_e2_a = deepcopy(multitask_template) + multitask_se_e2_a["model"]["shared_dict"]["my_descriptor"] = model_se_e2_a[ + "descriptor" + ] + data_file = [str(Path(__file__).parent / "water/data/data_0")] + self.stat_files = "se_e2_a" + os.makedirs(self.stat_files, exist_ok=True) + self.config = multitask_se_e2_a + self.config["training"]["data_dict"]["model_1"]["training_data"][ + "systems" + ] = data_file + self.config["training"]["data_dict"]["model_1"]["validation_data"][ + "systems" + ] = data_file + self.config["training"]["data_dict"]["model_1"][ + "stat_file" + ] = f"{self.stat_files}/model_1" + self.config["training"]["data_dict"]["model_2"]["training_data"][ + "systems" + ] = data_file + self.config["training"]["data_dict"]["model_2"]["validation_data"][ + "systems" + ] = data_file + self.config["training"]["data_dict"]["model_2"][ + "stat_file" + ] = f"{self.stat_files}/model_2" + self.config["training"]["numb_steps"] = 1 + self.config["training"]["save_freq"] = 1 + + def tearDown(self) -> None: + MultiTaskTrainTest.tearDown(self) + + +class TestMultiTaskDPA1(unittest.TestCase, MultiTaskTrainTest): + def setUp(self): + multitask_DPA1 = deepcopy(multitask_template) + multitask_DPA1["model"]["shared_dict"]["my_descriptor"] = model_dpa1[ + "descriptor" + ] + data_file = [str(Path(__file__).parent / "water/data/data_0")] + self.stat_files = "DPA1" + os.makedirs(self.stat_files, exist_ok=True) + self.config = multitask_DPA1 + self.config["training"]["data_dict"]["model_1"]["training_data"][ + "systems" + ] = data_file + self.config["training"]["data_dict"]["model_1"]["validation_data"][ + "systems" + ] = data_file + self.config["training"]["data_dict"]["model_1"][ + "stat_file" + ] = f"{self.stat_files}/model_1" + self.config["training"]["data_dict"]["model_2"]["training_data"][ + "systems" + ] = data_file + self.config["training"]["data_dict"]["model_2"]["validation_data"][ + "systems" + ] = data_file + self.config["training"]["data_dict"]["model_2"][ + "stat_file" + ] = f"{self.stat_files}/model_2" + self.config["training"]["numb_steps"] = 1 + self.config["training"]["save_freq"] = 1 + + def tearDown(self) -> None: + MultiTaskTrainTest.tearDown(self) + + +class TestMultiTaskDPA2(unittest.TestCase, MultiTaskTrainTest): + def setUp(self): + multitask_DPA2 = deepcopy(multitask_template) + multitask_DPA2["model"]["shared_dict"]["my_descriptor"] = model_dpa2[ + "descriptor" + ] + data_file = [str(Path(__file__).parent / "water/data/data_0")] + self.stat_files = "DPA2" + os.makedirs(self.stat_files, exist_ok=True) + self.config = multitask_DPA2 + self.config["training"]["data_dict"]["model_1"]["training_data"][ + "systems" + ] = data_file + self.config["training"]["data_dict"]["model_1"]["validation_data"][ + "systems" + ] = data_file + self.config["training"]["data_dict"]["model_1"][ + "stat_file" + ] = f"{self.stat_files}/model_1" + self.config["training"]["data_dict"]["model_2"]["training_data"][ + "systems" + ] = data_file + self.config["training"]["data_dict"]["model_2"]["validation_data"][ + "systems" + ] = data_file + self.config["training"]["data_dict"]["model_2"][ + "stat_file" + ] = f"{self.stat_files}/model_2" + self.config["training"]["numb_steps"] = 1 + self.config["training"]["save_freq"] = 1 + + def tearDown(self) -> None: + MultiTaskTrainTest.tearDown(self) + + +if __name__ == "__main__": + unittest.main() diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py index 2186467788..4e73fc4f8a 100644 --- a/source/tests/pt/test_training.py +++ b/source/tests/pt/test_training.py @@ -79,15 +79,6 @@ def setUp(self): self.config["training"]["training_data"]["systems"] = data_file self.config["training"]["validation_data"]["systems"] = data_file self.config["model"] = deepcopy(model_dpa2) - self.config["model"]["descriptor"]["rcut"] = self.config["model"]["descriptor"][ - "repinit_rcut" - ] - self.config["model"]["descriptor"]["rcut_smth"] = self.config["model"][ - "descriptor" - ]["repinit_rcut_smth"] - self.config["model"]["descriptor"]["sel"] = self.config["model"]["descriptor"][ - "repinit_nsel" - ] self.config["training"]["numb_steps"] = 1 self.config["training"]["save_freq"] = 1 From f1585b2d7aa088b349bda8ec6916c410c9bc82cf Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 01:32:19 +0800 Subject: [PATCH 11/37] Take advice from QL scan --- source/tests/pt/test_multitask.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/source/tests/pt/test_multitask.py b/source/tests/pt/test_multitask.py index c7a2784367..0c1abf1f44 100644 --- a/source/tests/pt/test_multitask.py +++ b/source/tests/pt/test_multitask.py @@ -33,20 +33,16 @@ def test_multitask_train(self): trainer.run() # check model keys self.assertEqual(len(trainer.wrapper.model), 2) - self.assertTrue("model_1" in trainer.wrapper.model) - self.assertTrue("model_2" in trainer.wrapper.model) + self.assertIn("model_1", trainer.wrapper.model) + self.assertIn("model_2", trainer.wrapper.model) # check shared parameters multi_state_dict = trainer.wrapper.model.state_dict() for state_key in multi_state_dict: if "model_1" in state_key: - self.assertTrue( - state_key.replace("model_1", "model_2") in multi_state_dict - ) + self.assertIn(state_key.replace("model_1", "model_2"), multi_state_dict) if "model_2" in state_key: - self.assertTrue( - state_key.replace("model_2", "model_1") in multi_state_dict - ) + self.assertIn(state_key.replace("model_2", "model_1"), multi_state_dict) if "model_1.descriptor" in state_key: torch.testing.assert_allclose( multi_state_dict[state_key], From 463f9fbafaba27c0782678bb50e531d8b37e267f Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 01:33:47 +0800 Subject: [PATCH 12/37] Support no validation --- deepmd/pt/entrypoints/main.py | 19 ++++++--- deepmd/pt/train/training.py | 72 +++++++++++++++++++++-------------- 2 files changed, 56 insertions(+), 35 deletions(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index c9eba6e579..340f783539 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -91,9 +91,12 @@ def prepare_trainer_input_single( type_split = False if model_params_single["descriptor"]["type"] in ["se_e2_a"]: type_split = True - validation_dataset_params = data_dict_single["validation_data"] + validation_dataset_params = data_dict_single.get("validation_data", None) + validation_systems = ( + validation_dataset_params["systems"] if validation_dataset_params else None + ) training_systems = training_dataset_params["systems"] - validation_systems = validation_dataset_params["systems"] + # stat files stat_file_path_single = data_dict_single.get("stat_file", None) if stat_file_path_single is not None: @@ -107,10 +110,14 @@ def prepare_trainer_input_single( stat_file_path_single = DPPath(stat_file_path_single, "a") # validation and training data - validation_data_single = DpLoaderSet( - validation_systems, - validation_dataset_params["batch_size"], - model_params_single, + validation_data_single = ( + DpLoaderSet( + validation_systems, + validation_dataset_params["batch_size"], + model_params_single, + ) + if validation_systems + else None ) if ckpt or finetune_model: train_data_single = DpLoaderSet( diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index a7f0d9956a..55609d6dc3 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -57,7 +57,6 @@ if torch.__version__.startswith("2"): import torch._dynamo - import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data import ( @@ -142,20 +141,7 @@ def get_data_loader(_training_data, _validation_data, _training_params): else: train_sampler = get_weighted_sampler(_training_data, "prob_sys_size") - if "auto_prob" in _training_params["validation_data"]: - valid_sampler = get_weighted_sampler( - _validation_data, _training_params["validation_data"]["auto_prob"] - ) - elif "sys_probs" in _training_params["validation_data"]: - valid_sampler = get_weighted_sampler( - _validation_data, - _training_params["validation_data"]["sys_probs"], - sys_prob=True, - ) - else: - valid_sampler = get_weighted_sampler(_validation_data, "prob_sys_size") - - if train_sampler is None or valid_sampler is None: + if train_sampler is None: log.warning( "Sampler not specified!" ) # None sampler will lead to a premature stop iteration. Replacement should be True in attribute of the sampler to produce expected number of items in one iteration. @@ -169,22 +155,43 @@ def get_data_loader(_training_data, _validation_data, _training_params): ) with torch.device("cpu"): training_data_buffered = BufferedIterator(iter(training_dataloader)) - validation_dataloader = DataLoader( - _validation_data, - sampler=valid_sampler, - batch_size=None, - num_workers=min(NUM_WORKERS, 1), - drop_last=False, - pin_memory=True, - ) - - with torch.device("cpu"): - validation_data_buffered = BufferedIterator(iter(validation_dataloader)) - if _training_params.get("validation_data", None) is not None: - valid_numb_batch = _training_params["validation_data"].get( - "numb_btch", 1 + if _validation_data is not None: + if "auto_prob" in _training_params["validation_data"]: + valid_sampler = get_weighted_sampler( + _validation_data, + _training_params["validation_data"]["auto_prob"], + ) + elif "sys_probs" in _training_params["validation_data"]: + valid_sampler = get_weighted_sampler( + _validation_data, + _training_params["validation_data"]["sys_probs"], + sys_prob=True, + ) + else: + valid_sampler = get_weighted_sampler( + _validation_data, "prob_sys_size" + ) + validation_dataloader = DataLoader( + _validation_data, + sampler=valid_sampler, + batch_size=None, + num_workers=min(NUM_WORKERS, 1), + drop_last=False, + pin_memory=True, ) + with torch.device("cpu"): + validation_data_buffered = BufferedIterator( + iter(validation_dataloader) + ) + if _training_params.get("validation_data", None) is not None: + valid_numb_batch = _training_params["validation_data"].get( + "numb_btch", 1 + ) + else: + valid_numb_batch = 1 else: + validation_dataloader = None + validation_data_buffered = None valid_numb_batch = 1 return ( training_dataloader, @@ -645,6 +652,9 @@ def log_loss_valid(_task_key="Default"): input_dict, label_dict, _ = self.get_data( is_train=False, task_key=_task_key ) + if input_dict == {}: + # no validation data + return "", None _, loss, more_loss = self.wrapper( **input_dict, cur_lr=pref_lr, @@ -806,6 +816,8 @@ def get_data(self, is_train=True, task_key="Default"): ) batch_data = next(iter(self.training_data)) else: + if self.validation_data is None: + return {}, {}, {} try: batch_data = next(iter(self.validation_data)) except StopIteration: @@ -824,6 +836,8 @@ def get_data(self, is_train=True, task_key="Default"): ) batch_data = next(iter(self.training_data[task_key])) else: + if self.validation_data[task_key] is None: + return {}, {}, {} try: batch_data = next(iter(self.validation_data[task_key])) except StopIteration: From e8575affb1ced965eab3cfbd64d40ecbfaff607a Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 02:46:02 +0800 Subject: [PATCH 13/37] Update se_r.py --- deepmd/pt/model/descriptor/se_r.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/deepmd/pt/model/descriptor/se_r.py b/deepmd/pt/model/descriptor/se_r.py index e3832b16e4..e16297b5e6 100644 --- a/deepmd/pt/model/descriptor/se_r.py +++ b/deepmd/pt/model/descriptor/se_r.py @@ -150,6 +150,34 @@ def mixed_types(self) -> bool: """ return False + def share_params(self, base_class, shared_level, resume=False): + assert ( + self.__class__ == base_class.__class__ + ), "Only descriptors of the same type can share params!" + # For SeR descriptors, the user-defined share-level + # shared_level: 0 + if shared_level == 0: + # link buffers + if hasattr(self, "mean") and not resume: + # in case of change params during resume + base_env = EnvMatStatSe(base_class) + base_env.stats = base_class.stats + for kk in base_class.get_stats(): + base_env.stats[kk] += self.get_stats()[kk] + mean, stddev = base_env() + if not base_class.set_davg_zero: + base_class.mean.copy_(torch.tensor(mean, device=env.DEVICE)) + base_class.stddev.copy_(torch.tensor(stddev, device=env.DEVICE)) + self.mean = base_class.mean + self.stddev = base_class.stddev + # self.load_state_dict(base_class.state_dict()) # this does not work, because it only inits the model + # the following will successfully link all the params except buffers + for item in self._modules: + self._modules[item] = base_class._modules[item] + # Other shared levels + else: + raise NotImplementedError + def compute_input_stats( self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None ): From 66d03b8754b05e816e55113cd4ff64eb4b3ae245 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 02:49:42 +0800 Subject: [PATCH 14/37] omit data prob log --- deepmd/pt/utils/dataloader.py | 2 +- deepmd/pt/utils/finetune.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py index 9d30748321..70993c21a0 100644 --- a/deepmd/pt/utils/dataloader.py +++ b/deepmd/pt/utils/dataloader.py @@ -253,7 +253,7 @@ def get_weighted_sampler(training_data, prob_style, sys_prob=False): probs = prob_sys_size_ext(style, len(training_data), training_data.index) else: probs = process_sys_probs(prob_style, training_data.index) - log.info("Generated weighted sampler with prob array: " + str(probs)) + log.debug("Generated weighted sampler with prob array: " + str(probs)) # training_data.total_batch is the size of one epoch, you can increase it to avoid too many rebuilding of iteraters len_sampler = training_data.total_batch * max(env.NUM_WORKERS, 1) with torch.device("cpu"): diff --git a/deepmd/pt/utils/finetune.py b/deepmd/pt/utils/finetune.py index 13749da151..b08dc9fbef 100644 --- a/deepmd/pt/utils/finetune.py +++ b/deepmd/pt/utils/finetune.py @@ -21,7 +21,8 @@ def change_finetune_model_params( """ if multi_task: # TODO - log.error("finetune mode need modification for multitask mode!") + pass + # log.error("finetune mode need modification for multitask mode!") if finetune_model is not None: state_dict = torch.load(finetune_model, map_location=env.DEVICE) if "model" in state_dict: From e9e0d95a61beeeef51d5f3bc24857487a4708ba6 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 02:51:00 +0800 Subject: [PATCH 15/37] omit seed log --- deepmd/pt/model/task/ener.py | 1 - deepmd/pt/model/task/fitting.py | 1 - 2 files changed, 2 deletions(-) diff --git a/deepmd/pt/model/task/ener.py b/deepmd/pt/model/task/ener.py index 55ee79db25..edd73009c5 100644 --- a/deepmd/pt/model/task/ener.py +++ b/deepmd/pt/model/task/ener.py @@ -331,7 +331,6 @@ def __init__( self.filter_layers = torch.nn.ModuleList(filter_layers) if "seed" in kwargs: - log.info("Set seed to %d in fitting net.", kwargs["seed"]) torch.manual_seed(kwargs["seed"]) def output_def(self): diff --git a/deepmd/pt/model/task/fitting.py b/deepmd/pt/model/task/fitting.py index a964f0222d..0afc718684 100644 --- a/deepmd/pt/model/task/fitting.py +++ b/deepmd/pt/model/task/fitting.py @@ -343,7 +343,6 @@ def __init__( self.filter_layers_old = None if seed is not None: - log.info("Set seed to %d in fitting net.", seed) torch.manual_seed(seed) def serialize(self) -> dict: From ab35653b5ad5c39cc72670814c516d6a8b8431ca Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 12:32:33 +0800 Subject: [PATCH 16/37] Add fparam and aparam --- deepmd/pt/train/training.py | 2 ++ deepmd/utils/env_mat_stat.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 55609d6dc3..6b62282817 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -861,6 +861,8 @@ def get_data(self, is_train=True, task_key="Default"): "atype", "box", "spin", + "fparam", + "aparam", ] input_dict = {item_key: None for item_key in input_keys} label_dict = {} diff --git a/deepmd/utils/env_mat_stat.py b/deepmd/utils/env_mat_stat.py index 2fa497b9b6..217c46844b 100644 --- a/deepmd/utils/env_mat_stat.py +++ b/deepmd/utils/env_mat_stat.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: LGPL-3.0-or-later +import logging from abc import ( ABC, abstractmethod, @@ -19,6 +20,8 @@ DPPath, ) +log = logging.getLogger(__name__) + class StatItem: """A class to store the statistics of the environment matrix. @@ -170,10 +173,12 @@ def load_or_compute_stats( """ if path is not None and path.is_dir(): self.load_stats(path) + log.info(f"Load stats from {path}.") else: self.compute_stats(data) if path is not None: self.save_stats(path) + log.info(f"Save stats to {path}.") def get_avg(self, default: float = 0) -> Dict[str, float]: """Get the average of the environment matrix. From 64d60797bd640c6e3af794a87e09a5038d491d47 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 12:37:32 +0800 Subject: [PATCH 17/37] Add type hint for `Callable` --- deepmd/dpmodel/descriptor/make_base_descriptor.py | 4 +++- deepmd/pt/model/descriptor/descriptor.py | 4 +++- deepmd/pt/model/descriptor/dpa1.py | 4 +++- deepmd/pt/model/descriptor/dpa2.py | 4 +++- deepmd/pt/model/descriptor/hybrid.py | 4 +++- deepmd/pt/model/descriptor/repformers.py | 4 +++- deepmd/pt/model/descriptor/se_a.py | 8 ++++++-- deepmd/pt/model/descriptor/se_atten.py | 4 +++- deepmd/pt/model/descriptor/se_r.py | 4 +++- deepmd/pt/model/task/dipole.py | 2 +- deepmd/pt/model/task/ener.py | 2 +- deepmd/pt/model/task/polarizability.py | 2 +- 12 files changed, 33 insertions(+), 13 deletions(-) diff --git a/deepmd/dpmodel/descriptor/make_base_descriptor.py b/deepmd/dpmodel/descriptor/make_base_descriptor.py index db0611b184..9a954a7f0b 100644 --- a/deepmd/dpmodel/descriptor/make_base_descriptor.py +++ b/deepmd/dpmodel/descriptor/make_base_descriptor.py @@ -92,7 +92,9 @@ def share_params(self, base_class, shared_level, resume=False): pass def compute_input_stats( - self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None + self, + merged: Union[Callable[[], List[dict]], List[dict]], + path: Optional[DPPath] = None, ): """Update mean and stddev for descriptor elements.""" raise NotImplementedError diff --git a/deepmd/pt/model/descriptor/descriptor.py b/deepmd/pt/model/descriptor/descriptor.py index 02cd657c7c..778523a14d 100644 --- a/deepmd/pt/model/descriptor/descriptor.py +++ b/deepmd/pt/model/descriptor/descriptor.py @@ -89,7 +89,9 @@ def get_dim_emb(self) -> int: pass def compute_input_stats( - self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None + self, + merged: Union[Callable[[], List[dict]], List[dict]], + path: Optional[DPPath] = None, ): """Update mean and stddev for DescriptorBlock elements.""" raise NotImplementedError diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py index c64f1e7f9a..7a65df3f11 100644 --- a/deepmd/pt/model/descriptor/dpa1.py +++ b/deepmd/pt/model/descriptor/dpa1.py @@ -149,7 +149,9 @@ def dim_emb(self): return self.get_dim_emb() def compute_input_stats( - self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None + self, + merged: Union[Callable[[], List[dict]], List[dict]], + path: Optional[DPPath] = None, ): return self.se_atten.compute_input_stats(merged, path) diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py index 69269b1b56..87319f29be 100644 --- a/deepmd/pt/model/descriptor/dpa2.py +++ b/deepmd/pt/model/descriptor/dpa2.py @@ -333,7 +333,9 @@ def dim_emb(self): return self.get_dim_emb() def compute_input_stats( - self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None + self, + merged: Union[Callable[[], List[dict]], List[dict]], + path: Optional[DPPath] = None, ): for ii, descrpt in enumerate([self.repinit, self.repformers]): descrpt.compute_input_stats(merged, path) diff --git a/deepmd/pt/model/descriptor/hybrid.py b/deepmd/pt/model/descriptor/hybrid.py index 872f642ef5..40ff3e5c7f 100644 --- a/deepmd/pt/model/descriptor/hybrid.py +++ b/deepmd/pt/model/descriptor/hybrid.py @@ -158,7 +158,9 @@ def share_params(self, base_class, shared_level, resume=False): raise NotImplementedError def compute_input_stats( - self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None + self, + merged: Union[Callable[[], List[dict]], List[dict]], + path: Optional[DPPath] = None, ): """Update mean and stddev for descriptor elements.""" for ii, descrpt in enumerate(self.descriptor_list): diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py index 174daf14af..14fdbc60cb 100644 --- a/deepmd/pt/model/descriptor/repformers.py +++ b/deepmd/pt/model/descriptor/repformers.py @@ -281,7 +281,9 @@ def forward( return g1, g2, h2, rot_mat.view(-1, nloc, self.dim_emb, 3), sw def compute_input_stats( - self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None + self, + merged: Union[Callable[[], List[dict]], List[dict]], + path: Optional[DPPath] = None, ): """Update mean and stddev for descriptor elements.""" env_mat_stat = EnvMatStatSe(self) diff --git a/deepmd/pt/model/descriptor/se_a.py b/deepmd/pt/model/descriptor/se_a.py index 7a68d347dc..0fc32e6ac3 100644 --- a/deepmd/pt/model/descriptor/se_a.py +++ b/deepmd/pt/model/descriptor/se_a.py @@ -145,7 +145,9 @@ def dim_out(self): return self.sea.dim_out def compute_input_stats( - self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None + self, + merged: Union[Callable[[], List[dict]], List[dict]], + path: Optional[DPPath] = None, ): """Update mean and stddev for descriptor elements.""" return self.sea.compute_input_stats(merged, path) @@ -405,7 +407,9 @@ def __getitem__(self, key): raise KeyError(key) def compute_input_stats( - self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None + self, + merged: Union[Callable[[], List[dict]], List[dict]], + path: Optional[DPPath] = None, ): """Update mean and stddev for descriptor elements.""" env_mat_stat = EnvMatStatSe(self) diff --git a/deepmd/pt/model/descriptor/se_atten.py b/deepmd/pt/model/descriptor/se_atten.py index a056fbe889..9981a199de 100644 --- a/deepmd/pt/model/descriptor/se_atten.py +++ b/deepmd/pt/model/descriptor/se_atten.py @@ -203,7 +203,9 @@ def dim_emb(self): return self.get_dim_emb() def compute_input_stats( - self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None + self, + merged: Union[Callable[[], List[dict]], List[dict]], + path: Optional[DPPath] = None, ): """Update mean and stddev for descriptor elements.""" env_mat_stat = EnvMatStatSe(self) diff --git a/deepmd/pt/model/descriptor/se_r.py b/deepmd/pt/model/descriptor/se_r.py index e16297b5e6..ad9b2ae9d9 100644 --- a/deepmd/pt/model/descriptor/se_r.py +++ b/deepmd/pt/model/descriptor/se_r.py @@ -179,7 +179,9 @@ def share_params(self, base_class, shared_level, resume=False): raise NotImplementedError def compute_input_stats( - self, merged: Union[Callable, List[dict]], path: Optional[DPPath] = None + self, + merged: Union[Callable[[], List[dict]], List[dict]], + path: Optional[DPPath] = None, ): """Update mean and stddev for descriptor elements.""" env_mat_stat = EnvMatStatSe(self) diff --git a/deepmd/pt/model/task/dipole.py b/deepmd/pt/model/task/dipole.py index 08a3673a8c..00de5276ee 100644 --- a/deepmd/pt/model/task/dipole.py +++ b/deepmd/pt/model/task/dipole.py @@ -138,7 +138,7 @@ def output_def(self) -> FittingOutputDef: def compute_output_stats( self, - merged: Union[Callable, List[dict]], + merged: Union[Callable[[], List[dict]], List[dict]], stat_file_path: Optional[DPPath] = None, ): raise NotImplementedError diff --git a/deepmd/pt/model/task/ener.py b/deepmd/pt/model/task/ener.py index edd73009c5..404b92a10a 100644 --- a/deepmd/pt/model/task/ener.py +++ b/deepmd/pt/model/task/ener.py @@ -142,7 +142,7 @@ def serialize(self) -> dict: def compute_output_stats( self, - merged: Union[Callable, List[dict]], + merged: Union[Callable[[], List[dict]], List[dict]], stat_file_path: Optional[DPPath] = None, ): if stat_file_path is not None: diff --git a/deepmd/pt/model/task/polarizability.py b/deepmd/pt/model/task/polarizability.py index 0fe817084e..37c802613a 100644 --- a/deepmd/pt/model/task/polarizability.py +++ b/deepmd/pt/model/task/polarizability.py @@ -165,7 +165,7 @@ def output_def(self) -> FittingOutputDef: def compute_output_stats( self, - merged: Union[Callable, List[dict]], + merged: Union[Callable[[], List[dict]], List[dict]], stat_file_path: Optional[DPPath] = None, ): raise NotImplementedError From 6020a2b33d71cebdd6c9ea2d3fef77d4a9b4d6b7 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 15:37:19 +0800 Subject: [PATCH 18/37] Fix nopbc --- deepmd/pt/utils/stat.py | 8 ++++++-- deepmd/utils/data.py | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/deepmd/pt/utils/stat.py b/deepmd/pt/utils/stat.py index 4dbb633de5..3b246a0ec2 100644 --- a/deepmd/pt/utils/stat.py +++ b/deepmd/pt/utils/stat.py @@ -34,12 +34,16 @@ def make_stat_input(datasets, dataloaders, nbatches): iterator = iter(dataloaders[i]) stat_data = next(iterator) for dd in stat_data: - if isinstance(stat_data[dd], torch.Tensor): + if stat_data[dd] is None: + sys_stat[dd] = None + elif isinstance(stat_data[dd], torch.Tensor): if dd not in sys_stat: sys_stat[dd] = [] sys_stat[dd].append(stat_data[dd]) + else: + pass for key in sys_stat: - if sys_stat[key][0] is None: + if sys_stat[key] is None or sys_stat[key][0] is None: sys_stat[key] = None else: sys_stat[key] = torch.cat(sys_stat[key], dim=0) diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py index 6e0c47881f..9e726fbe19 100644 --- a/deepmd/utils/data.py +++ b/deepmd/utils/data.py @@ -490,6 +490,8 @@ def reformat_data_torch(self, data): if self.data_dict[kk]["atomic"]: data[kk] = data[kk].reshape(-1, self.data_dict[kk]["ndof"]) data["atype"] = data["type"] + if not self.pbc: + data["box"] = None return data def _load_set(self, set_name: DPPath): From 5db7883457df7d7cdc7523a7354d6fb4a5432cfb Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 16:13:17 +0800 Subject: [PATCH 19/37] Add DataRequirementItem --- deepmd/dpmodel/model/base_model.py | 5 +- deepmd/dpmodel/model/dp_model.py | 9 +++- deepmd/pt/model/model/dipole_model.py | 41 ++++++++------ deepmd/pt/model/model/dp_zbl_model.py | 77 ++++++++++++++------------ deepmd/pt/model/model/ener_model.py | 78 +++++++++++++++------------ deepmd/pt/model/model/model.py | 6 ++- deepmd/pt/model/model/polar_model.py | 41 ++++++++------ deepmd/pt/utils/dataloader.py | 7 ++- deepmd/pt/utils/dataset.py | 27 ++++++---- deepmd/utils/data.py | 70 ++++++++++++++++++++++++ 10 files changed, 243 insertions(+), 118 deletions(-) diff --git a/deepmd/dpmodel/model/base_model.py b/deepmd/dpmodel/model/base_model.py index c4b998d763..ee22dec132 100644 --- a/deepmd/dpmodel/model/base_model.py +++ b/deepmd/dpmodel/model/base_model.py @@ -10,6 +10,9 @@ Type, ) +from deepmd.utils.data import ( + DataRequirementItem, +) from deepmd.utils.plugin import ( PluginVariant, make_plugin_registry, @@ -93,7 +96,7 @@ def model_output_type(self) -> str: """Get the output type for the model.""" @abstractmethod - def data_requirement(self) -> dict: + def data_requirement(self) -> List[DataRequirementItem]: """Get the data requirement for the model.""" @abstractmethod diff --git a/deepmd/dpmodel/model/dp_model.py b/deepmd/dpmodel/model/dp_model.py index 705750414b..88243c8742 100644 --- a/deepmd/dpmodel/model/dp_model.py +++ b/deepmd/dpmodel/model/dp_model.py @@ -1,10 +1,17 @@ # SPDX-License-Identifier: LGPL-3.0-or-later +from typing import ( + List, +) + from deepmd.dpmodel.atomic_model import ( DPAtomicModel, ) from deepmd.dpmodel.model.base_model import ( BaseModel, ) +from deepmd.utils.data import ( + DataRequirementItem, +) from .make_model import ( make_model, @@ -14,6 +21,6 @@ # use "class" to resolve "Variable not allowed in type expression" @BaseModel.register("standard") class DPModel(make_model(DPAtomicModel), BaseModel): - def data_requirement(self) -> dict: + def data_requirement(self) -> List[DataRequirementItem]: """Get the data requirement for the model.""" raise NotImplementedError diff --git a/deepmd/pt/model/model/dipole_model.py b/deepmd/pt/model/model/dipole_model.py index f6d896b5d8..106202d00c 100644 --- a/deepmd/pt/model/model/dipole_model.py +++ b/deepmd/pt/model/model/dipole_model.py @@ -1,11 +1,16 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( Dict, + List, Optional, ) import torch +from deepmd.utils.data import ( + DataRequirementItem, +) + from .dp_model import ( DPModel, ) @@ -92,21 +97,23 @@ def forward_lower( return model_predict @property - def data_requirement(self): - data_requirement = { - "dipole": { - "ndof": 3, - "atomic": False, - "must": False, - "high_prec": False, - "type_sel": self.get_sel_type(), - }, - "atomic_dipole": { - "ndof": 3, - "atomic": True, - "must": False, - "high_prec": False, - "type_sel": self.get_sel_type(), - }, - } + def data_requirement(self) -> List[DataRequirementItem]: + data_requirement = [ + DataRequirementItem( + "dipole", + ndof=3, + atomic=False, + must=False, + high_prec=False, + type_sel=self.get_sel_type(), + ), + DataRequirementItem( + "atomic_dipole", + ndof=3, + atomic=True, + must=False, + high_prec=False, + type_sel=self.get_sel_type(), + ), + ] return data_requirement diff --git a/deepmd/pt/model/model/dp_zbl_model.py b/deepmd/pt/model/model/dp_zbl_model.py index fd47b4368d..fed9d89bf5 100644 --- a/deepmd/pt/model/model/dp_zbl_model.py +++ b/deepmd/pt/model/model/dp_zbl_model.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( Dict, + List, Optional, ) @@ -12,6 +13,9 @@ from deepmd.pt.model.model.model import ( BaseModel, ) +from deepmd.utils.data import ( + DataRequirementItem, +) from .make_model import ( make_model, @@ -99,38 +103,43 @@ def forward_lower( return model_predict @property - def data_requirement(self): - data_requirement = { - "energy": { - "ndof": 1, - "atomic": False, - "must": False, - "high_prec": True, - }, - "force": { - "ndof": 3, - "atomic": True, - "must": False, - "high_prec": False, - }, - "virial": { - "ndof": 9, - "atomic": False, - "must": False, - "high_prec": False, - }, - "atom_ener": { - "ndof": 1, - "atomic": True, - "must": False, - "high_prec": False, - }, - "atom_pref": { - "ndof": 1, - "atomic": True, - "must": False, - "high_prec": False, - "repeat": 3, - }, - } + def data_requirement(self) -> List[DataRequirementItem]: + data_requirement = [ + DataRequirementItem( + "energy", + ndof=1, + atomic=False, + must=False, + high_prec=True, + ), + DataRequirementItem( + "force", + ndof=3, + atomic=True, + must=False, + high_prec=False, + ), + DataRequirementItem( + "virial", + ndof=9, + atomic=False, + must=False, + high_prec=False, + ), + DataRequirementItem( + "atom_ener", + ndof=1, + atomic=True, + must=False, + high_prec=False, + ), + DataRequirementItem( + "atom_pref", + ndof=1, + atomic=True, + must=False, + high_prec=False, + repeat=3, + ), + ] return data_requirement diff --git a/deepmd/pt/model/model/ener_model.py b/deepmd/pt/model/model/ener_model.py index 1497cbade4..92b2b95e34 100644 --- a/deepmd/pt/model/model/ener_model.py +++ b/deepmd/pt/model/model/ener_model.py @@ -1,11 +1,16 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( Dict, + List, Optional, ) import torch +from deepmd.utils.data import ( + DataRequirementItem, +) + from .dp_model import ( DPModel, ) @@ -97,38 +102,43 @@ def forward_lower( return model_predict @property - def data_requirement(self): - data_requirement = { - "energy": { - "ndof": 1, - "atomic": False, - "must": False, - "high_prec": True, - }, - "force": { - "ndof": 3, - "atomic": True, - "must": False, - "high_prec": False, - }, - "virial": { - "ndof": 9, - "atomic": False, - "must": False, - "high_prec": False, - }, - "atom_ener": { - "ndof": 1, - "atomic": True, - "must": False, - "high_prec": False, - }, - "atom_pref": { - "ndof": 1, - "atomic": True, - "must": False, - "high_prec": False, - "repeat": 3, - }, - } + def data_requirement(self) -> List[DataRequirementItem]: + data_requirement = [ + DataRequirementItem( + "energy", + ndof=1, + atomic=False, + must=False, + high_prec=True, + ), + DataRequirementItem( + "force", + ndof=3, + atomic=True, + must=False, + high_prec=False, + ), + DataRequirementItem( + "virial", + ndof=9, + atomic=False, + must=False, + high_prec=False, + ), + DataRequirementItem( + "atom_ener", + ndof=1, + atomic=True, + must=False, + high_prec=False, + ), + DataRequirementItem( + "atom_pref", + ndof=1, + atomic=True, + must=False, + high_prec=False, + repeat=3, + ), + ] return data_requirement diff --git a/deepmd/pt/model/model/model.py b/deepmd/pt/model/model/model.py index 0e2afadd14..1b82402747 100644 --- a/deepmd/pt/model/model/model.py +++ b/deepmd/pt/model/model/model.py @@ -1,11 +1,15 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( + List, Optional, ) from deepmd.dpmodel.model.base_model import ( make_base_model, ) +from deepmd.utils.data import ( + DataRequirementItem, +) from deepmd.utils.path import ( DPPath, ) @@ -85,6 +89,6 @@ def compute_or_load_stat( """ raise NotImplementedError - def data_requirement(self) -> dict: + def data_requirement(self) -> List[DataRequirementItem]: """Get the data requirement for the model.""" raise NotImplementedError diff --git a/deepmd/pt/model/model/polar_model.py b/deepmd/pt/model/model/polar_model.py index 450f5f2fb5..c23e26afac 100644 --- a/deepmd/pt/model/model/polar_model.py +++ b/deepmd/pt/model/model/polar_model.py @@ -1,11 +1,16 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( Dict, + List, Optional, ) import torch +from deepmd.utils.data import ( + DataRequirementItem, +) + from .dp_model import ( DPModel, ) @@ -76,21 +81,23 @@ def forward_lower( return model_predict @property - def get_data_requirement(self): - data_requirement = { - "polar": { - "ndof": 9, - "atomic": False, - "must": False, - "high_prec": False, - "type_sel": self.get_sel_type(), - }, - "atomic_polar": { - "ndof": 9, - "atomic": True, - "must": False, - "high_prec": False, - "type_sel": self.get_sel_type(), - }, - } + def get_data_requirement(self) -> List[DataRequirementItem]: + data_requirement = [ + DataRequirementItem( + "polar", + ndof=9, + atomic=False, + must=False, + high_prec=False, + type_sel=self.get_sel_type(), + ), + DataRequirementItem( + "atomic_polar", + ndof=9, + atomic=True, + must=False, + high_prec=False, + type_sel=self.get_sel_type(), + ), + ] return data_requirement diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py index 70993c21a0..b197f46124 100644 --- a/deepmd/pt/utils/dataloader.py +++ b/deepmd/pt/utils/dataloader.py @@ -35,6 +35,9 @@ from deepmd.pt.utils.dataset import ( DeepmdDataSetForLoader, ) +from deepmd.utils.data import ( + DataRequirementItem, +) from deepmd.utils.data_system import ( prob_sys_size_ext, process_sys_probs, @@ -147,10 +150,10 @@ def __getitem__(self, idx): batch["sid"] = idx return batch - def add_data_requirement(self, dict_of_keys): + def add_data_requirement(self, data_requirement: List[DataRequirementItem]): """Add data requirement for each system in multiple systems.""" for system in self.systems: - system.add_data_requirement(dict_of_keys) + system.add_data_requirement(data_requirement) _sentinel = object() diff --git a/deepmd/pt/utils/dataset.py b/deepmd/pt/utils/dataset.py index 9de82778dc..40a513acdf 100644 --- a/deepmd/pt/utils/dataset.py +++ b/deepmd/pt/utils/dataset.py @@ -1,11 +1,16 @@ # SPDX-License-Identifier: LGPL-3.0-or-later +from typing import ( + List, +) + from torch.utils.data import ( Dataset, ) from deepmd.utils.data import ( + DataRequirementItem, DeepmdData, ) @@ -42,17 +47,17 @@ def __getitem__(self, index): b_data["natoms"] = self._natoms_vec return b_data - def add_data_requirement(self, dict_of_keys): + def add_data_requirement(self, data_requirement: List[DataRequirementItem]): """Add data requirement for this data system.""" - for data_key in dict_of_keys: + for data_item in data_requirement: self._data_system.add( - data_key, - dict_of_keys[data_key]["ndof"], - atomic=dict_of_keys[data_key].get("atomic", False), - must=dict_of_keys[data_key].get("must", False), - high_prec=dict_of_keys[data_key].get("high_prec", False), - type_sel=dict_of_keys[data_key].get("type_sel", None), - repeat=dict_of_keys[data_key].get("repeat", 1), - default=dict_of_keys[data_key].get("default", 0.0), - dtype=dict_of_keys[data_key].get("dtype", None), + data_item["key"], + data_item["ndof"], + atomic=data_item["atomic"], + must=data_item["must"], + high_prec=data_item["high_prec"], + type_sel=data_item["type_sel"], + repeat=data_item["repeat"], + default=data_item["default"], + dtype=data_item["dtype"], ) diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py index 9e726fbe19..03e39e1f21 100644 --- a/deepmd/utils/data.py +++ b/deepmd/utils/data.py @@ -666,3 +666,73 @@ def _check_pbc(self, sys_path: DPPath): def _check_mode(self, set_path: DPPath): return (set_path / "real_atom_types.npy").is_file() + + +class DataRequirementItem: + """A class to store the data requirement for data systems. + + Parameters + ---------- + key + The key of the item. The corresponding data is stored in `sys_path/set.*/key.npy` + ndof + The number of dof + atomic + The item is an atomic property. + If False, the size of the data should be nframes x ndof + If True, the size of data should be nframes x natoms x ndof + must + The data file `sys_path/set.*/key.npy` must exist. + If must is False and the data file does not exist, the `data_dict[find_key]` is set to 0.0 + high_prec + Load the data and store in float64, otherwise in float32 + type_sel + Select certain type of atoms + repeat + The data will be repeated `repeat` times. + default : float, default=0. + default value of data + dtype : np.dtype, optional + the dtype of data, overwrites `high_prec` if provided + """ + + def __init__( + self, + key: str, + ndof: int, + atomic: bool = False, + must: bool = False, + high_prec: bool = False, + type_sel: Optional[List[int]] = None, + repeat: int = 1, + default: float = 0.0, + dtype: Optional[np.dtype] = None, + ) -> None: + self.key = key + self.ndof = ndof + self.atomic = atomic + self.must = must + self.high_prec = high_prec + self.type_sel = type_sel + self.repeat = repeat + self.default = default + self.dtype = dtype + self.dict = self.to_dict() + + def to_dict(self) -> dict: + return { + "key": self.key, + "ndof": self.ndof, + "atomic": self.atomic, + "must": self.must, + "high_prec": self.high_prec, + "type_sel": self.type_sel, + "repeat": self.repeat, + "default": self.default, + "dtype": self.dtype, + } + + def __getitem__(self, key: str): + if key not in self.dict: + raise KeyError(key) + return self.dict[key] From cce52da575cb21da92e735b7dc93cf8f86134fb6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Feb 2024 08:16:14 +0000 Subject: [PATCH 20/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/dpmodel/model/dp_model.py | 2 +- deepmd/pt/entrypoints/main.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/deepmd/dpmodel/model/dp_model.py b/deepmd/dpmodel/model/dp_model.py index 15b3ca2765..d4706cb3be 100644 --- a/deepmd/dpmodel/model/dp_model.py +++ b/deepmd/dpmodel/model/dp_model.py @@ -27,7 +27,7 @@ class DPModel(make_model(DPAtomicModel), BaseModel): def data_requirement(self) -> List[DataRequirementItem]: """Get the data requirement for the model.""" raise NotImplementedError - + @classmethod def update_sel(cls, global_jdata: dict, local_jdata: dict): """Update the selection and perform neighbor statistics. diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index ce8ace06d5..ab35e32012 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -50,9 +50,6 @@ from deepmd.pt.utils.multi_task import ( preprocess_shared_params, ) -from deepmd.pt.utils.stat import ( - make_stat_input, -) from deepmd.utils.argcheck import ( normalize, ) From cdcfcb2fac35739b47f0f196393e922645fd52ca Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 17:29:38 +0800 Subject: [PATCH 21/37] Fix neighbor-stat for multitask (#31) --- deepmd/pt/entrypoints/main.py | 36 +++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index ab35e32012..844061d0ef 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -3,6 +3,9 @@ import json import logging import os +from copy import ( + deepcopy, +) from pathlib import ( Path, ) @@ -72,9 +75,11 @@ def get_trainer( model_branch="", force_load=False, init_frz_model=None, + shared_links=None, ): + multi_task = "model_dict" in config.get("model", {}) # argcheck - if "model_dict" not in config.get("model", {}): + if not multi_task: config = update_deepmd_input(config, warning=True, dump="input_v2_compat.json") config = normalize(config) @@ -85,7 +90,6 @@ def get_trainer( assert dist.is_nccl_available() dist.init_process_group(backend="nccl") - multi_task = "model_dict" in config["model"] ckpt = init_model if init_model is not None else restart_model config["model"] = change_finetune_model_params( ckpt, @@ -94,9 +98,6 @@ def get_trainer( multi_task=multi_task, model_branch=model_branch, ) - shared_links = None - if multi_task: - config["model"], shared_links = preprocess_shared_params(config["model"]) def prepare_trainer_input_single( model_params_single, data_dict_single, loss_dict_single, suffix="" @@ -220,11 +221,33 @@ def train(FLAGS): SummaryPrinter()() with open(FLAGS.INPUT) as fin: config = json.load(fin) + + # update multitask config + multi_task = "model_dict" in config["model"] + shared_links = None + if multi_task: + config["model"], shared_links = preprocess_shared_params(config["model"]) + + # do neighbor stat if not FLAGS.skip_neighbor_stat: log.info( "Calculate neighbor statistics... (add --skip-neighbor-stat to skip this step)" ) - config["model"] = BaseModel.update_sel(config, config["model"]) + if not multi_task: + config["model"] = BaseModel.update_sel(config, config["model"]) + else: + training_jdata = deepcopy(config["training"]) + training_jdata.pop("data_dict", {}) + training_jdata.pop("model_prob", {}) + for model_item in config["model"]["model_dict"]: + fake_global_jdata = { + "model": deepcopy(config["model"]["model_dict"][model_item]), + "training": deepcopy(config["training"]["data_dict"][model_item]), + } + fake_global_jdata["training"].update(training_jdata) + config["model"]["model_dict"][model_item] = BaseModel.update_sel( + fake_global_jdata, config["model"]["model_dict"][model_item] + ) trainer = get_trainer( config, @@ -234,6 +257,7 @@ def train(FLAGS): FLAGS.model_branch, FLAGS.force_load, FLAGS.init_frz_model, + shared_links=shared_links, ) trainer.run() From a7d44d1c2283bcc4a096b1f0eb5eaf6241078ef1 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 17:35:52 +0800 Subject: [PATCH 22/37] Revert "Fix neighbor-stat for multitask (#31)" This reverts commit cdcfcb2fac35739b47f0f196393e922645fd52ca. --- deepmd/pt/entrypoints/main.py | 36 ++++++----------------------------- 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 844061d0ef..ab35e32012 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -3,9 +3,6 @@ import json import logging import os -from copy import ( - deepcopy, -) from pathlib import ( Path, ) @@ -75,11 +72,9 @@ def get_trainer( model_branch="", force_load=False, init_frz_model=None, - shared_links=None, ): - multi_task = "model_dict" in config.get("model", {}) # argcheck - if not multi_task: + if "model_dict" not in config.get("model", {}): config = update_deepmd_input(config, warning=True, dump="input_v2_compat.json") config = normalize(config) @@ -90,6 +85,7 @@ def get_trainer( assert dist.is_nccl_available() dist.init_process_group(backend="nccl") + multi_task = "model_dict" in config["model"] ckpt = init_model if init_model is not None else restart_model config["model"] = change_finetune_model_params( ckpt, @@ -98,6 +94,9 @@ def get_trainer( multi_task=multi_task, model_branch=model_branch, ) + shared_links = None + if multi_task: + config["model"], shared_links = preprocess_shared_params(config["model"]) def prepare_trainer_input_single( model_params_single, data_dict_single, loss_dict_single, suffix="" @@ -221,33 +220,11 @@ def train(FLAGS): SummaryPrinter()() with open(FLAGS.INPUT) as fin: config = json.load(fin) - - # update multitask config - multi_task = "model_dict" in config["model"] - shared_links = None - if multi_task: - config["model"], shared_links = preprocess_shared_params(config["model"]) - - # do neighbor stat if not FLAGS.skip_neighbor_stat: log.info( "Calculate neighbor statistics... (add --skip-neighbor-stat to skip this step)" ) - if not multi_task: - config["model"] = BaseModel.update_sel(config, config["model"]) - else: - training_jdata = deepcopy(config["training"]) - training_jdata.pop("data_dict", {}) - training_jdata.pop("model_prob", {}) - for model_item in config["model"]["model_dict"]: - fake_global_jdata = { - "model": deepcopy(config["model"]["model_dict"][model_item]), - "training": deepcopy(config["training"]["data_dict"][model_item]), - } - fake_global_jdata["training"].update(training_jdata) - config["model"]["model_dict"][model_item] = BaseModel.update_sel( - fake_global_jdata, config["model"]["model_dict"][model_item] - ) + config["model"] = BaseModel.update_sel(config, config["model"]) trainer = get_trainer( config, @@ -257,7 +234,6 @@ def train(FLAGS): FLAGS.model_branch, FLAGS.force_load, FLAGS.init_frz_model, - shared_links=shared_links, ) trainer.run() From fdca653a42c9cc0e2e8490d4dfe476416590f80c Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 18:07:58 +0800 Subject: [PATCH 23/37] Move label requirement to loss func --- deepmd/dpmodel/model/base_model.py | 7 --- deepmd/dpmodel/model/dp_model.py | 10 ---- deepmd/pt/loss/ener.py | 50 ++++++++++++++++++ deepmd/pt/loss/loss.py | 13 +++++ deepmd/pt/model/model/dipole_model.py | 27 ---------- deepmd/pt/model/model/dp_zbl_model.py | 46 ----------------- deepmd/pt/model/model/ener_model.py | 47 ----------------- deepmd/pt/model/model/model.py | 8 --- deepmd/pt/model/model/polar_model.py | 27 ---------- deepmd/pt/train/training.py | 53 +++++++++++-------- source/tests/pt/test_stat.py | 74 +++++++++++++++------------ 11 files changed, 135 insertions(+), 227 deletions(-) diff --git a/deepmd/dpmodel/model/base_model.py b/deepmd/dpmodel/model/base_model.py index ec23994575..e7cc8d9272 100644 --- a/deepmd/dpmodel/model/base_model.py +++ b/deepmd/dpmodel/model/base_model.py @@ -10,9 +10,6 @@ Type, ) -from deepmd.utils.data import ( - DataRequirementItem, -) from deepmd.utils.plugin import ( PluginVariant, make_plugin_registry, @@ -95,10 +92,6 @@ def is_aparam_nall(self) -> bool: def model_output_type(self) -> str: """Get the output type for the model.""" - @abstractmethod - def data_requirement(self) -> List[DataRequirementItem]: - """Get the data requirement for the model.""" - @abstractmethod def serialize(self) -> dict: """Serialize the model. diff --git a/deepmd/dpmodel/model/dp_model.py b/deepmd/dpmodel/model/dp_model.py index d4706cb3be..15f9027d4c 100644 --- a/deepmd/dpmodel/model/dp_model.py +++ b/deepmd/dpmodel/model/dp_model.py @@ -1,7 +1,4 @@ # SPDX-License-Identifier: LGPL-3.0-or-later -from typing import ( - List, -) from deepmd.dpmodel.atomic_model import ( DPAtomicModel, @@ -12,9 +9,6 @@ from deepmd.dpmodel.model.base_model import ( BaseModel, ) -from deepmd.utils.data import ( - DataRequirementItem, -) from .make_model import ( make_model, @@ -24,10 +18,6 @@ # use "class" to resolve "Variable not allowed in type expression" @BaseModel.register("standard") class DPModel(make_model(DPAtomicModel), BaseModel): - def data_requirement(self) -> List[DataRequirementItem]: - """Get the data requirement for the model.""" - raise NotImplementedError - @classmethod def update_sel(cls, global_jdata: dict, local_jdata: dict): """Update the selection and perform neighbor statistics. diff --git a/deepmd/pt/loss/ener.py b/deepmd/pt/loss/ener.py index 4ed765cf69..648e954401 100644 --- a/deepmd/pt/loss/ener.py +++ b/deepmd/pt/loss/ener.py @@ -1,4 +1,8 @@ # SPDX-License-Identifier: LGPL-3.0-or-later +from typing import ( + List, +) + import torch import torch.nn.functional as F @@ -11,6 +15,9 @@ from deepmd.pt.utils.env import ( GLOBAL_PT_FLOAT_PRECISION, ) +from deepmd.utils.data import ( + DataRequirementItem, +) class EnergyStdLoss(TaskLoss): @@ -153,3 +160,46 @@ def forward(self, model_pred, label, natoms, learning_rate, mae=False): if not self.inference: more_loss["rmse"] = torch.sqrt(loss.detach()) return loss, more_loss + + @property + def label_requirement(self) -> List[DataRequirementItem]: + """Return data label requirements needed for this loss calculation.""" + data_requirement = [ + DataRequirementItem( + "energy", + ndof=1, + atomic=False, + must=False, + high_prec=True, + ), + DataRequirementItem( + "force", + ndof=3, + atomic=True, + must=False, + high_prec=False, + ), + DataRequirementItem( + "virial", + ndof=9, + atomic=False, + must=False, + high_prec=False, + ), + DataRequirementItem( + "atom_ener", + ndof=1, + atomic=True, + must=False, + high_prec=False, + ), + DataRequirementItem( + "atom_pref", + ndof=1, + atomic=True, + must=False, + high_prec=False, + repeat=3, + ), + ] + return data_requirement diff --git a/deepmd/pt/loss/loss.py b/deepmd/pt/loss/loss.py index 9f2c3a7ed7..7059d76e03 100644 --- a/deepmd/pt/loss/loss.py +++ b/deepmd/pt/loss/loss.py @@ -1,6 +1,14 @@ # SPDX-License-Identifier: LGPL-3.0-or-later +from typing import ( + List, +) + import torch +from deepmd.utils.data import ( + DataRequirementItem, +) + class TaskLoss(torch.nn.Module): def __init__(self, **kwargs): @@ -10,3 +18,8 @@ def __init__(self, **kwargs): def forward(self, model_pred, label, natoms, learning_rate): """Return loss .""" raise NotImplementedError + + @property + def label_requirement(self) -> List[DataRequirementItem]: + """Return data label requirements needed for this loss calculation.""" + raise NotImplementedError diff --git a/deepmd/pt/model/model/dipole_model.py b/deepmd/pt/model/model/dipole_model.py index 106202d00c..6629541459 100644 --- a/deepmd/pt/model/model/dipole_model.py +++ b/deepmd/pt/model/model/dipole_model.py @@ -1,16 +1,11 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( Dict, - List, Optional, ) import torch -from deepmd.utils.data import ( - DataRequirementItem, -) - from .dp_model import ( DPModel, ) @@ -95,25 +90,3 @@ def forward_lower( else: model_predict = model_ret return model_predict - - @property - def data_requirement(self) -> List[DataRequirementItem]: - data_requirement = [ - DataRequirementItem( - "dipole", - ndof=3, - atomic=False, - must=False, - high_prec=False, - type_sel=self.get_sel_type(), - ), - DataRequirementItem( - "atomic_dipole", - ndof=3, - atomic=True, - must=False, - high_prec=False, - type_sel=self.get_sel_type(), - ), - ] - return data_requirement diff --git a/deepmd/pt/model/model/dp_zbl_model.py b/deepmd/pt/model/model/dp_zbl_model.py index c8aade5eec..f2af0fff52 100644 --- a/deepmd/pt/model/model/dp_zbl_model.py +++ b/deepmd/pt/model/model/dp_zbl_model.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( Dict, - List, Optional, ) @@ -16,9 +15,6 @@ from deepmd.pt.model.model.model import ( BaseModel, ) -from deepmd.utils.data import ( - DataRequirementItem, -) from .make_model import ( make_model, @@ -105,48 +101,6 @@ def forward_lower( model_predict = model_ret return model_predict - @property - def data_requirement(self) -> List[DataRequirementItem]: - data_requirement = [ - DataRequirementItem( - "energy", - ndof=1, - atomic=False, - must=False, - high_prec=True, - ), - DataRequirementItem( - "force", - ndof=3, - atomic=True, - must=False, - high_prec=False, - ), - DataRequirementItem( - "virial", - ndof=9, - atomic=False, - must=False, - high_prec=False, - ), - DataRequirementItem( - "atom_ener", - ndof=1, - atomic=True, - must=False, - high_prec=False, - ), - DataRequirementItem( - "atom_pref", - ndof=1, - atomic=True, - must=False, - high_prec=False, - repeat=3, - ), - ] - return data_requirement - @classmethod def update_sel(cls, global_jdata: dict, local_jdata: dict): """Update the selection and perform neighbor statistics. diff --git a/deepmd/pt/model/model/ener_model.py b/deepmd/pt/model/model/ener_model.py index 92b2b95e34..1a5706dbbf 100644 --- a/deepmd/pt/model/model/ener_model.py +++ b/deepmd/pt/model/model/ener_model.py @@ -1,16 +1,11 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( Dict, - List, Optional, ) import torch -from deepmd.utils.data import ( - DataRequirementItem, -) - from .dp_model import ( DPModel, ) @@ -100,45 +95,3 @@ def forward_lower( else: model_predict = model_ret return model_predict - - @property - def data_requirement(self) -> List[DataRequirementItem]: - data_requirement = [ - DataRequirementItem( - "energy", - ndof=1, - atomic=False, - must=False, - high_prec=True, - ), - DataRequirementItem( - "force", - ndof=3, - atomic=True, - must=False, - high_prec=False, - ), - DataRequirementItem( - "virial", - ndof=9, - atomic=False, - must=False, - high_prec=False, - ), - DataRequirementItem( - "atom_ener", - ndof=1, - atomic=True, - must=False, - high_prec=False, - ), - DataRequirementItem( - "atom_pref", - ndof=1, - atomic=True, - must=False, - high_prec=False, - repeat=3, - ), - ] - return data_requirement diff --git a/deepmd/pt/model/model/model.py b/deepmd/pt/model/model/model.py index 1b82402747..e32d2f307d 100644 --- a/deepmd/pt/model/model/model.py +++ b/deepmd/pt/model/model/model.py @@ -1,15 +1,11 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( - List, Optional, ) from deepmd.dpmodel.model.base_model import ( make_base_model, ) -from deepmd.utils.data import ( - DataRequirementItem, -) from deepmd.utils.path import ( DPPath, ) @@ -88,7 +84,3 @@ def compute_or_load_stat( The path to the statistics files. """ raise NotImplementedError - - def data_requirement(self) -> List[DataRequirementItem]: - """Get the data requirement for the model.""" - raise NotImplementedError diff --git a/deepmd/pt/model/model/polar_model.py b/deepmd/pt/model/model/polar_model.py index c23e26afac..d956a0344c 100644 --- a/deepmd/pt/model/model/polar_model.py +++ b/deepmd/pt/model/model/polar_model.py @@ -1,16 +1,11 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( Dict, - List, Optional, ) import torch -from deepmd.utils.data import ( - DataRequirementItem, -) - from .dp_model import ( DPModel, ) @@ -79,25 +74,3 @@ def forward_lower( else: model_predict = model_ret return model_predict - - @property - def get_data_requirement(self) -> List[DataRequirementItem]: - data_requirement = [ - DataRequirementItem( - "polar", - ndof=9, - atomic=False, - must=False, - high_prec=False, - type_sel=self.get_sel_type(), - ), - DataRequirementItem( - "atomic_polar", - ndof=9, - atomic=True, - must=False, - high_prec=False, - type_sel=self.get_sel_type(), - ), - ] - return data_requirement diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 6b62282817..21a285f540 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -202,12 +202,16 @@ def get_data_loader(_training_data, _validation_data, _training_params): ) def get_single_model( - _model_params, _training_data, _validation_data, _stat_file_path + _model_params, + _training_data, + _validation_data, + _stat_file_path, + _data_requirement, ): model = get_model(deepcopy(_model_params)).to(DEVICE) - _training_data.add_data_requirement(model.data_requirement) + _training_data.add_data_requirement(_data_requirement) if _validation_data is not None: - _validation_data.add_data_requirement(model.data_requirement) + _validation_data.add_data_requirement(_data_requirement) if not resuming: @lazy @@ -262,11 +266,33 @@ def get_loss(loss_params, start_lr, _ntypes): else: self.opt_type, self.opt_param = get_opt_param(training_params) + # Loss + if not self.multi_task: + self.loss = get_loss( + config["loss"], + config["learning_rate"]["start_lr"], + len(model_params["type_map"]), + ) + else: + self.loss = {} + for model_key in self.model_keys: + loss_param = config["loss_dict"][model_key] + if config.get("learning_rate_dict", None) is not None: + lr_param = config["learning_rate_dict"][model_key]["start_lr"] + else: + lr_param = config["learning_rate"]["start_lr"] + ntypes = len(model_params["model_dict"][model_key]["type_map"]) + self.loss[model_key] = get_loss(loss_param, lr_param, ntypes) + # Data + Model dp_random.seed(training_params["seed"]) if not self.multi_task: self.model = get_single_model( - model_params, training_data, validation_data, stat_file_path + model_params, + training_data, + validation_data, + stat_file_path, + self.loss.label_requirement, ) ( self.training_dataloader, @@ -290,6 +316,7 @@ def get_loss(loss_params, start_lr, _ntypes): training_data[model_key], validation_data[model_key], stat_file_path[model_key], + self.loss[model_key].label_requirement, ) ( self.training_dataloader[model_key], @@ -316,24 +343,6 @@ def get_loss(loss_params, start_lr, _ntypes): else: self.lr_exp = get_lr(config["learning_rate"]) - # Loss - if not self.multi_task: - self.loss = get_loss( - config["loss"], - config["learning_rate"]["start_lr"], - len(model_params["type_map"]), - ) - else: - self.loss = {} - for model_key in self.model_keys: - loss_param = config["loss_dict"][model_key] - if config.get("learning_rate_dict", None) is not None: - lr_param = config["learning_rate_dict"][model_key]["start_lr"] - else: - lr_param = config["learning_rate"]["start_lr"] - ntypes = len(model_params["model_dict"][model_key]["type_map"]) - self.loss[model_key] = get_loss(loss_param, lr_param, ntypes) - # JIT if JIT: self.model = torch.jit.script(self.model) diff --git a/source/tests/pt/test_stat.py b/source/tests/pt/test_stat.py index 54810fcc8f..3a09f82baf 100644 --- a/source/tests/pt/test_stat.py +++ b/source/tests/pt/test_stat.py @@ -44,42 +44,50 @@ from deepmd.tf.utils.data_system import ( DeepmdDataSystem, ) +from deepmd.utils.data import ( + DataRequirementItem, +) CUR_DIR = os.path.dirname(__file__) -energy_data_requirement = { - "energy": { - "ndof": 1, - "atomic": False, - "must": False, - "high_prec": True, - }, - "force": { - "ndof": 3, - "atomic": True, - "must": False, - "high_prec": False, - }, - "virial": { - "ndof": 9, - "atomic": False, - "must": False, - "high_prec": False, - }, - "atom_ener": { - "ndof": 1, - "atomic": True, - "must": False, - "high_prec": False, - }, - "atom_pref": { - "ndof": 1, - "atomic": True, - "must": False, - "high_prec": False, - "repeat": 3, - }, -} +energy_data_requirement = [ + DataRequirementItem( + "energy", + ndof=1, + atomic=False, + must=False, + high_prec=True, + ), + DataRequirementItem( + "force", + ndof=3, + atomic=True, + must=False, + high_prec=False, + ), + DataRequirementItem( + "virial", + ndof=9, + atomic=False, + must=False, + high_prec=False, + ), + DataRequirementItem( + "atom_ener", + ndof=1, + atomic=True, + must=False, + high_prec=False, + ), + DataRequirementItem( + "atom_pref", + ndof=1, + atomic=True, + must=False, + high_prec=False, + repeat=3, + ), +] def compare(ut, base, given): From 525ce93cc97dd8f3bbf56dd1539974dd73114cc8 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 18:38:43 +0800 Subject: [PATCH 24/37] resolve conversations --- deepmd/dpmodel/descriptor/make_base_descriptor.py | 6 +++++- deepmd/dpmodel/descriptor/se_e2_a.py | 6 +++++- deepmd/dpmodel/descriptor/se_r.py | 6 +++++- deepmd/pt/model/descriptor/descriptor.py | 5 +++++ deepmd/pt/model/descriptor/dpa1.py | 5 +++++ deepmd/pt/model/descriptor/dpa2.py | 5 +++++ deepmd/pt/model/descriptor/hybrid.py | 5 +++++ deepmd/pt/model/descriptor/se_a.py | 5 +++++ deepmd/pt/model/descriptor/se_r.py | 5 +++++ deepmd/pt/model/network/network.py | 5 +++++ deepmd/pt/model/task/dipole.py | 2 +- deepmd/pt/model/task/fitting.py | 5 +++++ deepmd/pt/model/task/polarizability.py | 2 +- deepmd/pt/train/wrapper.py | 5 +++++ 14 files changed, 62 insertions(+), 5 deletions(-) diff --git a/deepmd/dpmodel/descriptor/make_base_descriptor.py b/deepmd/dpmodel/descriptor/make_base_descriptor.py index 11db208077..940bd0cd27 100644 --- a/deepmd/dpmodel/descriptor/make_base_descriptor.py +++ b/deepmd/dpmodel/descriptor/make_base_descriptor.py @@ -88,7 +88,11 @@ def mixed_types(self) -> bool: @abstractmethod def share_params(self, base_class, shared_level, resume=False): - """Share the parameters of self to the base_class with shared_level.""" + """ + Share the parameters of self to the base_class with shared_level during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ pass def compute_input_stats( diff --git a/deepmd/dpmodel/descriptor/se_e2_a.py b/deepmd/dpmodel/descriptor/se_e2_a.py index 14010c34e2..f6b1c5677e 100644 --- a/deepmd/dpmodel/descriptor/se_e2_a.py +++ b/deepmd/dpmodel/descriptor/se_e2_a.py @@ -244,7 +244,11 @@ def mixed_types(self): return False def share_params(self, base_class, shared_level, resume=False): - """Share the parameters of self to the base_class with shared_level.""" + """ + Share the parameters of self to the base_class with shared_level during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ raise NotImplementedError def get_ntypes(self) -> int: diff --git a/deepmd/dpmodel/descriptor/se_r.py b/deepmd/dpmodel/descriptor/se_r.py index feea008478..fda8b19474 100644 --- a/deepmd/dpmodel/descriptor/se_r.py +++ b/deepmd/dpmodel/descriptor/se_r.py @@ -204,7 +204,11 @@ def mixed_types(self): return False def share_params(self, base_class, shared_level, resume=False): - """Share the parameters of self to the base_class with shared_level.""" + """ + Share the parameters of self to the base_class with shared_level during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ raise NotImplementedError def get_ntypes(self) -> int: diff --git a/deepmd/pt/model/descriptor/descriptor.py b/deepmd/pt/model/descriptor/descriptor.py index 778523a14d..339a716942 100644 --- a/deepmd/pt/model/descriptor/descriptor.py +++ b/deepmd/pt/model/descriptor/descriptor.py @@ -101,6 +101,11 @@ def get_stats(self) -> Dict[str, StatItem]: raise NotImplementedError def share_params(self, base_class, shared_level, resume=False): + """ + Share the parameters of self to the base_class with shared_level during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ assert ( self.__class__ == base_class.__class__ ), "Only descriptors of the same type can share params!" diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py index 08c37487de..ddb1d0ea05 100644 --- a/deepmd/pt/model/descriptor/dpa1.py +++ b/deepmd/pt/model/descriptor/dpa1.py @@ -148,6 +148,11 @@ def mixed_types(self) -> bool: return self.se_atten.mixed_types() def share_params(self, base_class, shared_level, resume=False): + """ + Share the parameters of self to the base_class with shared_level during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ assert ( self.__class__ == base_class.__class__ ), "Only descriptors of the same type can share params!" diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py index d407452e46..3a4319860f 100644 --- a/deepmd/pt/model/descriptor/dpa2.py +++ b/deepmd/pt/model/descriptor/dpa2.py @@ -292,6 +292,11 @@ def mixed_types(self) -> bool: return True def share_params(self, base_class, shared_level, resume=False): + """ + Share the parameters of self to the base_class with shared_level during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ assert ( self.__class__ == base_class.__class__ ), "Only descriptors of the same type can share params!" diff --git a/deepmd/pt/model/descriptor/hybrid.py b/deepmd/pt/model/descriptor/hybrid.py index 40ff3e5c7f..2c68afa892 100644 --- a/deepmd/pt/model/descriptor/hybrid.py +++ b/deepmd/pt/model/descriptor/hybrid.py @@ -146,6 +146,11 @@ def dim_emb(self): raise RuntimeError def share_params(self, base_class, shared_level, resume=False): + """ + Share the parameters of self to the base_class with shared_level during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ assert ( self.__class__ == base_class.__class__ ), "Only descriptors of the same type can share params!" diff --git a/deepmd/pt/model/descriptor/se_a.py b/deepmd/pt/model/descriptor/se_a.py index 9a9883cdb3..eddfcf4047 100644 --- a/deepmd/pt/model/descriptor/se_a.py +++ b/deepmd/pt/model/descriptor/se_a.py @@ -130,6 +130,11 @@ def mixed_types(self): return self.sea.mixed_types() def share_params(self, base_class, shared_level, resume=False): + """ + Share the parameters of self to the base_class with shared_level during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ assert ( self.__class__ == base_class.__class__ ), "Only descriptors of the same type can share params!" diff --git a/deepmd/pt/model/descriptor/se_r.py b/deepmd/pt/model/descriptor/se_r.py index ba483ea711..4e7e516065 100644 --- a/deepmd/pt/model/descriptor/se_r.py +++ b/deepmd/pt/model/descriptor/se_r.py @@ -154,6 +154,11 @@ def mixed_types(self) -> bool: return False def share_params(self, base_class, shared_level, resume=False): + """ + Share the parameters of self to the base_class with shared_level during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ assert ( self.__class__ == base_class.__class__ ), "Only descriptors of the same type can share params!" diff --git a/deepmd/pt/model/network/network.py b/deepmd/pt/model/network/network.py index 9ef7b3366a..10d0364c9b 100644 --- a/deepmd/pt/model/network/network.py +++ b/deepmd/pt/model/network/network.py @@ -575,6 +575,11 @@ def forward(self, atype): return self.embedding(atype) def share_params(self, base_class, shared_level, resume=False): + """ + Share the parameters of self to the base_class with shared_level during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ assert ( self.__class__ == base_class.__class__ ), "Only TypeEmbedNet of the same type can share params!" diff --git a/deepmd/pt/model/task/dipole.py b/deepmd/pt/model/task/dipole.py index 00de5276ee..6956d2ce25 100644 --- a/deepmd/pt/model/task/dipole.py +++ b/deepmd/pt/model/task/dipole.py @@ -93,7 +93,7 @@ def __init__( self.r_differentiable = r_differentiable self.c_differentiable = c_differentiable super().__init__( - var_name="dipole" if "var_name" not in kwargs else kwargs.pop("var_name"), + var_name=kwargs.pop("var_name", "dipole"), ntypes=ntypes, dim_descrpt=dim_descrpt, neuron=neuron, diff --git a/deepmd/pt/model/task/fitting.py b/deepmd/pt/model/task/fitting.py index d752ac964c..47535580db 100644 --- a/deepmd/pt/model/task/fitting.py +++ b/deepmd/pt/model/task/fitting.py @@ -62,6 +62,11 @@ def __new__(cls, *args, **kwargs): return super().__new__(cls) def share_params(self, base_class, shared_level, resume=False): + """ + Share the parameters of self to the base_class with shared_level during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ assert ( self.__class__ == base_class.__class__ ), "Only fitting nets of the same type can share params!" diff --git a/deepmd/pt/model/task/polarizability.py b/deepmd/pt/model/task/polarizability.py index 37c802613a..3c40e9f734 100644 --- a/deepmd/pt/model/task/polarizability.py +++ b/deepmd/pt/model/task/polarizability.py @@ -115,7 +115,7 @@ def __init__( ).view(ntypes, 1) self.shift_diag = shift_diag super().__init__( - var_name="polar" if "var_name" not in kwargs else kwargs.pop("var_name"), + var_name=kwargs.pop("var_name", "polar"), ntypes=ntypes, dim_descrpt=dim_descrpt, neuron=neuron, diff --git a/deepmd/pt/train/wrapper.py b/deepmd/pt/train/wrapper.py index 52cc636c10..67f8043653 100644 --- a/deepmd/pt/train/wrapper.py +++ b/deepmd/pt/train/wrapper.py @@ -83,6 +83,11 @@ def set_trainable_params(self): param.requires_grad = trainable def share_params(self, shared_links, resume=False): + """ + Share the parameters of classes following rules defined in shared_links during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ supported_types = ["descriptor", "fitting_net"] for shared_item in shared_links: class_name = shared_links[shared_item]["type"] From 46ee16c8dd91942329db32f32549820f1089ef62 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 18:43:55 +0800 Subject: [PATCH 25/37] set label_requirement abstractmethod --- deepmd/pt/loss/loss.py | 9 +++++++-- deepmd/pt/utils/finetune.py | 5 +---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/deepmd/pt/loss/loss.py b/deepmd/pt/loss/loss.py index 7059d76e03..925ff8f4ef 100644 --- a/deepmd/pt/loss/loss.py +++ b/deepmd/pt/loss/loss.py @@ -1,4 +1,8 @@ # SPDX-License-Identifier: LGPL-3.0-or-later +from abc import ( + ABC, + abstractmethod, +) from typing import ( List, ) @@ -10,7 +14,7 @@ ) -class TaskLoss(torch.nn.Module): +class TaskLoss(torch.nn.Module, ABC): def __init__(self, **kwargs): """Construct loss.""" super().__init__() @@ -20,6 +24,7 @@ def forward(self, model_pred, label, natoms, learning_rate): raise NotImplementedError @property + @abstractmethod def label_requirement(self) -> List[DataRequirementItem]: """Return data label requirements needed for this loss calculation.""" - raise NotImplementedError + pass diff --git a/deepmd/pt/utils/finetune.py b/deepmd/pt/utils/finetune.py index b08dc9fbef..c8fa1e5185 100644 --- a/deepmd/pt/utils/finetune.py +++ b/deepmd/pt/utils/finetune.py @@ -19,10 +19,7 @@ def change_finetune_model_params( - ckpt & finetune_model: origin model. - config: Read from json file. """ - if multi_task: - # TODO - pass - # log.error("finetune mode need modification for multitask mode!") + # TODO need support for multitask mode if finetune_model is not None: state_dict = torch.load(finetune_model, map_location=env.DEVICE) if "model" in state_dict: From 9d18dc4f043e9af82dcbef748de6627a95737928 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 18:54:44 +0800 Subject: [PATCH 26/37] make label_requirement dynamic --- deepmd/pt/loss/ener.py | 135 +++++++++++++++++++++++++++++------------ 1 file changed, 95 insertions(+), 40 deletions(-) diff --git a/deepmd/pt/loss/ener.py b/deepmd/pt/loss/ener.py index 648e954401..2834733112 100644 --- a/deepmd/pt/loss/ener.py +++ b/deepmd/pt/loss/ener.py @@ -30,16 +30,57 @@ def __init__( limit_pref_f=0.0, start_pref_v=0.0, limit_pref_v=0.0, + start_pref_ae: float = 0.0, + limit_pref_ae: float = 0.0, + start_pref_pf: float = 0.0, + limit_pref_pf: float = 0.0, use_l1_all: bool = False, inference=False, **kwargs, ): - """Construct a layer to compute loss on energy, force and virial.""" + r"""Construct a layer to compute loss on energy, force and virial. + + Parameters + ---------- + starter_learning_rate : float + The learning rate at the start of the training. + start_pref_e : float + The prefactor of energy loss at the start of the training. + limit_pref_e : float + The prefactor of energy loss at the end of the training. + start_pref_f : float + The prefactor of force loss at the start of the training. + limit_pref_f : float + The prefactor of force loss at the end of the training. + start_pref_v : float + The prefactor of virial loss at the start of the training. + limit_pref_v : float + The prefactor of virial loss at the end of the training. + start_pref_ae : float + The prefactor of atomic energy loss at the start of the training. + limit_pref_ae : float + The prefactor of atomic energy loss at the end of the training. + start_pref_pf : float + The prefactor of atomic prefactor force loss at the start of the training. + limit_pref_pf : float + The prefactor of atomic prefactor force loss at the end of the training. + use_l1_all : bool + Whether to use L1 loss, if False (default), it will use L2 loss. + inference : bool + If true, it will output all losses found in output, ignoring the pre-factors. + **kwargs + Other keyword arguments. + """ super().__init__() self.starter_learning_rate = starter_learning_rate self.has_e = (start_pref_e != 0.0 and limit_pref_e != 0.0) or inference self.has_f = (start_pref_f != 0.0 and limit_pref_f != 0.0) or inference self.has_v = (start_pref_v != 0.0 and limit_pref_v != 0.0) or inference + + # TODO need support for atomic energy and atomic pref + self.has_ae = (start_pref_ae != 0.0 and limit_pref_ae != 0.0) or inference + self.has_pf = (start_pref_pf != 0.0 and limit_pref_pf != 0.0) or inference + self.start_pref_e = start_pref_e self.limit_pref_e = limit_pref_e self.start_pref_f = start_pref_f @@ -164,42 +205,56 @@ def forward(self, model_pred, label, natoms, learning_rate, mae=False): @property def label_requirement(self) -> List[DataRequirementItem]: """Return data label requirements needed for this loss calculation.""" - data_requirement = [ - DataRequirementItem( - "energy", - ndof=1, - atomic=False, - must=False, - high_prec=True, - ), - DataRequirementItem( - "force", - ndof=3, - atomic=True, - must=False, - high_prec=False, - ), - DataRequirementItem( - "virial", - ndof=9, - atomic=False, - must=False, - high_prec=False, - ), - DataRequirementItem( - "atom_ener", - ndof=1, - atomic=True, - must=False, - high_prec=False, - ), - DataRequirementItem( - "atom_pref", - ndof=1, - atomic=True, - must=False, - high_prec=False, - repeat=3, - ), - ] - return data_requirement + label_requirement = [] + if self.has_e: + label_requirement.append( + DataRequirementItem( + "energy", + ndof=1, + atomic=False, + must=False, + high_prec=True, + ) + ) + if self.has_f: + label_requirement.append( + DataRequirementItem( + "force", + ndof=3, + atomic=True, + must=False, + high_prec=False, + ) + ) + if self.has_v: + label_requirement.append( + DataRequirementItem( + "virial", + ndof=9, + atomic=False, + must=False, + high_prec=False, + ) + ) + if self.has_ae: + label_requirement.append( + DataRequirementItem( + "atom_ener", + ndof=1, + atomic=True, + must=False, + high_prec=False, + ) + ) + if self.has_pf: + label_requirement.append( + DataRequirementItem( + "atom_pref", + ndof=1, + atomic=True, + must=False, + high_prec=False, + repeat=3, + ) + ) + return label_requirement From ad7227dc65b97d7b4797f6fdae00990466f25368 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 19:11:48 +0800 Subject: [PATCH 27/37] update docs --- deepmd/pt/model/descriptor/descriptor.py | 17 +++++++++++- deepmd/pt/model/descriptor/dpa1.py | 16 +++++++++++ deepmd/pt/model/descriptor/dpa2.py | 16 +++++++++++ deepmd/pt/model/descriptor/hybrid.py | 17 +++++++++++- deepmd/pt/model/descriptor/repformers.py | 17 +++++++++++- deepmd/pt/model/descriptor/se_a.py | 34 ++++++++++++++++++++++-- deepmd/pt/model/descriptor/se_atten.py | 17 +++++++++++- deepmd/pt/model/descriptor/se_r.py | 17 +++++++++++- deepmd/pt/model/task/dipole.py | 16 +++++++++++ deepmd/pt/model/task/ener.py | 16 +++++++++++ deepmd/pt/model/task/polarizability.py | 16 +++++++++++ 11 files changed, 192 insertions(+), 7 deletions(-) diff --git a/deepmd/pt/model/descriptor/descriptor.py b/deepmd/pt/model/descriptor/descriptor.py index 339a716942..24c1ef4dab 100644 --- a/deepmd/pt/model/descriptor/descriptor.py +++ b/deepmd/pt/model/descriptor/descriptor.py @@ -93,7 +93,22 @@ def compute_input_stats( merged: Union[Callable[[], List[dict]], List[dict]], path: Optional[DPPath] = None, ): - """Update mean and stddev for DescriptorBlock elements.""" + """ + Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data. + + Parameters + ---------- + merged : Union[Callable[[], List[dict]], List[dict]] + - List[dict]: A list of data samples from various data systems. + Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor` + originating from the `i`-th data system. + - Callable[[], List[dict]]: A lazy function that returns data samples in the above format + only when needed. Since the sampling process can be slow and memory-intensive, + the lazy function helps by only sampling once. + path : Optional[DPPath] + The path to the stat file. + + """ raise NotImplementedError def get_stats(self) -> Dict[str, StatItem]: diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py index ddb1d0ea05..224a24d60e 100644 --- a/deepmd/pt/model/descriptor/dpa1.py +++ b/deepmd/pt/model/descriptor/dpa1.py @@ -183,6 +183,22 @@ def compute_input_stats( merged: Union[Callable[[], List[dict]], List[dict]], path: Optional[DPPath] = None, ): + """ + Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data. + + Parameters + ---------- + merged : Union[Callable[[], List[dict]], List[dict]] + - List[dict]: A list of data samples from various data systems. + Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor` + originating from the `i`-th data system. + - Callable[[], List[dict]]: A lazy function that returns data samples in the above format + only when needed. Since the sampling process can be slow and memory-intensive, + the lazy function helps by only sampling once. + path : Optional[DPPath] + The path to the stat file. + + """ return self.se_atten.compute_input_stats(merged, path) def serialize(self) -> dict: diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py index 3a4319860f..dcb381d53a 100644 --- a/deepmd/pt/model/descriptor/dpa2.py +++ b/deepmd/pt/model/descriptor/dpa2.py @@ -345,6 +345,22 @@ def compute_input_stats( merged: Union[Callable[[], List[dict]], List[dict]], path: Optional[DPPath] = None, ): + """ + Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data. + + Parameters + ---------- + merged : Union[Callable[[], List[dict]], List[dict]] + - List[dict]: A list of data samples from various data systems. + Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor` + originating from the `i`-th data system. + - Callable[[], List[dict]]: A lazy function that returns data samples in the above format + only when needed. Since the sampling process can be slow and memory-intensive, + the lazy function helps by only sampling once. + path : Optional[DPPath] + The path to the stat file. + + """ for ii, descrpt in enumerate([self.repinit, self.repformers]): descrpt.compute_input_stats(merged, path) diff --git a/deepmd/pt/model/descriptor/hybrid.py b/deepmd/pt/model/descriptor/hybrid.py index 2c68afa892..bf3145f9f2 100644 --- a/deepmd/pt/model/descriptor/hybrid.py +++ b/deepmd/pt/model/descriptor/hybrid.py @@ -167,7 +167,22 @@ def compute_input_stats( merged: Union[Callable[[], List[dict]], List[dict]], path: Optional[DPPath] = None, ): - """Update mean and stddev for descriptor elements.""" + """ + Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data. + + Parameters + ---------- + merged : Union[Callable[[], List[dict]], List[dict]] + - List[dict]: A list of data samples from various data systems. + Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor` + originating from the `i`-th data system. + - Callable[[], List[dict]]: A lazy function that returns data samples in the above format + only when needed. Since the sampling process can be slow and memory-intensive, + the lazy function helps by only sampling once. + path : Optional[DPPath] + The path to the stat file. + + """ for ii, descrpt in enumerate(self.descriptor_list): # need support for hybrid descriptors descrpt.compute_input_stats(merged, path) diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py index eec07d8854..3e8bf72f77 100644 --- a/deepmd/pt/model/descriptor/repformers.py +++ b/deepmd/pt/model/descriptor/repformers.py @@ -285,7 +285,22 @@ def compute_input_stats( merged: Union[Callable[[], List[dict]], List[dict]], path: Optional[DPPath] = None, ): - """Update mean and stddev for descriptor elements.""" + """ + Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data. + + Parameters + ---------- + merged : Union[Callable[[], List[dict]], List[dict]] + - List[dict]: A list of data samples from various data systems. + Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor` + originating from the `i`-th data system. + - Callable[[], List[dict]]: A lazy function that returns data samples in the above format + only when needed. Since the sampling process can be slow and memory-intensive, + the lazy function helps by only sampling once. + path : Optional[DPPath] + The path to the stat file. + + """ env_mat_stat = EnvMatStatSe(self) if path is not None: path = path / env_mat_stat.get_hash() diff --git a/deepmd/pt/model/descriptor/se_a.py b/deepmd/pt/model/descriptor/se_a.py index eddfcf4047..d836b48992 100644 --- a/deepmd/pt/model/descriptor/se_a.py +++ b/deepmd/pt/model/descriptor/se_a.py @@ -157,7 +157,22 @@ def compute_input_stats( merged: Union[Callable[[], List[dict]], List[dict]], path: Optional[DPPath] = None, ): - """Update mean and stddev for descriptor elements.""" + """ + Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data. + + Parameters + ---------- + merged : Union[Callable[[], List[dict]], List[dict]] + - List[dict]: A list of data samples from various data systems. + Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor` + originating from the `i`-th data system. + - Callable[[], List[dict]]: A lazy function that returns data samples in the above format + only when needed. Since the sampling process can be slow and memory-intensive, + the lazy function helps by only sampling once. + path : Optional[DPPath] + The path to the stat file. + + """ return self.sea.compute_input_stats(merged, path) def reinit_exclude( @@ -440,7 +455,22 @@ def compute_input_stats( merged: Union[Callable[[], List[dict]], List[dict]], path: Optional[DPPath] = None, ): - """Update mean and stddev for descriptor elements.""" + """ + Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data. + + Parameters + ---------- + merged : Union[Callable[[], List[dict]], List[dict]] + - List[dict]: A list of data samples from various data systems. + Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor` + originating from the `i`-th data system. + - Callable[[], List[dict]]: A lazy function that returns data samples in the above format + only when needed. Since the sampling process can be slow and memory-intensive, + the lazy function helps by only sampling once. + path : Optional[DPPath] + The path to the stat file. + + """ env_mat_stat = EnvMatStatSe(self) if path is not None: path = path / env_mat_stat.get_hash() diff --git a/deepmd/pt/model/descriptor/se_atten.py b/deepmd/pt/model/descriptor/se_atten.py index 7165ed0cf4..c4b3757854 100644 --- a/deepmd/pt/model/descriptor/se_atten.py +++ b/deepmd/pt/model/descriptor/se_atten.py @@ -207,7 +207,22 @@ def compute_input_stats( merged: Union[Callable[[], List[dict]], List[dict]], path: Optional[DPPath] = None, ): - """Update mean and stddev for descriptor elements.""" + """ + Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data. + + Parameters + ---------- + merged : Union[Callable[[], List[dict]], List[dict]] + - List[dict]: A list of data samples from various data systems. + Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor` + originating from the `i`-th data system. + - Callable[[], List[dict]]: A lazy function that returns data samples in the above format + only when needed. Since the sampling process can be slow and memory-intensive, + the lazy function helps by only sampling once. + path : Optional[DPPath] + The path to the stat file. + + """ env_mat_stat = EnvMatStatSe(self) if path is not None: path = path / env_mat_stat.get_hash() diff --git a/deepmd/pt/model/descriptor/se_r.py b/deepmd/pt/model/descriptor/se_r.py index 4e7e516065..643d1ad558 100644 --- a/deepmd/pt/model/descriptor/se_r.py +++ b/deepmd/pt/model/descriptor/se_r.py @@ -191,7 +191,22 @@ def compute_input_stats( merged: Union[Callable[[], List[dict]], List[dict]], path: Optional[DPPath] = None, ): - """Update mean and stddev for descriptor elements.""" + """ + Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data. + + Parameters + ---------- + merged : Union[Callable[[], List[dict]], List[dict]] + - List[dict]: A list of data samples from various data systems. + Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor` + originating from the `i`-th data system. + - Callable[[], List[dict]]: A lazy function that returns data samples in the above format + only when needed. Since the sampling process can be slow and memory-intensive, + the lazy function helps by only sampling once. + path : Optional[DPPath] + The path to the stat file. + + """ env_mat_stat = EnvMatStatSe(self) if path is not None: path = path / env_mat_stat.get_hash() diff --git a/deepmd/pt/model/task/dipole.py b/deepmd/pt/model/task/dipole.py index 6956d2ce25..7d2dd221db 100644 --- a/deepmd/pt/model/task/dipole.py +++ b/deepmd/pt/model/task/dipole.py @@ -141,6 +141,22 @@ def compute_output_stats( merged: Union[Callable[[], List[dict]], List[dict]], stat_file_path: Optional[DPPath] = None, ): + """ + Compute the output statistics (e.g. energy bias) for the fitting net from packed data. + + Parameters + ---------- + merged : Union[Callable[[], List[dict]], List[dict]] + - List[dict]: A list of data samples from various data systems. + Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor` + originating from the `i`-th data system. + - Callable[[], List[dict]]: A lazy function that returns data samples in the above format + only when needed. Since the sampling process can be slow and memory-intensive, + the lazy function helps by only sampling once. + stat_file_path : Optional[DPPath] + The path to the stat file. + + """ raise NotImplementedError def forward( diff --git a/deepmd/pt/model/task/ener.py b/deepmd/pt/model/task/ener.py index 404b92a10a..29ed5acaad 100644 --- a/deepmd/pt/model/task/ener.py +++ b/deepmd/pt/model/task/ener.py @@ -145,6 +145,22 @@ def compute_output_stats( merged: Union[Callable[[], List[dict]], List[dict]], stat_file_path: Optional[DPPath] = None, ): + """ + Compute the output statistics (e.g. energy bias) for the fitting net from packed data. + + Parameters + ---------- + merged : Union[Callable[[], List[dict]], List[dict]] + - List[dict]: A list of data samples from various data systems. + Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor` + originating from the `i`-th data system. + - Callable[[], List[dict]]: A lazy function that returns data samples in the above format + only when needed. Since the sampling process can be slow and memory-intensive, + the lazy function helps by only sampling once. + stat_file_path : Optional[DPPath] + The path to the stat file. + + """ if stat_file_path is not None: stat_file_path = stat_file_path / "bias_atom_e" if stat_file_path is not None and stat_file_path.is_file(): diff --git a/deepmd/pt/model/task/polarizability.py b/deepmd/pt/model/task/polarizability.py index 3c40e9f734..9483d1eb4a 100644 --- a/deepmd/pt/model/task/polarizability.py +++ b/deepmd/pt/model/task/polarizability.py @@ -168,6 +168,22 @@ def compute_output_stats( merged: Union[Callable[[], List[dict]], List[dict]], stat_file_path: Optional[DPPath] = None, ): + """ + Compute the output statistics (e.g. energy bias) for the fitting net from packed data. + + Parameters + ---------- + merged : Union[Callable[[], List[dict]], List[dict]] + - List[dict]: A list of data samples from various data systems. + Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor` + originating from the `i`-th data system. + - Callable[[], List[dict]]: A lazy function that returns data samples in the above format + only when needed. Since the sampling process can be slow and memory-intensive, + the lazy function helps by only sampling once. + stat_file_path : Optional[DPPath] + The path to the stat file. + + """ raise NotImplementedError def forward( From 35598d2d49da07a81f4680a88e16fa1b4ec8e915 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 19:16:58 +0800 Subject: [PATCH 28/37] replace lazy with functools.lru_cache --- deepmd/pt/train/training.py | 4 ++-- deepmd/pt/utils/dataloader.py | 17 ----------------- 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 21a285f540..6fe8562ad6 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: LGPL-3.0-or-later +import functools import logging import time from copy import ( @@ -38,7 +39,6 @@ from deepmd.pt.utils.dataloader import ( BufferedIterator, get_weighted_sampler, - lazy, ) from deepmd.pt.utils.env import ( DEVICE, @@ -214,7 +214,7 @@ def get_single_model( _validation_data.add_data_requirement(_data_requirement) if not resuming: - @lazy + @functools.lru_cache def get_sample(): sampled = make_stat_input( _training_data.systems, diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py index b197f46124..65a96418c9 100644 --- a/deepmd/pt/utils/dataloader.py +++ b/deepmd/pt/utils/dataloader.py @@ -262,20 +262,3 @@ def get_weighted_sampler(training_data, prob_style, sys_prob=False): with torch.device("cpu"): sampler = WeightedRandomSampler(probs, len_sampler, replacement=True) return sampler - - -class LazyFunction: - def __init__(self, func): - self.func = func - self.result = None - self.called = False - - def __call__(self, *args, **kwargs): - if not self.called: - self.result = self.func(*args, **kwargs) - self.called = True - return self.result - - -def lazy(func): - return LazyFunction(func) From c0a0cfcd9d12ecec2c3ed617a47aba3b9767c4cf Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 29 Feb 2024 19:26:06 +0800 Subject: [PATCH 29/37] Update training.py --- deepmd/pt/train/training.py | 76 +++++++++++++++---------------------- 1 file changed, 30 insertions(+), 46 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 6fe8562ad6..1e25da77fb 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -128,61 +128,45 @@ def get_opt_param(params): return opt_type, opt_param def get_data_loader(_training_data, _validation_data, _training_params): - if "auto_prob" in _training_params["training_data"]: - train_sampler = get_weighted_sampler( - _training_data, _training_params["training_data"]["auto_prob"] - ) - elif "sys_probs" in _training_params["training_data"]: - train_sampler = get_weighted_sampler( - _training_data, - _training_params["training_data"]["sys_probs"], - sys_prob=True, - ) - else: - train_sampler = get_weighted_sampler(_training_data, "prob_sys_size") - - if train_sampler is None: - log.warning( - "Sampler not specified!" - ) # None sampler will lead to a premature stop iteration. Replacement should be True in attribute of the sampler to produce expected number of items in one iteration. - training_dataloader = DataLoader( - _training_data, - sampler=train_sampler, - batch_size=None, - num_workers=NUM_WORKERS, # setting to 0 diverges the behavior of its iterator; should be >=1 - drop_last=False, - pin_memory=True, - ) - with torch.device("cpu"): - training_data_buffered = BufferedIterator(iter(training_dataloader)) - if _validation_data is not None: - if "auto_prob" in _training_params["validation_data"]: - valid_sampler = get_weighted_sampler( - _validation_data, - _training_params["validation_data"]["auto_prob"], + def get_dataloader_and_buffer(_data, _params): + if "auto_prob" in _training_params["training_data"]: + _sampler = get_weighted_sampler( + _data, _params["training_data"]["auto_prob"] ) - elif "sys_probs" in _training_params["validation_data"]: - valid_sampler = get_weighted_sampler( - _validation_data, - _training_params["validation_data"]["sys_probs"], + elif "sys_probs" in _training_params["training_data"]: + _sampler = get_weighted_sampler( + _data, + _params["training_data"]["sys_probs"], sys_prob=True, ) else: - valid_sampler = get_weighted_sampler( - _validation_data, "prob_sys_size" - ) - validation_dataloader = DataLoader( - _validation_data, - sampler=valid_sampler, + _sampler = get_weighted_sampler(_data, "prob_sys_size") + + if _sampler is None: + log.warning( + "Sampler not specified!" + ) # None sampler will lead to a premature stop iteration. Replacement should be True in attribute of the sampler to produce expected number of items in one iteration. + _dataloader = DataLoader( + _data, + sampler=_sampler, batch_size=None, - num_workers=min(NUM_WORKERS, 1), + num_workers=NUM_WORKERS, # setting to 0 diverges the behavior of its iterator; should be >=1 drop_last=False, pin_memory=True, ) with torch.device("cpu"): - validation_data_buffered = BufferedIterator( - iter(validation_dataloader) - ) + _data_buffered = BufferedIterator(iter(_dataloader)) + return _dataloader, _data_buffered + + training_dataloader, training_data_buffered = get_dataloader_and_buffer( + _training_data, _training_params + ) + + if _validation_data is not None: + ( + validation_dataloader, + validation_data_buffered, + ) = get_dataloader_and_buffer(_validation_data, _training_params) if _training_params.get("validation_data", None) is not None: valid_numb_batch = _training_params["validation_data"].get( "numb_btch", 1 From 66edca55617edde721a0fd6b32a45b133defa118 Mon Sep 17 00:00:00 2001 From: Han Wang <92130845+wanghan-iapcm@users.noreply.github.com> Date: Thu, 29 Feb 2024 20:17:00 +0800 Subject: [PATCH 30/37] Update deepmd/pt/train/training.py Signed-off-by: Han Wang <92130845+wanghan-iapcm@users.noreply.github.com> --- deepmd/pt/train/training.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 1e25da77fb..bdbee19108 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -167,12 +167,7 @@ def get_dataloader_and_buffer(_data, _params): validation_dataloader, validation_data_buffered, ) = get_dataloader_and_buffer(_validation_data, _training_params) - if _training_params.get("validation_data", None) is not None: - valid_numb_batch = _training_params["validation_data"].get( - "numb_btch", 1 - ) - else: - valid_numb_batch = 1 + valid_numb_batch = _training_params["validation_data"].get("numb_btch", 1) else: validation_dataloader = None validation_data_buffered = None From d5a1549bbbdc1e9ed389d09bc79e856569e2a90f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Feb 2024 12:17:21 +0000 Subject: [PATCH 31/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt/train/training.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index bdbee19108..b8d13e6f25 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -167,7 +167,9 @@ def get_dataloader_and_buffer(_data, _params): validation_dataloader, validation_data_buffered, ) = get_dataloader_and_buffer(_validation_data, _training_params) - valid_numb_batch = _training_params["validation_data"].get("numb_btch", 1) + valid_numb_batch = _training_params["validation_data"].get( + "numb_btch", 1 + ) else: validation_dataloader = None validation_data_buffered = None From e17546ad6888257926c11a1ca0929069f4264295 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Fri, 1 Mar 2024 00:39:50 +0800 Subject: [PATCH 32/37] Update test_multitask.py --- source/tests/pt/test_multitask.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/source/tests/pt/test_multitask.py b/source/tests/pt/test_multitask.py index 0c1abf1f44..3c0240dbdc 100644 --- a/source/tests/pt/test_multitask.py +++ b/source/tests/pt/test_multitask.py @@ -15,6 +15,9 @@ from deepmd.pt.entrypoints.main import ( get_trainer, ) +from deepmd.pt.utils.multi_task import ( + preprocess_shared_params, +) from .model.test_permutation import ( model_dpa1, @@ -29,7 +32,7 @@ class MultiTaskTrainTest: def test_multitask_train(self): - trainer = get_trainer(deepcopy(self.config)) + trainer = get_trainer(deepcopy(self.config), shared_links=self.shared_links) trainer.run() # check model keys self.assertEqual(len(trainer.wrapper.model), 2) @@ -90,6 +93,9 @@ def setUp(self): ] = f"{self.stat_files}/model_2" self.config["training"]["numb_steps"] = 1 self.config["training"]["save_freq"] = 1 + self.config["model"], self.shared_links = preprocess_shared_params( + self.config["model"] + ) def tearDown(self) -> None: MultiTaskTrainTest.tearDown(self) @@ -125,6 +131,9 @@ def setUp(self): ] = f"{self.stat_files}/model_2" self.config["training"]["numb_steps"] = 1 self.config["training"]["save_freq"] = 1 + self.config["model"], self.shared_links = preprocess_shared_params( + self.config["model"] + ) def tearDown(self) -> None: MultiTaskTrainTest.tearDown(self) @@ -160,6 +169,9 @@ def setUp(self): ] = f"{self.stat_files}/model_2" self.config["training"]["numb_steps"] = 1 self.config["training"]["save_freq"] = 1 + self.config["model"], self.shared_links = preprocess_shared_params( + self.config["model"] + ) def tearDown(self) -> None: MultiTaskTrainTest.tearDown(self) From 1debf4f7a00c3dbe9e60237be470ff4a04a5db91 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Fri, 1 Mar 2024 01:23:12 +0800 Subject: [PATCH 33/37] Fix h5py files in multitask DDP --- deepmd/pt/entrypoints/main.py | 13 ++++++++++--- deepmd/pt/train/training.py | 4 ++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 12a3a01187..023bc5305e 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -101,7 +101,7 @@ def get_trainer( config["model"]["resuming"] = (finetune_model is not None) or (ckpt is not None) def prepare_trainer_input_single( - model_params_single, data_dict_single, loss_dict_single, suffix="" + model_params_single, data_dict_single, loss_dict_single, suffix="", rank=0 ): training_dataset_params = data_dict_single["training_data"] type_split = False @@ -115,7 +115,9 @@ def prepare_trainer_input_single( # stat files stat_file_path_single = data_dict_single.get("stat_file", None) - if stat_file_path_single is not None: + if rank != 0: + stat_file_path_single = None + elif stat_file_path_single is not None: if Path(stat_file_path_single).is_dir(): raise ValueError( f"stat_file should be a file, not a directory: {stat_file_path_single}" @@ -153,13 +155,17 @@ def prepare_trainer_input_single( stat_file_path_single, ) + rank = dist.get_rank() if dist.is_initialized() else 0 if not multi_task: ( train_data, validation_data, stat_file_path, ) = prepare_trainer_input_single( - config["model"], config["training"], config["loss"] + config["model"], + config["training"], + config["loss"], + rank=rank, ) else: train_data, validation_data, stat_file_path = {}, {}, {} @@ -173,6 +179,7 @@ def prepare_trainer_input_single( config["training"]["data_dict"][model_key], config["loss_dict"][model_key], suffix=f"_{model_key}", + rank=rank, ) trainer = training.Trainer( diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index b8d13e6f25..1003b499d6 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -193,7 +193,7 @@ def get_single_model( _training_data.add_data_requirement(_data_requirement) if _validation_data is not None: _validation_data.add_data_requirement(_data_requirement) - if not resuming: + if not resuming and self.rank == 0: @functools.lru_cache def get_sample(): @@ -429,7 +429,7 @@ def get_loss(loss_params, start_lr, _ntypes): # Multi-task share params if shared_links is not None: - self.wrapper.share_params(shared_links, resume=resuming) + self.wrapper.share_params(shared_links, resume=resuming or self.rank != 0) if dist.is_initialized(): torch.cuda.set_device(LOCAL_RANK) From db31edc0d408adf9ad3542d8589d8fbaedcd7a44 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Fri, 1 Mar 2024 01:47:21 +0800 Subject: [PATCH 34/37] FIx h5py file read block --- deepmd/pt/train/training.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 1003b499d6..ef8a53e656 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -63,6 +63,10 @@ DataLoader, ) +from deepmd.utils.path import ( + DPH5Path, +) + log = logging.getLogger(__name__) @@ -208,6 +212,8 @@ def get_sample(): sampled_func=get_sample, stat_file_path=_stat_file_path, ) + if isinstance(_stat_file_path, DPH5Path): + _stat_file_path.root.close() return model def get_lr(lr_params): From 3dfc31ee3beec282fbab4f838586cfb002a4498e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 1 Mar 2024 03:24:30 +0000 Subject: [PATCH 35/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt/model/descriptor/hybrid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/pt/model/descriptor/hybrid.py b/deepmd/pt/model/descriptor/hybrid.py index 5dd550d8e3..9952c4766a 100644 --- a/deepmd/pt/model/descriptor/hybrid.py +++ b/deepmd/pt/model/descriptor/hybrid.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from typing import ( - Callable, Any, + Callable, Dict, List, Optional, From 615446f8cf0768f15750f556db213af5a3752019 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Fri, 1 Mar 2024 12:16:35 +0800 Subject: [PATCH 36/37] Update hybrid.py --- deepmd/dpmodel/descriptor/hybrid.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/deepmd/dpmodel/descriptor/hybrid.py b/deepmd/dpmodel/descriptor/hybrid.py index d2620fdcf7..46f2616b84 100644 --- a/deepmd/dpmodel/descriptor/hybrid.py +++ b/deepmd/dpmodel/descriptor/hybrid.py @@ -127,6 +127,14 @@ def mixed_types(self): """ return any(descrpt.mixed_types() for descrpt in self.descrpt_list) + def share_params(self, base_class, shared_level, resume=False): + """ + Share the parameters of self to the base_class with shared_level during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ + raise NotImplementedError + def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None): """Update mean and stddev for descriptor elements.""" for descrpt in self.descrpt_list: From e26c118cbbb72f8e81662bab7c69cd12e8dc7b36 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Fri, 1 Mar 2024 14:04:39 +0800 Subject: [PATCH 37/37] Update hybrid.py --- deepmd/pt/model/descriptor/hybrid.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/deepmd/pt/model/descriptor/hybrid.py b/deepmd/pt/model/descriptor/hybrid.py index 9952c4766a..b53adca462 100644 --- a/deepmd/pt/model/descriptor/hybrid.py +++ b/deepmd/pt/model/descriptor/hybrid.py @@ -140,6 +140,23 @@ def mixed_types(self): """ return any(descrpt.mixed_types() for descrpt in self.descrpt_list) + def share_params(self, base_class, shared_level, resume=False): + """ + Share the parameters of self to the base_class with shared_level during multitask training. + If not start from checkpoint (resume is False), + some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes. + """ + assert ( + self.__class__ == base_class.__class__ + ), "Only descriptors of the same type can share params!" + if shared_level == 0: + for ii, des in enumerate(self.descrpt_list): + self.descrpt_list[ii].share_params( + base_class.descrpt_list[ii], shared_level, resume=resume + ) + else: + raise NotImplementedError + def compute_input_stats(self, merged: List[dict], path: Optional[DPPath] = None): """Update mean and stddev for descriptor elements.""" for descrpt in self.descrpt_list: