From 60238992104933493737855f9a64fb23b854b19d Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 9 Jun 2022 00:51:49 -0400 Subject: [PATCH 01/26] refactor simplify (#730) * refactor simplify 1. Used `dp model-devi` to calculate model deviation, instead of local calculation. Supported by deepmodeling/deepmd-kit#1618, released in v2.1.1. So the version earlier than 2.1.1 is not supported any more. 2. Assumed all systems are MultiSystems. 3. Removed energy model deviation support * expand path when getting multisystems * let `make_train` and `run_train` expand paths * load numpy array instead * use dpdata to get nframes * fix tests * update README --- README.md | 6 +- dpgen/generator/run.py | 70 +++---- dpgen/simplify/simplify.py | 381 +++++++++---------------------------- dpgen/util.py | 22 +++ 4 files changed, 140 insertions(+), 339 deletions(-) diff --git a/README.md b/README.md index b59725ec9..c833ed059 100644 --- a/README.md +++ b/README.md @@ -499,9 +499,8 @@ The bold notation of key (such aas **type_map**) means that it's a necessary key | **use_ele_temp** | int | 0 | Currently only support fp_style vasp. 0(default): no electron temperature. 1: eletron temperature as frame parameter. 2: electron temperature as atom parameter. | *#Data* | init_data_prefix | String | "/sharedext4/.../data/" | Prefix of initial data directories - | ***init_data_sys*** | List of string|["CH4.POSCAR.01x01x01/.../deepmd"] |Directories of initial data. You may use either absolute or relative path here. + | ***init_data_sys*** | List of string|["CH4.POSCAR.01x01x01/.../deepmd"] |Directories of initial data. You may use either absolute or relative path here. Systems will be detected recursively in the directories. | ***sys_format*** | String | "vasp/poscar" | Format of initial data. It will be `vasp/poscar` if not set. - | init_multi_systems | Boolean | false | If set to `true`, `init_data_sys` directories should contain sub-directories of various systems. DP-GEN will regard all of these sub-directories as inital data systems. | init_batch_size | String of integer | [8] | Each number is the batch_size of corresponding system for training in `init_data_sys`. One recommended rule for setting the `sys_batch_size` and `init_batch_size` is that `batch_size` mutiply number of atoms ot the stucture should be larger than 32. If set to `auto`, batch size will be 32 divided by number of atoms. | | sys_configs_prefix | String | "/sharedext4/.../data/" | Prefix of `sys_configs` | **sys_configs** | List of list of string | [
["/sharedext4/.../POSCAR"],
["....../POSCAR"]
] | Containing directories of structures to be explored in iterations.Wildcard characters are supported here. | @@ -1086,7 +1085,6 @@ Here is an example of `param.json` for QM7 dataset: }, "_comment": "that's all" }, - "use_clusters": true, "fp_style": "gaussian", "shuffle_poscar": false, "fp_task_max": 1000, @@ -1109,7 +1107,7 @@ Here is an example of `param.json` for QM7 dataset: } ``` -Here `pick_data` is the data to simplify and currently only supports `MultiSystems` containing `System` with `deepmd/npy` format, and `use_clusters` should always be `true`. `init_pick_number` and `iter_pick_number` are the numbers of picked frames. `e_trust_lo`, `e_trust_hi` mean the range of the deviation of the frame energy, and `f_trust_lo` and `f_trust_hi` mean the range of the max deviation of atomic forces in a frame. `fp_style` can only be `gaussian` currently. Other parameters are as the same as those of generator. +Here `pick_data` is the directory to data to simplify where the program recursively detects systems `System` with `deepmd/npy` format. `init_pick_number` and `iter_pick_number` are the numbers of picked frames. `e_trust_lo`, `e_trust_hi` mean the range of the deviation of the frame energy, and `f_trust_lo` and `f_trust_hi` mean the range of the max deviation of atomic forces in a frame. `fp_style` can only be `gaussian` currently. Other parameters are as the same as those of generator. ## Set up machine diff --git a/dpgen/generator/run.py b/dpgen/generator/run.py index d06c137b3..1bd196cc6 100644 --- a/dpgen/generator/run.py +++ b/dpgen/generator/run.py @@ -61,7 +61,7 @@ from dpgen.generator.lib.ele_temp import NBandsEsti from dpgen.remote.decide_machine import convert_mdata from dpgen.dispatcher.Dispatcher import Dispatcher, _split_tasks, make_dispatcher, make_submission -from dpgen.util import sepline +from dpgen.util import sepline, expand_sys_str from dpgen import ROOT_PATH from pymatgen.io.vasp import Incar,Kpoints,Potcar from dpgen.auto_test.lib.vasp import make_kspacing_kpoints @@ -288,13 +288,10 @@ def make_train (iter_index, # make sure all init_data_sys has the batch size -- for the following `zip` assert (len(init_data_sys_) <= len(init_batch_size_)) for ii, ss in zip(init_data_sys_, init_batch_size_) : - if jdata.get('init_multi_systems', False): - for single_sys in os.listdir(os.path.join(work_path, 'data.init', ii)): - init_data_sys.append(os.path.join('..', 'data.init', ii, single_sys)) - init_batch_size.append(detect_batch_size(ss, os.path.join(work_path, 'data.init', ii, single_sys))) - else: - init_data_sys.append(os.path.join('..', 'data.init', ii)) - init_batch_size.append(detect_batch_size(ss, os.path.join(work_path, 'data.init', ii))) + sys_paths = expand_sys_str(os.path.join(init_data_prefix, ii)) + for single_sys in sys_paths: + init_data_sys.append(os.path.normpath(os.path.join('..', 'data.init', ii, os.path.relpath(single_sys, os.path.join(init_data_prefix, ii))))) + init_batch_size.append(detect_batch_size(ss, single_sys)) old_range = None if iter_index > 0 : for ii in range(iter_index) : @@ -308,25 +305,16 @@ def make_train (iter_index, sys_batch_size = ["auto" for aa in range(len(sys_list))] for jj in fp_data_sys : sys_idx = int(jj.split('.')[-1]) - if jdata.get('use_clusters', False): - nframes = 0 - for sys_single in os.listdir(jj): - tmp_box = np.loadtxt(os.path.join(jj, sys_single, 'box.raw')) - tmp_box = np.reshape(tmp_box, [-1,9]) - nframes += tmp_box.shape[0] - if nframes < fp_task_min : - log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj)) - continue - for sys_single in os.listdir(jj): - init_data_sys.append(os.path.join('..', 'data.iters', jj, sys_single)) - init_batch_size.append(detect_batch_size(sys_batch_size[sys_idx], os.path.join(jj, sys_single))) - else: - nframes = dpdata.System(jj, 'deepmd/npy').get_nframes() - if nframes < fp_task_min : - log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj)) - continue - init_data_sys.append(os.path.join('..', 'data.iters', jj)) - init_batch_size.append(detect_batch_size(sys_batch_size[sys_idx], jj)) + sys_paths = expand_sys_str(jj) + nframes = 0 + for sys_single in sys_paths: + nframes += dpdata.LabeledSystem(sys_single, fmt="deepmd/npy").get_nframes() + if nframes < fp_task_min : + log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj)) + continue + for sys_single in sys_paths: + init_data_sys.append(os.path.normpath(os.path.join('..', 'data.iters', sys_single))) + init_batch_size.append(detect_batch_size(sys_batch_size[sys_idx], sys_single)) # establish tasks jinput = jdata['default_training_param'] try: @@ -568,25 +556,17 @@ def run_train (iter_index, os.chdir(work_path) fp_data = glob.glob(os.path.join('data.iters', 'iter.*', '02.fp', 'data.*')) for ii in init_data_sys : - if jdata.get('init_multi_systems', False): - for single_sys in os.listdir(os.path.join(ii)): - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'set.*')) - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'type*.raw')) - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'nopbc')) - else: - trans_comm_data += glob.glob(os.path.join(ii, 'set.*')) - trans_comm_data += glob.glob(os.path.join(ii, 'type*.raw')) - trans_comm_data += glob.glob(os.path.join(ii, 'nopbc')) + sys_paths = expand_sys_str(ii) + for single_sys in sys_paths: + trans_comm_data += glob.glob(os.path.join(single_sys, 'set.*')) + trans_comm_data += glob.glob(os.path.join(single_sys, 'type*.raw')) + trans_comm_data += glob.glob(os.path.join(single_sys, 'nopbc')) for ii in fp_data : - if jdata.get('use_clusters', False): - for single_sys in os.listdir(os.path.join(ii)): - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'set.*')) - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'type*.raw')) - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'nopbc')) - else: - trans_comm_data += glob.glob(os.path.join(ii, 'set.*')) - trans_comm_data += glob.glob(os.path.join(ii, 'type*.raw')) - trans_comm_data += glob.glob(os.path.join(ii, 'nopbc')) + sys_paths = expand_sys_str(ii) + for single_sys in sys_paths: + trans_comm_data += glob.glob(os.path.join(single_sys, 'set.*')) + trans_comm_data += glob.glob(os.path.join(single_sys, 'type*.raw')) + trans_comm_data += glob.glob(os.path.join(single_sys, 'nopbc')) os.chdir(cwd) try: diff --git a/dpgen/simplify/simplify.py b/dpgen/simplify/simplify.py index 982db3114..529401519 100644 --- a/dpgen/simplify/simplify.py +++ b/dpgen/simplify/simplify.py @@ -9,6 +9,7 @@ 02: fp (optional, if the original dataset do not have fp data, same as generator) """ import logging +import warnings import queue import os import json @@ -21,7 +22,7 @@ from dpgen import dlog from dpgen import SHORT_CMD -from dpgen.util import sepline +from dpgen.util import sepline, expand_sys_str from distutils.version import LooseVersion from dpgen.dispatcher.Dispatcher import Dispatcher, _split_tasks, make_dispatcher, make_submission from dpgen.generator.run import make_train, run_train, post_train, run_fp, post_fp, fp_name, model_devi_name, train_name, train_task_fmt, sys_link_fp_vasp_pp, make_fp_vasp_incar, make_fp_vasp_kp, make_fp_vasp_cp_cvasp, data_system_fmt, model_devi_task_fmt, fp_task_fmt @@ -38,17 +39,6 @@ sys_name_fmt = 'sys.' + data_system_fmt sys_name_pattern = 'sys.[0-9]*[0-9]' -def expand_sys_str(root_dir): - matches = [] - for root, dirnames, filenames in os.walk(root_dir, followlinks=True): - for filename in fnmatch.filter(filenames, 'type.raw'): - matches.append(root) - matches.sort() - dirnames = [os.path.basename(ii) for ii in matches] - if (len(list(set(dirnames))) != len(matches)) : - raise RuntimeError('duplicated system name: it is highly recommend to place all systems in the same level of directory and has different names') - return matches - def get_system_cls(jdata): if jdata.get("labeled", False): @@ -58,28 +48,12 @@ def get_system_cls(jdata): def get_multi_system(path, jdata): system = get_system_cls(jdata) + system_paths = expand_sys_str(path) systems = dpdata.MultiSystems( - *[system(os.path.join(path, s), fmt='deepmd/npy') for s in os.listdir(path)]) - return systems - - -def get_systems(path, jdata): - system_cls = get_system_cls(jdata) - system_paths = expand_sys_str(path) - systems = {} - for ii in system_paths: - systems[os.path.basename(ii)] = system_cls(ii, fmt='deepmd/npy') + *[system(s, fmt='deepmd/npy') for s in system_paths]) return systems -def get_system_idx(path): - system_paths = expand_sys_str(path) - sys_idx_map = {} - for idx,ii in enumerate(system_paths): - sys_idx_map[os.path.basename(ii)] = idx - return sys_idx_map - - def init_model(iter_index, jdata, mdata): training_init_model = jdata.get('training_init_model', False) if not training_init_model: @@ -111,20 +85,13 @@ def init_pick(iter_index, jdata, mdata): """pick up init data from dataset randomly""" pick_data = jdata['pick_data'] init_pick_number = jdata['init_pick_number'] - use_clusters = jdata.get('use_clusters', False) # use MultiSystems with System # TODO: support System and LabeledSystem # TODO: support other format - if use_clusters: - systems = get_multi_system(pick_data, jdata) - else: - systems = get_systems(pick_data, jdata) + systems = get_multi_system(pick_data, jdata) # label the system labels = [] - if use_clusters: - items = systems.systems.items() - else: - items = systems.items() + items = systems.systems.items() for key, system in items: labels.extend([(key, j) for j in range(len(system))]) @@ -146,48 +113,18 @@ def init_pick(iter_index, jdata, mdata): _init_dump_selected_frames(systems, labels, rest_idx, sys_data_path, jdata) -def _add_system(systems, key, system): - if key in systems.keys(): - systems[key].append(system) - else: - systems[key] = system - return systems - - def _init_dump_selected_frames(systems, labels, selc_idx, sys_data_path, jdata): - pick_data = jdata['pick_data'] - use_clusters = jdata.get('use_clusters', False) - if use_clusters: - selc_systems = dpdata.MultiSystems() - for j in selc_idx: - sys_name, sys_id = labels[j] - selc_systems.append(systems[sys_name][sys_id]) - selc_systems.to_deepmd_raw(sys_data_path) - selc_systems.to_deepmd_npy(sys_data_path, set_size=selc_idx.size) - else: - selc_systems = {} - for j in selc_idx: - sys_name, sys_id = labels[j] - selc_systems = _add_system(selc_systems, sys_name, systems[sys_name][sys_id]) - sys_idx_map = get_system_idx(pick_data) - for kk in selc_systems.keys(): - sub_path = os.path.join(sys_data_path, sys_name_fmt % sys_idx_map[kk]) - selc_systems[kk].to_deepmd_raw(sub_path) - selc_systems[kk].to_deepmd_npy(sub_path, set_size=selc_idx.size) - with open(os.path.join(sys_data_path, 'sys_idx_map.json'), 'w') as fp: - json.dump(sys_idx_map, fp, indent=4) - -def _dump_system_dict(systems, path): - for kk in systems: - sub_path = os.path.join(path, sys_name_fmt % (int(kk))) - systems[kk].to_deepmd_raw(sub_path) - systems[kk].to_deepmd_npy(sub_path, set_size=systems[kk].get_nframes()) + selc_systems = dpdata.MultiSystems() + for j in selc_idx: + sys_name, sys_id = labels[j] + selc_systems.append(systems[sys_name][sys_id]) + selc_systems.to_deepmd_raw(sys_data_path) + selc_systems.to_deepmd_npy(sys_data_path, set_size=selc_idx.size) def make_model_devi(iter_index, jdata, mdata): """calculate the model deviation of the rest idx""" pick_data = jdata['pick_data'] - use_clusters = jdata.get('use_clusters', False) iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, model_devi_name) create_path(work_path) @@ -203,25 +140,7 @@ def make_model_devi(iter_index, jdata, mdata): rest_data_path = os.path.join(last_iter_name, model_devi_name, rest_data_name) if not os.path.exists(rest_data_path): return False - if use_clusters: - for jj, subsystem in enumerate(os.listdir(rest_data_path)): - task_name = "task." + model_devi_task_fmt % (0, jj) - task_path = os.path.join(work_path, task_name) - create_path(task_path) - os.symlink(os.path.abspath(os.path.join(rest_data_path, subsystem)), - os.path.abspath(os.path.join(task_path, rest_data_name))) - else: - rest_data_path = os.path.abspath(rest_data_path) - sys_path = glob.glob(os.path.join(rest_data_path, sys_name_pattern)) - cwd = os.getcwd() - for ii in sys_path: - task_name = "task." + model_devi_task_fmt % (int(os.path.basename(ii).split('.')[1]), 0) - task_path = os.path.join(work_path, task_name) - create_path(task_path) - os.chdir(task_path) - os.symlink(os.path.relpath(ii), rest_data_name) - os.chdir(cwd) - os.chdir(cwd) + os.symlink(os.path.abspath(rest_data_path), os.path.join(work_path, rest_data_name + ".old")) return True @@ -231,43 +150,28 @@ def run_model_devi(iter_index, jdata, mdata): work_path = os.path.join(iter_name, model_devi_name) # generate command commands = [] - tasks = glob.glob(os.path.join(work_path, "task.*")) - run_tasks = [os.path.basename(ii) for ii in tasks] + run_tasks = ["."] # get models models = glob.glob(os.path.join(work_path, "graph*pb")) model_names = [os.path.basename(ii) for ii in models] task_model_list = [] for ii in model_names: - task_model_list.append(os.path.join('..', ii)) - # get max data size - data_size = max([len(dpdata.System(os.path.join( - task, rest_data_name), fmt="deepmd/npy")) for task in tasks]) + task_model_list.append(os.path.join('.', ii)) # models commands = [] - detail_file_names = [] - for ii, mm in enumerate(task_model_list): - detail_file_name = "{prefix}-{ii}".format( - prefix=detail_file_name_prefix, - ii=ii, - ) - # TODO: support 0.x? - command = "{python} -m deepmd test -m {model} -s {system} -n {numb_test} -d {detail_file}".format( - python=mdata['python_test_path'], - model=mm, - system=rest_data_name, - numb_test=data_size, - detail_file=detail_file_name, - ) - commands.append(command) - detail_file_names.append(detail_file_name) + detail_file_name = detail_file_name_prefix + command = "{dp} model-devi -m {model} -s {system} -o {detail_file}".format( + dp=mdata.get('model_devi_command', 'dp'), + model=" ".join(task_model_list), + system=rest_data_name + ".old", + detail_file=detail_file_name, + ) + commands = [command] # submit - try: - model_devi_group_size = mdata['model_devi_group_size'] - except Exception: - model_devi_group_size = 1 + model_devi_group_size = mdata.get('model_devi_group_size', 1) - forward_files = [rest_data_name] - backward_files = sum([[pf+".e.out", pf+".f.out", pf+".v.out"] for pf in detail_file_names], []) + forward_files = [rest_data_name + ".old"] + backward_files = [detail_file_name] api_version = mdata.get('api_version', '0.9') if LooseVersion(api_version) < LooseVersion('1.0'): @@ -303,102 +207,50 @@ def run_model_devi(iter_index, jdata, mdata): def post_model_devi(iter_index, jdata, mdata): """calculate the model deviation""" - use_clusters = jdata.get('use_clusters', False) iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, model_devi_name) - tasks = glob.glob(os.path.join(work_path, "task.*")) - tasks.sort() - - e_trust_lo = jdata['e_trust_lo'] - e_trust_hi = jdata['e_trust_hi'] - f_trust_lo = jdata['f_trust_lo'] - f_trust_hi = jdata['f_trust_hi'] - - if use_clusters: - sys_accurate = dpdata.MultiSystems() - sys_candinate = dpdata.MultiSystems() - sys_failed = dpdata.MultiSystems() - else: - sys_accurate = {} - sys_candinate = {} - sys_failed = {} - all_names = set() - - for task in tasks: - if not use_clusters: - sys_name = os.path.basename(task).split('.')[1] - all_names.add(sys_name) - # e.out - details_e = glob.glob(os.path.join(task, "{}-*.e.out".format(detail_file_name_prefix))) - e_all = np.array([np.loadtxt(detail_e, ndmin=2)[:, 1] for detail_e in details_e]) - e_std = np.std(e_all, axis=0) - n_frame = e_std.size - - # f.out - details_f = glob.glob(os.path.join(task, "{}-*.f.out".format(detail_file_name_prefix))) - f_all = np.array([np.loadtxt(detail_f, ndmin=2)[:, 3:6].reshape((n_frame, -1, 3)) for detail_f in details_f]) - # (n_model, n_frame, n_atom, 3) - f_std = np.std(f_all, axis=0) - # (n_frame, n_atom, 3) - f_std = np.linalg.norm(f_std, axis=2) - # (n_frame, n_atom) - f_std = np.max(f_std, axis=1) - # (n_frame,) - - system_cls = get_system_cls(jdata) - for subsys, e_devi, f_devi in zip(system_cls(os.path.join(task, rest_data_name), fmt='deepmd/npy'), e_std, f_std): - if (e_devi < e_trust_hi and e_devi >= e_trust_lo) or (f_devi < f_trust_hi and f_devi >= f_trust_lo) : - if use_clusters: + + f_trust_lo = jdata['model_devi_f_trust_lo'] + f_trust_hi = jdata['model_devi_f_trust_hi'] + + sys_accurate = dpdata.MultiSystems() + sys_candinate = dpdata.MultiSystems() + sys_failed = dpdata.MultiSystems() + + sys_entire = dpdata.MultiSystems().from_deepmd_npy(os.path.join(work_path, rest_data_name + ".old")) + + detail_file_name = detail_file_name_prefix + with open(os.path.join(work_path, detail_file_name)) as f: + for line in f: + if line.startswith("# data.rest.old"): + name = (line.split()[1]).split("/")[-1] + elif line.startswith("#"): + pass + else: + idx = int(line.split()[0]) + f_devi = float(line.split()[4]) + subsys = sys_entire[name][idx] + if f_trust_lo <= f_devi < f_trust_hi: sys_candinate.append(subsys) - else: - sys_candinate = _add_system(sys_candinate, sys_name, subsys) - elif (e_devi >= e_trust_hi ) or (f_devi >= f_trust_hi ): - if use_clusters: + elif f_devi >= f_trust_hi: sys_failed.append(subsys) - else: - sys_failed = _add_system(sys_failed, sys_name, subsys) - elif (e_devi < e_trust_lo and f_devi < f_trust_lo ): - if use_clusters: + elif f_devi < f_trust_lo: sys_accurate.append(subsys) else: - sys_accurate = _add_system(sys_accurate, sys_name, subsys) - else: - raise RuntimeError('reach a place that should NOT be reached...') - if use_clusters: - counter = {"candidate": sys_candinate.get_nframes(), "accurate": sys_accurate.get_nframes(), "failed": sys_failed.get_nframes()} - fp_sum = sum(counter.values()) - for cc_key, cc_value in counter.items(): - dlog.info("{0:9s} : {1:6d} in {2:6d} {3:6.2f} %".format(cc_key, cc_value, fp_sum, cc_value/fp_sum*100)) - else: - all_names = list(all_names) - all_names.sort() - counter = {"candidate": 0, "accurate": 0, "failed": 0} - for kk in all_names: - sys_counter = {"candidate": 0, "accurate": 0, "failed": 0} - if kk in sys_candinate.keys(): - sys_counter['candidate'] += sys_candinate[kk].get_nframes() - if kk in sys_accurate.keys(): - sys_counter['accurate'] += sys_accurate[kk].get_nframes() - if kk in sys_failed.keys(): - sys_counter['failed'] += sys_failed[kk].get_nframes() - fp_sum = sum(sys_counter.values()) - for cc_key, cc_value in sys_counter.items(): - if fp_sum != 0: - dlog.info("sys{0:s} {1:9s} : {2:6d} in {3:6d} {4:6.2f} %".format(kk, cc_key, cc_value, fp_sum, cc_value/fp_sum*100)) - else: - dlog.info("sys{0:s} {1:9s} : {2:6d} in {3:6d} {4:6.2f} %".format(kk, cc_key, cc_value, fp_sum, 0*100)) - for ii in ['candidate', 'accurate', 'failed']: - counter[ii] += sys_counter[ii] + raise RuntimeError('reach a place that should NOT be reached...') + + counter = {"candidate": sys_candinate.get_nframes(), "accurate": sys_accurate.get_nframes(), "failed": sys_failed.get_nframes()} + fp_sum = sum(counter.values()) + for cc_key, cc_value in counter.items(): + dlog.info("{0:9s} : {1:6d} in {2:6d} {3:6.2f} %".format(cc_key, cc_value, fp_sum, cc_value/fp_sum*100)) if counter['candidate'] == 0 and counter['failed'] > 0: raise RuntimeError('no candidate but still have failed cases, stop. You may want to refine the training or to increase the trust level hi') # label the candidate system labels = [] - if use_clusters: - items = sys_candinate.systems.items() - else: - items = sys_candinate.items() + items = sys_candinate.systems.items() + for key, system in items: labels.extend([(key, j) for j in range(len(system))]) # candinate: pick up randomly @@ -412,112 +264,61 @@ def post_model_devi(iter_index, jdata, mdata): (counter['candidate'], len(pick_idx), float(len(pick_idx))/counter['candidate']*100., len(rest_idx), float(len(rest_idx))/counter['candidate']*100.)) # dump the picked candinate data - if use_clusters: - picked_systems = dpdata.MultiSystems() - for j in pick_idx: - sys_name, sys_id = labels[j] - picked_systems.append(sys_candinate[sys_name][sys_id]) - sys_data_path = os.path.join(work_path, picked_data_name) - picked_systems.to_deepmd_raw(sys_data_path) - picked_systems.to_deepmd_npy(sys_data_path, set_size=iter_pick_number) - else: - selc_systems = {} - for j in pick_idx: - sys_name, sys_id = labels[j] - selc_systems = _add_system(selc_systems, sys_name, sys_candinate[sys_name][sys_id]) - sys_data_path = os.path.join(work_path, picked_data_name) - _dump_system_dict(selc_systems, sys_data_path) + picked_systems = dpdata.MultiSystems() + for j in pick_idx: + sys_name, sys_id = labels[j] + picked_systems.append(sys_candinate[sys_name][sys_id]) + sys_data_path = os.path.join(work_path, picked_data_name) + picked_systems.to_deepmd_raw(sys_data_path) + picked_systems.to_deepmd_npy(sys_data_path, set_size=iter_pick_number) + # dump the rest data (not picked candinate data and failed data) - if use_clusters: - rest_systems = dpdata.MultiSystems() - for j in rest_idx: - sys_name, sys_id = labels[j] - rest_systems.append(sys_candinate[sys_name][sys_id]) - rest_systems += sys_failed - sys_data_path = os.path.join(work_path, rest_data_name) - rest_systems.to_deepmd_raw(sys_data_path) + rest_systems = dpdata.MultiSystems() + for j in rest_idx: + sys_name, sys_id = labels[j] + rest_systems.append(sys_candinate[sys_name][sys_id]) + rest_systems += sys_failed + sys_data_path = os.path.join(work_path, rest_data_name) + rest_systems.to_deepmd_raw(sys_data_path) + if rest_idx.size: rest_systems.to_deepmd_npy(sys_data_path, set_size=rest_idx.size) - else: - selc_systems = {} - for j in rest_idx: - sys_name, sys_id = labels[j] - selc_systems = _add_system(selc_systems, sys_name, sys_candinate[sys_name][sys_id]) - for kk in sys_failed.keys(): - selc_systems = _add_system(selc_systems, kk, sys_failed[kk]) - sys_data_path = os.path.join(work_path, rest_data_name) - _dump_system_dict(selc_systems, sys_data_path) + # dump the accurate data -- to another directory - if use_clusters: - sys_data_path = os.path.join(work_path, accurate_data_name) - sys_accurate.to_deepmd_raw(sys_data_path) - sys_accurate.to_deepmd_npy(sys_data_path, set_size=sys_accurate.get_nframes()) - else: - sys_data_path = os.path.join(work_path, accurate_data_name) - _dump_system_dict(sys_accurate, sys_data_path) + sys_data_path = os.path.join(work_path, accurate_data_name) + sys_accurate.to_deepmd_raw(sys_data_path) + sys_accurate.to_deepmd_npy(sys_data_path, set_size=sys_accurate.get_nframes()) def make_fp_labeled(iter_index, jdata): dlog.info("already labeled, skip make_fp and link data directly") pick_data = jdata['pick_data'] - use_clusters = jdata.get('use_clusters', False) iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, fp_name) create_path(work_path) picked_data_path = os.path.join(iter_name, model_devi_name, picked_data_name) - if use_clusters: - os.symlink(os.path.abspath(picked_data_path), os.path.abspath( - os.path.join(work_path, "task." + data_system_fmt % 0))) - os.symlink(os.path.abspath(picked_data_path), os.path.abspath( - os.path.join(work_path, "data." + data_system_fmt % 0))) - else: - picked_data_path = os.path.abspath(picked_data_path) - sys_path = glob.glob(os.path.join(picked_data_path, sys_name_pattern)) - cwd = os.getcwd() - os.chdir(work_path) - for ii in sys_path: - sys_idx = os.path.basename(ii).split('.')[1] - data_dir = 'data.' + data_system_fmt % int(sys_idx) - task_dir = 'task.' + data_system_fmt % int(sys_idx) - os.symlink(os.path.relpath(ii), data_dir) - os.symlink(os.path.relpath(ii), task_dir) - os.chdir(cwd) + os.symlink(os.path.abspath(picked_data_path), os.path.abspath( + os.path.join(work_path, "task." + data_system_fmt % 0))) + os.symlink(os.path.abspath(picked_data_path), os.path.abspath( + os.path.join(work_path, "data." + data_system_fmt % 0))) def make_fp_configs(iter_index, jdata): pick_data = jdata['pick_data'] - use_clusters = jdata.get('use_clusters', False) iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, fp_name) create_path(work_path) picked_data_path = os.path.join(iter_name, model_devi_name, picked_data_name) - if use_clusters: - systems = get_multi_system(picked_data_path, jdata) - jj = 0 - for system in systems: - for subsys in system: - task_name = "task." + fp_task_fmt % (0, jj) - task_path = os.path.join(work_path, task_name) - create_path(task_path) - subsys.to('vasp/poscar', os.path.join(task_path, 'POSCAR')) - jj += 1 - else: - picked_data_path = os.path.abspath(picked_data_path) - sys_path = glob.glob(os.path.join(picked_data_path, sys_name_pattern)) - for ii in sys_path: - tmp_sys = dpdata.System(ii, fmt = 'deepmd/npy') - sys_idx = os.path.basename(ii).split('.')[1] - jj = 0 - for ss in tmp_sys: - task_name = "task." + fp_task_fmt % (int(sys_idx), jj) - task_path = os.path.join(work_path, task_name) - create_path(task_path) - ss.to('vasp/poscar', os.path.join(task_path, 'POSCAR')) - job = {} - with open(os.path.join(task_path, 'job.json'), 'w') as fp: - json.dump(job, fp, indent=4) - jj += 1 + systems = get_multi_system(picked_data_path, jdata) + jj = 0 + for system in systems: + for subsys in system: + task_name = "task." + fp_task_fmt % (0, jj) + task_path = os.path.join(work_path, task_name) + create_path(task_path) + subsys.to('vasp/poscar', os.path.join(task_path, 'POSCAR')) + jj += 1 def make_fp_gaussian(iter_index, jdata): diff --git a/dpgen/util.py b/dpgen/util.py index aa805e7e5..9491cdc30 100644 --- a/dpgen/util.py +++ b/dpgen/util.py @@ -1,5 +1,7 @@ #!/usr/bin/env python # coding: utf-8 +from typing import Union, List +from pathlib import Path from dpgen import dlog @@ -25,3 +27,23 @@ def box_center(ch='',fill=' ',sp="|"): ''' strs=ch.center(Len,fill) dlog.info(sp+strs[1:len(strs)-1:]+sp) + + +def expand_sys_str(root_dir: Union[str, Path]) -> List[str]: + """Recursively iterate over directories taking those that contain `type.raw` file. + + Parameters + ---------- + root_dir : Union[str, Path] + starting directory + + Returns + ------- + List[str] + list of string pointing to system directories + """ + root_dir = Path(root_dir) + matches = [str(d) for d in root_dir.rglob("*") if (d / "type.raw").is_file()] + if (root_dir / "type.raw").is_file(): + matches.append(str(root_dir)) + return matches From c25cea3aac2e84f86f481afb5663d6bc87252bbc Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 9 Jun 2022 00:53:16 -0400 Subject: [PATCH 02/26] supports non-list mdata (#748) * supports non-list mdata The mdata of a task is a list of a single dict. This "list" looks useless and not clear enough. So this commit supports using the dict without a list. Note that old parameters are still supported, so no breaking changes are made. The "list" is just removed from all examples. Both list and non-list parameters are in the unittest. * fix typo --- README.md | 26 +++++++------------ doc/run/example-of-machine.md | 11 +++----- dpgen/remote/decide_machine.py | 13 +++++++--- .../machine/DeePMD-kit-1.x/machine-ali.json | 13 ++++------ .../machine-lsf-slurm-cp2k.json | 13 ++++------ .../DeePMD-kit-1.x/machine-slurm-qe.json | 12 +++------ .../DeePMD-kit-2.x/lebesgue_v2_machine.json | 11 ++++---- tests/tools/machine_fp_single2.json | 14 ++++++++++ tests/tools/test_convert_mdata.py | 10 +++++-- 9 files changed, 65 insertions(+), 58 deletions(-) create mode 100644 tests/tools/machine_fp_single2.json diff --git a/README.md b/README.md index c833ed059..fb280e226 100644 --- a/README.md +++ b/README.md @@ -1137,7 +1137,7 @@ an example of new dpgen's machine.json ```json { "api_version": "1.0", - "train": [ + "train": { "command": "dp", "machine": { @@ -1161,9 +1161,8 @@ an example of new dpgen's machine.json "para_deg": 3, "source_list": ["/home/user1234/deepmd.1.2.4.env"] } - } - ], - "model_devi":[ + }, + "model_devi": { "command": "lmp", "machine":{ @@ -1184,9 +1183,8 @@ an example of new dpgen's machine.json "group_size": 5, "source_list": ["/home/user1234/deepmd.1.2.4.env"] } - } - ], - "fp":[ + }, + "fp": { "command": "vasp_std", "machine":{ @@ -1208,7 +1206,6 @@ an example of new dpgen's machine.json "source_list": ["~/vasp.env"] } } - ] } ``` note1: the key "local_root" in dpgen's machine.json is always `./` @@ -1220,7 +1217,7 @@ When switching into a new machine, you may modifying the `MACHINE`, according to An example for `MACHINE` is: ```json { - "train": [ + "train": { "machine": { "batch": "slurm", @@ -1243,9 +1240,8 @@ An example for `MACHINE` is: "qos": "data" }, "command": "USERPATH/dp" - } - ], - "model_devi": [ + }, + "model_devi": { "machine": { "batch": "slurm", @@ -1269,9 +1265,8 @@ An example for `MACHINE` is: }, "command": "lmp_serial", "group_size": 1 - } - ], - "fp": [ + }, + "fp": { "machine": { "batch": "slurm", @@ -1298,7 +1293,6 @@ An example for `MACHINE` is: "command": "vasp_gpu", "group_size": 1 } - ] } ``` Following table illustrates which key is needed for three types of machine: `train`,`model_devi` and `fp`. Each of them is a list of dicts. Each dict can be considered as an independent environmnet for calculation. diff --git a/doc/run/example-of-machine.md b/doc/run/example-of-machine.md index 569f85026..247c50e4f 100644 --- a/doc/run/example-of-machine.md +++ b/doc/run/example-of-machine.md @@ -20,7 +20,7 @@ In this section, we will show you how to perform train task at a local workstati In this example, we perform the `train` task on a local workstation. ```json -"train": [ +"train": { "command": "dp", "machine": { @@ -36,8 +36,7 @@ In this example, we perform the `train` task on a local workstation. "group_size": 1, "source_list": ["/home/user1234/deepmd.env"] } - } - ], + }, ``` The "command" for the train task in the DeePMD-kit is "dp". @@ -51,7 +50,7 @@ In the resources parameter, "number_node", "cpu_per_node", and "gpu_per_node" sp In this example, we perform the model_devi task at a local Slurm workstation. ```json -"model_devi": [ +"model_devi": { "command": "lmp", "machine": { @@ -70,7 +69,6 @@ In this example, we perform the model_devi task at a local Slurm workstation. "source_list": ["/home/user1234/lammps.env"] } } -], ``` The "command" for the model_devi task in the LAMMPS is "lmp". @@ -84,7 +82,7 @@ In the resources parameter, we specify the name of the queue to which the task i In this example, we perform the fp task at a remote PBS cluster that can be accessed via SSH. ```json -"fp": [ +"fp": { "command": "mpirun -n 32 vasp_std", "machine": { @@ -106,7 +104,6 @@ In this example, we perform the fp task at a remote PBS cluster that can be acce "source_list": ["/home/user1234/vasp.env"] } } -], ``` VASP code is used for fp task and mpi is used for parallel computing, so "mpirun -n 32" is added to specify the number of parallel threads. diff --git a/dpgen/remote/decide_machine.py b/dpgen/remote/decide_machine.py index 31691f322..c551be44b 100644 --- a/dpgen/remote/decide_machine.py +++ b/dpgen/remote/decide_machine.py @@ -36,11 +36,18 @@ def convert_mdata(mdata, task_types=["train", "model_devi", "fp"]): ''' for task_type in task_types: if task_type in mdata: - for key, item in mdata[task_type][0].items(): + if isinstance(mdata[task_type], dict): + task_data = mdata[task_type] + elif isinstance(mdata[task_type], (list, tuple)): + task_data = mdata[task_type][0] + else: + raise TypeError("mdata/%s should be dict or list!" % task_type) + for key, item in task_data.items(): if "comments" not in key: mdata[task_type + "_" + key] = item - group_size = mdata[task_type][0]["resources"].get("group_size", 1) - if group_size == 1: group_size = mdata[task_type][0].get("group_size", 1) + group_size = task_data["resources"].get("group_size", 1) + if group_size == 1: + group_size = task_data.get("group_size", 1) mdata[task_type + "_" + "group_size"] = group_size return mdata diff --git a/examples/machine/DeePMD-kit-1.x/machine-ali.json b/examples/machine/DeePMD-kit-1.x/machine-ali.json index a2a338af4..e78fc9dd4 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-ali.json +++ b/examples/machine/DeePMD-kit-1.x/machine-ali.json @@ -1,5 +1,5 @@ { - "train": [ + "train": { "machine": { "batch": "shell", @@ -34,10 +34,9 @@ }, "command": "/root/deepmd-kit/bin/dp", "group_size": 2 - } - ], + }, - "model_devi": [ + "model_devi": { "machine": { "batch": "shell", @@ -71,10 +70,9 @@ }, "command": "/root/deepmd-kit/bin/lmp", "group_size": 2 - } - ], + }, - "fp": [ + "fp": { "machine": { "batch": "shell", @@ -108,7 +106,6 @@ "command": "mpirun -n 16 /root/deepmd-pkg/vasp.5.4.4/bin/vasp_std", "group_size": 1 } - ] } diff --git a/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json b/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json index b56d022ec..4fb5845ee 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json +++ b/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json @@ -1,5 +1,5 @@ { - "train": [ + "train": { "machine": { "machine_type": "slurm", @@ -25,9 +25,8 @@ "submit_wait_time": 60 }, "python_path": "/share/apps/deepmd/compress/bin/python3.8" - } - ], - "model_devi": [ + }, + "model_devi": { "machine": { "machine_type": "slurm", @@ -54,9 +53,8 @@ }, "command": "lmp_mpi", "group_size": 5 - } - ], - "fp": [ + }, + "fp": { "machine": { "machine_type": "lsf", @@ -87,5 +85,4 @@ "command": "cp2k.popt -i input.inp", "group_size": 50 } - ] } diff --git a/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json b/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json index 22a3fdbbd..5f15303d6 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json +++ b/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json @@ -1,7 +1,7 @@ { "_comment" : "This is an example of DP-GEN on Slurm", "_comment" : "Last updated on 2021.4.30 for DP-GEN 0.9.2 by Yuzhi Zhang", - "train" :[ + "train" : { "_comment" : "Specify the installed path of DeePMD-kit", "_comment" : "The version of DeePMD-kit should be 1.*", @@ -49,10 +49,9 @@ "time_limit": "23:0:0", "_comment": "that's all" } - } - ], + }, - "model_devi": [ + "model_devi": { "machine": { "machine_type": "slurm", @@ -81,10 +80,8 @@ "command": "lmp_serial", "_comment" : "DP-GEN will put 5 tasks together in one submitting script.", "group_size": 5 - } - ], + }, "fp": - [ { "machine": { "machine_type": "slurm", @@ -113,5 +110,4 @@ "command": "mpirun -n 8 /gpfs/share/home/1600017784/yuzhi/soft/QE-mpi/PW/src/pw.x < input", "group_size": 1 } - ] } diff --git a/examples/machine/DeePMD-kit-2.x/lebesgue_v2_machine.json b/examples/machine/DeePMD-kit-2.x/lebesgue_v2_machine.json index 6b9ead467..0ecba4fa6 100644 --- a/examples/machine/DeePMD-kit-2.x/lebesgue_v2_machine.json +++ b/examples/machine/DeePMD-kit-2.x/lebesgue_v2_machine.json @@ -1,7 +1,7 @@ { "api_version": "1.0", "deepmd_version": "2.0.1", - "train" :[ + "train" : { "command": "dp", "machine": { @@ -34,9 +34,9 @@ "queue_name": "GPU", "group_size": 1 } - }], + }, "model_devi": - [{ + { "command": "lmp -i input.lammps -v restart 0", "machine": { "batch_type": "DpCloudServer", @@ -68,9 +68,9 @@ "queue_name": "GPU", "group_size": 5 } - }], + }, "fp": - [{ + { "command": "mpirun -n 16 vasp_std", "machine": { "batch_type": "DpCloudServer", @@ -104,5 +104,4 @@ "source_list": ["/opt/intel/oneapi/setvars.sh"] } } - ] } diff --git a/tests/tools/machine_fp_single2.json b/tests/tools/machine_fp_single2.json new file mode 100644 index 000000000..8c2212927 --- /dev/null +++ b/tests/tools/machine_fp_single2.json @@ -0,0 +1,14 @@ +{ + "fp": + { + "command": "vasp_std", + "machine":{ + "batch_type": "PBS" + }, + "resources": { + "group_size" : 8 + }, + "_comments" : "In user_forward_files, define input files to be uploaded.", + "user_forward_files" : ["vdw_kernel.bindat"] + } +} \ No newline at end of file diff --git a/tests/tools/test_convert_mdata.py b/tests/tools/test_convert_mdata.py index 5458b0faa..5dc1b944e 100644 --- a/tests/tools/test_convert_mdata.py +++ b/tests/tools/test_convert_mdata.py @@ -6,12 +6,18 @@ __package__ = 'tools' from dpgen.remote.decide_machine import convert_mdata from .context import setUpModule -machine_file = 'machine_fp_single.json' + class TestConvertMdata(unittest.TestCase): + machine_file = 'machine_fp_single.json' + def test_convert_mdata (self): - mdata = json.load(open(machine_file)) + mdata = json.load(open(self.machine_file)) mdata = convert_mdata(mdata, ["fp"]) self.assertEqual(mdata["fp_command"], "vasp_std") self.assertEqual(mdata["fp_group_size"], 8) self.assertEqual(mdata["fp_machine"]["batch_type"], "PBS") self.assertEqual(mdata["fp_user_forward_files"], ["vdw_kernel.bindat"]) + + +class TestConvertMdata2(TestConvertMdata): + machine_file = 'machine_fp_single2.json' From 828024b4696b2e06dfa72d2694aae541f2003e3e Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 9 Jun 2022 00:53:45 -0400 Subject: [PATCH 03/26] upgrade all tasks to dpdispatcher (#749) * upgrade all tasks to dpdispatcher This commit upgrades init_reaction and init_surf to use dpdispatcher * fix method args * fix typo * change the variable name from `work_dir` to `work_path` --- dpgen/data/reaction.py | 38 ++++++++-------- dpgen/data/surf.py | 11 ++--- dpgen/dispatcher/Dispatcher.py | 79 ++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 22 deletions(-) diff --git a/dpgen/data/reaction.py b/dpgen/data/reaction.py index b9574d525..0abfeb965 100644 --- a/dpgen/data/reaction.py +++ b/dpgen/data/reaction.py @@ -7,7 +7,7 @@ output: data """ -import argparse +import warnings import glob import json import os @@ -15,7 +15,8 @@ import dpdata from dpgen import dlog -from dpgen.dispatcher.Dispatcher import make_dispatcher +from dpgen.dispatcher.Dispatcher import make_submission_compat +from dpgen.remote.decide_machine import convert_mdata from dpgen.generator.run import create_path, make_fp_task_name from dpgen.util import sepline @@ -73,14 +74,15 @@ def make_lmp(jdata): return lmp_string -def run_reaxff(jdata, mdata, dispatcher, log_file="reaxff_log"): +def run_reaxff(jdata, mdata, log_file="reaxff_log"): work_path = reaxff_path reaxff_command = "{} -in {}".format(mdata["reaxff_command"], lmp_path) run_tasks = glob.glob(os.path.join(work_path, 'task.*')) run_tasks.sort() run_tasks = [os.path.basename(ii) for ii in run_tasks] - dispatcher.run_jobs(mdata['reaxff_resources'], + make_submission_compat(mdata['reaxff_machine'], + mdata['reaxff_resources'], [reaxff_command], work_path, run_tasks, @@ -89,7 +91,8 @@ def run_reaxff(jdata, mdata, dispatcher, log_file="reaxff_log"): [ff_path, data_init_path, control_path, lmp_path], [trj_path], outlog=log_file, - errlog=log_file) + errlog=log_file, + api_version=mdata.get("api_version", "0.9")) def link_trj(jdata): @@ -102,7 +105,7 @@ def link_trj(jdata): os.path.join(task_path, trj_path))) -def run_build_dataset(jdata, mdata, dispatcher, log_file="build_log"): +def run_build_dataset(jdata, mdata, log_file="build_log"): work_path = build_path build_command = "{cmd} -n {dataset_name} -a {type_map} -d {lammpstrj} -c {cutoff} -s {dataset_size} -k \"{qmkeywords}\" --nprocjob {nprocjob} --nproc {nproc}".format( cmd=mdata["build_command"], @@ -119,7 +122,8 @@ def run_build_dataset(jdata, mdata, dispatcher, log_file="build_log"): run_tasks.sort() run_tasks = [os.path.basename(ii) for ii in run_tasks] - dispatcher.run_jobs(mdata['build_resources'], + make_submission_compat(mdata['build_machine'], + mdata['build_resources'], [build_command], work_path, run_tasks, @@ -128,7 +132,8 @@ def run_build_dataset(jdata, mdata, dispatcher, log_file="build_log"): [trj_path], [f"dataset_{dataset_name}_gjf"], outlog=log_file, - errlog=log_file) + errlog=log_file, + api_version=mdata.get("api_version", "0.9")) def link_fp_input(): @@ -146,7 +151,6 @@ def link_fp_input(): def run_fp(jdata, mdata, - dispatcher, log_file="output", forward_common_files=[]): fp_command = mdata['fp_command'] @@ -162,7 +166,8 @@ def run_fp(jdata, run_tasks = [os.path.basename(ii) for ii in fp_run_tasks] - dispatcher.run_jobs(mdata['fp_resources'], + make_submission_compat(mdata['fp_machine'], + mdata['fp_resources'], [fp_command], work_path, run_tasks, @@ -171,7 +176,8 @@ def run_fp(jdata, ["input"], [log_file], outlog=log_file, - errlog=log_file) + errlog=log_file, + api_version=mdata.get("api_version", "0.9")) def convert_data(jdata): @@ -198,6 +204,7 @@ def gen_init_reaction(args): with open(args.MACHINE, "r") as fp: mdata = json.load(fp) + mdata = convert_mdata(mdata, ["reaxff", "build", "fp"]) record = "record.reaction" iter_rec = -1 numb_task = 7 @@ -213,18 +220,15 @@ def gen_init_reaction(args): elif ii == 0: link_reaxff(jdata) elif ii == 1: - dispatcher = make_dispatcher(mdata["reaxff_machine"]) - run_reaxff(jdata, mdata, dispatcher) + run_reaxff(jdata, mdata) elif ii == 2: link_trj(jdata) elif ii == 3: - dispatcher = make_dispatcher(mdata["build_machine"]) - run_build_dataset(jdata, mdata, dispatcher) + run_build_dataset(jdata, mdata) elif ii == 4: link_fp_input() elif ii == 5: - dispatcher = make_dispatcher(mdata["fp_machine"]) - run_fp(jdata, mdata, dispatcher) + run_fp(jdata, mdata) elif ii == 6: convert_data(jdata) with open(record, "a") as frec: diff --git a/dpgen/data/surf.py b/dpgen/data/surf.py index bc31b6705..543f02bc8 100644 --- a/dpgen/data/surf.py +++ b/dpgen/data/surf.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -import time +import warnings import os,json,shutil,re,glob,argparse import numpy as np import subprocess as sp @@ -12,7 +12,7 @@ from dpgen import dlog from dpgen import ROOT_PATH from dpgen.remote.decide_machine import convert_mdata -from dpgen.dispatcher.Dispatcher import Dispatcher, make_dispatcher +from dpgen.dispatcher.Dispatcher import make_submission_compat #-----PMG--------- from pymatgen.io.vasp import Poscar from pymatgen.core import Structure, Element @@ -565,15 +565,16 @@ def run_vasp_relax(jdata, mdata): run_tasks = [ii.replace(work_dir+"/", "") for ii in relax_run_tasks] #dlog.info(run_tasks) - dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size) - dispatcher.run_jobs(fp_resources, + make_submission_compat(mdata['fp_machine'], + fp_resources, [fp_command], work_dir, run_tasks, fp_group_size, forward_common_files, forward_files, - backward_files) + backward_files, + api_version=mdata.get("api_version", "0.9")) def gen_init_surf(args): try: diff --git a/dpgen/dispatcher/Dispatcher.py b/dpgen/dispatcher/Dispatcher.py index 29bea5669..abbe493b8 100644 --- a/dpgen/dispatcher/Dispatcher.py +++ b/dpgen/dispatcher/Dispatcher.py @@ -1,5 +1,6 @@ from distutils.version import LooseVersion import os,sys,time,random,json,glob +import warnings from typing import List from dpdispatcher import Task, Submission, Resources, Machine from dpgen.dispatcher.LocalContext import LocalSession @@ -406,3 +407,81 @@ def mdata_arginfo() -> List[Argument]: return [ command_arginfo, machine_arginfo, resources_arginfo, ] + + +def make_submission_compat( + machine: dict, + resources: dict, + commands: List[str], + work_path: str, + run_tasks: List[str], + group_size: int, + forward_common_files: List[str], + forward_files: List[str], + backward_files: List[str], + outlog: str="log", + errlog: str="err", + api_version: str="0.9", + ) -> None: + """Make submission with compatibility of both dispatcher API v0 and v1. + + If `api_version` is less than 1.0, use `make_dispatcher`. If + `api_version` is large than 1.0, use `make_submission`. + + Parameters + ---------- + machine : dict + machine dict + resources : dict + resource dict + commands : list[str] + list of commands + work_path : str + working directory + run_tasks : list[str] + list of paths to running tasks + group_size : int + group size + forward_common_files : list[str] + forwarded common files shared for all tasks + forward_files : list[str] + forwarded files for each task + backward_files : list[str] + backwarded files for each task + outlog : str, default=log + path to log from stdout + errlog : str, default=err + path to log from stderr + api_version : str, default=0.9 + API version. 1.0 is recommended + """ + if LooseVersion(api_version) < LooseVersion('1.0'): + warnings.warn(f"the dpdispatcher will be updated to new version." + f"And the interface may be changed. Please check the documents for more details") + dispatcher = make_dispatcher(machine, resources, work_dir, run_tasks, group_size) + dispatcher.run_jobs(resources, + commands, + work_path, + run_tasks, + group_size, + forward_common_files, + forward_files, + backward_files, + outlog=outlog, + errlog=errlog) + + elif LooseVersion(api_version) >= LooseVersion('1.0'): + submission = make_submission( + machine, + resources, + commands=commands, + work_path=work_path, + run_tasks=run_tasks, + group_size=group_size, + forward_common_files=forward_common_files, + forward_files=forward_files, + backward_files=backward_files, + outlog=outlog, + errlog=errlog) + submission.run_submission() + From 7a5dc0c7fec46a2dcdc49cd758a42c88cfd53462 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Sun, 12 Jun 2022 12:43:44 +0800 Subject: [PATCH 04/26] update: add a new option of absolute volume in /dpgen/dpgen/auto_test/EOS.py (#741) * update: add a new option of absolute volume in ./dpgen/auto_test/EOS.py * update: add doc in /dpgen/doc/toymodels/ * update: change the description for eos, change the doc in /dpgen/doc/toymodels/ * update: change the notice of absolute volume from print into dlog.info --- doc/toymodels/JiamengHuang_pr.md | 9 +++++++++ dpgen/auto_test/EOS.py | 15 +++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 doc/toymodels/JiamengHuang_pr.md diff --git a/doc/toymodels/JiamengHuang_pr.md b/doc/toymodels/JiamengHuang_pr.md new file mode 100644 index 000000000..3b1210228 --- /dev/null +++ b/doc/toymodels/JiamengHuang_pr.md @@ -0,0 +1,9 @@ +A new parameter "vol_abs" is added. If you want to use absolute volume to get EOS, you can add + + "vol_abs": true, + +in the "eos" part of property.json +if it's not mentioned, "False" is set defaultly +when you are using absolute volume, there will be a notation in the last line of output during "make" process, which is like + +treat vol_start and vol_end as absolute volume diff --git a/dpgen/auto_test/EOS.py b/dpgen/auto_test/EOS.py index 4f332198b..f824dba8e 100644 --- a/dpgen/auto_test/EOS.py +++ b/dpgen/auto_test/EOS.py @@ -24,6 +24,8 @@ def __init__(self, self.vol_start = parameter['vol_start'] self.vol_end = parameter['vol_end'] self.vol_step = parameter['vol_step'] + parameter['vol_abs'] = parameter.get('vol_abs', False) + self.vol_abs = parameter['vol_abs'] parameter['cal_type'] = parameter.get('cal_type', 'relaxation') self.cal_type = parameter['cal_type'] default_cal_setting = {"relax_pos": True, @@ -117,6 +119,10 @@ def make_confs(self, else: print('gen eos from ' + str(self.vol_start) + ' to ' + str(self.vol_end) + ' by every ' + str(self.vol_step)) + if self.vol_abs : + dlog.info('treat vol_start and vol_end as absolute volume') + else : + dlog.info('treat vol_start and vol_end as relative volume') equi_contcar = os.path.join(path_to_equi, 'CONTCAR') if not os.path.exists(equi_contcar): raise RuntimeError("please do relaxation first") @@ -138,8 +144,13 @@ def make_confs(self, task_list.append(output_task) os.symlink(os.path.relpath(equi_contcar), 'POSCAR.orig') # scale = (vol / vol_to_poscar) ** (1. / 3.) - scale = vol ** (1. / 3.) - eos_params = {'volume': vol * vol_to_poscar, 'scale': scale} + + if self.vol_abs : + scale = (vol / vol_to_poscar) ** (1. / 3.) + eos_params = {'volume': vol, 'scale': scale} + else : + scale = vol ** (1. / 3.) + eos_params = {'volume': vol * vol_to_poscar, 'scale': scale} dumpfn(eos_params, 'eos.json', indent=4) self.parameter['scale2equi'].append(scale) # 06/22 vasp.poscar_scale('POSCAR.orig', 'POSCAR', scale) From 800fb519681b9dc00b07627ebde4f0ce3dfa20b0 Mon Sep 17 00:00:00 2001 From: Liu Renxi <75369672+Liu-RX@users.noreply.github.com> Date: Tue, 14 Jun 2022 14:05:50 +0800 Subject: [PATCH 05/26] fix a bug in make_abacus_scf_input (#754) Co-authored-by: LiuRenxi --- README.md | 2 +- dpgen/generator/lib/abacus_scf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fb280e226..f26fb51df 100644 --- a/README.md +++ b/README.md @@ -570,7 +570,7 @@ The bold notation of key (such aas **type_map**) means that it's a necessary key | **user_fp_params** | Dict | |Parameters for cp2k calculation. find detail in manual.cp2k.org. only the kind section must be set before use. we assume that you have basic knowledge for cp2k input. | **external_input_path** | String | | Conflict with key:user_fp_params, use the template input provided by user, some rules should be followed, read the following text in detail. | *fp_style == ABACUS* -| **user_fp_params** | Dict | |Parameters for ABACUS INPUT. find detail [Here](https://github.com/deepmodeling/abacus-develop/blob/develop/docs/input-main.md#out-descriptor). If `deepks_model` is set, the model file should be in the pseudopotential directory. +| **user_fp_params** | Dict | |Parameters for ABACUS INPUT. find detail [Here](https://github.com/deepmodeling/abacus-develop/blob/develop/docs/input-main.md#out-descriptor). If `deepks_model` is set, the model file should be in the pseudopotential directory. You can also set `KPT` file by adding `k_points` that corresponds to a list of six integers in this dictionary. | **fp_orb_files** | List | |List of atomic orbital files. The files should be in pseudopotential directory. | **fp_dpks_descriptor** | String | |DeePKS descriptor file name. The file should be in pseudopotential directory. diff --git a/dpgen/generator/lib/abacus_scf.py b/dpgen/generator/lib/abacus_scf.py index 256eb1d9d..1a9882979 100644 --- a/dpgen/generator/lib/abacus_scf.py +++ b/dpgen/generator/lib/abacus_scf.py @@ -83,7 +83,7 @@ def make_abacus_scf_input(fp_params): ret += "deepks_scf %d\n" % fp_params["deepks_scf"] elif key == "deepks_model": ret += "deepks_model %s\n" % fp_params["deepks_model"] - else: + elif key != "k_points": # "k_points key is used to generate KPT file." ret += "%s %s\n" % (key, str(fp_params[key])) return ret From aca0d3229408b3082693149bd888b9527db85684 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 14 Jun 2022 02:17:46 -0400 Subject: [PATCH 06/26] init_reaction: fix compatibility with new dpdispatcher (#755) fix compatibility as the key was changed in the dpdispatcher --- dpgen/data/reaction.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dpgen/data/reaction.py b/dpgen/data/reaction.py index 0abfeb965..5e900f9de 100644 --- a/dpgen/data/reaction.py +++ b/dpgen/data/reaction.py @@ -107,6 +107,9 @@ def link_trj(jdata): def run_build_dataset(jdata, mdata, log_file="build_log"): work_path = build_path + # compatible with new dpdispatcher and old dpgen.dispatcher + build_ntasks = mdata["build_resources"].get("cpu_per_node", mdata["build_resources"]["task_per_node"]) + fp_ntasks = mdata["fp_resources"].get("cpu_per_node", mdata["fp_resources"]["task_per_node"]) build_command = "{cmd} -n {dataset_name} -a {type_map} -d {lammpstrj} -c {cutoff} -s {dataset_size} -k \"{qmkeywords}\" --nprocjob {nprocjob} --nproc {nproc}".format( cmd=mdata["build_command"], type_map=" ".join(jdata["type_map"]), @@ -114,8 +117,8 @@ def run_build_dataset(jdata, mdata, log_file="build_log"): cutoff=jdata["cutoff"], dataset_size=jdata["dataset_size"], qmkeywords=jdata["qmkeywords"], - nprocjob=mdata["fp_resources"]["task_per_node"], - nproc=mdata["build_resources"]["task_per_node"], + nprocjob=fp_ntasks, + nproc=build_ntasks, dataset_name=dataset_name ) run_tasks = glob.glob(os.path.join(work_path, 'task.*')) From bace57e624dbdc0e40b3d5e65cc5967db16af234 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sun, 19 Jun 2022 12:09:51 -0400 Subject: [PATCH 07/26] generate machine parameter docs for simplify and init (#751) * generate machine parameter page for simplify and init * switching to new dargs directive --- doc/conf.py | 1 + doc/index.rst | 5 +++++ doc/init/init-bulk-mdata.rst | 6 ++++++ doc/init/init-reaction-mdata.rst | 6 ++++++ doc/init/init-surf-mdata.rst | 6 ++++++ doc/simplify/simplify-mdata.rst | 6 ++++++ dpgen/arginfo.py | 35 +++++++++++++++++++++++++++++++ dpgen/data/arginfo.py | 36 ++++++++++++++++++++++++++++++++ dpgen/generator/arginfo.py | 16 ++------------ dpgen/simplify/arginfo.py | 13 ++++++++++++ 10 files changed, 116 insertions(+), 14 deletions(-) create mode 100644 doc/init/init-bulk-mdata.rst create mode 100644 doc/init/init-reaction-mdata.rst create mode 100644 doc/init/init-surf-mdata.rst create mode 100644 doc/simplify/simplify-mdata.rst create mode 100644 dpgen/arginfo.py create mode 100644 dpgen/data/arginfo.py create mode 100644 dpgen/simplify/arginfo.py diff --git a/doc/conf.py b/doc/conf.py index 9d5ecc006..99dce21b5 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -40,6 +40,7 @@ extensions = [ 'deepmodeling_sphinx', + 'dargs.sphinx', "sphinx_rtd_theme", 'myst_parser', 'sphinx.ext.autosummary', diff --git a/doc/index.rst b/doc/index.rst index 341ce5d79..6eea4d95b 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -32,6 +32,9 @@ DPGEN's documentation :maxdepth: 2 :caption: Init + init/init-bulk-mdata + init/init-surf-mdata + init/init-reaction-mdata .. _autotest:: @@ -46,6 +49,8 @@ DPGEN's documentation :maxdepth: 2 :caption: Simplify + simplify/simplify-mdata + .. _tutorial: diff --git a/doc/init/init-bulk-mdata.rst b/doc/init/init-bulk-mdata.rst new file mode 100644 index 000000000..b3098e906 --- /dev/null +++ b/doc/init/init-bulk-mdata.rst @@ -0,0 +1,6 @@ +dpgen init_bulk machine parameters +================================== + +.. dargs:: + :module: dpgen.data.arginfo + :func: init_bulk_mdata_arginfo diff --git a/doc/init/init-reaction-mdata.rst b/doc/init/init-reaction-mdata.rst new file mode 100644 index 000000000..2fe35a0d8 --- /dev/null +++ b/doc/init/init-reaction-mdata.rst @@ -0,0 +1,6 @@ +dpgen init_reaction machine parameters +====================================== + +.. dargs:: + :module: dpgen.data.arginfo + :func: init_reaction_mdata_arginfo diff --git a/doc/init/init-surf-mdata.rst b/doc/init/init-surf-mdata.rst new file mode 100644 index 000000000..35e8e322f --- /dev/null +++ b/doc/init/init-surf-mdata.rst @@ -0,0 +1,6 @@ +dpgen init_surf machine parameters +================================== + +.. dargs:: + :module: dpgen.data.arginfo + :func: init_surf_mdata_arginfo diff --git a/doc/simplify/simplify-mdata.rst b/doc/simplify/simplify-mdata.rst new file mode 100644 index 000000000..995fc90f8 --- /dev/null +++ b/doc/simplify/simplify-mdata.rst @@ -0,0 +1,6 @@ +dpgen simplify machine parameters +================================= + +.. dargs:: + :module: dpgen.simplify.arginfo + :func: simplify_mdata_arginfo diff --git a/dpgen/arginfo.py b/dpgen/arginfo.py new file mode 100644 index 000000000..3f657942a --- /dev/null +++ b/dpgen/arginfo.py @@ -0,0 +1,35 @@ +from typing import Tuple + +from dargs import Argument + +from dpgen.dispatcher.Dispatcher import mdata_arginfo + + +def general_mdata_arginfo(name: str, tasks: Tuple[str]) -> Argument: + """Generate arginfo for general mdata. + + Parameters + ---------- + name : str + mdata name + tasks : tuple[str] + tuple of task keys, e.g. ("train", "model_devi", "fp") + + Returns + ------- + Argument + arginfo + """ + + doc_api_version = "Please set to 1.0" + doc_run_mdata = "machine.json file" + arg_api_version = Argument("api_version", str, optional=False, doc=doc_api_version) + + sub_fields = [arg_api_version] + doc_mdata = "Parameters of command, machine, and resources for %s" + for task in tasks: + sub_fields.append(Argument( + task, dict, optional=False, sub_fields=mdata_arginfo(), + doc=doc_mdata % task, + )) + return Argument(name, dict, sub_fields=sub_fields, doc=doc_run_mdata) diff --git a/dpgen/data/arginfo.py b/dpgen/data/arginfo.py new file mode 100644 index 000000000..d5814c036 --- /dev/null +++ b/dpgen/data/arginfo.py @@ -0,0 +1,36 @@ +from dargs import Argument + +from dpgen.arginfo import general_mdata_arginfo + + +def init_bulk_mdata_arginfo() -> Argument: + """Generate arginfo for dpgen init_bulk mdata. + + Returns + ------- + Argument + arginfo + """ + return general_mdata_arginfo("init_bulk_mdata", ("fp",)) + + +def init_surf_mdata_arginfo() -> Argument: + """Generate arginfo for dpgen init_surf mdata. + + Returns + ------- + Argument + arginfo + """ + return general_mdata_arginfo("init_surf_mdata", ("fp",)) + + +def init_reaction_mdata_arginfo() -> Argument: + """Generate arginfo for dpgen init_reaction mdata. + + Returns + ------- + Argument + arginfo + """ + return general_mdata_arginfo("init_reaction_mdata", ("reaxff", "build", "fp")) diff --git a/dpgen/generator/arginfo.py b/dpgen/generator/arginfo.py index f8815862d..cb2fb887b 100644 --- a/dpgen/generator/arginfo.py +++ b/dpgen/generator/arginfo.py @@ -1,6 +1,6 @@ from dargs import Argument -from dpgen.dispatcher.Dispatcher import mdata_arginfo +from dpgen.arginfo import general_mdata_arginfo def run_mdata_arginfo() -> Argument: """Generate arginfo for dpgen run mdata. @@ -10,16 +10,4 @@ def run_mdata_arginfo() -> Argument: Argument arginfo """ - - doc_api_version = "Please set to 1.0" - doc_run_mdata = "machine.json file" - arg_api_version = Argument("api_version", str, optional=False, doc=doc_api_version) - - sub_fields = [arg_api_version] - doc_mdata = "Parameters of command, machine, and resources for %s" - for task in ("train", "model_devi", "fp"): - sub_fields.append(Argument( - task, dict, optional=False, sub_fields=mdata_arginfo(), - doc=doc_mdata % task, - )) - return Argument("run_mdata", dict, sub_fields=sub_fields, doc=doc_run_mdata) + return general_mdata_arginfo("run_mdata", ("train", "model_devi", "fp")) diff --git a/dpgen/simplify/arginfo.py b/dpgen/simplify/arginfo.py new file mode 100644 index 000000000..0fbfe606e --- /dev/null +++ b/dpgen/simplify/arginfo.py @@ -0,0 +1,13 @@ +from dargs import Argument + +from dpgen.arginfo import general_mdata_arginfo + +def simplify_mdata_arginfo() -> Argument: + """Generate arginfo for dpgen simplify mdata. + + Returns + ------- + Argument + arginfo + """ + return general_mdata_arginfo("simplify_mdata", ("train", "model_devi", "fp")) From 9083b1d795a8c2dcfd6c3c002705b00b96506dea Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 20 Jun 2022 23:17:34 -0400 Subject: [PATCH 08/26] add auto cli docs (#759) * add auto cli docs * fix typo * fix package name... * forgot to return parser * add the blank line --- doc/conf.py | 1 + doc/index.rst | 2 ++ doc/overview/cli.rst | 7 +++++++ doc/requirements.txt | 1 + dpgen/main.py | 16 +++++++++++++--- 5 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 doc/overview/cli.rst diff --git a/doc/conf.py b/doc/conf.py index 99dce21b5..97b4b206b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -45,6 +45,7 @@ 'myst_parser', 'sphinx.ext.autosummary', 'sphinx.ext.viewcode', + 'sphinxarg.ext', ] diff --git a/doc/index.rst b/doc/index.rst index 6eea4d95b..eaa229813 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -7,6 +7,8 @@ DPGEN's documentation .. toctree:: :maxdepth: 2 :caption: Overview + + overview/cli .. _installation:: diff --git a/doc/overview/cli.rst b/doc/overview/cli.rst new file mode 100644 index 000000000..e57f1b064 --- /dev/null +++ b/doc/overview/cli.rst @@ -0,0 +1,7 @@ +Command line interface +====================== + +.. argparse:: + :module: dpgen.main + :func: main_parser + :prog: dpgen diff --git a/doc/requirements.txt b/doc/requirements.txt index 33ad28e39..0ae5c76f1 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -2,6 +2,7 @@ sphinx>=4.0.2 recommonmark sphinx_rtd_theme sphinx_markdown_tables +sphinx-argparse myst-parser deepmodeling_sphinx . diff --git a/dpgen/main.py b/dpgen/main.py index 6dcdc4ccd..c93c41ef4 100644 --- a/dpgen/main.py +++ b/dpgen/main.py @@ -30,9 +30,14 @@ __email__ = "" -def main(): - info() - print("Description\n------------") +def main_parser() -> argparse.ArgumentParser: + """Returns parser for `dpgen` command. + + Returns + ------- + argparse.ArgumentParser + parser for `dpgen` command + """ parser = argparse.ArgumentParser(description=""" dpgen is a convenient script that uses DeepGenerator to prepare initial data, drive DeepMDkit and analyze results. This script works based on @@ -156,8 +161,13 @@ def main(): help="parameter file, json format") parser_db.set_defaults(func=db_run) + return parser +def main(): + info() + print("Description\n------------") + parser = main_parser() try: import argcomplete argcomplete.autocomplete(parser) From f5c317228d1673f9428edbd5453c7b3a41aaf330 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Wed, 29 Jun 2022 10:01:41 +0800 Subject: [PATCH 09/26] correct the wrong spelling of 'failure' (#764) --- dpgen/dispatcher/DispatcherList.py | 6 +++--- examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dpgen/dispatcher/DispatcherList.py b/dpgen/dispatcher/DispatcherList.py index 085ae5d9a..22b77fd50 100644 --- a/dpgen/dispatcher/DispatcherList.py +++ b/dpgen/dispatcher/DispatcherList.py @@ -45,7 +45,7 @@ def run_jobs(self, mark_failure = False, outlog = 'log', errlog = 'err'): - ratio_failure = self.mdata_resources.get("ratio_failue", 0) + ratio_failure = self.mdata_resources.get("ratio_failure", 0) while True: if self.check_all_dispatchers_finished(ratio_failure): self.clean() @@ -188,7 +188,7 @@ def make_dispatcher(self, ii): # Base - def check_dispatcher_status(self, ii, allow_failue=False): + def check_dispatcher_status(self, ii, allow_failure=False): '''catch running dispatcher exception if no exception occured, check finished''' if self.dispatcher_list[ii]["dispatcher_status"] == "running": @@ -198,7 +198,7 @@ def check_dispatcher_status(self, ii, allow_failue=False): clean = self.mdata_resources.get("clean", False) try: # avoid raising ssh exception in download proceess - finished = self.dispatcher_list[ii]["dispatcher"].all_finished(self.dispatcher_list[ii]["entity"].job_handler, allow_failue, clean) + finished = self.dispatcher_list[ii]["dispatcher"].all_finished(self.dispatcher_list[ii]["entity"].job_handler, allow_failure, clean) if finished: self.dispatcher_list[ii]["dispatcher_status"] = "finished" except Exception: diff --git a/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json b/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json index 3de59661f..e2db8d254 100644 --- a/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json +++ b/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json @@ -64,7 +64,7 @@ }, "resources": { "allow_failure": true, - "ratio_failue": 0.05, + "ratio_failure": 0.05, "task_per_node": 16, "with_mpi": true, "_comment" : "Load the intel compiler.", From 66d856cd9862e9b7becef596c4d8a8cd504cee06 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 1 Jul 2022 01:22:16 -0400 Subject: [PATCH 10/26] upgrade machine examples to new dpdispatcher (#762) --- .../DeePMD-kit-1.0/machine-local-4GPU.json | 165 +++++++------ .../machine/DeePMD-kit-1.x/machine-local.json | 103 ++++---- .../machine-lsf-slurm-cp2k.json | 178 +++++++------- .../DeePMD-kit-1.x/machine-pbs-gaussian.json | 163 +++++++------ .../DeePMD-kit-1.x/machine-slurm-qe.json | 223 +++++++++--------- 5 files changed, 442 insertions(+), 390 deletions(-) diff --git a/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json b/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json index e2db8d254..e0e6bfca0 100644 --- a/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json +++ b/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json @@ -1,79 +1,90 @@ { - "_comment" : "This is an example of DP-GEN on Local device running with 4 GPUs", - "_comment" : "Last updated on 2021.5.9 for DP-GEN 0.9.2 by Ke XU", - "train": [ - { - "_comment" : "Specify the installed path of DeePMD-kit", - "_comment" : "The version of DeePMD-kit should be 1.*", - "command": "/home/user/anaconda3/bin/dp", - "_comment" : "Specify machine settings", - "machine": { - "_comment" : "Supported batches include slurm, pbs, shell, lsf.", - "batch": "shell", - "work_path": "/tmp/dpwork", - "_comment": "that's all" - }, - "resources":{ - "_comment" : "The number of nodes.", - "numb_node": 1, - "_comment" : "If you choose to run with multiple GPUs simultaneously, just ignore numb_gpu.", - "numb_gpu": 0, - "_comment" : "The number of CPUs.", - "task_per_node": 4, - "_comment" : "The number of GPUs that can be used for each task.", - "manual_cuda_devices": 4, - "_comment" : "The number of tasks that can be run in each GPU.", - "manual_cuda_multiplicity":1, - "_comment" : "Allow the multi-GPU task running.", - "cuda_multi_task": true, - "module_list": [], - "_comment" : "Environment to be activated. This will generate source xxx/psxevars.sh in scripts. ", - "source_list": ["/opt/intel/parallel_studio_xe_2020/psxevars.sh"] - }, - "_comment" : "DP-GEN will put 4 tasks together in one submitting script.", - "group_size": 4 - } - ], - - "model_devi": [ - { - "machine": { - "batch": "shell", - "work_path": "/tmp/dpwork" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 0, - "task_per_node": 4, - "manual_cuda_devices": 4, - "manual_cuda_multiplicity":1, - "cuda_multi_task": true, - "source_list": [], - "module_list": [] - }, - "command": "/home/user/Soft/Deepmd/lammps-stable_29Oct2020/src/lmp_mpi", - "group_size": 4 - } - ], - - "fp": [ - { - "machine": { - "batch": "shell", - "work_path": "/tmp/dpwork" - }, - "resources": { - "allow_failure": true, - "ratio_failure": 0.05, - "task_per_node": 16, - "with_mpi": true, - "_comment" : "Load the intel compiler.", - "source_list": ["/opt/intel/parallel_studio_xe_2020/psxevars.sh"], - "envs": {"PATH" : "/home/user/Soft/VASP/vasp.5.4.4-allbak/bin:$PATH"}, - "_comment" : "This will generate export PATH=/home/user/Soft/VASP/vasp.5.4.4-allbak/bin:$PATH in scripts;" - }, - "command": "vasp_std", - "group_size": 1 - } - ] + "_comment" : "This is an example of DP-GEN on Local device running with 4 GPUs", + "_comment": "Last updated on 2021.5.9 for DP-GEN 0.9.2 by Ke XU", + "train": { + "_comment" : "Specify the installed path of DeePMD-kit", + "command": "/home/user/anaconda3/bin/dp", + "_comment" : "Specify machine settings", + "machine": { + "_comment": "Supported batches include slurm, pbs, shell, lsf.", + "batch_type": "shell", + "context_type": "local", + "remote_root": "/tmp/dpwork", + "local_root": "./" + }, + "resources": { + "module_list": [], + "_comment": "Environment to be activated. This will generate source xxx/psxevars.sh in scripts. ", + "source_list": [ + "/opt/intel/parallel_studio_xe_2020/psxevars.sh" + ], + "batch_type": "shell", + "_comment": "DP-GEN will put 4 tasks together in one submitting script.", + "group_size": 4, + "_comment" : "The number of nodes.", + "number_node": 1, + "_comment" : "The number of CPUs.", + "cpu_per_node": 4, + "_comment" : "If you choose to run with multiple GPUs simultaneously, just ignore numb_gpu.", + "gpu_per_node": 0, + "kwargs": {}, + "strategy": { + "_comment" : "Allow the multi-GPU task running.", + "if_cuda_multi_devices": true + }, + "para_deg": 4, + "queue_name": "" + } + }, + "model_devi": { + "machine": { + "batch_type": "shell", + "context_type": "local", + "remote_root": "/tmp/dpwork", + "local_root": "./" + }, + "resources": { + "source_list": [], + "module_list": [], + "batch_type": "shell", + "group_size": 4, + "number_node": 1, + "cpu_per_node": 4, + "gpu_per_node": 0, + "kwargs": {}, + "strategy": { + "if_cuda_multi_devices": true + }, + "para_deg": 4, + "queue_name": "" + }, + "command": "/home/user/Soft/Deepmd/lammps-stable_29Oct2020/src/lmp_mpi" + }, + "fp": { + "machine": { + "batch_type": "shell", + "context_type": "local", + "remote_root": "/tmp/dpwork", + "local_root": "./" + }, + "resources": { + "_comment" : "Load the intel compiler.", + "source_list": [ + "/opt/intel/parallel_studio_xe_2020/psxevars.sh" + ], + "_comment": "This will generate export PATH=/home/user/Soft/VASP/vasp.5.4.4-allbak/bin:$PATH in scripts;", + "envs": { + "PATH": "/home/user/Soft/VASP/vasp.5.4.4-allbak/bin:$PATH" + }, + "batch_type": "shell", + "group_size": 1, + "cpu_per_node": 16, + "kwargs": {}, + "queue_name": "", + "number_node": 1, + "gpu_per_node": 1 + }, + "command": "mpirun -n 16 vasp_std || :" + }, + "api_version": "1.0" } diff --git a/examples/machine/DeePMD-kit-1.x/machine-local.json b/examples/machine/DeePMD-kit-1.x/machine-local.json index a266f712b..c8134d750 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-local.json +++ b/examples/machine/DeePMD-kit-1.x/machine-local.json @@ -1,42 +1,65 @@ { - "_comment": "training on localhost ", - "_comment" : "This is for DeePMD-kit 1.*", - "train_command" : "/home/wanghan/local/deepmd/1.*/dp", - "train_machine": { - "batch": "shell", - "work_path" : "/home/wanghan/tmp/subs/" - }, - "train_resources": { - "envs": { - } - }, - - - "_comment": "model_devi on localhost ", - "model_devi_command": "/home/wanghan/local/bin/lmp_mpi_010", - "model_devi_group_size": 5, - "model_devi_machine": { - "batch": "shell", - "_comment" : "If lazy_local is true, calculations are done directly in current folders.", - "lazy_local" : true - }, - "model_devi_resources": { - }, - - "_comment": "fp on localhost ", - "fp_command": "/home/wanghan/local/bin/vasp_std", - "fp_group_size": 2, - "fp_machine": { - "batch": "shell", - "work_path" : "/home/wanghan/tmp/subs/", - "_comment" : "that's all" - }, - "fp_resources": { - "module_list": ["mpi"], - "task_per_node":4, - "with_mpi": true, - "_comment": "that's all" + "api_version": "1.0", + "train": { + "_comment": "training on localhost", + "machine": { + "batch_type": "shell", + "context_type": "local", + "remote_root": "/home/wanghan/tmp/subs/", + "local_root": "./" + }, + "resources": { + "envs": {}, + "batch_type": "shell", + "group_size": 1, + "kwargs": {}, + "queue_name": "", + "number_node": 1, + "cpu_per_node": 1, + "gpu_per_node": 1 + }, + "command": "/home/wanghan/local/deepmd/1.*/dp" }, - - "_comment": " that's all " -} + "model_devi": { + "_comment": "model devi on localhost", + "machine": { + "_comment": "If lazy_local, calculations are done directly in current folders.", + "batch_type": "shell", + "context_type": "lazylocal", + "local_root": "./" + }, + "resources": { + "batch_type": "shell", + "group_size": 5, + "kwargs": {}, + "queue_name": "", + "number_node": 1, + "cpu_per_node": 1, + "gpu_per_node": 1 + }, + "command": "/home/wanghan/local/bin/lmp_mpi_010" + }, + "fp": { + "_comment": "fp on localhost", + "machine": { + "batch_type": "shell", + "context_type": "local", + "remote_root": "/home/wanghan/tmp/subs/", + "local_root": "./" + }, + "resources": { + "module_list": [ + "mpi" + ], + "_comment": "that's all", + "batch_type": "shell", + "group_size": 2, + "cpu_per_node": 4, + "kwargs": {}, + "queue_name": "", + "number_node": 1, + "gpu_per_node": 1 + }, + "command": "mpirun -n 4 /home/wanghan/local/bin/vasp_std" + } +} \ No newline at end of file diff --git a/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json b/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json index 4fb5845ee..348609c1e 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json +++ b/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json @@ -1,88 +1,98 @@ { - "train": - { - "machine": { - "machine_type": "slurm", - "hostname": "210.34.15.205", - "port": 22, - "username": "ybzhuang", - "work_path": "/home/ybzhuang/workdir" - }, - "resources": { - "numb_gpu": 1, - "numb_node": 1, - "task_per_node": 1, - "partition": "gpu", - "job_name": "train", - "qos":"emergency", - "exclude_list": [], - "source_list": [ - ], - "module_list": [ - "deepmd/1.2" - ], - "time_limit": "96:0:0", - "submit_wait_time": 60 - }, - "python_path": "/share/apps/deepmd/compress/bin/python3.8" + "train": { + "machine": { + "batch_type": "slurm", + "context_type": "ssh", + "remote_profile": { + "hostname": "210.34.15.205", + "port": 22, + "username": "ybzhuang" + }, + "remote_root": "/home/ybzhuang/workdir", + "local_root": "./" + }, + "resources": { + "source_list": [], + "module_list": [ + "deepmd/1.2" + ], + "batch_type": "slurm", + "number_node": 1, + "cpu_per_node": 1, + "gpu_per_node": 1, + "queue_name": "emergency", + "custom_flags": [ + "#SBATCH -t 96:0:0" + ], + "kwargs": {}, + "wait_time": 60, + "group_size": 1 + }, + "command": "/share/apps/deepmd/compress/bin/python3.8-m deepmd" }, - "model_devi": - { - "machine": { - "machine_type": "slurm", - "hostname": "210.34.15.205", - "port": 22, - "username": "ybzhuang", - "work_path": "/home/ybzhuang/workdir" - }, - "resources": { - "numb_gpu": 1, - "numb_node": 1, - "task_per_node": 1, - "partition": "gpu", - "job_name": "md", - "qos":"emergency", - "exclude_list": [], - "source_list": [ - ], - "module_list": [ - "deepmd/1.2" - ], - "time_limit": "96:0:0", - "submit_wait_time": 60 - }, - "command": "lmp_mpi", - "group_size": 5 + "model_devi": { + "machine": { + "batch_type": "slurm", + "context_type": "ssh", + "remote_profile": { + "hostname": "210.34.15.205", + "port": 22, + "username": "ybzhuang" + }, + "remote_root": "/home/ybzhuang/workdir", + "local_root": "./" + }, + "resources": { + "source_list": [], + "module_list": [ + "deepmd/1.2" + ], + "batch_type": "slurm", + "group_size": 5, + "number_node": 1, + "cpu_per_node": 1, + "gpu_per_node": 1, + "queue_name": "emergency", + "custom_flags": [ + "#SBATCH -t 96:0:0" + ], + "kwargs": {}, + "wait_time": 60 + }, + "command": "lmp_mpi" }, - "fp": - { - "machine": { - "machine_type": "lsf", - "hostname": "localhost", - "port": 6666, - "username": "ybzhuang", - "work_path": "/data/ybzhuang/methane-dpgen/dpgen-tutorial-2020-08-23/dpgen-tutorial-mathane/workpath" - }, - "resources": { - "cvasp": false, - "task_per_node": 32, - "numb_node": 1, - "node_cpu": 32, - "exclude_list": [], - "with_mpi": true, - "source_list": [ - ], - "module_list": [ - "intel/17.5.239", - "mpi/intel/2017.5.239", - "gcc/5.5.0", - "cp2k/7.1" + "fp": { + "machine": { + "batch_type": "lsf", + "context_type": "ssh", + "remote_profile": { + "hostname": "localhost", + "port": 6666, + "username": "ybzhuang" + }, + "remote_root": "/data/ybzhuang/methane-dpgen/dpgen-tutorial-2020-08-23/dpgen-tutorial-mathane/workpath", + "local_root": "./" + }, + "resources": { + "source_list": [], + "module_list": [ + "intel/17.5.239", + "mpi/intel/2017.5.239", + "gcc/5.5.0", + "cp2k/7.1" + ], + "batch_type": "lsf", + "group_size": 50, + "number_node": 1, + "cpu_per_node": 32, + "queue_name": "53-medium", + "custom_flags": [ + "#BSUB -W 12:00:00" ], - "time_limit": "12:00:00", - "partition": "53-medium", - "_comment": "that's Bel" - }, - "command": "cp2k.popt -i input.inp", - "group_size": 50 - } -} + "kwargs": {}, + "gpu_per_node": 1 + }, + "command": "mpirun -n 32 cp2k.popt -i input.inp" + }, + "api_version": "1.0" +} \ No newline at end of file diff --git a/examples/machine/DeePMD-kit-1.x/machine-pbs-gaussian.json b/examples/machine/DeePMD-kit-1.x/machine-pbs-gaussian.json index 6893471c5..daa743dcc 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-pbs-gaussian.json +++ b/examples/machine/DeePMD-kit-1.x/machine-pbs-gaussian.json @@ -1,79 +1,88 @@ { - "_comment": "training on localhost ", - "train_command": "/gpfs/home/tzhu/anaconda3/envs/python3.6/bin/dp", - "train_machine": { - "machine_type": "lsf", - "hostname" : "xxx.xxx.xxx.xxx", - "port" : 22, - "username": "tzhu", - "work_path" : "/gpfs/home/tzhu/jzzeng/dpgen_workdir", - "_comment" : "that's all" - }, - "train_resources": { - "source_list": [ "activate deepmd" ], - "envs": { - "KMP_BLOCKTIME": 0, - "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" - }, - "numb_gpu": 1, - "numb_node": 1, - "node_cpu": 0, - "partition": "newgpu", - "job_name": "dpgen_jzzeng", - "with_mpi": false, - "time_limit": false, - "_comment": "that's all" - }, - - - "_comment": "model_devi on localhost ", - "model_devi_command": "/gpfs/home/tzhu/lammps-stable_5Jun2019/src/lmp_intel_cpu_intelmpi -pk intel 0 omp 2", - "model_devi_group_size": 1, - "model_devi_machine": { - "machine_type": "lsf", - "hostname" : "xxx.xxx.xxx.xxx", - "port" : 22, - "username": "tzhu", - "work_path" : "/gpfs/home/tzhu/jzzeng/dpgen_workdir", - "_comment" : "that's all" - }, - "model_devi_resources": { - "envs": { - "KMP_BLOCKTIME": 0 - }, - "source_list": [ "activate deepmd" ], - "numb_gpu": 1, - "numb_node": 1, - "node_cpu": 0, - "time_limit": false, - "partition": "newgpu", - "job_name": "dpgen_jzzeng", - "with_mpi": true, - "task_per_node": 1, - "_comment": "that's all" - }, - - "_comment": "fp on lsf //localhost ", - "fp_command": "/public/home/tzhu/g16/g16 < input", - "fp_group_size": 1, - "fp_machine": { - "machine_type": "pbs", - "hostname" : "xxx.xxx.xxx.xxx", - "port" : 2323, - "username": "tzhu", - "work_path" : "/public/home/tzhu/jzzeng/dpgen_workdir", - "_comment" : "that's all" - }, - "fp_resources": { - "node_cpu":28, - "numb_node": 1, - "job_name": "dpgen_jzzeng", - "task_per_node": 28, - "with_mpi": false, - "time_limit": "10:00:00", - "allow_failure": true, - "partition": "small", - "_comment": "that's all" + "api_version": "1.0", + "train": { + "machine": { + "batch_type": "lsf", + "context_type": "ssh", + "remote_profile": { + "hostname": "xxx.xxx.xxx.xxx", + "port": 22, + "username": "tzhu" + }, + "remote_root": "/gpfs/home/tzhu/jzzeng/dpgen_workdir", + "local_root": "./" + }, + "resources": { + "source_list": [ + "activate deepmd" + ], + "envs": { + "KMP_BLOCKTIME": 0, + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" + }, + "batch_type": "lsf", + "group_size": 1, + "number_node": 1, + "cpu_per_node": 0, + "gpu_per_node": 1, + "queue_name": "newgpu", + "kwargs": {} + }, + "command": "/gpfs/home/tzhu/anaconda3/envs/python3.6/bin/dp" }, - "_comment": " that's all " -} + "model_devi": { + "machine": { + "batch_type": "lsf", + "context_type": "ssh", + "remote_profile": { + "hostname": "xxx.xxx.xxx.xxx", + "port": 22, + "username": "tzhu" + }, + "remote_root": "/gpfs/home/tzhu/jzzeng/dpgen_workdir", + "local_root": "./" + }, + "resources": { + "envs": { + "KMP_BLOCKTIME": 0 + }, + "source_list": [ + "activate deepmd" + ], + "batch_type": "lsf", + "group_size": 1, + "number_node": 1, + "cpu_per_node": 0, + "gpu_per_node": 1, + "queue_name": "newgpu", + "kwargs": {} + }, + "command": "mpirun -n 0 /gpfs/home/tzhu/lammps-stable_5Jun2019/src/lmp_intel_cpu_intelmpi -pk intel 0 omp 2" + }, + "fp": { + "machine": { + "batch_type": "pbs", + "context_type": "ssh", + "remote_profile": { + "hostname": "xxx.xxx.xxx.xxx", + "port": 2323, + "username": "tzhu" + }, + "remote_root": "/public/home/tzhu/jzzeng/dpgen_workdir", + "local_root": "./" + }, + "resources": { + "batch_type": "pbs", + "group_size": 1, + "number_node": 1, + "cpu_per_node": 28, + "queue_name": "small", + "custom_flags": [ + "#PBS -l walltime=10:00:00" + ], + "kwargs": {}, + "gpu_per_node": 1 + }, + "command": "/public/home/tzhu/g16/g16 < input || :" + } +} \ No newline at end of file diff --git a/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json b/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json index 5f15303d6..2ff5b4a4b 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json +++ b/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json @@ -1,113 +1,112 @@ - { - "_comment" : "This is an example of DP-GEN on Slurm", - "_comment" : "Last updated on 2021.4.30 for DP-GEN 0.9.2 by Yuzhi Zhang", - "train" : - { - "_comment" : "Specify the installed path of DeePMD-kit", - "_comment" : "The version of DeePMD-kit should be 1.*", - "command": "PATH_TO_DEEPMD/dp", - "_comment" : "Specify machine settings", - "machine": { - "_comment" : "Supported batches include slurm, pbs, shell, lsf.", - "batch": "slurm", - "_comment" : "If your jobs are executed on a local workstation, you can let hostname be localhost.", - "_comment" : "Otherwise you should provide the IP of machine you want to connect via ssh.", - "hostname": "localhost", - "_comment" : "The port for connection, most common settings is 22", - "port": 22, - "_comment" : "Specify your username. Sometimes you may need specify password. Exactly the name of key is password. ", - "username": "USERNAME", - "_comment" : "Specify where you want your job executes, all of tasks will be sent to work_path on this machine.", - "_comment" : "You should alwasy make sure that directory of work_path exits. ", - "work_path": "PATH_TO_WORK", - "_comment": "that's all" - }, - "resources": { - "_comment" : "The number of nodes. This will generate #SBATCH -N 1 in your script. ", - "numb_node": 1, - "_comment" : "The number of GPU cards. #SBATCH --gres=gpu:1", - "numb_gpu": 1, - "_comment" : "The number of CPUs. #SBATCH -n 4", - "task_per_node": 4, - "_comment" : "Partition. #SBATCH -p all", - "partition": "all", - "_comment" : "Memory limit. #SBATCH --mem=16G", - "mem_limit": 16, - "_comment" : "Nodelist to be excluded. #SBATCH --exclude=gpu06,gpu07", - "exclude_list": [ - "gpu06", - "gpu07" - ], - "_comment" : "Environment to be activated. This will generate source PATH/train_new.env . ", - "source_list": [ - "PATH/train_new.env" - ], - "_comment" : " Module is a common tools on HPC clustes to manage softwares for multiple users.", - "_comment" : "Modules to be loaded. This will generate module load intel", - "module_list": ["intel"], - "_comment" : "Time limit. ", - "time_limit": "23:0:0", - "_comment": "that's all" - } +{ + "_comment": "Last updated on 2021.4.30 for DP-GEN 0.9.2 by Yuzhi Zhang", + "train": { + "_comment" : "Specify the installed path of DeePMD-kit", + "command": "PATH_TO_DEEPMD/dp", + "_comment" : "Specify machine settings", + "machine": { + "batch_type": "slurm", + "context_type": "ssh", + "remote_profile": { + "hostname": "localhost", + "_comment" : "The port for connection, most common settings is 22", + "port": 22, + "_comment" : "Specify your username.", + "username": "USERNAME" + }, + "_comment" : "You should alwasy make sure that directory of work_path exits. ", + "remote_root": "PATH_TO_WORK", + "local_root": "./" + }, + "resources": { + "_comment" : "Environment to be activated. This will generate source PATH/train_new.env . ", + "source_list": [ + "PATH/train_new.env" + ], + "_comment" : " Module is a common tools on HPC clustes to manage softwares for multiple users.", + "_comment" : "Modules to be loaded. This will generate module load intel", + "module_list": [ + "intel" + ], + "batch_type": "slurm", + "_comment" : "The number of nodes. This will generate #SBATCH -N 1 in your script. ", + "number_node": 1, + "_comment" : "The number of CPUs. #SBATCH -n 4", + "cpu_per_node": 4, + "_comment" : "The number of GPU cards. #SBATCH --gres=gpu:1", + "gpu_per_node": 1, + "queue_name": "all", + "custom_flags": [ + "#SBATCH -t 23:0:0", + "#SBATCH --mem=16G", + "#SBATCH --exclude=gpu06,gpu07" + ], + "kwargs": {}, + "group_size": 1 + } }, - - "model_devi": - { - "machine": { - "machine_type": "slurm", - "hostname": "localhost", - "port": 22, - "username": "USERNAME", - "work_path": "PATH_TO_WORK", - "_comment": "that's all" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node": 4, - "partition": "all", - "mem_limit": 16, - "exclude_list": [ - - ], - "source_list": [ - "PATH/lmp_new.env" - ], - "module_list": [], - "time_limit": "23:0:0", - "_comment": "that's all" - }, - "command": "lmp_serial", - "_comment" : "DP-GEN will put 5 tasks together in one submitting script.", - "group_size": 5 - }, - "fp": - { - "machine": { - "machine_type": "slurm", - "hostname": "xxx.xxx.xxx.xxx", - "port": 22, - "username": "USERNAME", - "work_path": "PATH_TO_WORK" - }, - "resources": { - "task_per_node": 8, - "numb_gpu": 0, - "exclude_list": [], - "_comment" : "If you set with_mpi to true, the defaulted parallelling command of Slurm, srun, will be appended as prefix.", - "_comment" : "If you do not want this, you can set with_mpi to false, and specify parallelling command yourself. ", - "_comment" : "Notice that in json format, the upper/lower case is strict. You should write true instead of True and false instead of False", - "with_mpi": false, - "source_list": [ - ], - "module_list": [ - "mpich/3.2.1-intel-2017.1" - ], - "time_limit": "120:0:0", - "partition": "C032M0128G", - "_comment": "that's all" - }, - "command": "mpirun -n 8 /gpfs/share/home/1600017784/yuzhi/soft/QE-mpi/PW/src/pw.x < input", - "group_size": 1 - } -} + "model_devi": { + "machine": { + "batch_type": "slurm", + "context_type": "ssh", + "remote_profile": { + "hostname": "localhost", + "port": 22, + "username": "USERNAME" + }, + "remote_root": "PATH_TO_WORK", + "local_root": "./" + }, + "resources": { + "source_list": [ + "PATH/lmp_new.env" + ], + "module_list": [], + "batch_type": "slurm", + "_comment": "DP-GEN will put 5 tasks together in one submitting script.", + "group_size": 5, + "number_node": 1, + "cpu_per_node": 4, + "gpu_per_node": 1, + "queue_name": "all", + "custom_flags": [ + "#SBATCH -t 23:0:0", + "#SBATCH --mem=16G", + "#SBATCH --exclude=" + ], + "kwargs": {} + }, + "command": "lmp_serial" + }, + "fp": { + "machine": { + "batch_type": "slurm", + "context_type": "ssh", + "remote_profile": { + "hostname": "xxx.xxx.xxx.xxx", + "port": 22, + "username": "USERNAME" + }, + "remote_root": "PATH_TO_WORK", + "local_root": "./" + }, + "resources": { + "source_list": [], + "module_list": [ + "mpich/3.2.1-intel-2017.1" + ], + "batch_type": "slurm", + "group_size": 1, + "cpu_per_node": 8, + "gpu_per_node": 0, + "queue_name": "C032M0128G", + "custom_flags": [ + "#SBATCH -t 120:0:0" + ], + "kwargs": {}, + "number_node": 1 + }, + "command": "mpirun -n 8 /gpfs/share/home/1600017784/yuzhi/soft/QE-mpi/PW/src/pw.x < input" + }, + "api_version": "1.0" +} \ No newline at end of file From 2e82464b38759c65673e309430b7547d04c46fd8 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Fri, 1 Jul 2022 13:23:55 +0800 Subject: [PATCH 11/26] =?UTF-8?q?fix=20=E2=80=98post=5Ffp=5Fcp2k=E2=80=99,?= =?UTF-8?q?=20add=20param=20rfailed=20(#765)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix ‘post_fp_cp2k’, add param rfailed * Update run.py --- dpgen/generator/run.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/dpgen/generator/run.py b/dpgen/generator/run.py index 1bd196cc6..f716e2266 100644 --- a/dpgen/generator/run.py +++ b/dpgen/generator/run.py @@ -3342,7 +3342,10 @@ def post_fp_gaussian (iter_index, def post_fp_cp2k (iter_index, - jdata): + jdata, + rfailed=None): + + ratio_failed = rfailed if rfailed else jdata.get('ratio_failed',0.10) model_devi_jobs = jdata['model_devi_jobs'] assert (iter_index < len(model_devi_jobs)) @@ -3373,7 +3376,7 @@ def post_fp_cp2k (iter_index, all_sys = None for oo in sys_output : _sys = dpdata.LabeledSystem(oo, fmt = 'cp2k/output') - _sys.check_type_map(type_map = jdata['type_map']) + #_sys.check_type_map(type_map = jdata['type_map']) if all_sys is None: all_sys = _sys else: @@ -3385,8 +3388,12 @@ def post_fp_cp2k (iter_index, sys_data_path = os.path.join(work_path, 'data.%s'%ss) all_sys.to_deepmd_raw(sys_data_path) all_sys.to_deepmd_npy(sys_data_path, set_size = len(sys_output)) - dlog.info("failed frame number: %s "%(tcount-icount)) - dlog.info("total frame number: %s "%tcount) + + rfail=float(tcount - icount)/float(tcount) + dlog.info("failed frame: %6d in %6d %6.2f %% " % (tcount - icount, tcount, rfail * 100.)) + + if rfail>ratio_failed: + raise RuntimeError("find too many unsuccessfully terminated jobs. Too many FP tasks are not converged. Please check your files in directories \'iter.*.*/02.fp/task.*.*/.\'") def post_fp_pwmat (iter_index, From 4948c81e33a40f1b10017e2c6066b90ad4624055 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 9 Jun 2022 00:51:49 -0400 Subject: [PATCH 12/26] refactor simplify (#730) * refactor simplify 1. Used `dp model-devi` to calculate model deviation, instead of local calculation. Supported by deepmodeling/deepmd-kit#1618, released in v2.1.1. So the version earlier than 2.1.1 is not supported any more. 2. Assumed all systems are MultiSystems. 3. Removed energy model deviation support * expand path when getting multisystems * let `make_train` and `run_train` expand paths * load numpy array instead * use dpdata to get nframes * fix tests * update README --- README.md | 6 +- dpgen/generator/run.py | 70 +++---- dpgen/simplify/simplify.py | 381 +++++++++---------------------------- dpgen/util.py | 22 +++ 4 files changed, 140 insertions(+), 339 deletions(-) diff --git a/README.md b/README.md index b59725ec9..c833ed059 100644 --- a/README.md +++ b/README.md @@ -499,9 +499,8 @@ The bold notation of key (such aas **type_map**) means that it's a necessary key | **use_ele_temp** | int | 0 | Currently only support fp_style vasp. 0(default): no electron temperature. 1: eletron temperature as frame parameter. 2: electron temperature as atom parameter. | *#Data* | init_data_prefix | String | "/sharedext4/.../data/" | Prefix of initial data directories - | ***init_data_sys*** | List of string|["CH4.POSCAR.01x01x01/.../deepmd"] |Directories of initial data. You may use either absolute or relative path here. + | ***init_data_sys*** | List of string|["CH4.POSCAR.01x01x01/.../deepmd"] |Directories of initial data. You may use either absolute or relative path here. Systems will be detected recursively in the directories. | ***sys_format*** | String | "vasp/poscar" | Format of initial data. It will be `vasp/poscar` if not set. - | init_multi_systems | Boolean | false | If set to `true`, `init_data_sys` directories should contain sub-directories of various systems. DP-GEN will regard all of these sub-directories as inital data systems. | init_batch_size | String of integer | [8] | Each number is the batch_size of corresponding system for training in `init_data_sys`. One recommended rule for setting the `sys_batch_size` and `init_batch_size` is that `batch_size` mutiply number of atoms ot the stucture should be larger than 32. If set to `auto`, batch size will be 32 divided by number of atoms. | | sys_configs_prefix | String | "/sharedext4/.../data/" | Prefix of `sys_configs` | **sys_configs** | List of list of string | [
["/sharedext4/.../POSCAR"],
["....../POSCAR"]
] | Containing directories of structures to be explored in iterations.Wildcard characters are supported here. | @@ -1086,7 +1085,6 @@ Here is an example of `param.json` for QM7 dataset: }, "_comment": "that's all" }, - "use_clusters": true, "fp_style": "gaussian", "shuffle_poscar": false, "fp_task_max": 1000, @@ -1109,7 +1107,7 @@ Here is an example of `param.json` for QM7 dataset: } ``` -Here `pick_data` is the data to simplify and currently only supports `MultiSystems` containing `System` with `deepmd/npy` format, and `use_clusters` should always be `true`. `init_pick_number` and `iter_pick_number` are the numbers of picked frames. `e_trust_lo`, `e_trust_hi` mean the range of the deviation of the frame energy, and `f_trust_lo` and `f_trust_hi` mean the range of the max deviation of atomic forces in a frame. `fp_style` can only be `gaussian` currently. Other parameters are as the same as those of generator. +Here `pick_data` is the directory to data to simplify where the program recursively detects systems `System` with `deepmd/npy` format. `init_pick_number` and `iter_pick_number` are the numbers of picked frames. `e_trust_lo`, `e_trust_hi` mean the range of the deviation of the frame energy, and `f_trust_lo` and `f_trust_hi` mean the range of the max deviation of atomic forces in a frame. `fp_style` can only be `gaussian` currently. Other parameters are as the same as those of generator. ## Set up machine diff --git a/dpgen/generator/run.py b/dpgen/generator/run.py index d06c137b3..1bd196cc6 100644 --- a/dpgen/generator/run.py +++ b/dpgen/generator/run.py @@ -61,7 +61,7 @@ from dpgen.generator.lib.ele_temp import NBandsEsti from dpgen.remote.decide_machine import convert_mdata from dpgen.dispatcher.Dispatcher import Dispatcher, _split_tasks, make_dispatcher, make_submission -from dpgen.util import sepline +from dpgen.util import sepline, expand_sys_str from dpgen import ROOT_PATH from pymatgen.io.vasp import Incar,Kpoints,Potcar from dpgen.auto_test.lib.vasp import make_kspacing_kpoints @@ -288,13 +288,10 @@ def make_train (iter_index, # make sure all init_data_sys has the batch size -- for the following `zip` assert (len(init_data_sys_) <= len(init_batch_size_)) for ii, ss in zip(init_data_sys_, init_batch_size_) : - if jdata.get('init_multi_systems', False): - for single_sys in os.listdir(os.path.join(work_path, 'data.init', ii)): - init_data_sys.append(os.path.join('..', 'data.init', ii, single_sys)) - init_batch_size.append(detect_batch_size(ss, os.path.join(work_path, 'data.init', ii, single_sys))) - else: - init_data_sys.append(os.path.join('..', 'data.init', ii)) - init_batch_size.append(detect_batch_size(ss, os.path.join(work_path, 'data.init', ii))) + sys_paths = expand_sys_str(os.path.join(init_data_prefix, ii)) + for single_sys in sys_paths: + init_data_sys.append(os.path.normpath(os.path.join('..', 'data.init', ii, os.path.relpath(single_sys, os.path.join(init_data_prefix, ii))))) + init_batch_size.append(detect_batch_size(ss, single_sys)) old_range = None if iter_index > 0 : for ii in range(iter_index) : @@ -308,25 +305,16 @@ def make_train (iter_index, sys_batch_size = ["auto" for aa in range(len(sys_list))] for jj in fp_data_sys : sys_idx = int(jj.split('.')[-1]) - if jdata.get('use_clusters', False): - nframes = 0 - for sys_single in os.listdir(jj): - tmp_box = np.loadtxt(os.path.join(jj, sys_single, 'box.raw')) - tmp_box = np.reshape(tmp_box, [-1,9]) - nframes += tmp_box.shape[0] - if nframes < fp_task_min : - log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj)) - continue - for sys_single in os.listdir(jj): - init_data_sys.append(os.path.join('..', 'data.iters', jj, sys_single)) - init_batch_size.append(detect_batch_size(sys_batch_size[sys_idx], os.path.join(jj, sys_single))) - else: - nframes = dpdata.System(jj, 'deepmd/npy').get_nframes() - if nframes < fp_task_min : - log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj)) - continue - init_data_sys.append(os.path.join('..', 'data.iters', jj)) - init_batch_size.append(detect_batch_size(sys_batch_size[sys_idx], jj)) + sys_paths = expand_sys_str(jj) + nframes = 0 + for sys_single in sys_paths: + nframes += dpdata.LabeledSystem(sys_single, fmt="deepmd/npy").get_nframes() + if nframes < fp_task_min : + log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj)) + continue + for sys_single in sys_paths: + init_data_sys.append(os.path.normpath(os.path.join('..', 'data.iters', sys_single))) + init_batch_size.append(detect_batch_size(sys_batch_size[sys_idx], sys_single)) # establish tasks jinput = jdata['default_training_param'] try: @@ -568,25 +556,17 @@ def run_train (iter_index, os.chdir(work_path) fp_data = glob.glob(os.path.join('data.iters', 'iter.*', '02.fp', 'data.*')) for ii in init_data_sys : - if jdata.get('init_multi_systems', False): - for single_sys in os.listdir(os.path.join(ii)): - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'set.*')) - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'type*.raw')) - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'nopbc')) - else: - trans_comm_data += glob.glob(os.path.join(ii, 'set.*')) - trans_comm_data += glob.glob(os.path.join(ii, 'type*.raw')) - trans_comm_data += glob.glob(os.path.join(ii, 'nopbc')) + sys_paths = expand_sys_str(ii) + for single_sys in sys_paths: + trans_comm_data += glob.glob(os.path.join(single_sys, 'set.*')) + trans_comm_data += glob.glob(os.path.join(single_sys, 'type*.raw')) + trans_comm_data += glob.glob(os.path.join(single_sys, 'nopbc')) for ii in fp_data : - if jdata.get('use_clusters', False): - for single_sys in os.listdir(os.path.join(ii)): - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'set.*')) - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'type*.raw')) - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'nopbc')) - else: - trans_comm_data += glob.glob(os.path.join(ii, 'set.*')) - trans_comm_data += glob.glob(os.path.join(ii, 'type*.raw')) - trans_comm_data += glob.glob(os.path.join(ii, 'nopbc')) + sys_paths = expand_sys_str(ii) + for single_sys in sys_paths: + trans_comm_data += glob.glob(os.path.join(single_sys, 'set.*')) + trans_comm_data += glob.glob(os.path.join(single_sys, 'type*.raw')) + trans_comm_data += glob.glob(os.path.join(single_sys, 'nopbc')) os.chdir(cwd) try: diff --git a/dpgen/simplify/simplify.py b/dpgen/simplify/simplify.py index 982db3114..529401519 100644 --- a/dpgen/simplify/simplify.py +++ b/dpgen/simplify/simplify.py @@ -9,6 +9,7 @@ 02: fp (optional, if the original dataset do not have fp data, same as generator) """ import logging +import warnings import queue import os import json @@ -21,7 +22,7 @@ from dpgen import dlog from dpgen import SHORT_CMD -from dpgen.util import sepline +from dpgen.util import sepline, expand_sys_str from distutils.version import LooseVersion from dpgen.dispatcher.Dispatcher import Dispatcher, _split_tasks, make_dispatcher, make_submission from dpgen.generator.run import make_train, run_train, post_train, run_fp, post_fp, fp_name, model_devi_name, train_name, train_task_fmt, sys_link_fp_vasp_pp, make_fp_vasp_incar, make_fp_vasp_kp, make_fp_vasp_cp_cvasp, data_system_fmt, model_devi_task_fmt, fp_task_fmt @@ -38,17 +39,6 @@ sys_name_fmt = 'sys.' + data_system_fmt sys_name_pattern = 'sys.[0-9]*[0-9]' -def expand_sys_str(root_dir): - matches = [] - for root, dirnames, filenames in os.walk(root_dir, followlinks=True): - for filename in fnmatch.filter(filenames, 'type.raw'): - matches.append(root) - matches.sort() - dirnames = [os.path.basename(ii) for ii in matches] - if (len(list(set(dirnames))) != len(matches)) : - raise RuntimeError('duplicated system name: it is highly recommend to place all systems in the same level of directory and has different names') - return matches - def get_system_cls(jdata): if jdata.get("labeled", False): @@ -58,28 +48,12 @@ def get_system_cls(jdata): def get_multi_system(path, jdata): system = get_system_cls(jdata) + system_paths = expand_sys_str(path) systems = dpdata.MultiSystems( - *[system(os.path.join(path, s), fmt='deepmd/npy') for s in os.listdir(path)]) - return systems - - -def get_systems(path, jdata): - system_cls = get_system_cls(jdata) - system_paths = expand_sys_str(path) - systems = {} - for ii in system_paths: - systems[os.path.basename(ii)] = system_cls(ii, fmt='deepmd/npy') + *[system(s, fmt='deepmd/npy') for s in system_paths]) return systems -def get_system_idx(path): - system_paths = expand_sys_str(path) - sys_idx_map = {} - for idx,ii in enumerate(system_paths): - sys_idx_map[os.path.basename(ii)] = idx - return sys_idx_map - - def init_model(iter_index, jdata, mdata): training_init_model = jdata.get('training_init_model', False) if not training_init_model: @@ -111,20 +85,13 @@ def init_pick(iter_index, jdata, mdata): """pick up init data from dataset randomly""" pick_data = jdata['pick_data'] init_pick_number = jdata['init_pick_number'] - use_clusters = jdata.get('use_clusters', False) # use MultiSystems with System # TODO: support System and LabeledSystem # TODO: support other format - if use_clusters: - systems = get_multi_system(pick_data, jdata) - else: - systems = get_systems(pick_data, jdata) + systems = get_multi_system(pick_data, jdata) # label the system labels = [] - if use_clusters: - items = systems.systems.items() - else: - items = systems.items() + items = systems.systems.items() for key, system in items: labels.extend([(key, j) for j in range(len(system))]) @@ -146,48 +113,18 @@ def init_pick(iter_index, jdata, mdata): _init_dump_selected_frames(systems, labels, rest_idx, sys_data_path, jdata) -def _add_system(systems, key, system): - if key in systems.keys(): - systems[key].append(system) - else: - systems[key] = system - return systems - - def _init_dump_selected_frames(systems, labels, selc_idx, sys_data_path, jdata): - pick_data = jdata['pick_data'] - use_clusters = jdata.get('use_clusters', False) - if use_clusters: - selc_systems = dpdata.MultiSystems() - for j in selc_idx: - sys_name, sys_id = labels[j] - selc_systems.append(systems[sys_name][sys_id]) - selc_systems.to_deepmd_raw(sys_data_path) - selc_systems.to_deepmd_npy(sys_data_path, set_size=selc_idx.size) - else: - selc_systems = {} - for j in selc_idx: - sys_name, sys_id = labels[j] - selc_systems = _add_system(selc_systems, sys_name, systems[sys_name][sys_id]) - sys_idx_map = get_system_idx(pick_data) - for kk in selc_systems.keys(): - sub_path = os.path.join(sys_data_path, sys_name_fmt % sys_idx_map[kk]) - selc_systems[kk].to_deepmd_raw(sub_path) - selc_systems[kk].to_deepmd_npy(sub_path, set_size=selc_idx.size) - with open(os.path.join(sys_data_path, 'sys_idx_map.json'), 'w') as fp: - json.dump(sys_idx_map, fp, indent=4) - -def _dump_system_dict(systems, path): - for kk in systems: - sub_path = os.path.join(path, sys_name_fmt % (int(kk))) - systems[kk].to_deepmd_raw(sub_path) - systems[kk].to_deepmd_npy(sub_path, set_size=systems[kk].get_nframes()) + selc_systems = dpdata.MultiSystems() + for j in selc_idx: + sys_name, sys_id = labels[j] + selc_systems.append(systems[sys_name][sys_id]) + selc_systems.to_deepmd_raw(sys_data_path) + selc_systems.to_deepmd_npy(sys_data_path, set_size=selc_idx.size) def make_model_devi(iter_index, jdata, mdata): """calculate the model deviation of the rest idx""" pick_data = jdata['pick_data'] - use_clusters = jdata.get('use_clusters', False) iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, model_devi_name) create_path(work_path) @@ -203,25 +140,7 @@ def make_model_devi(iter_index, jdata, mdata): rest_data_path = os.path.join(last_iter_name, model_devi_name, rest_data_name) if not os.path.exists(rest_data_path): return False - if use_clusters: - for jj, subsystem in enumerate(os.listdir(rest_data_path)): - task_name = "task." + model_devi_task_fmt % (0, jj) - task_path = os.path.join(work_path, task_name) - create_path(task_path) - os.symlink(os.path.abspath(os.path.join(rest_data_path, subsystem)), - os.path.abspath(os.path.join(task_path, rest_data_name))) - else: - rest_data_path = os.path.abspath(rest_data_path) - sys_path = glob.glob(os.path.join(rest_data_path, sys_name_pattern)) - cwd = os.getcwd() - for ii in sys_path: - task_name = "task." + model_devi_task_fmt % (int(os.path.basename(ii).split('.')[1]), 0) - task_path = os.path.join(work_path, task_name) - create_path(task_path) - os.chdir(task_path) - os.symlink(os.path.relpath(ii), rest_data_name) - os.chdir(cwd) - os.chdir(cwd) + os.symlink(os.path.abspath(rest_data_path), os.path.join(work_path, rest_data_name + ".old")) return True @@ -231,43 +150,28 @@ def run_model_devi(iter_index, jdata, mdata): work_path = os.path.join(iter_name, model_devi_name) # generate command commands = [] - tasks = glob.glob(os.path.join(work_path, "task.*")) - run_tasks = [os.path.basename(ii) for ii in tasks] + run_tasks = ["."] # get models models = glob.glob(os.path.join(work_path, "graph*pb")) model_names = [os.path.basename(ii) for ii in models] task_model_list = [] for ii in model_names: - task_model_list.append(os.path.join('..', ii)) - # get max data size - data_size = max([len(dpdata.System(os.path.join( - task, rest_data_name), fmt="deepmd/npy")) for task in tasks]) + task_model_list.append(os.path.join('.', ii)) # models commands = [] - detail_file_names = [] - for ii, mm in enumerate(task_model_list): - detail_file_name = "{prefix}-{ii}".format( - prefix=detail_file_name_prefix, - ii=ii, - ) - # TODO: support 0.x? - command = "{python} -m deepmd test -m {model} -s {system} -n {numb_test} -d {detail_file}".format( - python=mdata['python_test_path'], - model=mm, - system=rest_data_name, - numb_test=data_size, - detail_file=detail_file_name, - ) - commands.append(command) - detail_file_names.append(detail_file_name) + detail_file_name = detail_file_name_prefix + command = "{dp} model-devi -m {model} -s {system} -o {detail_file}".format( + dp=mdata.get('model_devi_command', 'dp'), + model=" ".join(task_model_list), + system=rest_data_name + ".old", + detail_file=detail_file_name, + ) + commands = [command] # submit - try: - model_devi_group_size = mdata['model_devi_group_size'] - except Exception: - model_devi_group_size = 1 + model_devi_group_size = mdata.get('model_devi_group_size', 1) - forward_files = [rest_data_name] - backward_files = sum([[pf+".e.out", pf+".f.out", pf+".v.out"] for pf in detail_file_names], []) + forward_files = [rest_data_name + ".old"] + backward_files = [detail_file_name] api_version = mdata.get('api_version', '0.9') if LooseVersion(api_version) < LooseVersion('1.0'): @@ -303,102 +207,50 @@ def run_model_devi(iter_index, jdata, mdata): def post_model_devi(iter_index, jdata, mdata): """calculate the model deviation""" - use_clusters = jdata.get('use_clusters', False) iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, model_devi_name) - tasks = glob.glob(os.path.join(work_path, "task.*")) - tasks.sort() - - e_trust_lo = jdata['e_trust_lo'] - e_trust_hi = jdata['e_trust_hi'] - f_trust_lo = jdata['f_trust_lo'] - f_trust_hi = jdata['f_trust_hi'] - - if use_clusters: - sys_accurate = dpdata.MultiSystems() - sys_candinate = dpdata.MultiSystems() - sys_failed = dpdata.MultiSystems() - else: - sys_accurate = {} - sys_candinate = {} - sys_failed = {} - all_names = set() - - for task in tasks: - if not use_clusters: - sys_name = os.path.basename(task).split('.')[1] - all_names.add(sys_name) - # e.out - details_e = glob.glob(os.path.join(task, "{}-*.e.out".format(detail_file_name_prefix))) - e_all = np.array([np.loadtxt(detail_e, ndmin=2)[:, 1] for detail_e in details_e]) - e_std = np.std(e_all, axis=0) - n_frame = e_std.size - - # f.out - details_f = glob.glob(os.path.join(task, "{}-*.f.out".format(detail_file_name_prefix))) - f_all = np.array([np.loadtxt(detail_f, ndmin=2)[:, 3:6].reshape((n_frame, -1, 3)) for detail_f in details_f]) - # (n_model, n_frame, n_atom, 3) - f_std = np.std(f_all, axis=0) - # (n_frame, n_atom, 3) - f_std = np.linalg.norm(f_std, axis=2) - # (n_frame, n_atom) - f_std = np.max(f_std, axis=1) - # (n_frame,) - - system_cls = get_system_cls(jdata) - for subsys, e_devi, f_devi in zip(system_cls(os.path.join(task, rest_data_name), fmt='deepmd/npy'), e_std, f_std): - if (e_devi < e_trust_hi and e_devi >= e_trust_lo) or (f_devi < f_trust_hi and f_devi >= f_trust_lo) : - if use_clusters: + + f_trust_lo = jdata['model_devi_f_trust_lo'] + f_trust_hi = jdata['model_devi_f_trust_hi'] + + sys_accurate = dpdata.MultiSystems() + sys_candinate = dpdata.MultiSystems() + sys_failed = dpdata.MultiSystems() + + sys_entire = dpdata.MultiSystems().from_deepmd_npy(os.path.join(work_path, rest_data_name + ".old")) + + detail_file_name = detail_file_name_prefix + with open(os.path.join(work_path, detail_file_name)) as f: + for line in f: + if line.startswith("# data.rest.old"): + name = (line.split()[1]).split("/")[-1] + elif line.startswith("#"): + pass + else: + idx = int(line.split()[0]) + f_devi = float(line.split()[4]) + subsys = sys_entire[name][idx] + if f_trust_lo <= f_devi < f_trust_hi: sys_candinate.append(subsys) - else: - sys_candinate = _add_system(sys_candinate, sys_name, subsys) - elif (e_devi >= e_trust_hi ) or (f_devi >= f_trust_hi ): - if use_clusters: + elif f_devi >= f_trust_hi: sys_failed.append(subsys) - else: - sys_failed = _add_system(sys_failed, sys_name, subsys) - elif (e_devi < e_trust_lo and f_devi < f_trust_lo ): - if use_clusters: + elif f_devi < f_trust_lo: sys_accurate.append(subsys) else: - sys_accurate = _add_system(sys_accurate, sys_name, subsys) - else: - raise RuntimeError('reach a place that should NOT be reached...') - if use_clusters: - counter = {"candidate": sys_candinate.get_nframes(), "accurate": sys_accurate.get_nframes(), "failed": sys_failed.get_nframes()} - fp_sum = sum(counter.values()) - for cc_key, cc_value in counter.items(): - dlog.info("{0:9s} : {1:6d} in {2:6d} {3:6.2f} %".format(cc_key, cc_value, fp_sum, cc_value/fp_sum*100)) - else: - all_names = list(all_names) - all_names.sort() - counter = {"candidate": 0, "accurate": 0, "failed": 0} - for kk in all_names: - sys_counter = {"candidate": 0, "accurate": 0, "failed": 0} - if kk in sys_candinate.keys(): - sys_counter['candidate'] += sys_candinate[kk].get_nframes() - if kk in sys_accurate.keys(): - sys_counter['accurate'] += sys_accurate[kk].get_nframes() - if kk in sys_failed.keys(): - sys_counter['failed'] += sys_failed[kk].get_nframes() - fp_sum = sum(sys_counter.values()) - for cc_key, cc_value in sys_counter.items(): - if fp_sum != 0: - dlog.info("sys{0:s} {1:9s} : {2:6d} in {3:6d} {4:6.2f} %".format(kk, cc_key, cc_value, fp_sum, cc_value/fp_sum*100)) - else: - dlog.info("sys{0:s} {1:9s} : {2:6d} in {3:6d} {4:6.2f} %".format(kk, cc_key, cc_value, fp_sum, 0*100)) - for ii in ['candidate', 'accurate', 'failed']: - counter[ii] += sys_counter[ii] + raise RuntimeError('reach a place that should NOT be reached...') + + counter = {"candidate": sys_candinate.get_nframes(), "accurate": sys_accurate.get_nframes(), "failed": sys_failed.get_nframes()} + fp_sum = sum(counter.values()) + for cc_key, cc_value in counter.items(): + dlog.info("{0:9s} : {1:6d} in {2:6d} {3:6.2f} %".format(cc_key, cc_value, fp_sum, cc_value/fp_sum*100)) if counter['candidate'] == 0 and counter['failed'] > 0: raise RuntimeError('no candidate but still have failed cases, stop. You may want to refine the training or to increase the trust level hi') # label the candidate system labels = [] - if use_clusters: - items = sys_candinate.systems.items() - else: - items = sys_candinate.items() + items = sys_candinate.systems.items() + for key, system in items: labels.extend([(key, j) for j in range(len(system))]) # candinate: pick up randomly @@ -412,112 +264,61 @@ def post_model_devi(iter_index, jdata, mdata): (counter['candidate'], len(pick_idx), float(len(pick_idx))/counter['candidate']*100., len(rest_idx), float(len(rest_idx))/counter['candidate']*100.)) # dump the picked candinate data - if use_clusters: - picked_systems = dpdata.MultiSystems() - for j in pick_idx: - sys_name, sys_id = labels[j] - picked_systems.append(sys_candinate[sys_name][sys_id]) - sys_data_path = os.path.join(work_path, picked_data_name) - picked_systems.to_deepmd_raw(sys_data_path) - picked_systems.to_deepmd_npy(sys_data_path, set_size=iter_pick_number) - else: - selc_systems = {} - for j in pick_idx: - sys_name, sys_id = labels[j] - selc_systems = _add_system(selc_systems, sys_name, sys_candinate[sys_name][sys_id]) - sys_data_path = os.path.join(work_path, picked_data_name) - _dump_system_dict(selc_systems, sys_data_path) + picked_systems = dpdata.MultiSystems() + for j in pick_idx: + sys_name, sys_id = labels[j] + picked_systems.append(sys_candinate[sys_name][sys_id]) + sys_data_path = os.path.join(work_path, picked_data_name) + picked_systems.to_deepmd_raw(sys_data_path) + picked_systems.to_deepmd_npy(sys_data_path, set_size=iter_pick_number) + # dump the rest data (not picked candinate data and failed data) - if use_clusters: - rest_systems = dpdata.MultiSystems() - for j in rest_idx: - sys_name, sys_id = labels[j] - rest_systems.append(sys_candinate[sys_name][sys_id]) - rest_systems += sys_failed - sys_data_path = os.path.join(work_path, rest_data_name) - rest_systems.to_deepmd_raw(sys_data_path) + rest_systems = dpdata.MultiSystems() + for j in rest_idx: + sys_name, sys_id = labels[j] + rest_systems.append(sys_candinate[sys_name][sys_id]) + rest_systems += sys_failed + sys_data_path = os.path.join(work_path, rest_data_name) + rest_systems.to_deepmd_raw(sys_data_path) + if rest_idx.size: rest_systems.to_deepmd_npy(sys_data_path, set_size=rest_idx.size) - else: - selc_systems = {} - for j in rest_idx: - sys_name, sys_id = labels[j] - selc_systems = _add_system(selc_systems, sys_name, sys_candinate[sys_name][sys_id]) - for kk in sys_failed.keys(): - selc_systems = _add_system(selc_systems, kk, sys_failed[kk]) - sys_data_path = os.path.join(work_path, rest_data_name) - _dump_system_dict(selc_systems, sys_data_path) + # dump the accurate data -- to another directory - if use_clusters: - sys_data_path = os.path.join(work_path, accurate_data_name) - sys_accurate.to_deepmd_raw(sys_data_path) - sys_accurate.to_deepmd_npy(sys_data_path, set_size=sys_accurate.get_nframes()) - else: - sys_data_path = os.path.join(work_path, accurate_data_name) - _dump_system_dict(sys_accurate, sys_data_path) + sys_data_path = os.path.join(work_path, accurate_data_name) + sys_accurate.to_deepmd_raw(sys_data_path) + sys_accurate.to_deepmd_npy(sys_data_path, set_size=sys_accurate.get_nframes()) def make_fp_labeled(iter_index, jdata): dlog.info("already labeled, skip make_fp and link data directly") pick_data = jdata['pick_data'] - use_clusters = jdata.get('use_clusters', False) iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, fp_name) create_path(work_path) picked_data_path = os.path.join(iter_name, model_devi_name, picked_data_name) - if use_clusters: - os.symlink(os.path.abspath(picked_data_path), os.path.abspath( - os.path.join(work_path, "task." + data_system_fmt % 0))) - os.symlink(os.path.abspath(picked_data_path), os.path.abspath( - os.path.join(work_path, "data." + data_system_fmt % 0))) - else: - picked_data_path = os.path.abspath(picked_data_path) - sys_path = glob.glob(os.path.join(picked_data_path, sys_name_pattern)) - cwd = os.getcwd() - os.chdir(work_path) - for ii in sys_path: - sys_idx = os.path.basename(ii).split('.')[1] - data_dir = 'data.' + data_system_fmt % int(sys_idx) - task_dir = 'task.' + data_system_fmt % int(sys_idx) - os.symlink(os.path.relpath(ii), data_dir) - os.symlink(os.path.relpath(ii), task_dir) - os.chdir(cwd) + os.symlink(os.path.abspath(picked_data_path), os.path.abspath( + os.path.join(work_path, "task." + data_system_fmt % 0))) + os.symlink(os.path.abspath(picked_data_path), os.path.abspath( + os.path.join(work_path, "data." + data_system_fmt % 0))) def make_fp_configs(iter_index, jdata): pick_data = jdata['pick_data'] - use_clusters = jdata.get('use_clusters', False) iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, fp_name) create_path(work_path) picked_data_path = os.path.join(iter_name, model_devi_name, picked_data_name) - if use_clusters: - systems = get_multi_system(picked_data_path, jdata) - jj = 0 - for system in systems: - for subsys in system: - task_name = "task." + fp_task_fmt % (0, jj) - task_path = os.path.join(work_path, task_name) - create_path(task_path) - subsys.to('vasp/poscar', os.path.join(task_path, 'POSCAR')) - jj += 1 - else: - picked_data_path = os.path.abspath(picked_data_path) - sys_path = glob.glob(os.path.join(picked_data_path, sys_name_pattern)) - for ii in sys_path: - tmp_sys = dpdata.System(ii, fmt = 'deepmd/npy') - sys_idx = os.path.basename(ii).split('.')[1] - jj = 0 - for ss in tmp_sys: - task_name = "task." + fp_task_fmt % (int(sys_idx), jj) - task_path = os.path.join(work_path, task_name) - create_path(task_path) - ss.to('vasp/poscar', os.path.join(task_path, 'POSCAR')) - job = {} - with open(os.path.join(task_path, 'job.json'), 'w') as fp: - json.dump(job, fp, indent=4) - jj += 1 + systems = get_multi_system(picked_data_path, jdata) + jj = 0 + for system in systems: + for subsys in system: + task_name = "task." + fp_task_fmt % (0, jj) + task_path = os.path.join(work_path, task_name) + create_path(task_path) + subsys.to('vasp/poscar', os.path.join(task_path, 'POSCAR')) + jj += 1 def make_fp_gaussian(iter_index, jdata): diff --git a/dpgen/util.py b/dpgen/util.py index aa805e7e5..9491cdc30 100644 --- a/dpgen/util.py +++ b/dpgen/util.py @@ -1,5 +1,7 @@ #!/usr/bin/env python # coding: utf-8 +from typing import Union, List +from pathlib import Path from dpgen import dlog @@ -25,3 +27,23 @@ def box_center(ch='',fill=' ',sp="|"): ''' strs=ch.center(Len,fill) dlog.info(sp+strs[1:len(strs)-1:]+sp) + + +def expand_sys_str(root_dir: Union[str, Path]) -> List[str]: + """Recursively iterate over directories taking those that contain `type.raw` file. + + Parameters + ---------- + root_dir : Union[str, Path] + starting directory + + Returns + ------- + List[str] + list of string pointing to system directories + """ + root_dir = Path(root_dir) + matches = [str(d) for d in root_dir.rglob("*") if (d / "type.raw").is_file()] + if (root_dir / "type.raw").is_file(): + matches.append(str(root_dir)) + return matches From 8a27df371211264bcc302ae05b88fd9058c4f1b8 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 9 Jun 2022 00:53:16 -0400 Subject: [PATCH 13/26] supports non-list mdata (#748) * supports non-list mdata The mdata of a task is a list of a single dict. This "list" looks useless and not clear enough. So this commit supports using the dict without a list. Note that old parameters are still supported, so no breaking changes are made. The "list" is just removed from all examples. Both list and non-list parameters are in the unittest. * fix typo --- README.md | 26 +++++++------------ doc/run/example-of-machine.md | 11 +++----- dpgen/remote/decide_machine.py | 13 +++++++--- .../machine/DeePMD-kit-1.x/machine-ali.json | 13 ++++------ .../machine-lsf-slurm-cp2k.json | 13 ++++------ .../DeePMD-kit-1.x/machine-slurm-qe.json | 12 +++------ .../DeePMD-kit-2.x/lebesgue_v2_machine.json | 11 ++++---- tests/tools/machine_fp_single2.json | 14 ++++++++++ tests/tools/test_convert_mdata.py | 10 +++++-- 9 files changed, 65 insertions(+), 58 deletions(-) create mode 100644 tests/tools/machine_fp_single2.json diff --git a/README.md b/README.md index c833ed059..fb280e226 100644 --- a/README.md +++ b/README.md @@ -1137,7 +1137,7 @@ an example of new dpgen's machine.json ```json { "api_version": "1.0", - "train": [ + "train": { "command": "dp", "machine": { @@ -1161,9 +1161,8 @@ an example of new dpgen's machine.json "para_deg": 3, "source_list": ["/home/user1234/deepmd.1.2.4.env"] } - } - ], - "model_devi":[ + }, + "model_devi": { "command": "lmp", "machine":{ @@ -1184,9 +1183,8 @@ an example of new dpgen's machine.json "group_size": 5, "source_list": ["/home/user1234/deepmd.1.2.4.env"] } - } - ], - "fp":[ + }, + "fp": { "command": "vasp_std", "machine":{ @@ -1208,7 +1206,6 @@ an example of new dpgen's machine.json "source_list": ["~/vasp.env"] } } - ] } ``` note1: the key "local_root" in dpgen's machine.json is always `./` @@ -1220,7 +1217,7 @@ When switching into a new machine, you may modifying the `MACHINE`, according to An example for `MACHINE` is: ```json { - "train": [ + "train": { "machine": { "batch": "slurm", @@ -1243,9 +1240,8 @@ An example for `MACHINE` is: "qos": "data" }, "command": "USERPATH/dp" - } - ], - "model_devi": [ + }, + "model_devi": { "machine": { "batch": "slurm", @@ -1269,9 +1265,8 @@ An example for `MACHINE` is: }, "command": "lmp_serial", "group_size": 1 - } - ], - "fp": [ + }, + "fp": { "machine": { "batch": "slurm", @@ -1298,7 +1293,6 @@ An example for `MACHINE` is: "command": "vasp_gpu", "group_size": 1 } - ] } ``` Following table illustrates which key is needed for three types of machine: `train`,`model_devi` and `fp`. Each of them is a list of dicts. Each dict can be considered as an independent environmnet for calculation. diff --git a/doc/run/example-of-machine.md b/doc/run/example-of-machine.md index 569f85026..247c50e4f 100644 --- a/doc/run/example-of-machine.md +++ b/doc/run/example-of-machine.md @@ -20,7 +20,7 @@ In this section, we will show you how to perform train task at a local workstati In this example, we perform the `train` task on a local workstation. ```json -"train": [ +"train": { "command": "dp", "machine": { @@ -36,8 +36,7 @@ In this example, we perform the `train` task on a local workstation. "group_size": 1, "source_list": ["/home/user1234/deepmd.env"] } - } - ], + }, ``` The "command" for the train task in the DeePMD-kit is "dp". @@ -51,7 +50,7 @@ In the resources parameter, "number_node", "cpu_per_node", and "gpu_per_node" sp In this example, we perform the model_devi task at a local Slurm workstation. ```json -"model_devi": [ +"model_devi": { "command": "lmp", "machine": { @@ -70,7 +69,6 @@ In this example, we perform the model_devi task at a local Slurm workstation. "source_list": ["/home/user1234/lammps.env"] } } -], ``` The "command" for the model_devi task in the LAMMPS is "lmp". @@ -84,7 +82,7 @@ In the resources parameter, we specify the name of the queue to which the task i In this example, we perform the fp task at a remote PBS cluster that can be accessed via SSH. ```json -"fp": [ +"fp": { "command": "mpirun -n 32 vasp_std", "machine": { @@ -106,7 +104,6 @@ In this example, we perform the fp task at a remote PBS cluster that can be acce "source_list": ["/home/user1234/vasp.env"] } } -], ``` VASP code is used for fp task and mpi is used for parallel computing, so "mpirun -n 32" is added to specify the number of parallel threads. diff --git a/dpgen/remote/decide_machine.py b/dpgen/remote/decide_machine.py index 31691f322..c551be44b 100644 --- a/dpgen/remote/decide_machine.py +++ b/dpgen/remote/decide_machine.py @@ -36,11 +36,18 @@ def convert_mdata(mdata, task_types=["train", "model_devi", "fp"]): ''' for task_type in task_types: if task_type in mdata: - for key, item in mdata[task_type][0].items(): + if isinstance(mdata[task_type], dict): + task_data = mdata[task_type] + elif isinstance(mdata[task_type], (list, tuple)): + task_data = mdata[task_type][0] + else: + raise TypeError("mdata/%s should be dict or list!" % task_type) + for key, item in task_data.items(): if "comments" not in key: mdata[task_type + "_" + key] = item - group_size = mdata[task_type][0]["resources"].get("group_size", 1) - if group_size == 1: group_size = mdata[task_type][0].get("group_size", 1) + group_size = task_data["resources"].get("group_size", 1) + if group_size == 1: + group_size = task_data.get("group_size", 1) mdata[task_type + "_" + "group_size"] = group_size return mdata diff --git a/examples/machine/DeePMD-kit-1.x/machine-ali.json b/examples/machine/DeePMD-kit-1.x/machine-ali.json index a2a338af4..e78fc9dd4 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-ali.json +++ b/examples/machine/DeePMD-kit-1.x/machine-ali.json @@ -1,5 +1,5 @@ { - "train": [ + "train": { "machine": { "batch": "shell", @@ -34,10 +34,9 @@ }, "command": "/root/deepmd-kit/bin/dp", "group_size": 2 - } - ], + }, - "model_devi": [ + "model_devi": { "machine": { "batch": "shell", @@ -71,10 +70,9 @@ }, "command": "/root/deepmd-kit/bin/lmp", "group_size": 2 - } - ], + }, - "fp": [ + "fp": { "machine": { "batch": "shell", @@ -108,7 +106,6 @@ "command": "mpirun -n 16 /root/deepmd-pkg/vasp.5.4.4/bin/vasp_std", "group_size": 1 } - ] } diff --git a/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json b/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json index b56d022ec..4fb5845ee 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json +++ b/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json @@ -1,5 +1,5 @@ { - "train": [ + "train": { "machine": { "machine_type": "slurm", @@ -25,9 +25,8 @@ "submit_wait_time": 60 }, "python_path": "/share/apps/deepmd/compress/bin/python3.8" - } - ], - "model_devi": [ + }, + "model_devi": { "machine": { "machine_type": "slurm", @@ -54,9 +53,8 @@ }, "command": "lmp_mpi", "group_size": 5 - } - ], - "fp": [ + }, + "fp": { "machine": { "machine_type": "lsf", @@ -87,5 +85,4 @@ "command": "cp2k.popt -i input.inp", "group_size": 50 } - ] } diff --git a/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json b/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json index 22a3fdbbd..5f15303d6 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json +++ b/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json @@ -1,7 +1,7 @@ { "_comment" : "This is an example of DP-GEN on Slurm", "_comment" : "Last updated on 2021.4.30 for DP-GEN 0.9.2 by Yuzhi Zhang", - "train" :[ + "train" : { "_comment" : "Specify the installed path of DeePMD-kit", "_comment" : "The version of DeePMD-kit should be 1.*", @@ -49,10 +49,9 @@ "time_limit": "23:0:0", "_comment": "that's all" } - } - ], + }, - "model_devi": [ + "model_devi": { "machine": { "machine_type": "slurm", @@ -81,10 +80,8 @@ "command": "lmp_serial", "_comment" : "DP-GEN will put 5 tasks together in one submitting script.", "group_size": 5 - } - ], + }, "fp": - [ { "machine": { "machine_type": "slurm", @@ -113,5 +110,4 @@ "command": "mpirun -n 8 /gpfs/share/home/1600017784/yuzhi/soft/QE-mpi/PW/src/pw.x < input", "group_size": 1 } - ] } diff --git a/examples/machine/DeePMD-kit-2.x/lebesgue_v2_machine.json b/examples/machine/DeePMD-kit-2.x/lebesgue_v2_machine.json index 6b9ead467..0ecba4fa6 100644 --- a/examples/machine/DeePMD-kit-2.x/lebesgue_v2_machine.json +++ b/examples/machine/DeePMD-kit-2.x/lebesgue_v2_machine.json @@ -1,7 +1,7 @@ { "api_version": "1.0", "deepmd_version": "2.0.1", - "train" :[ + "train" : { "command": "dp", "machine": { @@ -34,9 +34,9 @@ "queue_name": "GPU", "group_size": 1 } - }], + }, "model_devi": - [{ + { "command": "lmp -i input.lammps -v restart 0", "machine": { "batch_type": "DpCloudServer", @@ -68,9 +68,9 @@ "queue_name": "GPU", "group_size": 5 } - }], + }, "fp": - [{ + { "command": "mpirun -n 16 vasp_std", "machine": { "batch_type": "DpCloudServer", @@ -104,5 +104,4 @@ "source_list": ["/opt/intel/oneapi/setvars.sh"] } } - ] } diff --git a/tests/tools/machine_fp_single2.json b/tests/tools/machine_fp_single2.json new file mode 100644 index 000000000..8c2212927 --- /dev/null +++ b/tests/tools/machine_fp_single2.json @@ -0,0 +1,14 @@ +{ + "fp": + { + "command": "vasp_std", + "machine":{ + "batch_type": "PBS" + }, + "resources": { + "group_size" : 8 + }, + "_comments" : "In user_forward_files, define input files to be uploaded.", + "user_forward_files" : ["vdw_kernel.bindat"] + } +} \ No newline at end of file diff --git a/tests/tools/test_convert_mdata.py b/tests/tools/test_convert_mdata.py index 5458b0faa..5dc1b944e 100644 --- a/tests/tools/test_convert_mdata.py +++ b/tests/tools/test_convert_mdata.py @@ -6,12 +6,18 @@ __package__ = 'tools' from dpgen.remote.decide_machine import convert_mdata from .context import setUpModule -machine_file = 'machine_fp_single.json' + class TestConvertMdata(unittest.TestCase): + machine_file = 'machine_fp_single.json' + def test_convert_mdata (self): - mdata = json.load(open(machine_file)) + mdata = json.load(open(self.machine_file)) mdata = convert_mdata(mdata, ["fp"]) self.assertEqual(mdata["fp_command"], "vasp_std") self.assertEqual(mdata["fp_group_size"], 8) self.assertEqual(mdata["fp_machine"]["batch_type"], "PBS") self.assertEqual(mdata["fp_user_forward_files"], ["vdw_kernel.bindat"]) + + +class TestConvertMdata2(TestConvertMdata): + machine_file = 'machine_fp_single2.json' From a0c7333d76142c51310c68ec52988f9f28cc5bab Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 9 Jun 2022 00:53:45 -0400 Subject: [PATCH 14/26] upgrade all tasks to dpdispatcher (#749) * upgrade all tasks to dpdispatcher This commit upgrades init_reaction and init_surf to use dpdispatcher * fix method args * fix typo * change the variable name from `work_dir` to `work_path` --- dpgen/data/reaction.py | 38 ++++++++-------- dpgen/data/surf.py | 11 ++--- dpgen/dispatcher/Dispatcher.py | 79 ++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 22 deletions(-) diff --git a/dpgen/data/reaction.py b/dpgen/data/reaction.py index b9574d525..0abfeb965 100644 --- a/dpgen/data/reaction.py +++ b/dpgen/data/reaction.py @@ -7,7 +7,7 @@ output: data """ -import argparse +import warnings import glob import json import os @@ -15,7 +15,8 @@ import dpdata from dpgen import dlog -from dpgen.dispatcher.Dispatcher import make_dispatcher +from dpgen.dispatcher.Dispatcher import make_submission_compat +from dpgen.remote.decide_machine import convert_mdata from dpgen.generator.run import create_path, make_fp_task_name from dpgen.util import sepline @@ -73,14 +74,15 @@ def make_lmp(jdata): return lmp_string -def run_reaxff(jdata, mdata, dispatcher, log_file="reaxff_log"): +def run_reaxff(jdata, mdata, log_file="reaxff_log"): work_path = reaxff_path reaxff_command = "{} -in {}".format(mdata["reaxff_command"], lmp_path) run_tasks = glob.glob(os.path.join(work_path, 'task.*')) run_tasks.sort() run_tasks = [os.path.basename(ii) for ii in run_tasks] - dispatcher.run_jobs(mdata['reaxff_resources'], + make_submission_compat(mdata['reaxff_machine'], + mdata['reaxff_resources'], [reaxff_command], work_path, run_tasks, @@ -89,7 +91,8 @@ def run_reaxff(jdata, mdata, dispatcher, log_file="reaxff_log"): [ff_path, data_init_path, control_path, lmp_path], [trj_path], outlog=log_file, - errlog=log_file) + errlog=log_file, + api_version=mdata.get("api_version", "0.9")) def link_trj(jdata): @@ -102,7 +105,7 @@ def link_trj(jdata): os.path.join(task_path, trj_path))) -def run_build_dataset(jdata, mdata, dispatcher, log_file="build_log"): +def run_build_dataset(jdata, mdata, log_file="build_log"): work_path = build_path build_command = "{cmd} -n {dataset_name} -a {type_map} -d {lammpstrj} -c {cutoff} -s {dataset_size} -k \"{qmkeywords}\" --nprocjob {nprocjob} --nproc {nproc}".format( cmd=mdata["build_command"], @@ -119,7 +122,8 @@ def run_build_dataset(jdata, mdata, dispatcher, log_file="build_log"): run_tasks.sort() run_tasks = [os.path.basename(ii) for ii in run_tasks] - dispatcher.run_jobs(mdata['build_resources'], + make_submission_compat(mdata['build_machine'], + mdata['build_resources'], [build_command], work_path, run_tasks, @@ -128,7 +132,8 @@ def run_build_dataset(jdata, mdata, dispatcher, log_file="build_log"): [trj_path], [f"dataset_{dataset_name}_gjf"], outlog=log_file, - errlog=log_file) + errlog=log_file, + api_version=mdata.get("api_version", "0.9")) def link_fp_input(): @@ -146,7 +151,6 @@ def link_fp_input(): def run_fp(jdata, mdata, - dispatcher, log_file="output", forward_common_files=[]): fp_command = mdata['fp_command'] @@ -162,7 +166,8 @@ def run_fp(jdata, run_tasks = [os.path.basename(ii) for ii in fp_run_tasks] - dispatcher.run_jobs(mdata['fp_resources'], + make_submission_compat(mdata['fp_machine'], + mdata['fp_resources'], [fp_command], work_path, run_tasks, @@ -171,7 +176,8 @@ def run_fp(jdata, ["input"], [log_file], outlog=log_file, - errlog=log_file) + errlog=log_file, + api_version=mdata.get("api_version", "0.9")) def convert_data(jdata): @@ -198,6 +204,7 @@ def gen_init_reaction(args): with open(args.MACHINE, "r") as fp: mdata = json.load(fp) + mdata = convert_mdata(mdata, ["reaxff", "build", "fp"]) record = "record.reaction" iter_rec = -1 numb_task = 7 @@ -213,18 +220,15 @@ def gen_init_reaction(args): elif ii == 0: link_reaxff(jdata) elif ii == 1: - dispatcher = make_dispatcher(mdata["reaxff_machine"]) - run_reaxff(jdata, mdata, dispatcher) + run_reaxff(jdata, mdata) elif ii == 2: link_trj(jdata) elif ii == 3: - dispatcher = make_dispatcher(mdata["build_machine"]) - run_build_dataset(jdata, mdata, dispatcher) + run_build_dataset(jdata, mdata) elif ii == 4: link_fp_input() elif ii == 5: - dispatcher = make_dispatcher(mdata["fp_machine"]) - run_fp(jdata, mdata, dispatcher) + run_fp(jdata, mdata) elif ii == 6: convert_data(jdata) with open(record, "a") as frec: diff --git a/dpgen/data/surf.py b/dpgen/data/surf.py index bc31b6705..543f02bc8 100644 --- a/dpgen/data/surf.py +++ b/dpgen/data/surf.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -import time +import warnings import os,json,shutil,re,glob,argparse import numpy as np import subprocess as sp @@ -12,7 +12,7 @@ from dpgen import dlog from dpgen import ROOT_PATH from dpgen.remote.decide_machine import convert_mdata -from dpgen.dispatcher.Dispatcher import Dispatcher, make_dispatcher +from dpgen.dispatcher.Dispatcher import make_submission_compat #-----PMG--------- from pymatgen.io.vasp import Poscar from pymatgen.core import Structure, Element @@ -565,15 +565,16 @@ def run_vasp_relax(jdata, mdata): run_tasks = [ii.replace(work_dir+"/", "") for ii in relax_run_tasks] #dlog.info(run_tasks) - dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size) - dispatcher.run_jobs(fp_resources, + make_submission_compat(mdata['fp_machine'], + fp_resources, [fp_command], work_dir, run_tasks, fp_group_size, forward_common_files, forward_files, - backward_files) + backward_files, + api_version=mdata.get("api_version", "0.9")) def gen_init_surf(args): try: diff --git a/dpgen/dispatcher/Dispatcher.py b/dpgen/dispatcher/Dispatcher.py index 29bea5669..abbe493b8 100644 --- a/dpgen/dispatcher/Dispatcher.py +++ b/dpgen/dispatcher/Dispatcher.py @@ -1,5 +1,6 @@ from distutils.version import LooseVersion import os,sys,time,random,json,glob +import warnings from typing import List from dpdispatcher import Task, Submission, Resources, Machine from dpgen.dispatcher.LocalContext import LocalSession @@ -406,3 +407,81 @@ def mdata_arginfo() -> List[Argument]: return [ command_arginfo, machine_arginfo, resources_arginfo, ] + + +def make_submission_compat( + machine: dict, + resources: dict, + commands: List[str], + work_path: str, + run_tasks: List[str], + group_size: int, + forward_common_files: List[str], + forward_files: List[str], + backward_files: List[str], + outlog: str="log", + errlog: str="err", + api_version: str="0.9", + ) -> None: + """Make submission with compatibility of both dispatcher API v0 and v1. + + If `api_version` is less than 1.0, use `make_dispatcher`. If + `api_version` is large than 1.0, use `make_submission`. + + Parameters + ---------- + machine : dict + machine dict + resources : dict + resource dict + commands : list[str] + list of commands + work_path : str + working directory + run_tasks : list[str] + list of paths to running tasks + group_size : int + group size + forward_common_files : list[str] + forwarded common files shared for all tasks + forward_files : list[str] + forwarded files for each task + backward_files : list[str] + backwarded files for each task + outlog : str, default=log + path to log from stdout + errlog : str, default=err + path to log from stderr + api_version : str, default=0.9 + API version. 1.0 is recommended + """ + if LooseVersion(api_version) < LooseVersion('1.0'): + warnings.warn(f"the dpdispatcher will be updated to new version." + f"And the interface may be changed. Please check the documents for more details") + dispatcher = make_dispatcher(machine, resources, work_dir, run_tasks, group_size) + dispatcher.run_jobs(resources, + commands, + work_path, + run_tasks, + group_size, + forward_common_files, + forward_files, + backward_files, + outlog=outlog, + errlog=errlog) + + elif LooseVersion(api_version) >= LooseVersion('1.0'): + submission = make_submission( + machine, + resources, + commands=commands, + work_path=work_path, + run_tasks=run_tasks, + group_size=group_size, + forward_common_files=forward_common_files, + forward_files=forward_files, + backward_files=backward_files, + outlog=outlog, + errlog=errlog) + submission.run_submission() + From f5feb9ce99afc10a72d7f1f9da7ec03b61838428 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Sun, 12 Jun 2022 12:43:44 +0800 Subject: [PATCH 15/26] update: add a new option of absolute volume in /dpgen/dpgen/auto_test/EOS.py (#741) * update: add a new option of absolute volume in ./dpgen/auto_test/EOS.py * update: add doc in /dpgen/doc/toymodels/ * update: change the description for eos, change the doc in /dpgen/doc/toymodels/ * update: change the notice of absolute volume from print into dlog.info --- doc/toymodels/JiamengHuang_pr.md | 9 +++++++++ dpgen/auto_test/EOS.py | 15 +++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 doc/toymodels/JiamengHuang_pr.md diff --git a/doc/toymodels/JiamengHuang_pr.md b/doc/toymodels/JiamengHuang_pr.md new file mode 100644 index 000000000..3b1210228 --- /dev/null +++ b/doc/toymodels/JiamengHuang_pr.md @@ -0,0 +1,9 @@ +A new parameter "vol_abs" is added. If you want to use absolute volume to get EOS, you can add + + "vol_abs": true, + +in the "eos" part of property.json +if it's not mentioned, "False" is set defaultly +when you are using absolute volume, there will be a notation in the last line of output during "make" process, which is like + +treat vol_start and vol_end as absolute volume diff --git a/dpgen/auto_test/EOS.py b/dpgen/auto_test/EOS.py index 4f332198b..f824dba8e 100644 --- a/dpgen/auto_test/EOS.py +++ b/dpgen/auto_test/EOS.py @@ -24,6 +24,8 @@ def __init__(self, self.vol_start = parameter['vol_start'] self.vol_end = parameter['vol_end'] self.vol_step = parameter['vol_step'] + parameter['vol_abs'] = parameter.get('vol_abs', False) + self.vol_abs = parameter['vol_abs'] parameter['cal_type'] = parameter.get('cal_type', 'relaxation') self.cal_type = parameter['cal_type'] default_cal_setting = {"relax_pos": True, @@ -117,6 +119,10 @@ def make_confs(self, else: print('gen eos from ' + str(self.vol_start) + ' to ' + str(self.vol_end) + ' by every ' + str(self.vol_step)) + if self.vol_abs : + dlog.info('treat vol_start and vol_end as absolute volume') + else : + dlog.info('treat vol_start and vol_end as relative volume') equi_contcar = os.path.join(path_to_equi, 'CONTCAR') if not os.path.exists(equi_contcar): raise RuntimeError("please do relaxation first") @@ -138,8 +144,13 @@ def make_confs(self, task_list.append(output_task) os.symlink(os.path.relpath(equi_contcar), 'POSCAR.orig') # scale = (vol / vol_to_poscar) ** (1. / 3.) - scale = vol ** (1. / 3.) - eos_params = {'volume': vol * vol_to_poscar, 'scale': scale} + + if self.vol_abs : + scale = (vol / vol_to_poscar) ** (1. / 3.) + eos_params = {'volume': vol, 'scale': scale} + else : + scale = vol ** (1. / 3.) + eos_params = {'volume': vol * vol_to_poscar, 'scale': scale} dumpfn(eos_params, 'eos.json', indent=4) self.parameter['scale2equi'].append(scale) # 06/22 vasp.poscar_scale('POSCAR.orig', 'POSCAR', scale) From 7d986d9b2c5fae9ae49c8a741d87f502bf3daa11 Mon Sep 17 00:00:00 2001 From: Liu Renxi <75369672+Liu-RX@users.noreply.github.com> Date: Tue, 14 Jun 2022 14:05:50 +0800 Subject: [PATCH 16/26] fix a bug in make_abacus_scf_input (#754) Co-authored-by: LiuRenxi --- README.md | 2 +- dpgen/generator/lib/abacus_scf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fb280e226..f26fb51df 100644 --- a/README.md +++ b/README.md @@ -570,7 +570,7 @@ The bold notation of key (such aas **type_map**) means that it's a necessary key | **user_fp_params** | Dict | |Parameters for cp2k calculation. find detail in manual.cp2k.org. only the kind section must be set before use. we assume that you have basic knowledge for cp2k input. | **external_input_path** | String | | Conflict with key:user_fp_params, use the template input provided by user, some rules should be followed, read the following text in detail. | *fp_style == ABACUS* -| **user_fp_params** | Dict | |Parameters for ABACUS INPUT. find detail [Here](https://github.com/deepmodeling/abacus-develop/blob/develop/docs/input-main.md#out-descriptor). If `deepks_model` is set, the model file should be in the pseudopotential directory. +| **user_fp_params** | Dict | |Parameters for ABACUS INPUT. find detail [Here](https://github.com/deepmodeling/abacus-develop/blob/develop/docs/input-main.md#out-descriptor). If `deepks_model` is set, the model file should be in the pseudopotential directory. You can also set `KPT` file by adding `k_points` that corresponds to a list of six integers in this dictionary. | **fp_orb_files** | List | |List of atomic orbital files. The files should be in pseudopotential directory. | **fp_dpks_descriptor** | String | |DeePKS descriptor file name. The file should be in pseudopotential directory. diff --git a/dpgen/generator/lib/abacus_scf.py b/dpgen/generator/lib/abacus_scf.py index 256eb1d9d..1a9882979 100644 --- a/dpgen/generator/lib/abacus_scf.py +++ b/dpgen/generator/lib/abacus_scf.py @@ -83,7 +83,7 @@ def make_abacus_scf_input(fp_params): ret += "deepks_scf %d\n" % fp_params["deepks_scf"] elif key == "deepks_model": ret += "deepks_model %s\n" % fp_params["deepks_model"] - else: + elif key != "k_points": # "k_points key is used to generate KPT file." ret += "%s %s\n" % (key, str(fp_params[key])) return ret From ec1600592dfe9c8883dfe998818bf26e1930e9de Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 14 Jun 2022 02:17:46 -0400 Subject: [PATCH 17/26] init_reaction: fix compatibility with new dpdispatcher (#755) fix compatibility as the key was changed in the dpdispatcher --- dpgen/data/reaction.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dpgen/data/reaction.py b/dpgen/data/reaction.py index 0abfeb965..5e900f9de 100644 --- a/dpgen/data/reaction.py +++ b/dpgen/data/reaction.py @@ -107,6 +107,9 @@ def link_trj(jdata): def run_build_dataset(jdata, mdata, log_file="build_log"): work_path = build_path + # compatible with new dpdispatcher and old dpgen.dispatcher + build_ntasks = mdata["build_resources"].get("cpu_per_node", mdata["build_resources"]["task_per_node"]) + fp_ntasks = mdata["fp_resources"].get("cpu_per_node", mdata["fp_resources"]["task_per_node"]) build_command = "{cmd} -n {dataset_name} -a {type_map} -d {lammpstrj} -c {cutoff} -s {dataset_size} -k \"{qmkeywords}\" --nprocjob {nprocjob} --nproc {nproc}".format( cmd=mdata["build_command"], type_map=" ".join(jdata["type_map"]), @@ -114,8 +117,8 @@ def run_build_dataset(jdata, mdata, log_file="build_log"): cutoff=jdata["cutoff"], dataset_size=jdata["dataset_size"], qmkeywords=jdata["qmkeywords"], - nprocjob=mdata["fp_resources"]["task_per_node"], - nproc=mdata["build_resources"]["task_per_node"], + nprocjob=fp_ntasks, + nproc=build_ntasks, dataset_name=dataset_name ) run_tasks = glob.glob(os.path.join(work_path, 'task.*')) From d43fb5266b636cea521d00208cdba912dc517de2 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sun, 19 Jun 2022 12:09:51 -0400 Subject: [PATCH 18/26] generate machine parameter docs for simplify and init (#751) * generate machine parameter page for simplify and init * switching to new dargs directive --- doc/conf.py | 1 + doc/index.rst | 5 +++++ doc/init/init-bulk-mdata.rst | 6 ++++++ doc/init/init-reaction-mdata.rst | 6 ++++++ doc/init/init-surf-mdata.rst | 6 ++++++ doc/simplify/simplify-mdata.rst | 6 ++++++ dpgen/arginfo.py | 35 +++++++++++++++++++++++++++++++ dpgen/data/arginfo.py | 36 ++++++++++++++++++++++++++++++++ dpgen/generator/arginfo.py | 16 ++------------ dpgen/simplify/arginfo.py | 13 ++++++++++++ 10 files changed, 116 insertions(+), 14 deletions(-) create mode 100644 doc/init/init-bulk-mdata.rst create mode 100644 doc/init/init-reaction-mdata.rst create mode 100644 doc/init/init-surf-mdata.rst create mode 100644 doc/simplify/simplify-mdata.rst create mode 100644 dpgen/arginfo.py create mode 100644 dpgen/data/arginfo.py create mode 100644 dpgen/simplify/arginfo.py diff --git a/doc/conf.py b/doc/conf.py index 9d5ecc006..99dce21b5 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -40,6 +40,7 @@ extensions = [ 'deepmodeling_sphinx', + 'dargs.sphinx', "sphinx_rtd_theme", 'myst_parser', 'sphinx.ext.autosummary', diff --git a/doc/index.rst b/doc/index.rst index 341ce5d79..6eea4d95b 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -32,6 +32,9 @@ DPGEN's documentation :maxdepth: 2 :caption: Init + init/init-bulk-mdata + init/init-surf-mdata + init/init-reaction-mdata .. _autotest:: @@ -46,6 +49,8 @@ DPGEN's documentation :maxdepth: 2 :caption: Simplify + simplify/simplify-mdata + .. _tutorial: diff --git a/doc/init/init-bulk-mdata.rst b/doc/init/init-bulk-mdata.rst new file mode 100644 index 000000000..b3098e906 --- /dev/null +++ b/doc/init/init-bulk-mdata.rst @@ -0,0 +1,6 @@ +dpgen init_bulk machine parameters +================================== + +.. dargs:: + :module: dpgen.data.arginfo + :func: init_bulk_mdata_arginfo diff --git a/doc/init/init-reaction-mdata.rst b/doc/init/init-reaction-mdata.rst new file mode 100644 index 000000000..2fe35a0d8 --- /dev/null +++ b/doc/init/init-reaction-mdata.rst @@ -0,0 +1,6 @@ +dpgen init_reaction machine parameters +====================================== + +.. dargs:: + :module: dpgen.data.arginfo + :func: init_reaction_mdata_arginfo diff --git a/doc/init/init-surf-mdata.rst b/doc/init/init-surf-mdata.rst new file mode 100644 index 000000000..35e8e322f --- /dev/null +++ b/doc/init/init-surf-mdata.rst @@ -0,0 +1,6 @@ +dpgen init_surf machine parameters +================================== + +.. dargs:: + :module: dpgen.data.arginfo + :func: init_surf_mdata_arginfo diff --git a/doc/simplify/simplify-mdata.rst b/doc/simplify/simplify-mdata.rst new file mode 100644 index 000000000..995fc90f8 --- /dev/null +++ b/doc/simplify/simplify-mdata.rst @@ -0,0 +1,6 @@ +dpgen simplify machine parameters +================================= + +.. dargs:: + :module: dpgen.simplify.arginfo + :func: simplify_mdata_arginfo diff --git a/dpgen/arginfo.py b/dpgen/arginfo.py new file mode 100644 index 000000000..3f657942a --- /dev/null +++ b/dpgen/arginfo.py @@ -0,0 +1,35 @@ +from typing import Tuple + +from dargs import Argument + +from dpgen.dispatcher.Dispatcher import mdata_arginfo + + +def general_mdata_arginfo(name: str, tasks: Tuple[str]) -> Argument: + """Generate arginfo for general mdata. + + Parameters + ---------- + name : str + mdata name + tasks : tuple[str] + tuple of task keys, e.g. ("train", "model_devi", "fp") + + Returns + ------- + Argument + arginfo + """ + + doc_api_version = "Please set to 1.0" + doc_run_mdata = "machine.json file" + arg_api_version = Argument("api_version", str, optional=False, doc=doc_api_version) + + sub_fields = [arg_api_version] + doc_mdata = "Parameters of command, machine, and resources for %s" + for task in tasks: + sub_fields.append(Argument( + task, dict, optional=False, sub_fields=mdata_arginfo(), + doc=doc_mdata % task, + )) + return Argument(name, dict, sub_fields=sub_fields, doc=doc_run_mdata) diff --git a/dpgen/data/arginfo.py b/dpgen/data/arginfo.py new file mode 100644 index 000000000..d5814c036 --- /dev/null +++ b/dpgen/data/arginfo.py @@ -0,0 +1,36 @@ +from dargs import Argument + +from dpgen.arginfo import general_mdata_arginfo + + +def init_bulk_mdata_arginfo() -> Argument: + """Generate arginfo for dpgen init_bulk mdata. + + Returns + ------- + Argument + arginfo + """ + return general_mdata_arginfo("init_bulk_mdata", ("fp",)) + + +def init_surf_mdata_arginfo() -> Argument: + """Generate arginfo for dpgen init_surf mdata. + + Returns + ------- + Argument + arginfo + """ + return general_mdata_arginfo("init_surf_mdata", ("fp",)) + + +def init_reaction_mdata_arginfo() -> Argument: + """Generate arginfo for dpgen init_reaction mdata. + + Returns + ------- + Argument + arginfo + """ + return general_mdata_arginfo("init_reaction_mdata", ("reaxff", "build", "fp")) diff --git a/dpgen/generator/arginfo.py b/dpgen/generator/arginfo.py index f8815862d..cb2fb887b 100644 --- a/dpgen/generator/arginfo.py +++ b/dpgen/generator/arginfo.py @@ -1,6 +1,6 @@ from dargs import Argument -from dpgen.dispatcher.Dispatcher import mdata_arginfo +from dpgen.arginfo import general_mdata_arginfo def run_mdata_arginfo() -> Argument: """Generate arginfo for dpgen run mdata. @@ -10,16 +10,4 @@ def run_mdata_arginfo() -> Argument: Argument arginfo """ - - doc_api_version = "Please set to 1.0" - doc_run_mdata = "machine.json file" - arg_api_version = Argument("api_version", str, optional=False, doc=doc_api_version) - - sub_fields = [arg_api_version] - doc_mdata = "Parameters of command, machine, and resources for %s" - for task in ("train", "model_devi", "fp"): - sub_fields.append(Argument( - task, dict, optional=False, sub_fields=mdata_arginfo(), - doc=doc_mdata % task, - )) - return Argument("run_mdata", dict, sub_fields=sub_fields, doc=doc_run_mdata) + return general_mdata_arginfo("run_mdata", ("train", "model_devi", "fp")) diff --git a/dpgen/simplify/arginfo.py b/dpgen/simplify/arginfo.py new file mode 100644 index 000000000..0fbfe606e --- /dev/null +++ b/dpgen/simplify/arginfo.py @@ -0,0 +1,13 @@ +from dargs import Argument + +from dpgen.arginfo import general_mdata_arginfo + +def simplify_mdata_arginfo() -> Argument: + """Generate arginfo for dpgen simplify mdata. + + Returns + ------- + Argument + arginfo + """ + return general_mdata_arginfo("simplify_mdata", ("train", "model_devi", "fp")) From 4a5557e60dff84b6ff2919dd6df32286e9077474 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 20 Jun 2022 23:17:34 -0400 Subject: [PATCH 19/26] add auto cli docs (#759) * add auto cli docs * fix typo * fix package name... * forgot to return parser * add the blank line --- doc/conf.py | 1 + doc/index.rst | 2 ++ doc/overview/cli.rst | 7 +++++++ doc/requirements.txt | 1 + dpgen/main.py | 16 +++++++++++++--- 5 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 doc/overview/cli.rst diff --git a/doc/conf.py b/doc/conf.py index 99dce21b5..97b4b206b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -45,6 +45,7 @@ 'myst_parser', 'sphinx.ext.autosummary', 'sphinx.ext.viewcode', + 'sphinxarg.ext', ] diff --git a/doc/index.rst b/doc/index.rst index 6eea4d95b..eaa229813 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -7,6 +7,8 @@ DPGEN's documentation .. toctree:: :maxdepth: 2 :caption: Overview + + overview/cli .. _installation:: diff --git a/doc/overview/cli.rst b/doc/overview/cli.rst new file mode 100644 index 000000000..e57f1b064 --- /dev/null +++ b/doc/overview/cli.rst @@ -0,0 +1,7 @@ +Command line interface +====================== + +.. argparse:: + :module: dpgen.main + :func: main_parser + :prog: dpgen diff --git a/doc/requirements.txt b/doc/requirements.txt index 33ad28e39..0ae5c76f1 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -2,6 +2,7 @@ sphinx>=4.0.2 recommonmark sphinx_rtd_theme sphinx_markdown_tables +sphinx-argparse myst-parser deepmodeling_sphinx . diff --git a/dpgen/main.py b/dpgen/main.py index 6dcdc4ccd..c93c41ef4 100644 --- a/dpgen/main.py +++ b/dpgen/main.py @@ -30,9 +30,14 @@ __email__ = "" -def main(): - info() - print("Description\n------------") +def main_parser() -> argparse.ArgumentParser: + """Returns parser for `dpgen` command. + + Returns + ------- + argparse.ArgumentParser + parser for `dpgen` command + """ parser = argparse.ArgumentParser(description=""" dpgen is a convenient script that uses DeepGenerator to prepare initial data, drive DeepMDkit and analyze results. This script works based on @@ -156,8 +161,13 @@ def main(): help="parameter file, json format") parser_db.set_defaults(func=db_run) + return parser +def main(): + info() + print("Description\n------------") + parser = main_parser() try: import argcomplete argcomplete.autocomplete(parser) From 5ed5fa1529bc87f412a141349d9df4a95ff037a8 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Wed, 29 Jun 2022 10:01:41 +0800 Subject: [PATCH 20/26] correct the wrong spelling of 'failure' (#764) --- dpgen/dispatcher/DispatcherList.py | 6 +++--- examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dpgen/dispatcher/DispatcherList.py b/dpgen/dispatcher/DispatcherList.py index 085ae5d9a..22b77fd50 100644 --- a/dpgen/dispatcher/DispatcherList.py +++ b/dpgen/dispatcher/DispatcherList.py @@ -45,7 +45,7 @@ def run_jobs(self, mark_failure = False, outlog = 'log', errlog = 'err'): - ratio_failure = self.mdata_resources.get("ratio_failue", 0) + ratio_failure = self.mdata_resources.get("ratio_failure", 0) while True: if self.check_all_dispatchers_finished(ratio_failure): self.clean() @@ -188,7 +188,7 @@ def make_dispatcher(self, ii): # Base - def check_dispatcher_status(self, ii, allow_failue=False): + def check_dispatcher_status(self, ii, allow_failure=False): '''catch running dispatcher exception if no exception occured, check finished''' if self.dispatcher_list[ii]["dispatcher_status"] == "running": @@ -198,7 +198,7 @@ def check_dispatcher_status(self, ii, allow_failue=False): clean = self.mdata_resources.get("clean", False) try: # avoid raising ssh exception in download proceess - finished = self.dispatcher_list[ii]["dispatcher"].all_finished(self.dispatcher_list[ii]["entity"].job_handler, allow_failue, clean) + finished = self.dispatcher_list[ii]["dispatcher"].all_finished(self.dispatcher_list[ii]["entity"].job_handler, allow_failure, clean) if finished: self.dispatcher_list[ii]["dispatcher_status"] = "finished" except Exception: diff --git a/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json b/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json index 3de59661f..e2db8d254 100644 --- a/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json +++ b/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json @@ -64,7 +64,7 @@ }, "resources": { "allow_failure": true, - "ratio_failue": 0.05, + "ratio_failure": 0.05, "task_per_node": 16, "with_mpi": true, "_comment" : "Load the intel compiler.", From 3dec4b87b14233b39d5ea6503c286177c1ae8599 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 1 Jul 2022 01:22:16 -0400 Subject: [PATCH 21/26] upgrade machine examples to new dpdispatcher (#762) --- .../DeePMD-kit-1.0/machine-local-4GPU.json | 165 +++++++------ .../machine/DeePMD-kit-1.x/machine-local.json | 103 ++++---- .../machine-lsf-slurm-cp2k.json | 178 +++++++------- .../DeePMD-kit-1.x/machine-pbs-gaussian.json | 163 +++++++------ .../DeePMD-kit-1.x/machine-slurm-qe.json | 223 +++++++++--------- 5 files changed, 442 insertions(+), 390 deletions(-) diff --git a/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json b/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json index e2db8d254..e0e6bfca0 100644 --- a/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json +++ b/examples/machine/DeePMD-kit-1.0/machine-local-4GPU.json @@ -1,79 +1,90 @@ { - "_comment" : "This is an example of DP-GEN on Local device running with 4 GPUs", - "_comment" : "Last updated on 2021.5.9 for DP-GEN 0.9.2 by Ke XU", - "train": [ - { - "_comment" : "Specify the installed path of DeePMD-kit", - "_comment" : "The version of DeePMD-kit should be 1.*", - "command": "/home/user/anaconda3/bin/dp", - "_comment" : "Specify machine settings", - "machine": { - "_comment" : "Supported batches include slurm, pbs, shell, lsf.", - "batch": "shell", - "work_path": "/tmp/dpwork", - "_comment": "that's all" - }, - "resources":{ - "_comment" : "The number of nodes.", - "numb_node": 1, - "_comment" : "If you choose to run with multiple GPUs simultaneously, just ignore numb_gpu.", - "numb_gpu": 0, - "_comment" : "The number of CPUs.", - "task_per_node": 4, - "_comment" : "The number of GPUs that can be used for each task.", - "manual_cuda_devices": 4, - "_comment" : "The number of tasks that can be run in each GPU.", - "manual_cuda_multiplicity":1, - "_comment" : "Allow the multi-GPU task running.", - "cuda_multi_task": true, - "module_list": [], - "_comment" : "Environment to be activated. This will generate source xxx/psxevars.sh in scripts. ", - "source_list": ["/opt/intel/parallel_studio_xe_2020/psxevars.sh"] - }, - "_comment" : "DP-GEN will put 4 tasks together in one submitting script.", - "group_size": 4 - } - ], - - "model_devi": [ - { - "machine": { - "batch": "shell", - "work_path": "/tmp/dpwork" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 0, - "task_per_node": 4, - "manual_cuda_devices": 4, - "manual_cuda_multiplicity":1, - "cuda_multi_task": true, - "source_list": [], - "module_list": [] - }, - "command": "/home/user/Soft/Deepmd/lammps-stable_29Oct2020/src/lmp_mpi", - "group_size": 4 - } - ], - - "fp": [ - { - "machine": { - "batch": "shell", - "work_path": "/tmp/dpwork" - }, - "resources": { - "allow_failure": true, - "ratio_failure": 0.05, - "task_per_node": 16, - "with_mpi": true, - "_comment" : "Load the intel compiler.", - "source_list": ["/opt/intel/parallel_studio_xe_2020/psxevars.sh"], - "envs": {"PATH" : "/home/user/Soft/VASP/vasp.5.4.4-allbak/bin:$PATH"}, - "_comment" : "This will generate export PATH=/home/user/Soft/VASP/vasp.5.4.4-allbak/bin:$PATH in scripts;" - }, - "command": "vasp_std", - "group_size": 1 - } - ] + "_comment" : "This is an example of DP-GEN on Local device running with 4 GPUs", + "_comment": "Last updated on 2021.5.9 for DP-GEN 0.9.2 by Ke XU", + "train": { + "_comment" : "Specify the installed path of DeePMD-kit", + "command": "/home/user/anaconda3/bin/dp", + "_comment" : "Specify machine settings", + "machine": { + "_comment": "Supported batches include slurm, pbs, shell, lsf.", + "batch_type": "shell", + "context_type": "local", + "remote_root": "/tmp/dpwork", + "local_root": "./" + }, + "resources": { + "module_list": [], + "_comment": "Environment to be activated. This will generate source xxx/psxevars.sh in scripts. ", + "source_list": [ + "/opt/intel/parallel_studio_xe_2020/psxevars.sh" + ], + "batch_type": "shell", + "_comment": "DP-GEN will put 4 tasks together in one submitting script.", + "group_size": 4, + "_comment" : "The number of nodes.", + "number_node": 1, + "_comment" : "The number of CPUs.", + "cpu_per_node": 4, + "_comment" : "If you choose to run with multiple GPUs simultaneously, just ignore numb_gpu.", + "gpu_per_node": 0, + "kwargs": {}, + "strategy": { + "_comment" : "Allow the multi-GPU task running.", + "if_cuda_multi_devices": true + }, + "para_deg": 4, + "queue_name": "" + } + }, + "model_devi": { + "machine": { + "batch_type": "shell", + "context_type": "local", + "remote_root": "/tmp/dpwork", + "local_root": "./" + }, + "resources": { + "source_list": [], + "module_list": [], + "batch_type": "shell", + "group_size": 4, + "number_node": 1, + "cpu_per_node": 4, + "gpu_per_node": 0, + "kwargs": {}, + "strategy": { + "if_cuda_multi_devices": true + }, + "para_deg": 4, + "queue_name": "" + }, + "command": "/home/user/Soft/Deepmd/lammps-stable_29Oct2020/src/lmp_mpi" + }, + "fp": { + "machine": { + "batch_type": "shell", + "context_type": "local", + "remote_root": "/tmp/dpwork", + "local_root": "./" + }, + "resources": { + "_comment" : "Load the intel compiler.", + "source_list": [ + "/opt/intel/parallel_studio_xe_2020/psxevars.sh" + ], + "_comment": "This will generate export PATH=/home/user/Soft/VASP/vasp.5.4.4-allbak/bin:$PATH in scripts;", + "envs": { + "PATH": "/home/user/Soft/VASP/vasp.5.4.4-allbak/bin:$PATH" + }, + "batch_type": "shell", + "group_size": 1, + "cpu_per_node": 16, + "kwargs": {}, + "queue_name": "", + "number_node": 1, + "gpu_per_node": 1 + }, + "command": "mpirun -n 16 vasp_std || :" + }, + "api_version": "1.0" } diff --git a/examples/machine/DeePMD-kit-1.x/machine-local.json b/examples/machine/DeePMD-kit-1.x/machine-local.json index a266f712b..c8134d750 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-local.json +++ b/examples/machine/DeePMD-kit-1.x/machine-local.json @@ -1,42 +1,65 @@ { - "_comment": "training on localhost ", - "_comment" : "This is for DeePMD-kit 1.*", - "train_command" : "/home/wanghan/local/deepmd/1.*/dp", - "train_machine": { - "batch": "shell", - "work_path" : "/home/wanghan/tmp/subs/" - }, - "train_resources": { - "envs": { - } - }, - - - "_comment": "model_devi on localhost ", - "model_devi_command": "/home/wanghan/local/bin/lmp_mpi_010", - "model_devi_group_size": 5, - "model_devi_machine": { - "batch": "shell", - "_comment" : "If lazy_local is true, calculations are done directly in current folders.", - "lazy_local" : true - }, - "model_devi_resources": { - }, - - "_comment": "fp on localhost ", - "fp_command": "/home/wanghan/local/bin/vasp_std", - "fp_group_size": 2, - "fp_machine": { - "batch": "shell", - "work_path" : "/home/wanghan/tmp/subs/", - "_comment" : "that's all" - }, - "fp_resources": { - "module_list": ["mpi"], - "task_per_node":4, - "with_mpi": true, - "_comment": "that's all" + "api_version": "1.0", + "train": { + "_comment": "training on localhost", + "machine": { + "batch_type": "shell", + "context_type": "local", + "remote_root": "/home/wanghan/tmp/subs/", + "local_root": "./" + }, + "resources": { + "envs": {}, + "batch_type": "shell", + "group_size": 1, + "kwargs": {}, + "queue_name": "", + "number_node": 1, + "cpu_per_node": 1, + "gpu_per_node": 1 + }, + "command": "/home/wanghan/local/deepmd/1.*/dp" }, - - "_comment": " that's all " -} + "model_devi": { + "_comment": "model devi on localhost", + "machine": { + "_comment": "If lazy_local, calculations are done directly in current folders.", + "batch_type": "shell", + "context_type": "lazylocal", + "local_root": "./" + }, + "resources": { + "batch_type": "shell", + "group_size": 5, + "kwargs": {}, + "queue_name": "", + "number_node": 1, + "cpu_per_node": 1, + "gpu_per_node": 1 + }, + "command": "/home/wanghan/local/bin/lmp_mpi_010" + }, + "fp": { + "_comment": "fp on localhost", + "machine": { + "batch_type": "shell", + "context_type": "local", + "remote_root": "/home/wanghan/tmp/subs/", + "local_root": "./" + }, + "resources": { + "module_list": [ + "mpi" + ], + "_comment": "that's all", + "batch_type": "shell", + "group_size": 2, + "cpu_per_node": 4, + "kwargs": {}, + "queue_name": "", + "number_node": 1, + "gpu_per_node": 1 + }, + "command": "mpirun -n 4 /home/wanghan/local/bin/vasp_std" + } +} \ No newline at end of file diff --git a/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json b/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json index 4fb5845ee..348609c1e 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json +++ b/examples/machine/DeePMD-kit-1.x/machine-lsf-slurm-cp2k.json @@ -1,88 +1,98 @@ { - "train": - { - "machine": { - "machine_type": "slurm", - "hostname": "210.34.15.205", - "port": 22, - "username": "ybzhuang", - "work_path": "/home/ybzhuang/workdir" - }, - "resources": { - "numb_gpu": 1, - "numb_node": 1, - "task_per_node": 1, - "partition": "gpu", - "job_name": "train", - "qos":"emergency", - "exclude_list": [], - "source_list": [ - ], - "module_list": [ - "deepmd/1.2" - ], - "time_limit": "96:0:0", - "submit_wait_time": 60 - }, - "python_path": "/share/apps/deepmd/compress/bin/python3.8" + "train": { + "machine": { + "batch_type": "slurm", + "context_type": "ssh", + "remote_profile": { + "hostname": "210.34.15.205", + "port": 22, + "username": "ybzhuang" + }, + "remote_root": "/home/ybzhuang/workdir", + "local_root": "./" + }, + "resources": { + "source_list": [], + "module_list": [ + "deepmd/1.2" + ], + "batch_type": "slurm", + "number_node": 1, + "cpu_per_node": 1, + "gpu_per_node": 1, + "queue_name": "emergency", + "custom_flags": [ + "#SBATCH -t 96:0:0" + ], + "kwargs": {}, + "wait_time": 60, + "group_size": 1 + }, + "command": "/share/apps/deepmd/compress/bin/python3.8-m deepmd" }, - "model_devi": - { - "machine": { - "machine_type": "slurm", - "hostname": "210.34.15.205", - "port": 22, - "username": "ybzhuang", - "work_path": "/home/ybzhuang/workdir" - }, - "resources": { - "numb_gpu": 1, - "numb_node": 1, - "task_per_node": 1, - "partition": "gpu", - "job_name": "md", - "qos":"emergency", - "exclude_list": [], - "source_list": [ - ], - "module_list": [ - "deepmd/1.2" - ], - "time_limit": "96:0:0", - "submit_wait_time": 60 - }, - "command": "lmp_mpi", - "group_size": 5 + "model_devi": { + "machine": { + "batch_type": "slurm", + "context_type": "ssh", + "remote_profile": { + "hostname": "210.34.15.205", + "port": 22, + "username": "ybzhuang" + }, + "remote_root": "/home/ybzhuang/workdir", + "local_root": "./" + }, + "resources": { + "source_list": [], + "module_list": [ + "deepmd/1.2" + ], + "batch_type": "slurm", + "group_size": 5, + "number_node": 1, + "cpu_per_node": 1, + "gpu_per_node": 1, + "queue_name": "emergency", + "custom_flags": [ + "#SBATCH -t 96:0:0" + ], + "kwargs": {}, + "wait_time": 60 + }, + "command": "lmp_mpi" }, - "fp": - { - "machine": { - "machine_type": "lsf", - "hostname": "localhost", - "port": 6666, - "username": "ybzhuang", - "work_path": "/data/ybzhuang/methane-dpgen/dpgen-tutorial-2020-08-23/dpgen-tutorial-mathane/workpath" - }, - "resources": { - "cvasp": false, - "task_per_node": 32, - "numb_node": 1, - "node_cpu": 32, - "exclude_list": [], - "with_mpi": true, - "source_list": [ - ], - "module_list": [ - "intel/17.5.239", - "mpi/intel/2017.5.239", - "gcc/5.5.0", - "cp2k/7.1" + "fp": { + "machine": { + "batch_type": "lsf", + "context_type": "ssh", + "remote_profile": { + "hostname": "localhost", + "port": 6666, + "username": "ybzhuang" + }, + "remote_root": "/data/ybzhuang/methane-dpgen/dpgen-tutorial-2020-08-23/dpgen-tutorial-mathane/workpath", + "local_root": "./" + }, + "resources": { + "source_list": [], + "module_list": [ + "intel/17.5.239", + "mpi/intel/2017.5.239", + "gcc/5.5.0", + "cp2k/7.1" + ], + "batch_type": "lsf", + "group_size": 50, + "number_node": 1, + "cpu_per_node": 32, + "queue_name": "53-medium", + "custom_flags": [ + "#BSUB -W 12:00:00" ], - "time_limit": "12:00:00", - "partition": "53-medium", - "_comment": "that's Bel" - }, - "command": "cp2k.popt -i input.inp", - "group_size": 50 - } -} + "kwargs": {}, + "gpu_per_node": 1 + }, + "command": "mpirun -n 32 cp2k.popt -i input.inp" + }, + "api_version": "1.0" +} \ No newline at end of file diff --git a/examples/machine/DeePMD-kit-1.x/machine-pbs-gaussian.json b/examples/machine/DeePMD-kit-1.x/machine-pbs-gaussian.json index 6893471c5..daa743dcc 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-pbs-gaussian.json +++ b/examples/machine/DeePMD-kit-1.x/machine-pbs-gaussian.json @@ -1,79 +1,88 @@ { - "_comment": "training on localhost ", - "train_command": "/gpfs/home/tzhu/anaconda3/envs/python3.6/bin/dp", - "train_machine": { - "machine_type": "lsf", - "hostname" : "xxx.xxx.xxx.xxx", - "port" : 22, - "username": "tzhu", - "work_path" : "/gpfs/home/tzhu/jzzeng/dpgen_workdir", - "_comment" : "that's all" - }, - "train_resources": { - "source_list": [ "activate deepmd" ], - "envs": { - "KMP_BLOCKTIME": 0, - "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" - }, - "numb_gpu": 1, - "numb_node": 1, - "node_cpu": 0, - "partition": "newgpu", - "job_name": "dpgen_jzzeng", - "with_mpi": false, - "time_limit": false, - "_comment": "that's all" - }, - - - "_comment": "model_devi on localhost ", - "model_devi_command": "/gpfs/home/tzhu/lammps-stable_5Jun2019/src/lmp_intel_cpu_intelmpi -pk intel 0 omp 2", - "model_devi_group_size": 1, - "model_devi_machine": { - "machine_type": "lsf", - "hostname" : "xxx.xxx.xxx.xxx", - "port" : 22, - "username": "tzhu", - "work_path" : "/gpfs/home/tzhu/jzzeng/dpgen_workdir", - "_comment" : "that's all" - }, - "model_devi_resources": { - "envs": { - "KMP_BLOCKTIME": 0 - }, - "source_list": [ "activate deepmd" ], - "numb_gpu": 1, - "numb_node": 1, - "node_cpu": 0, - "time_limit": false, - "partition": "newgpu", - "job_name": "dpgen_jzzeng", - "with_mpi": true, - "task_per_node": 1, - "_comment": "that's all" - }, - - "_comment": "fp on lsf //localhost ", - "fp_command": "/public/home/tzhu/g16/g16 < input", - "fp_group_size": 1, - "fp_machine": { - "machine_type": "pbs", - "hostname" : "xxx.xxx.xxx.xxx", - "port" : 2323, - "username": "tzhu", - "work_path" : "/public/home/tzhu/jzzeng/dpgen_workdir", - "_comment" : "that's all" - }, - "fp_resources": { - "node_cpu":28, - "numb_node": 1, - "job_name": "dpgen_jzzeng", - "task_per_node": 28, - "with_mpi": false, - "time_limit": "10:00:00", - "allow_failure": true, - "partition": "small", - "_comment": "that's all" + "api_version": "1.0", + "train": { + "machine": { + "batch_type": "lsf", + "context_type": "ssh", + "remote_profile": { + "hostname": "xxx.xxx.xxx.xxx", + "port": 22, + "username": "tzhu" + }, + "remote_root": "/gpfs/home/tzhu/jzzeng/dpgen_workdir", + "local_root": "./" + }, + "resources": { + "source_list": [ + "activate deepmd" + ], + "envs": { + "KMP_BLOCKTIME": 0, + "KMP_AFFINITY": "granularity=fine,verbose,compact,1,0" + }, + "batch_type": "lsf", + "group_size": 1, + "number_node": 1, + "cpu_per_node": 0, + "gpu_per_node": 1, + "queue_name": "newgpu", + "kwargs": {} + }, + "command": "/gpfs/home/tzhu/anaconda3/envs/python3.6/bin/dp" }, - "_comment": " that's all " -} + "model_devi": { + "machine": { + "batch_type": "lsf", + "context_type": "ssh", + "remote_profile": { + "hostname": "xxx.xxx.xxx.xxx", + "port": 22, + "username": "tzhu" + }, + "remote_root": "/gpfs/home/tzhu/jzzeng/dpgen_workdir", + "local_root": "./" + }, + "resources": { + "envs": { + "KMP_BLOCKTIME": 0 + }, + "source_list": [ + "activate deepmd" + ], + "batch_type": "lsf", + "group_size": 1, + "number_node": 1, + "cpu_per_node": 0, + "gpu_per_node": 1, + "queue_name": "newgpu", + "kwargs": {} + }, + "command": "mpirun -n 0 /gpfs/home/tzhu/lammps-stable_5Jun2019/src/lmp_intel_cpu_intelmpi -pk intel 0 omp 2" + }, + "fp": { + "machine": { + "batch_type": "pbs", + "context_type": "ssh", + "remote_profile": { + "hostname": "xxx.xxx.xxx.xxx", + "port": 2323, + "username": "tzhu" + }, + "remote_root": "/public/home/tzhu/jzzeng/dpgen_workdir", + "local_root": "./" + }, + "resources": { + "batch_type": "pbs", + "group_size": 1, + "number_node": 1, + "cpu_per_node": 28, + "queue_name": "small", + "custom_flags": [ + "#PBS -l walltime=10:00:00" + ], + "kwargs": {}, + "gpu_per_node": 1 + }, + "command": "/public/home/tzhu/g16/g16 < input || :" + } +} \ No newline at end of file diff --git a/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json b/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json index 5f15303d6..2ff5b4a4b 100644 --- a/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json +++ b/examples/machine/DeePMD-kit-1.x/machine-slurm-qe.json @@ -1,113 +1,112 @@ - { - "_comment" : "This is an example of DP-GEN on Slurm", - "_comment" : "Last updated on 2021.4.30 for DP-GEN 0.9.2 by Yuzhi Zhang", - "train" : - { - "_comment" : "Specify the installed path of DeePMD-kit", - "_comment" : "The version of DeePMD-kit should be 1.*", - "command": "PATH_TO_DEEPMD/dp", - "_comment" : "Specify machine settings", - "machine": { - "_comment" : "Supported batches include slurm, pbs, shell, lsf.", - "batch": "slurm", - "_comment" : "If your jobs are executed on a local workstation, you can let hostname be localhost.", - "_comment" : "Otherwise you should provide the IP of machine you want to connect via ssh.", - "hostname": "localhost", - "_comment" : "The port for connection, most common settings is 22", - "port": 22, - "_comment" : "Specify your username. Sometimes you may need specify password. Exactly the name of key is password. ", - "username": "USERNAME", - "_comment" : "Specify where you want your job executes, all of tasks will be sent to work_path on this machine.", - "_comment" : "You should alwasy make sure that directory of work_path exits. ", - "work_path": "PATH_TO_WORK", - "_comment": "that's all" - }, - "resources": { - "_comment" : "The number of nodes. This will generate #SBATCH -N 1 in your script. ", - "numb_node": 1, - "_comment" : "The number of GPU cards. #SBATCH --gres=gpu:1", - "numb_gpu": 1, - "_comment" : "The number of CPUs. #SBATCH -n 4", - "task_per_node": 4, - "_comment" : "Partition. #SBATCH -p all", - "partition": "all", - "_comment" : "Memory limit. #SBATCH --mem=16G", - "mem_limit": 16, - "_comment" : "Nodelist to be excluded. #SBATCH --exclude=gpu06,gpu07", - "exclude_list": [ - "gpu06", - "gpu07" - ], - "_comment" : "Environment to be activated. This will generate source PATH/train_new.env . ", - "source_list": [ - "PATH/train_new.env" - ], - "_comment" : " Module is a common tools on HPC clustes to manage softwares for multiple users.", - "_comment" : "Modules to be loaded. This will generate module load intel", - "module_list": ["intel"], - "_comment" : "Time limit. ", - "time_limit": "23:0:0", - "_comment": "that's all" - } +{ + "_comment": "Last updated on 2021.4.30 for DP-GEN 0.9.2 by Yuzhi Zhang", + "train": { + "_comment" : "Specify the installed path of DeePMD-kit", + "command": "PATH_TO_DEEPMD/dp", + "_comment" : "Specify machine settings", + "machine": { + "batch_type": "slurm", + "context_type": "ssh", + "remote_profile": { + "hostname": "localhost", + "_comment" : "The port for connection, most common settings is 22", + "port": 22, + "_comment" : "Specify your username.", + "username": "USERNAME" + }, + "_comment" : "You should alwasy make sure that directory of work_path exits. ", + "remote_root": "PATH_TO_WORK", + "local_root": "./" + }, + "resources": { + "_comment" : "Environment to be activated. This will generate source PATH/train_new.env . ", + "source_list": [ + "PATH/train_new.env" + ], + "_comment" : " Module is a common tools on HPC clustes to manage softwares for multiple users.", + "_comment" : "Modules to be loaded. This will generate module load intel", + "module_list": [ + "intel" + ], + "batch_type": "slurm", + "_comment" : "The number of nodes. This will generate #SBATCH -N 1 in your script. ", + "number_node": 1, + "_comment" : "The number of CPUs. #SBATCH -n 4", + "cpu_per_node": 4, + "_comment" : "The number of GPU cards. #SBATCH --gres=gpu:1", + "gpu_per_node": 1, + "queue_name": "all", + "custom_flags": [ + "#SBATCH -t 23:0:0", + "#SBATCH --mem=16G", + "#SBATCH --exclude=gpu06,gpu07" + ], + "kwargs": {}, + "group_size": 1 + } }, - - "model_devi": - { - "machine": { - "machine_type": "slurm", - "hostname": "localhost", - "port": 22, - "username": "USERNAME", - "work_path": "PATH_TO_WORK", - "_comment": "that's all" - }, - "resources": { - "numb_node": 1, - "numb_gpu": 1, - "task_per_node": 4, - "partition": "all", - "mem_limit": 16, - "exclude_list": [ - - ], - "source_list": [ - "PATH/lmp_new.env" - ], - "module_list": [], - "time_limit": "23:0:0", - "_comment": "that's all" - }, - "command": "lmp_serial", - "_comment" : "DP-GEN will put 5 tasks together in one submitting script.", - "group_size": 5 - }, - "fp": - { - "machine": { - "machine_type": "slurm", - "hostname": "xxx.xxx.xxx.xxx", - "port": 22, - "username": "USERNAME", - "work_path": "PATH_TO_WORK" - }, - "resources": { - "task_per_node": 8, - "numb_gpu": 0, - "exclude_list": [], - "_comment" : "If you set with_mpi to true, the defaulted parallelling command of Slurm, srun, will be appended as prefix.", - "_comment" : "If you do not want this, you can set with_mpi to false, and specify parallelling command yourself. ", - "_comment" : "Notice that in json format, the upper/lower case is strict. You should write true instead of True and false instead of False", - "with_mpi": false, - "source_list": [ - ], - "module_list": [ - "mpich/3.2.1-intel-2017.1" - ], - "time_limit": "120:0:0", - "partition": "C032M0128G", - "_comment": "that's all" - }, - "command": "mpirun -n 8 /gpfs/share/home/1600017784/yuzhi/soft/QE-mpi/PW/src/pw.x < input", - "group_size": 1 - } -} + "model_devi": { + "machine": { + "batch_type": "slurm", + "context_type": "ssh", + "remote_profile": { + "hostname": "localhost", + "port": 22, + "username": "USERNAME" + }, + "remote_root": "PATH_TO_WORK", + "local_root": "./" + }, + "resources": { + "source_list": [ + "PATH/lmp_new.env" + ], + "module_list": [], + "batch_type": "slurm", + "_comment": "DP-GEN will put 5 tasks together in one submitting script.", + "group_size": 5, + "number_node": 1, + "cpu_per_node": 4, + "gpu_per_node": 1, + "queue_name": "all", + "custom_flags": [ + "#SBATCH -t 23:0:0", + "#SBATCH --mem=16G", + "#SBATCH --exclude=" + ], + "kwargs": {} + }, + "command": "lmp_serial" + }, + "fp": { + "machine": { + "batch_type": "slurm", + "context_type": "ssh", + "remote_profile": { + "hostname": "xxx.xxx.xxx.xxx", + "port": 22, + "username": "USERNAME" + }, + "remote_root": "PATH_TO_WORK", + "local_root": "./" + }, + "resources": { + "source_list": [], + "module_list": [ + "mpich/3.2.1-intel-2017.1" + ], + "batch_type": "slurm", + "group_size": 1, + "cpu_per_node": 8, + "gpu_per_node": 0, + "queue_name": "C032M0128G", + "custom_flags": [ + "#SBATCH -t 120:0:0" + ], + "kwargs": {}, + "number_node": 1 + }, + "command": "mpirun -n 8 /gpfs/share/home/1600017784/yuzhi/soft/QE-mpi/PW/src/pw.x < input" + }, + "api_version": "1.0" +} \ No newline at end of file From 9cace602491cff395128eb8020e3c3ffa9007e32 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Fri, 1 Jul 2022 13:23:55 +0800 Subject: [PATCH 22/26] =?UTF-8?q?fix=20=E2=80=98post=5Ffp=5Fcp2k=E2=80=99,?= =?UTF-8?q?=20add=20param=20rfailed=20(#765)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix ‘post_fp_cp2k’, add param rfailed * Update run.py --- dpgen/generator/run.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/dpgen/generator/run.py b/dpgen/generator/run.py index 1bd196cc6..f716e2266 100644 --- a/dpgen/generator/run.py +++ b/dpgen/generator/run.py @@ -3342,7 +3342,10 @@ def post_fp_gaussian (iter_index, def post_fp_cp2k (iter_index, - jdata): + jdata, + rfailed=None): + + ratio_failed = rfailed if rfailed else jdata.get('ratio_failed',0.10) model_devi_jobs = jdata['model_devi_jobs'] assert (iter_index < len(model_devi_jobs)) @@ -3373,7 +3376,7 @@ def post_fp_cp2k (iter_index, all_sys = None for oo in sys_output : _sys = dpdata.LabeledSystem(oo, fmt = 'cp2k/output') - _sys.check_type_map(type_map = jdata['type_map']) + #_sys.check_type_map(type_map = jdata['type_map']) if all_sys is None: all_sys = _sys else: @@ -3385,8 +3388,12 @@ def post_fp_cp2k (iter_index, sys_data_path = os.path.join(work_path, 'data.%s'%ss) all_sys.to_deepmd_raw(sys_data_path) all_sys.to_deepmd_npy(sys_data_path, set_size = len(sys_output)) - dlog.info("failed frame number: %s "%(tcount-icount)) - dlog.info("total frame number: %s "%tcount) + + rfail=float(tcount - icount)/float(tcount) + dlog.info("failed frame: %6d in %6d %6.2f %% " % (tcount - icount, tcount, rfail * 100.)) + + if rfail>ratio_failed: + raise RuntimeError("find too many unsuccessfully terminated jobs. Too many FP tasks are not converged. Please check your files in directories \'iter.*.*/02.fp/task.*.*/.\'") def post_fp_pwmat (iter_index, From 3cb8b901f5c1bc5bf16fd10139e1d4ba2719268e Mon Sep 17 00:00:00 2001 From: Zhuoyuan <75076820+ZLI-afk@users.noreply.github.com> Date: Mon, 4 Jul 2022 16:14:17 +0800 Subject: [PATCH 23/26] improve direction dictionary --- dpgen/auto_test/Gamma.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dpgen/auto_test/Gamma.py b/dpgen/auto_test/Gamma.py index d69d2ef6e..5fd1ea041 100644 --- a/dpgen/auto_test/Gamma.py +++ b/dpgen/auto_test/Gamma.py @@ -211,15 +211,15 @@ def return_direction(self): miller_str += str(self.miller_index[ii]) for ii in range(len(self.displace_direction)): direct_str += str(self.displace_direction[ii]) - search_key = miller_str + ':' + direct_str + search_key = miller_str + '/' + direct_str # define specific cell vectors dict_directions = { - '100:010': [(0,1,0), (0,0,1), (1,0,0)], - '110:111': [(-1,1,1), (1,-1,1), (1,1,0)], - '111:110': [(-1,1,0), (-1,-1,2), (1,1,1)], - '111:112': [(-1,-1,2), (1,-1,0), (1,1,1)], - '112:111': [(-1,-1,1), (1,-1,0), (1,1,2)], - '123:111': [(-1,-1,1), (2,-1,0), (1,2,3)] + '100/010': [(0,1,0), (0,0,1), (1,0,0)], + '110/111': [(-1,1,1), (1,-1,1), (1,1,0)], + '111/110': [(-1,1,0), (-1,-1,2), (1,1,1)], + '111/112': [(-1,-1,2), (1,-1,0), (1,1,1)], + '112/111': [(-1,-1,1), (1,-1,0), (1,1,2)], + '123/111': [(-1,-1,1), (2,-1,0), (1,2,3)] } try: directions = dict_directions[search_key] From 26823b76e5aa7934dd1a54116a4d2f8e8edd9517 Mon Sep 17 00:00:00 2001 From: Zhuoyuan <75076820+ZLI-afk@users.noreply.github.com> Date: Tue, 5 Jul 2022 14:56:48 +0800 Subject: [PATCH 24/26] fix typos --- dpgen/auto_test/Gamma.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/dpgen/auto_test/Gamma.py b/dpgen/auto_test/Gamma.py index 5fd1ea041..4d32f2f20 100644 --- a/dpgen/auto_test/Gamma.py +++ b/dpgen/auto_test/Gamma.py @@ -33,11 +33,7 @@ def __init__(self, self.miller_index = parameter['miller_index'] self.displace_direction = parameter['displace_direction'] self.lattice_type = parameter['lattice_type'] - - # parameter['min_slab_size'] = parameter.get('min_slab_size', 10) - # self.min_slab_size = parameter['min_slab_size'] parameter['min_supercell_size'] = parameter.get('min_supercell_size', (1,1,5)) - self.min_supercell_size = parameter['min_supercell_size'] parameter['min_vacuum_size'] = parameter.get('min_vacuum_size', 20) self.min_vacuum_size = parameter['min_vacuum_size'] @@ -192,15 +188,12 @@ def make_confs(self, return task_list @staticmethod - def centralize_slab(slab): + def centralize_slab(slab) -> None: z_pos_list = list(set([site.position[2] for site in slab])) z_pos_list.sort() central_atoms = (z_pos_list[-1] - z_pos_list[0])/2 - #print(f"central_atoms: {central_atoms}") central_cell = slab.cell[2][2]/2 - #print(f"central_cell: {central_cell}") disp_length = central_cell - central_atoms - #print(f"disp_length: {disp_length}") for site in slab: site.position[2] += disp_length From 479df751a588691cf64e5613ea9f6b9f84850da8 Mon Sep 17 00:00:00 2001 From: Zhuoyuan <75076820+ZLI-afk@users.noreply.github.com> Date: Tue, 12 Jul 2022 17:59:02 +0800 Subject: [PATCH 25/26] fix 111 directions and post bugs --- dpgen/auto_test/Gamma.py | 14 ++++++++------ .../confs/std-fcc/relaxation/relax_task/CONTCAR | 12 ++++++++++++ tests/auto_test/test_gamma.py | 8 ++++---- 3 files changed, 24 insertions(+), 10 deletions(-) create mode 100644 tests/auto_test/confs/std-fcc/relaxation/relax_task/CONTCAR diff --git a/dpgen/auto_test/Gamma.py b/dpgen/auto_test/Gamma.py index 4d32f2f20..5cb3209f9 100644 --- a/dpgen/auto_test/Gamma.py +++ b/dpgen/auto_test/Gamma.py @@ -209,8 +209,8 @@ def return_direction(self): dict_directions = { '100/010': [(0,1,0), (0,0,1), (1,0,0)], '110/111': [(-1,1,1), (1,-1,1), (1,1,0)], - '111/110': [(-1,1,0), (-1,-1,2), (1,1,1)], - '111/112': [(-1,-1,2), (1,-1,0), (1,1,1)], + '111/110': [(-1,1,0), (1,1,-2), (1,1,1)], + '111/112': [(1,1,-2), (-1,1,0), (1,1,1)], '112/111': [(-1,-1,1), (1,-1,0), (1,1,2)], '123/111': [(-1,-1,1), (2,-1,0), (1,2,3)] } @@ -346,7 +346,8 @@ def _compute_lower(self, if not self.reprod: ptr_data += str(tuple(self.miller_index)) + ' plane along ' + str(self.displace_direction) - ptr_data += "No_task: \tDisplacement \tStacking_Fault_E(J/m^2) EpA(eV) equi_EpA(eV)\n" + ptr_data += "No_task: \tDisplacement \tStacking_Fault_E(J/m^2) EpA(eV) slab_equi_EpA(eV)\n" + task_result_slab_equi = loadfn(os.path.join(all_tasks[0], 'result_task.json')) for ii in all_tasks: task_result = loadfn(os.path.join(ii, 'result_task.json')) natoms = np.sum(task_result['atom_numbs']) @@ -356,14 +357,15 @@ def _compute_lower(self, equi_path = os.path.abspath(os.path.join(os.path.dirname(output_file), '../relaxation/relax_task')) equi_result = loadfn(os.path.join(equi_path, 'result.json')) equi_epa = equi_result['energies'][-1] / np.sum(equi_result['atom_numbs']) + equi_epa_slab = task_result_slab_equi['energies'][-1] / np.sum(equi_result['atom_numbs']) structure_dir = os.path.basename(ii) - Cf = 1.60217657e-16 / (1e-20 * 2) * 0.001 - sfe = (task_result['energies'][-1] - equi_epa * natoms) / AA * Cf + Cf = 1.60217657e-16 / 1e-20 * 0.001 + sfe = (task_result['energies'][-1] - equi_epa_slab * natoms) / AA * Cf miller_index = loadfn(os.path.join(ii, 'miller.json')) ptr_data += "%-25s %7.2f %7.3f %8.3f %8.3f\n" % ( - str(miller_index) + '-' + structure_dir + ':', int(ii[-4:])/self.n_steps, sfe, epa, equi_epa) + str(miller_index) + '-' + structure_dir + ':', int(ii[-4:])/self.n_steps, sfe, epa, equi_epa_slab) res_data[int(ii[-4:])/self.n_steps] = [sfe, epa, equi_epa] diff --git a/tests/auto_test/confs/std-fcc/relaxation/relax_task/CONTCAR b/tests/auto_test/confs/std-fcc/relaxation/relax_task/CONTCAR new file mode 100644 index 000000000..ce4af1e57 --- /dev/null +++ b/tests/auto_test/confs/std-fcc/relaxation/relax_task/CONTCAR @@ -0,0 +1,12 @@ +Mo4 +1.0 +4.0028914311881421e+00 0.0000000000000000e+00 0.0000000000000000e+00 +-3.3847683075468209e-17 4.0028914311881421e+00 0.0000000000000000e+00 +-2.3607623963217771e-17 -2.9295144479997265e-17 4.0028914311881421e+00 +Mo +4 +Cartesian + 4.0028914312 4.0028914312 4.0028914312 + 4.0028914312 2.0014457156 2.0014457156 + 2.0014457156 4.0028914312 2.0014457156 + 2.0014457156 2.0014457156 4.0028914312 diff --git a/tests/auto_test/test_gamma.py b/tests/auto_test/test_gamma.py index 46b0df2ce..26d7f4f54 100644 --- a/tests/auto_test/test_gamma.py +++ b/tests/auto_test/test_gamma.py @@ -25,7 +25,7 @@ class TestGamma(unittest.TestCase): def setUp(self): _jdata = { - "structures": ["confs/hp-Mo"], + "structures": ["confs/std-fcc"], "interaction": { "type": "vasp", "incar": "vasp_input/INCAR_Mo", @@ -35,9 +35,9 @@ def setUp(self): "properties": [ { "type": "gamma", - "lattice_type": "bcc", - "miller_index": [1, 1, 0], - "displace_direction": [1, 1, 1], + "lattice_type": "fcc", + "miller_index": [1, 1, 1], + "displace_direction": [1, 1, 2], "min_supercell_size": [1, 1, 10], "min_vacuum_size": 10, "add_fix": ["true", "true", "false"], From 37b2cdbb28c2c5294308722337b1c8e54c31f1df Mon Sep 17 00:00:00 2001 From: Zhuoyuan <75076820+ZLI-afk@users.noreply.github.com> Date: Tue, 12 Jul 2022 20:15:33 +0800 Subject: [PATCH 26/26] fix post bugs --- dpgen/auto_test/Gamma.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dpgen/auto_test/Gamma.py b/dpgen/auto_test/Gamma.py index 5cb3209f9..283195dae 100644 --- a/dpgen/auto_test/Gamma.py +++ b/dpgen/auto_test/Gamma.py @@ -347,21 +347,22 @@ def _compute_lower(self, if not self.reprod: ptr_data += str(tuple(self.miller_index)) + ' plane along ' + str(self.displace_direction) ptr_data += "No_task: \tDisplacement \tStacking_Fault_E(J/m^2) EpA(eV) slab_equi_EpA(eV)\n" + all_tasks.sort() task_result_slab_equi = loadfn(os.path.join(all_tasks[0], 'result_task.json')) for ii in all_tasks: task_result = loadfn(os.path.join(ii, 'result_task.json')) natoms = np.sum(task_result['atom_numbs']) epa = task_result['energies'][-1] / natoms + equi_epa_slab = task_result_slab_equi['energies'][-1] / natoms AA = np.linalg.norm(np.cross(task_result['cells'][0][0], task_result['cells'][0][1])) equi_path = os.path.abspath(os.path.join(os.path.dirname(output_file), '../relaxation/relax_task')) equi_result = loadfn(os.path.join(equi_path, 'result.json')) equi_epa = equi_result['energies'][-1] / np.sum(equi_result['atom_numbs']) - equi_epa_slab = task_result_slab_equi['energies'][-1] / np.sum(equi_result['atom_numbs']) structure_dir = os.path.basename(ii) Cf = 1.60217657e-16 / 1e-20 * 0.001 - sfe = (task_result['energies'][-1] - equi_epa_slab * natoms) / AA * Cf + sfe = (task_result['energies'][-1] - task_result_slab_equi['energies'][-1]) / AA * Cf miller_index = loadfn(os.path.join(ii, 'miller.json')) ptr_data += "%-25s %7.2f %7.3f %8.3f %8.3f\n" % (