Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add an option to merge data to one H5 file #1119

Merged
merged 1 commit into from
Jan 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dpgen/generator/arginfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def training_args() -> List[Argument]:
doc_training_reuse_start_pref_f = "The prefactor of force loss at the start of the training." + doc_reusing
doc_model_devi_activation_func = "The activation function in the model. The shape of list should be (N_models, 2), where 2 represents the embedding and fitting network. This option will override default parameters."
doc_srtab_file_path = 'The path of the table for the short-range pairwise interaction which is needed when using DP-ZBL potential'
doc_one_h5 = "Before training, all of the training data will be merged into one HDF5 file."

return [
Argument("numb_models", int, optional=False, doc=doc_numb_models),
Expand All @@ -100,6 +101,7 @@ def training_args() -> List[Argument]:
Argument("model_devi_activation_func", [None, list], optional=True, doc=doc_model_devi_activation_func),
Argument("srtab_file_path",str,optional=True,
doc=doc_srtab_file_path),
Argument("one_h5", bool, optional=True, default=False, doc=doc_one_h5),
]


Expand Down
49 changes: 28 additions & 21 deletions dpgen/generator/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
from dpgen.generator.lib.ele_temp import NBandsEsti
from dpgen.remote.decide_machine import convert_mdata
from dpgen.dispatcher.Dispatcher import make_submission
from dpgen.util import sepline, expand_sys_str, normalize
from dpgen.util import sepline, expand_sys_str, normalize, convert_training_data_to_hdf5
from dpgen import ROOT_PATH
from pymatgen.io.vasp import Incar,Kpoints,Potcar
from dpgen.auto_test.lib.vasp import make_kspacing_kpoints
Expand Down Expand Up @@ -385,7 +385,7 @@ def make_train (iter_index,
jinput['loss']['start_pref_f'] = training_reuse_start_pref_f
jinput['learning_rate']['start_lr'] = training_reuse_start_lr


input_files = []
for ii in range(numb_models) :
task_path = os.path.join(work_path, train_task_fmt % ii)
create_path(task_path)
Expand Down Expand Up @@ -429,6 +429,7 @@ def make_train (iter_index,
# dump the input.json
with open(os.path.join(task_path, train_input_file), 'w') as outfile:
json.dump(jinput, outfile, indent = 4)
input_files.append(os.path.join(task_path, train_input_file))

# link old models
if iter_index > 0 :
Expand All @@ -454,7 +455,9 @@ def make_train (iter_index,
_link_old_models(work_path, old_model_files, ii)
# Copy user defined forward files
symlink_user_forward_files(mdata=mdata, task_type="train", work_path=work_path)

# HDF5 format for training data
if jdata.get('one_h5', False):
convert_training_data_to_hdf5(input_files, os.path.join(work_path, "data.hdf5"))


def _link_old_models(work_path, old_model_files, ii):
Expand Down Expand Up @@ -568,24 +571,28 @@ def run_train (iter_index,
backward_files+= ['model.ckpt.meta', 'model.ckpt.index', 'model.ckpt.data-00000-of-00001', 'checkpoint']
if jdata.get("dp_compress", False):
backward_files.append('frozen_model_compressed.pb')
init_data_sys_ = jdata['init_data_sys']
init_data_sys = []
for ii in init_data_sys_ :
init_data_sys.append(os.path.join('data.init', ii))
trans_comm_data = []
cwd = os.getcwd()
os.chdir(work_path)
fp_data = glob.glob(os.path.join('data.iters', 'iter.*', '02.fp', 'data.*'))
for ii in itertools.chain(init_data_sys, fp_data) :
sys_paths = expand_sys_str(ii)
for single_sys in sys_paths:
if "#" not in single_sys:
trans_comm_data += glob.glob(os.path.join(single_sys, 'set.*'))
trans_comm_data += glob.glob(os.path.join(single_sys, 'type*.raw'))
trans_comm_data += glob.glob(os.path.join(single_sys, 'nopbc'))
else:
# H5 file
trans_comm_data.append(single_sys.split("#")[0])
if not jdata.get('one_h5', False):
init_data_sys_ = jdata['init_data_sys']
init_data_sys = []
for ii in init_data_sys_ :
init_data_sys.append(os.path.join('data.init', ii))
trans_comm_data = []
cwd = os.getcwd()
os.chdir(work_path)
fp_data = glob.glob(os.path.join('data.iters', 'iter.*', '02.fp', 'data.*'))
for ii in itertools.chain(init_data_sys, fp_data) :
sys_paths = expand_sys_str(ii)
for single_sys in sys_paths:
if "#" not in single_sys:
trans_comm_data += glob.glob(os.path.join(single_sys, 'set.*'))
trans_comm_data += glob.glob(os.path.join(single_sys, 'type*.raw'))
trans_comm_data += glob.glob(os.path.join(single_sys, 'nopbc'))
else:
# H5 file
trans_comm_data.append(single_sys.split("#")[0])
else:
cwd = os.getcwd()
trans_comm_data = ["data.hdf5"]
# remove duplicated files
trans_comm_data = list(set(trans_comm_data))
os.chdir(cwd)
Expand Down
60 changes: 60 additions & 0 deletions dpgen/util.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
#!/usr/bin/env python
# coding: utf-8
import json
import os
from typing import Union, List
from pathlib import Path

import h5py
import dpdata
from dargs import Argument

from dpgen import dlog
Expand Down Expand Up @@ -83,3 +86,60 @@ def normalize(arginfo: Argument, data: dict, strict_check: bool = True) -> dict:
data = arginfo.normalize_value(data, trim_pattern="_*")
arginfo.check_value(data, strict=strict_check)
return data


def convert_training_data_to_hdf5(input_files: List[str], h5_file: str):
"""Convert training data to HDF5 format and update the input files.

Parameters
----------
input_files : list of str
DeePMD-kit input file names
h5_file : str
HDF5 file name
"""
systems = []
h5_dir = Path(h5_file).parent.absolute()
cwd = Path.cwd().absolute()
for ii in input_files:
ii = Path(ii)
dd = ii.parent.absolute()
with open(ii, 'r+') as f:
jinput = json.load(f)
if 'training_data' in jinput['training']:
# v2.0
p_sys = jinput['training']['training_data']['systems']
else:
# v1.x
p_sys = jinput['training']['systems']
for ii, pp in enumerate(p_sys):
if "#" in pp:
# HDF5 file
p1, p2 = pp.split("#")
ff = os.path.normpath(str((dd / p1).absolute().relative_to(cwd)))
pp = ff + "#" + p2
new_pp = os.path.normpath(os.path.relpath(ff, h5_dir)) + "/" + p2
else:
pp = os.path.normpath(str((dd / pp).absolute().relative_to(cwd)))
new_pp = os.path.normpath(os.path.relpath(pp, h5_dir))
p_sys[ii] = os.path.normpath(os.path.relpath(h5_file, dd)) + "#/" + str(new_pp)
systems.append(pp)
f.seek(0)
json.dump(jinput, f, indent=4)
systems = list(set(systems))

dlog.info("Combining %d training systems to %s...", len(systems), h5_file)

with h5py.File(h5_file, 'w') as f:
for ii in systems:
if "#" in ii:
p1, p2 = ii.split("#")
p1 = os.path.normpath(os.path.relpath(p1, h5_dir))
group = f.create_group(str(p1) + "/" + p2)
s = dpdata.LabeledSystem(ii, fmt="deepmd/hdf5")
s.to("deepmd/hdf5", group)
else:
pp = os.path.normpath(os.path.relpath(ii, h5_dir))
group = f.create_group(str(pp))
s = dpdata.LabeledSystem(ii, fmt="deepmd/npy")
s.to("deepmd/hdf5", group)
52 changes: 52 additions & 0 deletions tests/generator/test_make_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,58 @@ def test_1_data_v1_h5(self) :
shutil.rmtree('iter.000000')
os.remove('data/deepmd.hdf5')

def test_1_data_v1_one_h5(self) :
"""Test `one_h5` option."""
dpdata.LabeledSystem("data/deepmd", fmt='deepmd/npy').to_deepmd_hdf5('data/deepmd.hdf5')
with open (param_file_v1, 'r') as fp :
jdata = json.load (fp)
jdata.pop('use_ele_temp', None)
jdata['init_data_sys'].append('deepmd.hdf5')
jdata['init_batch_size'].append('auto')
jdata['one_h5'] = True
with open (machine_file_v1, 'r') as fp:
mdata = json.load (fp)
make_train(0, jdata, mdata)
# make fake fp results #data == fp_task_min
_make_fake_fp(0, 0, jdata['fp_task_min'])
# make iter1 train
make_train(1, jdata, mdata)
# check data is linked
self.assertTrue(os.path.isdir(os.path.join('iter.000001', '00.train', 'data.iters', 'iter.000000', '02.fp')))
# check models inputs
with open(os.path.join('iter.%06d' % 1,
'00.train',
'%03d' % 0,
"input.json")) as fp:
jdata0 = json.load(fp)
self.assertEqual(jdata0['training']['systems'], [
'../data.hdf5#/data.init/deepmd',
'../data.hdf5#/data.init/deepmd.hdf5/',
'../data.hdf5#/data.iters/iter.000000/02.fp/data.000',
])
# test run_train -- confirm transferred files are correct
with tempfile.TemporaryDirectory() as remote_root:
run_train(1, jdata, {
"api_version": "1.0",
"train_command": (
"test -f ../data.hdf5"
"&& touch frozen_model.pb lcurve.out model.ckpt.meta model.ckpt.index model.ckpt.data-00000-of-00001 checkpoint"
"&& echo dp"
),
"train_machine": {
"batch_type": "shell",
"local_root": "./",
"remote_root": remote_root,
"context_type": "local",
},
"train_resources": {
"group_size": 1,
},
})

# remove testing dirs
shutil.rmtree('iter.000001')
shutil.rmtree('iter.000000')

if __name__ == '__main__':
unittest.main()