From 32d925aabe080a6f06b9ffb3d2cc88205ddd8d44 Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Fri, 24 Nov 2023 12:26:00 +0100 Subject: [PATCH 1/9] add script to generate small data files by cloning the data structure of real data files. The resulting files do not hold any of the original data, only the group structure and dataset type/shape. --- extra_data/tests/utils/__init__.py | 0 extra_data/tests/utils/gendata.py | 69 ++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 extra_data/tests/utils/__init__.py create mode 100644 extra_data/tests/utils/gendata.py diff --git a/extra_data/tests/utils/__init__.py b/extra_data/tests/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/extra_data/tests/utils/gendata.py b/extra_data/tests/utils/gendata.py new file mode 100644 index 00000000..1179266e --- /dev/null +++ b/extra_data/tests/utils/gendata.py @@ -0,0 +1,69 @@ +import sys +from argparse import ArgumentParser +from pathlib import Path + +import h5py + + +def clone_file_structure(h5file: Path, output: Path) -> None: + clone = h5py.File(output / h5file.name, "w") + + def visitor(name, obj): + if isinstance(obj, h5py.Group): + clone.create_group(name) + elif isinstance(obj, h5py.Dataset): + if ( + name.startswith("INSTRUMENT") + or name.startswith("CONTROL") + or name.startswith("RUN") + ): + clone.create_dataset_like(obj) + else: + clone.create_dataset_like(obj, data=obj[()]) + + original = h5py.File(h5file) + original.visititems(visitor) + + +def clone(input: Path, output: Path) -> None: + """Clone EuXFEL HDF5 file structure without any of its data. + + Clone the input file or files present the input directory. + The cloned files will be writen to output. + """ + if not output.is_dir(): + raise ValueError(f"The given output directory does not exist: {output}") + + if h5py.is_hdf5(input): + if output == input.parent: + raise ValueError("Input and output must be different directories.") + clone_file_structure(input, output) + elif input.is_dir(): + if output == input: + raise ValueError("Input and output must be different directories.") + # clone all hdf5 file present in the given directory + for file_ in input.glob("*"): + if not h5py.is_hdf5(file_): + continue + clone_file_structure(file_, output) + else: + raise ValueError(f"invalid input: {input}") + + +def main(argv=None): + ap = ArgumentParser("Clone EuXFEL HDF5 files but with empty datasets.") + ap.add_argument("input", type=str, help="Path to an HDF5 file or a directory.") + ap.add_argument( + "output", type=str, help="Output directory to write the cloned files." + ) + + args = ap.parse_args() + + path_in = Path(args.input).expanduser() + path_out = Path(args.output).expanduser() + + clone(path_in, path_out) + + +if __name__ == "__main__": + main(sys.argv[1:]) From 8ec2f264fc95eb05d2d72702f7bc796d31c5691d Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Fri, 24 Nov 2023 13:08:07 +0100 Subject: [PATCH 2/9] add progress bar in terminal --- extra_data/tests/utils/gendata.py | 34 +++++++++++++++++++++++-------- extra_data/utils.py | 11 +++++++++- extra_data/validation.py | 11 +--------- 3 files changed, 37 insertions(+), 19 deletions(-) diff --git a/extra_data/tests/utils/gendata.py b/extra_data/tests/utils/gendata.py index 1179266e..0f8f59f1 100644 --- a/extra_data/tests/utils/gendata.py +++ b/extra_data/tests/utils/gendata.py @@ -4,6 +4,20 @@ import h5py +from ..utils import progress_bar + + +def progress(processed, total, *, show=True): + """Show progress information""" + if not show: + return + + pbar = progress_bar(processed, total) + if sys.stderr.isatty(): + # "\x1b[2K": delete whole line, "\x1b[1A": move up cursor + print("\x1b[2K\x1b[1A\x1b[2K\x1b[1A", file=sys.stderr) + print(pbar, file=sys.stderr) + def clone_file_structure(h5file: Path, output: Path) -> None: clone = h5py.File(output / h5file.name, "w") @@ -17,17 +31,17 @@ def visitor(name, obj): or name.startswith("CONTROL") or name.startswith("RUN") ): - clone.create_dataset_like(obj) + clone.create_dataset_like(name, obj) else: - clone.create_dataset_like(obj, data=obj[()]) + clone.create_dataset_like(name, obj, data=obj[()]) original = h5py.File(h5file) original.visititems(visitor) -def clone(input: Path, output: Path) -> None: +def clone(input: Path, output: Path, *, term_progress=False) -> None: """Clone EuXFEL HDF5 file structure without any of its data. - + Clone the input file or files present the input directory. The cloned files will be writen to output. """ @@ -42,10 +56,12 @@ def clone(input: Path, output: Path) -> None: if output == input: raise ValueError("Input and output must be different directories.") # clone all hdf5 file present in the given directory - for file_ in input.glob("*"): - if not h5py.is_hdf5(file_): - continue + h5files = [f for f in input.glob("*") if h5py.is_hdf5(f)] + + progress(0, len(h5files), show=term_progress) + for n, file_ in enumerate(h5files, start=1): clone_file_structure(file_, output) + progress(n, len(h5files), show=term_progress) else: raise ValueError(f"invalid input: {input}") @@ -62,7 +78,9 @@ def main(argv=None): path_in = Path(args.input).expanduser() path_out = Path(args.output).expanduser() - clone(path_in, path_out) + print(f"Cloning file(s) structure:\ninput: {path_in}\nOutput: {path_out}\n") + clone(path_in, path_out, term_progress=True) + print("Done.") if __name__ == "__main__": diff --git a/extra_data/utils.py b/extra_data/utils.py index 8ccc39d2..592c08f3 100644 --- a/extra_data/utils.py +++ b/extra_data/utils.py @@ -9,14 +9,23 @@ """ import os +from shutil import get_terminal_size def available_cpu_cores(): # This process may be restricted to a subset of the cores on the machine; # sched_getaffinity() tells us which on some Unix flavours (inc Linux) - if hasattr(os, 'sched_getaffinity'): + if hasattr(os, "sched_getaffinity"): return len(os.sched_getaffinity(0)) else: # Fallback, inc on Windows ncpu = os.cpu_count() or 2 return min(ncpu, 8) + + +def progress_bar(done, total, suffix=" "): + line = f"Progress: {done}/{total}{suffix}[{{}}]" + length = min(get_terminal_size().columns - len(line), 50) + filled = int(length * done // total) + bar = "#" * filled + " " * (length - filled) + return line.format(bar) diff --git a/extra_data/validation.py b/extra_data/validation.py index 3cf02c44..caf2b3a0 100644 --- a/extra_data/validation.py +++ b/extra_data/validation.py @@ -4,13 +4,12 @@ import numpy as np import os import os.path as osp -from shutil import get_terminal_size from signal import signal, SIGINT, SIG_IGN import sys from .reader import H5File, FileAccess from .run_files_map import RunFilesMap - +from .utils import progress_bar class ValidationError(Exception): def __init__(self, problems): @@ -212,14 +211,6 @@ def check_index_contiguous(firsts, counts, record): )) -def progress_bar(done, total, suffix=' '): - line = f'Progress: {done}/{total}{suffix}[{{}}]' - length = min(get_terminal_size().columns - len(line), 50) - filled = int(length * done // total) - bar = '#' * filled + ' ' * (length - filled) - return line.format(bar) - - def _check_file(args): runpath, filename = args filepath = osp.join(runpath, filename) From 3f42eef0155a5fdff177e72f0d0a5c7da2adc2f4 Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Fri, 24 Nov 2023 13:32:40 +0100 Subject: [PATCH 3/9] add the option to copy RUN and CONTROL data to the output files --- extra_data/tests/utils/gendata.py | 56 ++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/extra_data/tests/utils/gendata.py b/extra_data/tests/utils/gendata.py index 0f8f59f1..e076b414 100644 --- a/extra_data/tests/utils/gendata.py +++ b/extra_data/tests/utils/gendata.py @@ -6,6 +6,8 @@ from ..utils import progress_bar +__all__ = ["clone"] + def progress(processed, total, *, show=True): """Show progress information""" @@ -19,7 +21,9 @@ def progress(processed, total, *, show=True): print(pbar, file=sys.stderr) -def clone_file_structure(h5file: Path, output: Path) -> None: +def _clone_file_structure( + h5file: Path, output: Path, *, run_data=False, control_data=False +) -> None: clone = h5py.File(output / h5file.name, "w") def visitor(name, obj): @@ -28,8 +32,8 @@ def visitor(name, obj): elif isinstance(obj, h5py.Dataset): if ( name.startswith("INSTRUMENT") - or name.startswith("CONTROL") - or name.startswith("RUN") + or (name.startswith("CONTROL") and not control_data) + or (name.startswith("RUN") and not run_data) ): clone.create_dataset_like(name, obj) else: @@ -39,11 +43,23 @@ def visitor(name, obj): original.visititems(visitor) -def clone(input: Path, output: Path, *, term_progress=False) -> None: +def clone( + input: Path, + output: Path, + *, + run_data=False, + control_data=False, + term_progress=False, +) -> None: """Clone EuXFEL HDF5 file structure without any of its data. Clone the input file or files present the input directory. - The cloned files will be writen to output. + The cloned files will be written to output. + + args: + run_data: Copy data in RUN group if set to True + control_data: Copy data in CONTROL group if set to True + term_progress: show progress in terminal if set to True """ if not output.is_dir(): raise ValueError(f"The given output directory does not exist: {output}") @@ -51,7 +67,9 @@ def clone(input: Path, output: Path, *, term_progress=False) -> None: if h5py.is_hdf5(input): if output == input.parent: raise ValueError("Input and output must be different directories.") - clone_file_structure(input, output) + _clone_file_structure( + input, output, run_data=run_data, control_data=control_data + ) elif input.is_dir(): if output == input: raise ValueError("Input and output must be different directories.") @@ -60,7 +78,9 @@ def clone(input: Path, output: Path, *, term_progress=False) -> None: progress(0, len(h5files), show=term_progress) for n, file_ in enumerate(h5files, start=1): - clone_file_structure(file_, output) + _clone_file_structure( + file_, output, run_data=run_data, control_data=control_data + ) progress(n, len(h5files), show=term_progress) else: raise ValueError(f"invalid input: {input}") @@ -72,6 +92,20 @@ def main(argv=None): ap.add_argument( "output", type=str, help="Output directory to write the cloned files." ) + ap.add_argument( + "--copy-run-data", + "-cr", + action="store_true", + default=False, + help="Copy data present in the RUN group.", + ) + ap.add_argument( + "--copy-control-data", + "-cc", + action="store_true", + default=False, + help="Copy dara present in the CONTROL group.", + ) args = ap.parse_args() @@ -79,7 +113,13 @@ def main(argv=None): path_out = Path(args.output).expanduser() print(f"Cloning file(s) structure:\ninput: {path_in}\nOutput: {path_out}\n") - clone(path_in, path_out, term_progress=True) + clone( + path_in, + path_out, + run_data=args.copy_run_data, + control_data=args.copy_control_data, + term_progress=True, + ) print("Done.") From 343fce3b0605f38274ddbe438ca0e683aa3b9beb Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Mon, 27 Nov 2023 22:28:09 +0100 Subject: [PATCH 4/9] copy attributes to new file replicate softlinks to new file ... --- extra_data/tests/utils/gendata.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/extra_data/tests/utils/gendata.py b/extra_data/tests/utils/gendata.py index e076b414..6ca49f9d 100644 --- a/extra_data/tests/utils/gendata.py +++ b/extra_data/tests/utils/gendata.py @@ -24,22 +24,36 @@ def progress(processed, total, *, show=True): def _clone_file_structure( h5file: Path, output: Path, *, run_data=False, control_data=False ) -> None: + original = h5py.File(h5file) clone = h5py.File(output / h5file.name, "w") def visitor(name, obj): + link = original.get(name, getlink=True) + if isinstance(link, h5py.SoftLink): + clone[name] = h5py.SoftLink(link.path) + return + if isinstance(obj, h5py.Group): - clone.create_group(name) + clone_obj = clone.create_group(name) elif isinstance(obj, h5py.Dataset): if ( name.startswith("INSTRUMENT") or (name.startswith("CONTROL") and not control_data) or (name.startswith("RUN") and not run_data) ): - clone.create_dataset_like(name, obj) + clone_obj = clone.create_dataset_like(name, obj) else: - clone.create_dataset_like(name, obj, data=obj[()]) + # note: consider using h5py.File.copy once a bug causing + # segfault for dataset with attributes is fixed, + # see: https://github.com/HDFGroup/hdf5/issues/2414 + clone_obj = clone.create_dataset_like(name, obj, data=obj[()]) + else: + return + + # copy attributes + for key, value in obj.attrs.items(): + clone_obj.attrs.create(key, value) - original = h5py.File(h5file) original.visititems(visitor) @@ -104,7 +118,7 @@ def main(argv=None): "-cc", action="store_true", default=False, - help="Copy dara present in the CONTROL group.", + help="Copy data present in the CONTROL group.", ) args = ap.parse_args() From bd284cd0e63da9647c0ba10adca41c5d5cb7c4db Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Wed, 29 Nov 2023 22:09:36 +0100 Subject: [PATCH 5/9] Rewrite visitor to handle hard and soft links --- extra_data/copy.py | 182 +++++++++++++++++++++++++++++ extra_data/tests/utils/__init__.py | 0 extra_data/tests/utils/gendata.py | 141 ---------------------- 3 files changed, 182 insertions(+), 141 deletions(-) create mode 100644 extra_data/copy.py delete mode 100644 extra_data/tests/utils/__init__.py delete mode 100644 extra_data/tests/utils/gendata.py diff --git a/extra_data/copy.py b/extra_data/copy.py new file mode 100644 index 00000000..3cc496e7 --- /dev/null +++ b/extra_data/copy.py @@ -0,0 +1,182 @@ +import sys +from argparse import ArgumentParser +from pathlib import Path +from typing import Union + +import h5py + +from .utils import progress_bar + +__all__ = ["copy_structure"] + + +def progress(processed, total, *, show=True): + """Show progress information""" + if not show: + return + + pbar = progress_bar(processed, total) + if sys.stderr.isatty(): + # "\x1b[2K": delete whole line, "\x1b[1A": move up cursor + print("\x1b[2K\x1b[1A\x1b[2K\x1b[1A", file=sys.stderr) + print(pbar, file=sys.stderr) + + +class Cloner: + def __init__(self, input, output, *, run_data=False, control_data=False): + self.run_data = run_data + self.control_data = control_data + self.visited = {} + + if output.file.mode == "r": + raise ValueError("Output file must be writeable.") + self.visit(input, output) + + @staticmethod + def _copy_attrs(input, output): + for key, value in input.attrs.items(): + output.attrs.create(key, value) + + def visit(self, obj, output): + if obj.name != "/": + link = obj.file.get(obj.name, getlink=True) + if isinstance(link, h5py.SoftLink): + output[obj.name] = h5py.SoftLink(link.path) + return + elif isinstance(link, h5py.ExternalLink): + # TODO do we want to support external links? + # this *might* work, but external softlinks may point to non reacheable data + # with h5py.File(link.filename) as ext: + # Cloner(ext[link.path], output[obj.name], run_data=self.run_data, control_data=self.control_data) + return + + obj_id = h5py.h5o.get_info(obj.id).addr + + if obj_id in self.visited: + # Hardlink to an object we've already seen + output[obj.name] = output[self.visited[obj_id]] + return + + self.visited[obj_id] = obj.name + + if isinstance(obj, h5py.Dataset): + if ( + obj.name.startswith("/INSTRUMENT") + or (obj.name.startswith("/CONTROL") and not self.control_data) + or (obj.name.startswith("/RUN") and not self.run_data) + ): + output_obj = output.create_dataset_like(obj.name, obj) + else: + # note: consider using h5py.File.copy once a bug causing + # segfault for dataset with attributes is fixed, + # see: https://github.com/HDFGroup/hdf5/issues/2414 + output_obj = output.create_dataset_like(obj.name, obj, data=obj[()]) + self._copy_attrs(obj, output_obj) + elif isinstance(obj, h5py.Group): + if obj == obj.file: + # root object + output_obj = output["/"] + else: + output_obj = output.create_group(obj.name) + self._copy_attrs(obj, output_obj) + + for child in obj.values(): + self.visit(child, output) + else: + # unknown type + return + + +def copy_structure( + input: Union[Path, str], + output: Union[Path, str], + *, + run_data=False, + control_data=False, + term_progress=False, +) -> None: + """Clone EuXFEL HDF5 file structure without any of its data. + + Clone the input file or files present the input directory. + The cloned files will be written to output. + + args: + run_data: Copy data in RUN group if set to True + control_data: Copy data in CONTROL group if set to True + term_progress: show progress in terminal if set to True + """ + if isinstance(input, str): + input = Path(input) + input = input.expanduser() + + if isinstance(output, str): + output = Path(output) + output = output.expanduser() + + if not output.is_dir(): + raise ValueError(f"The given output directory does not exist: {output}") + + if h5py.is_hdf5(input): + if output == input.parent: + raise ValueError("Input and output must be different directories.") + Cloner( + h5py.File(input), + h5py.File(output / input.name, "w"), + run_data=run_data, + control_data=control_data, + ) + elif input.is_dir(): + if output == input: + raise ValueError("Input and output must be different directories.") + # clone all hdf5 file present in the given directory + h5files = [f for f in input.glob("*") if h5py.is_hdf5(f)] + + progress(0, len(h5files), show=term_progress) + for n, file_ in enumerate(h5files, start=1): + Cloner( + h5py.File(file_), + h5py.File(output / file_.name, "w"), + run_data=run_data, + control_data=control_data, + ) + progress(n, len(h5files), show=term_progress) + else: + raise ValueError(f"invalid input: {input}") + + +def main(argv=None): + ap = ArgumentParser("Clone EuXFEL HDF5 files but with empty datasets.") + ap.add_argument("input", type=str, help="Path to an HDF5 file or a directory.") + ap.add_argument( + "output", type=str, help="Output directory to write the cloned files." + ) + ap.add_argument( + "--copy-run-data", + "-cr", + action="store_true", + default=False, + help="Copy data present in the RUN group.", + ) + ap.add_argument( + "--copy-control-data", + "-cc", + action="store_true", + default=False, + help="Copy data present in the CONTROL group.", + ) + + args = ap.parse_args() + + print(f"Cloning file(s) structure:\ninput: {args.input}\nOutput: {args.output}\n") + copy_structure( + args.input, + args.output, + run_data=args.copy_run_data, + control_data=args.copy_control_data, + term_progress=True, + ) + print("Done.") + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/extra_data/tests/utils/__init__.py b/extra_data/tests/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/extra_data/tests/utils/gendata.py b/extra_data/tests/utils/gendata.py deleted file mode 100644 index 6ca49f9d..00000000 --- a/extra_data/tests/utils/gendata.py +++ /dev/null @@ -1,141 +0,0 @@ -import sys -from argparse import ArgumentParser -from pathlib import Path - -import h5py - -from ..utils import progress_bar - -__all__ = ["clone"] - - -def progress(processed, total, *, show=True): - """Show progress information""" - if not show: - return - - pbar = progress_bar(processed, total) - if sys.stderr.isatty(): - # "\x1b[2K": delete whole line, "\x1b[1A": move up cursor - print("\x1b[2K\x1b[1A\x1b[2K\x1b[1A", file=sys.stderr) - print(pbar, file=sys.stderr) - - -def _clone_file_structure( - h5file: Path, output: Path, *, run_data=False, control_data=False -) -> None: - original = h5py.File(h5file) - clone = h5py.File(output / h5file.name, "w") - - def visitor(name, obj): - link = original.get(name, getlink=True) - if isinstance(link, h5py.SoftLink): - clone[name] = h5py.SoftLink(link.path) - return - - if isinstance(obj, h5py.Group): - clone_obj = clone.create_group(name) - elif isinstance(obj, h5py.Dataset): - if ( - name.startswith("INSTRUMENT") - or (name.startswith("CONTROL") and not control_data) - or (name.startswith("RUN") and not run_data) - ): - clone_obj = clone.create_dataset_like(name, obj) - else: - # note: consider using h5py.File.copy once a bug causing - # segfault for dataset with attributes is fixed, - # see: https://github.com/HDFGroup/hdf5/issues/2414 - clone_obj = clone.create_dataset_like(name, obj, data=obj[()]) - else: - return - - # copy attributes - for key, value in obj.attrs.items(): - clone_obj.attrs.create(key, value) - - original.visititems(visitor) - - -def clone( - input: Path, - output: Path, - *, - run_data=False, - control_data=False, - term_progress=False, -) -> None: - """Clone EuXFEL HDF5 file structure without any of its data. - - Clone the input file or files present the input directory. - The cloned files will be written to output. - - args: - run_data: Copy data in RUN group if set to True - control_data: Copy data in CONTROL group if set to True - term_progress: show progress in terminal if set to True - """ - if not output.is_dir(): - raise ValueError(f"The given output directory does not exist: {output}") - - if h5py.is_hdf5(input): - if output == input.parent: - raise ValueError("Input and output must be different directories.") - _clone_file_structure( - input, output, run_data=run_data, control_data=control_data - ) - elif input.is_dir(): - if output == input: - raise ValueError("Input and output must be different directories.") - # clone all hdf5 file present in the given directory - h5files = [f for f in input.glob("*") if h5py.is_hdf5(f)] - - progress(0, len(h5files), show=term_progress) - for n, file_ in enumerate(h5files, start=1): - _clone_file_structure( - file_, output, run_data=run_data, control_data=control_data - ) - progress(n, len(h5files), show=term_progress) - else: - raise ValueError(f"invalid input: {input}") - - -def main(argv=None): - ap = ArgumentParser("Clone EuXFEL HDF5 files but with empty datasets.") - ap.add_argument("input", type=str, help="Path to an HDF5 file or a directory.") - ap.add_argument( - "output", type=str, help="Output directory to write the cloned files." - ) - ap.add_argument( - "--copy-run-data", - "-cr", - action="store_true", - default=False, - help="Copy data present in the RUN group.", - ) - ap.add_argument( - "--copy-control-data", - "-cc", - action="store_true", - default=False, - help="Copy data present in the CONTROL group.", - ) - - args = ap.parse_args() - - path_in = Path(args.input).expanduser() - path_out = Path(args.output).expanduser() - - print(f"Cloning file(s) structure:\ninput: {path_in}\nOutput: {path_out}\n") - clone( - path_in, - path_out, - run_data=args.copy_run_data, - control_data=args.copy_control_data, - term_progress=True, - ) - print("Done.") - - -if __name__ == "__main__": - main(sys.argv[1:]) From 5df5aeddf596cd7c08c0fa135c47c3b96deedfd3 Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Wed, 29 Nov 2023 22:09:51 +0100 Subject: [PATCH 6/9] unit test --- extra_data/tests/test_copy.py | 47 +++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 extra_data/tests/test_copy.py diff --git a/extra_data/tests/test_copy.py b/extra_data/tests/test_copy.py new file mode 100644 index 00000000..2bb6482c --- /dev/null +++ b/extra_data/tests/test_copy.py @@ -0,0 +1,47 @@ +import h5py +import numpy as np + +from extra_data.copy import copy_structure + + +def test_copy_structure(tmp_path, mock_sa3_control_data): + xgm = "SA3_XTD10_XGM/XGM/DOOCS" + xgm_intensity = f"INSTRUMENT/{xgm}:output/data/intensityTD" + xgm_flux = f"CONTROL/{xgm}/pulseEnergy/photonFlux/value" + with h5py.File(mock_sa3_control_data, "a") as f: + # add softlink + f[f"LINKED/{xgm_intensity}"] = h5py.SoftLink(f"/{xgm_intensity}") + # add some data + ds = f[xgm_intensity] + ds[:] = np.ones(ds.shape, ds.dtype) + ds = f[xgm_flux] + ds[:] = np.ones(ds.shape, ds.dtype) + + copy_structure(mock_sa3_control_data, tmp_path, control_data=True) + + inp = h5py.File(mock_sa3_control_data) + out = h5py.File(tmp_path / mock_sa3_control_data.rpartition("/")[-1]) + slink = out.get(f"LINKED/{xgm_intensity}", getlink=True) + + # softlinks are copied + assert isinstance(slink, h5py.SoftLink) + assert slink.path == f"/{xgm_intensity}" + # data is not copied + assert out[xgm_intensity].shape == inp[xgm_intensity].shape + assert out[xgm_intensity].dtype == inp[xgm_intensity].dtype + assert (out[xgm_intensity][()] == 0).all() + # attributes are copied + assert out[xgm_intensity].attrs["unitName"] == "joule" + # control data is copied + assert out[xgm_flux].shape == inp[xgm_flux].shape + assert out[xgm_flux].dtype == inp[xgm_flux].dtype + assert (out[xgm_flux][()] == 1).all() + # run data is not copied + assert out[f"RUN/{xgm}/classId/value"].dtype == h5py.string_dtype() + assert out[f"RUN/{xgm}/classId/value"][()] == [b""] + + # TODO test hardlinks + + +def test_copy_run(): + ... From addfd1a8996896db366184c4e2b6c909f09547c7 Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Thu, 30 Nov 2023 14:20:52 +0100 Subject: [PATCH 7/9] more unit tests --- extra_data/copy.py | 2 +- extra_data/tests/test_copy.py | 27 ++++++++++++++++++++------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/extra_data/copy.py b/extra_data/copy.py index 3cc496e7..a8ea79c7 100644 --- a/extra_data/copy.py +++ b/extra_data/copy.py @@ -165,7 +165,7 @@ def main(argv=None): help="Copy data present in the CONTROL group.", ) - args = ap.parse_args() + args = ap.parse_args(argv) print(f"Cloning file(s) structure:\ninput: {args.input}\nOutput: {args.output}\n") copy_structure( diff --git a/extra_data/tests/test_copy.py b/extra_data/tests/test_copy.py index 2bb6482c..2e3ca24e 100644 --- a/extra_data/tests/test_copy.py +++ b/extra_data/tests/test_copy.py @@ -1,7 +1,9 @@ +from pathlib import Path + import h5py import numpy as np -from extra_data.copy import copy_structure +from extra_data.copy import copy_structure, main def test_copy_structure(tmp_path, mock_sa3_control_data): @@ -9,23 +11,27 @@ def test_copy_structure(tmp_path, mock_sa3_control_data): xgm_intensity = f"INSTRUMENT/{xgm}:output/data/intensityTD" xgm_flux = f"CONTROL/{xgm}/pulseEnergy/photonFlux/value" with h5py.File(mock_sa3_control_data, "a") as f: - # add softlink - f[f"LINKED/{xgm_intensity}"] = h5py.SoftLink(f"/{xgm_intensity}") # add some data ds = f[xgm_intensity] ds[:] = np.ones(ds.shape, ds.dtype) ds = f[xgm_flux] ds[:] = np.ones(ds.shape, ds.dtype) + # add softlink + f["SOFTLINKED"] = h5py.SoftLink(f"/{xgm_intensity}") + # add hardlink + f['HARDLINKED'] = ds copy_structure(mock_sa3_control_data, tmp_path, control_data=True) inp = h5py.File(mock_sa3_control_data) out = h5py.File(tmp_path / mock_sa3_control_data.rpartition("/")[-1]) - slink = out.get(f"LINKED/{xgm_intensity}", getlink=True) + slink = out.get("SOFTLINKED", getlink=True) # softlinks are copied assert isinstance(slink, h5py.SoftLink) assert slink.path == f"/{xgm_intensity}" + # hardlink + assert out['HARDLINKED'] == out[xgm_flux] # data is not copied assert out[xgm_intensity].shape == inp[xgm_intensity].shape assert out[xgm_intensity].dtype == inp[xgm_intensity].dtype @@ -40,8 +46,15 @@ def test_copy_structure(tmp_path, mock_sa3_control_data): assert out[f"RUN/{xgm}/classId/value"].dtype == h5py.string_dtype() assert out[f"RUN/{xgm}/classId/value"][()] == [b""] - # TODO test hardlinks + +def test_copy_run(tmp_path, mock_spb_proc_run): + copy_structure(mock_spb_proc_run, tmp_path) + + inp_files = list(Path(mock_spb_proc_run).glob('*.h5')) + out_files = list(tmp_path.glob('*.h5')) + assert len(inp_files) == len(out_files) -def test_copy_run(): - ... +def test_cli(tmp_path, mock_scs_run): + # smoke test + main([mock_scs_run, str(tmp_path)]) From e1680091f78793898eccd63c7530b7b085030dac Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Mon, 4 Dec 2023 15:30:34 +0100 Subject: [PATCH 8/9] handle external links --- extra_data/copy.py | 18 +++++++++--------- extra_data/tests/test_copy.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/extra_data/copy.py b/extra_data/copy.py index a8ea79c7..1628617c 100644 --- a/extra_data/copy.py +++ b/extra_data/copy.py @@ -41,13 +41,9 @@ def visit(self, obj, output): if obj.name != "/": link = obj.file.get(obj.name, getlink=True) if isinstance(link, h5py.SoftLink): - output[obj.name] = h5py.SoftLink(link.path) - return - elif isinstance(link, h5py.ExternalLink): - # TODO do we want to support external links? - # this *might* work, but external softlinks may point to non reacheable data - # with h5py.File(link.filename) as ext: - # Cloner(ext[link.path], output[obj.name], run_data=self.run_data, control_data=self.control_data) + # note this works only for SoftLinks. ExternalLink object's + # name is not the name of the path, but the targeted file's path + output[obj.name] = link return obj_id = h5py.h5o.get_info(obj.id).addr @@ -80,8 +76,12 @@ def visit(self, obj, output): output_obj = output.create_group(obj.name) self._copy_attrs(obj, output_obj) - for child in obj.values(): - self.visit(child, output) + for name, child in obj.items(): + if child.file != obj.file: + # external link + output[name] = obj.get(name, getlink=True) + else: + self.visit(child, output) else: # unknown type return diff --git a/extra_data/tests/test_copy.py b/extra_data/tests/test_copy.py index 2e3ca24e..d725c0c0 100644 --- a/extra_data/tests/test_copy.py +++ b/extra_data/tests/test_copy.py @@ -10,6 +10,9 @@ def test_copy_structure(tmp_path, mock_sa3_control_data): xgm = "SA3_XTD10_XGM/XGM/DOOCS" xgm_intensity = f"INSTRUMENT/{xgm}:output/data/intensityTD" xgm_flux = f"CONTROL/{xgm}/pulseEnergy/photonFlux/value" + + ext_file = 'ext-data.h5' + ext_path = 'some/data' with h5py.File(mock_sa3_control_data, "a") as f: # add some data ds = f[xgm_intensity] @@ -20,18 +23,26 @@ def test_copy_structure(tmp_path, mock_sa3_control_data): f["SOFTLINKED"] = h5py.SoftLink(f"/{xgm_intensity}") # add hardlink f['HARDLINKED'] = ds + # add external link + with h5py.File(Path(mock_sa3_control_data).parent / ext_file, 'w') as g: + g[ext_path] = [1] + f['EXTLINK'] = h5py.ExternalLink(ext_file, ext_path) copy_structure(mock_sa3_control_data, tmp_path, control_data=True) inp = h5py.File(mock_sa3_control_data) out = h5py.File(tmp_path / mock_sa3_control_data.rpartition("/")[-1]) slink = out.get("SOFTLINKED", getlink=True) + extlink = out.get('EXTLINK', getlink=True) # softlinks are copied assert isinstance(slink, h5py.SoftLink) assert slink.path == f"/{xgm_intensity}" # hardlink assert out['HARDLINKED'] == out[xgm_flux] + # external link + assert extlink.filename == ext_file + assert extlink.path == ext_path # data is not copied assert out[xgm_intensity].shape == inp[xgm_intensity].shape assert out[xgm_intensity].dtype == inp[xgm_intensity].dtype From c4951a8d17a94c232f35dcf3989cdd2fa4e5595d Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Mon, 4 Dec 2023 15:40:29 +0100 Subject: [PATCH 9/9] fix: absolute path for external link --- extra_data/copy.py | 2 +- extra_data/tests/test_copy.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/extra_data/copy.py b/extra_data/copy.py index 1628617c..9a8de783 100644 --- a/extra_data/copy.py +++ b/extra_data/copy.py @@ -79,7 +79,7 @@ def visit(self, obj, output): for name, child in obj.items(): if child.file != obj.file: # external link - output[name] = obj.get(name, getlink=True) + output[f'{obj.name}/{name}'] = obj.get(name, getlink=True) else: self.visit(child, output) else: diff --git a/extra_data/tests/test_copy.py b/extra_data/tests/test_copy.py index d725c0c0..65ab4bde 100644 --- a/extra_data/tests/test_copy.py +++ b/extra_data/tests/test_copy.py @@ -20,26 +20,26 @@ def test_copy_structure(tmp_path, mock_sa3_control_data): ds = f[xgm_flux] ds[:] = np.ones(ds.shape, ds.dtype) # add softlink - f["SOFTLINKED"] = h5py.SoftLink(f"/{xgm_intensity}") + f["group/SOFTLINKED"] = h5py.SoftLink(f"/{xgm_intensity}") # add hardlink - f['HARDLINKED'] = ds + f['group/HARDLINKED'] = ds # add external link with h5py.File(Path(mock_sa3_control_data).parent / ext_file, 'w') as g: g[ext_path] = [1] - f['EXTLINK'] = h5py.ExternalLink(ext_file, ext_path) + f['group/EXTLINK'] = h5py.ExternalLink(ext_file, ext_path) copy_structure(mock_sa3_control_data, tmp_path, control_data=True) inp = h5py.File(mock_sa3_control_data) out = h5py.File(tmp_path / mock_sa3_control_data.rpartition("/")[-1]) - slink = out.get("SOFTLINKED", getlink=True) - extlink = out.get('EXTLINK', getlink=True) + slink = out.get("group/SOFTLINKED", getlink=True) + extlink = out.get('group/EXTLINK', getlink=True) # softlinks are copied assert isinstance(slink, h5py.SoftLink) assert slink.path == f"/{xgm_intensity}" # hardlink - assert out['HARDLINKED'] == out[xgm_flux] + assert out['group/HARDLINKED'] == out[xgm_flux] # external link assert extlink.filename == ext_file assert extlink.path == ext_path