From b40706bb1e90a1e5beadecfb09a37479536ddfa3 Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Thu, 12 Nov 2020 22:09:01 +0100 Subject: [PATCH 1/5] Add script to convert from n5.bdv to zarr.ome format --- data-conversion/check_result.py | 17 +++++ data-conversion/to_ome_zarr.py | 115 ++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 data-conversion/check_result.py create mode 100644 data-conversion/to_ome_zarr.py diff --git a/data-conversion/check_result.py b/data-conversion/check_result.py new file mode 100644 index 0000000..294f97c --- /dev/null +++ b/data-conversion/check_result.py @@ -0,0 +1,17 @@ +import sys +import zarr + + +def check_result(path): + with zarr.open(path, mode='r') as f: + for name, ds in f.items(): + shape = ds.shape + chunks = ds.chunks + assert len(shape) == len(chunks) == 5 + print(name, shape, chunks) + + print("All tests passed") + + +path = sys.argv[1] +check_result(path) diff --git a/data-conversion/to_ome_zarr.py b/data-conversion/to_ome_zarr.py new file mode 100644 index 0000000..eb29aef --- /dev/null +++ b/data-conversion/to_ome_zarr.py @@ -0,0 +1,115 @@ +import argparse +import os +import json +import shutil +from concurrent import futures + +import zarr +import z5py +from tqdm import tqdm +from z5py.util import blocking + + +def copy_dataset(ds_in, ds_out, n_threads): + """ Copy input to output dataset in parallel. + + Arguments: + ds_in [dataset] - input dataset (h5py, z5py or zarr dataset) + ds_out [dataset] - output dataset (h5py, z5py or zarr dataset) + n_threads [int] - number of threads, by default all are used (default: None) + Returns: + array_like - output + """ + + assert ds_in.shape == ds_out.shape + # only thread-safe for same chunk sizes ! + assert ds_in.chunks == ds_out.chunks + + blocks = blocking(ds_in.shape, ds_in.chunks) + blocks = [block for block in blocks] + n_blocks = len(blocks) + + def _copy_chunk(block): + ds_out[block] = ds_in[block] + + with futures.ThreadPoolExecutor(n_threads) as tp: + list(tqdm(tp.map(_copy_chunk, blocks), total=n_blocks)) + + +# expand the 2 leading dimensions of the zarr dataset +def expand_dims(ds_path): + attrs_file = os.path.join(ds_path, '.zarray') + assert os.path.exists(attrs_file) + + def is_int(some_string): + try: + int(some_string) + return True + except ValueError: + return False + + chunk_files = os.listdir(ds_path) + chunk_files = [cf for cf in chunk_files if is_int(cf)] + + dim0 = os.path.join(ds_path, 'tmp0') + dim1 = os.path.join(dim0, '0') + + os.makedirs(dim1) + for cf in chunk_files: + shutil.move(os.path.join(ds_path, cf), os.path.join(dim1, cf)) + + shutil.move(dim0, os.path.join(ds_path, '0')) + + with open(attrs_file) as f: + attrs = json.load(f) + + shape = attrs['shape'] + shape = [1, 1] + shape + attrs['shape'] = shape + + chunks = attrs['chunks'] + chunks = [1, 1] + chunks + attrs['chunks'] = chunks + + with open(attrs_file, 'w') as f: + json.dump(attrs, f, indent=2, sort_keys=True) + + +def convert_bdv_n5(in_path, out_path, n_threads): + with z5py.File(in_path, mode='r') as f_in, zarr.open(out_path, mode='w') as f_out: + # we assume bdv.n5 file format and only a single channel + scale_group = f_in['setup0/timepoint0'] + scale_names = [elem for elem in scale_group] + scale_names.sort() + + for name in scale_names: + ds_in = scale_group[name] + + store = zarr.NestedDirectoryStore(os.path.join(out_path, name)) + ds_out = zarr.zeros(store=store, + shape=ds_in.shape, + chunks=ds_in.chunks, + dtype=ds_in.dtype) + + copy_dataset(ds_in, ds_out, n_threads) + + # this invalidates the shape and chunk attributes of our dataset, + # so we can't use it after that (but we also don't need to) + expand_dims(os.path.join(out_path, name)) + + f_out.attrs['multiscalles'] = [ + { + "version": "0.1", + "datasets": [{"path": name} for name in scale_names] + } + ] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('inp', type=str) + parser.add_argument('outp', type=str) + parser.add_argument('--n_threads', type=int, default=8) + + args = parser.parse_args() + convert_bdv_n5(args.inp, args.outp, args.n_threads) From c7b6782f3d03fdd7303bb230b35e6a105476ccad Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Thu, 12 Nov 2020 22:26:56 +0100 Subject: [PATCH 2/5] Update ome zarr script to support flat chunk hierarchy --- data-conversion/check_result.py | 9 ++++-- data-conversion/to_ome_zarr.py | 55 ++++++++++++++++++++++++--------- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/data-conversion/check_result.py b/data-conversion/check_result.py index 294f97c..1924db7 100644 --- a/data-conversion/check_result.py +++ b/data-conversion/check_result.py @@ -2,7 +2,7 @@ import zarr -def check_result(path): +def check_result(path, check_data): with zarr.open(path, mode='r') as f: for name, ds in f.items(): shape = ds.shape @@ -10,8 +10,13 @@ def check_result(path): assert len(shape) == len(chunks) == 5 print(name, shape, chunks) + if check_data: + data = ds[:] + # print(data[0, 0, :10, :10, :10]) + assert data.shape == shape + print("All tests passed") path = sys.argv[1] -check_result(path) +check_result(path, True) diff --git a/data-conversion/to_ome_zarr.py b/data-conversion/to_ome_zarr.py index eb29aef..3f36ef4 100644 --- a/data-conversion/to_ome_zarr.py +++ b/data-conversion/to_ome_zarr.py @@ -36,17 +36,15 @@ def _copy_chunk(block): list(tqdm(tp.map(_copy_chunk, blocks), total=n_blocks)) -# expand the 2 leading dimensions of the zarr dataset -def expand_dims(ds_path): - attrs_file = os.path.join(ds_path, '.zarray') - assert os.path.exists(attrs_file) +def is_int(some_string): + try: + int(some_string) + return True + except ValueError: + return False + - def is_int(some_string): - try: - int(some_string) - return True - except ValueError: - return False +def expand_chunks_nested(ds_path): chunk_files = os.listdir(ds_path) chunk_files = [cf for cf in chunk_files if is_int(cf)] @@ -60,6 +58,31 @@ def is_int(some_string): shutil.move(dim0, os.path.join(ds_path, '0')) + +def expand_chunks_flat(ds_path): + def is_chunk(some_name): + chunk_idx = some_name.split('.') + return all(map(is_int, chunk_idx)) and len(chunk_idx) > 0 + + chunk_files = os.listdir(ds_path) + chunk_files = [cf for cf in chunk_files if is_chunk(cf)] + + for cf in chunk_files: + shutil.move(os.path.join(ds_path, cf), + os.path.join(ds_path, '0.0.' + cf)) + + +# NOTE this works because zarr doesn't have a chunk header +# expand the 2 leading dimensions of the zarr dataset +def expand_dims(ds_path, use_nested_store): + attrs_file = os.path.join(ds_path, '.zarray') + assert os.path.exists(attrs_file) + + if use_nested_store: + expand_chunks_nested(ds_path) + else: + expand_chunks_flat(ds_path) + with open(attrs_file) as f: attrs = json.load(f) @@ -75,7 +98,7 @@ def is_int(some_string): json.dump(attrs, f, indent=2, sort_keys=True) -def convert_bdv_n5(in_path, out_path, n_threads): +def convert_bdv_n5(in_path, out_path, use_nested_store, n_threads): with z5py.File(in_path, mode='r') as f_in, zarr.open(out_path, mode='w') as f_out: # we assume bdv.n5 file format and only a single channel scale_group = f_in['setup0/timepoint0'] @@ -85,7 +108,10 @@ def convert_bdv_n5(in_path, out_path, n_threads): for name in scale_names: ds_in = scale_group[name] - store = zarr.NestedDirectoryStore(os.path.join(out_path, name)) + if use_nested_store: + store = zarr.NestedDirectoryStore(os.path.join(out_path, name)) + else: + store = zarr.DirectoryStore(os.path.join(out_path, name)) ds_out = zarr.zeros(store=store, shape=ds_in.shape, chunks=ds_in.chunks, @@ -95,7 +121,7 @@ def convert_bdv_n5(in_path, out_path, n_threads): # this invalidates the shape and chunk attributes of our dataset, # so we can't use it after that (but we also don't need to) - expand_dims(os.path.join(out_path, name)) + expand_dims(os.path.join(out_path, name), use_nested_store) f_out.attrs['multiscalles'] = [ { @@ -109,7 +135,8 @@ def convert_bdv_n5(in_path, out_path, n_threads): parser = argparse.ArgumentParser() parser.add_argument('inp', type=str) parser.add_argument('outp', type=str) + parser.add_argument('--use_nested_store', type=int, default=0) parser.add_argument('--n_threads', type=int, default=8) args = parser.parse_args() - convert_bdv_n5(args.inp, args.outp, args.n_threads) + convert_bdv_n5(args.inp, args.outp, bool(args.use_nested_store), args.n_threads) From 5cddf277da06bb90c43b4f22bf27872158e9fbe6 Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Fri, 13 Nov 2020 09:21:00 +0100 Subject: [PATCH 3/5] Avoid writing empty chunks --- data-conversion/to_ome_zarr.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/data-conversion/to_ome_zarr.py b/data-conversion/to_ome_zarr.py index 3f36ef4..4727309 100644 --- a/data-conversion/to_ome_zarr.py +++ b/data-conversion/to_ome_zarr.py @@ -30,7 +30,12 @@ def copy_dataset(ds_in, ds_out, n_threads): n_blocks = len(blocks) def _copy_chunk(block): - ds_out[block] = ds_in[block] + # make sure we don't copy empty blocks; I don't know + # if zarr makes sure not to write them out + data_in = ds_in[block] + if data_in.sum() == 0: + return + ds_out[block] = data_in with futures.ThreadPoolExecutor(n_threads) as tp: list(tqdm(tp.map(_copy_chunk, blocks), total=n_blocks)) From 208300c9ee60290cab20a6dee3dd805248f42fe5 Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Wed, 18 Nov 2020 21:16:14 +0100 Subject: [PATCH 4/5] Add platy data conversion script --- .gitignore | 1 + create_data.py | 28 ++++++++ data_conversion/__init__.py | 1 + .../check_result.py | 0 data_conversion/joshs_script.py | 69 +++++++++++++++++++ .../to_ome_zarr.py | 16 +++-- upload_to_s3.py | 1 + 7 files changed, 109 insertions(+), 7 deletions(-) create mode 100644 .gitignore create mode 100644 create_data.py create mode 100644 data_conversion/__init__.py rename {data-conversion => data_conversion}/check_result.py (100%) create mode 100644 data_conversion/joshs_script.py rename {data-conversion => data_conversion}/to_ome_zarr.py (88%) create mode 100644 upload_to_s3.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c18dd8d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/create_data.py b/create_data.py new file mode 100644 index 0000000..23b9462 --- /dev/null +++ b/create_data.py @@ -0,0 +1,28 @@ +import os +from data_conversion import convert_bdv_n5 + + +# def convert_bdv_n5(in_path, out_path, use_nested_store, n_threads): +# add the myosin prospr data +def add_myosin(): + in_path = os.path.join('/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/0.6.3', + 'images/local/prospr-6dpf-1-whole-non-muscle-mhc.n5') + convert_bdv_n5(in_path=in_path, + out_path='platy.ome.zarr', + out_key='prospr-myosin', + use_nested_store=False, + n_threads=4) + + +# add the em raw data +def add_raw(): + pass + + +# add the em cell segmentation +def add_seg(): + pass + + +if __name__ == '__main__': + add_myosin() diff --git a/data_conversion/__init__.py b/data_conversion/__init__.py new file mode 100644 index 0000000..e28a454 --- /dev/null +++ b/data_conversion/__init__.py @@ -0,0 +1 @@ +from .to_ome_zarr import convert_bdv_n5 diff --git a/data-conversion/check_result.py b/data_conversion/check_result.py similarity index 100% rename from data-conversion/check_result.py rename to data_conversion/check_result.py diff --git a/data_conversion/joshs_script.py b/data_conversion/joshs_script.py new file mode 100644 index 0000000..32cebe9 --- /dev/null +++ b/data_conversion/joshs_script.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +# This assumes that n5-copy has already been used + +import argparse +import zarr + +parser = argparse.ArgumentParser() +parser.add_argument("input") +parser.add_argument("output") +ns = parser.parse_args() + +zin = zarr.open(ns.input) + +sizes = [] + +def groups(z): + rv = sorted(list(z.groups())) + assert rv + assert not list(z.arrays()) + return rv + +def arrays(z): + rv = sorted(list(z.arrays())) + assert rv + assert not list(z.groups()) + return rv + +setups = groups(zin) +assert len(setups) == 1 # TODO: multiple channels? +for sname, setup in setups: + timepoints = groups(setup) + for tname, timepoint in timepoints: + resolutions = arrays(timepoint) + for idx, rtuple in enumerate(resolutions): + rname, resolution = rtuple + try: + expected = sizes[idx] + assert expected[0] == rname + assert expected[1] == resolution.shape + assert expected[2] == resolution.chunks + assert expected[3] == resolution.dtype + except: + sizes.append((rname, + resolution.shape, + resolution.chunks, + resolution.dtype)) + + +datasets = [] +out = zarr.open(ns.output, mode="w") + +for idx, size in enumerate(sizes): + name, shape, chunks, dtype = size + shape = tuple([len(timepoints), len(setups)] + list(shape)) + chunks = tuple([1, 1] + list(chunks)) + a = out.create_dataset(name, shape=shape, chunks=chunks, dtype=dtype) + datasets.append({"path": name}) + for sidx, stuple in enumerate(groups(zin)): + for tidx, ttuple in enumerate(groups(stuple[1])): + resolutions = arrays(ttuple[1]) + a[tidx, sidx, :, :, :] = resolutions[idx][1] +out.attrs["multiscales"] = [ + { + "version": "0.1", + "datasets": datasets, + } +] + diff --git a/data-conversion/to_ome_zarr.py b/data_conversion/to_ome_zarr.py similarity index 88% rename from data-conversion/to_ome_zarr.py rename to data_conversion/to_ome_zarr.py index 4727309..3466178 100644 --- a/data-conversion/to_ome_zarr.py +++ b/data_conversion/to_ome_zarr.py @@ -5,7 +5,7 @@ from concurrent import futures import zarr -import z5py +import z5py # NOTE: once the issue with zarr opening n5 groups is resolved, we can also use zarr for reading the n5s from tqdm import tqdm from z5py.util import blocking @@ -81,7 +81,7 @@ def is_chunk(some_name): # expand the 2 leading dimensions of the zarr dataset def expand_dims(ds_path, use_nested_store): attrs_file = os.path.join(ds_path, '.zarray') - assert os.path.exists(attrs_file) + assert os.path.exists(attrs_file), attrs_file if use_nested_store: expand_chunks_nested(ds_path) @@ -103,7 +103,8 @@ def expand_dims(ds_path, use_nested_store): json.dump(attrs, f, indent=2, sort_keys=True) -def convert_bdv_n5(in_path, out_path, use_nested_store, n_threads): +def convert_bdv_n5(in_path, out_path, out_key, + use_nested_store, n_threads): with z5py.File(in_path, mode='r') as f_in, zarr.open(out_path, mode='w') as f_out: # we assume bdv.n5 file format and only a single channel scale_group = f_in['setup0/timepoint0'] @@ -114,9 +115,9 @@ def convert_bdv_n5(in_path, out_path, use_nested_store, n_threads): ds_in = scale_group[name] if use_nested_store: - store = zarr.NestedDirectoryStore(os.path.join(out_path, name)) + store = zarr.NestedDirectoryStore(os.path.join(out_path, out_key, name)) else: - store = zarr.DirectoryStore(os.path.join(out_path, name)) + store = zarr.DirectoryStore(os.path.join(out_path, out_key, name)) ds_out = zarr.zeros(store=store, shape=ds_in.shape, chunks=ds_in.chunks, @@ -126,7 +127,7 @@ def convert_bdv_n5(in_path, out_path, use_nested_store, n_threads): # this invalidates the shape and chunk attributes of our dataset, # so we can't use it after that (but we also don't need to) - expand_dims(os.path.join(out_path, name), use_nested_store) + expand_dims(os.path.join(out_path, out_key, name), use_nested_store) f_out.attrs['multiscalles'] = [ { @@ -140,8 +141,9 @@ def convert_bdv_n5(in_path, out_path, use_nested_store, n_threads): parser = argparse.ArgumentParser() parser.add_argument('inp', type=str) parser.add_argument('outp', type=str) + parser.add_argument('outk', type=str) parser.add_argument('--use_nested_store', type=int, default=0) parser.add_argument('--n_threads', type=int, default=8) args = parser.parse_args() - convert_bdv_n5(args.inp, args.outp, bool(args.use_nested_store), args.n_threads) + convert_bdv_n5(args.inp, args.outp, args.outk, bool(args.use_nested_store), args.n_threads) diff --git a/upload_to_s3.py b/upload_to_s3.py new file mode 100644 index 0000000..4640904 --- /dev/null +++ b/upload_to_s3.py @@ -0,0 +1 @@ +# TODO From 429c1d2c5771aada97eb23519e3a7d86859e1033 Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Fri, 20 Nov 2020 13:09:56 +0100 Subject: [PATCH 5/5] Convert initial data to ome.zarr format --- .gitignore | 2 + create_data.py | 110 ++++++++++++++++++++++++++++++--- data/em-cells.xml | 43 +++++++++++++ data/em-raw.xml | 43 +++++++++++++ data/images.json | 35 +++++++++++ data/prospr-myosin.xml | 43 +++++++++++++ data_conversion/to_ome_zarr.py | 57 +++++++++++++---- upload_to_s3.py | 1 - 8 files changed, 313 insertions(+), 21 deletions(-) create mode 100644 data/em-cells.xml create mode 100644 data/em-raw.xml create mode 100644 data/images.json create mode 100644 data/prospr-myosin.xml delete mode 100644 upload_to_s3.py diff --git a/.gitignore b/.gitignore index c18dd8d..7df2e81 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ __pycache__/ +*.n5 +*.ome.zarr diff --git a/create_data.py b/create_data.py index 23b9462..051e126 100644 --- a/create_data.py +++ b/create_data.py @@ -1,28 +1,120 @@ +import json import os + +import z5py from data_conversion import convert_bdv_n5 +from pybdv.metadata import (get_size, get_resolution, + write_size_and_resolution, + write_affine) + +from mobie.xml_utils import copy_xml_as_n5_s3 +from mobie.metadata.image_dict import default_layer_setting + +IMAGE_DICT = './data/images.json' + + +def write_metadata(in_xml, out_xml, out_path): + bucket_name = 'i2k-2020' + path_in_bucket = os.path.split(out_path)[1] + copy_xml_as_n5_s3(in_xml, out_xml, + service_endpoint='https://s3.embl.de', + bucket_name=bucket_name, + path_in_bucket=path_in_bucket, + authentication='Anonymous', + bdv_type='bdv.zarr.s3') + + with z5py.File(out_path, 'r') as f: + shape = f['setup0/timepoint0/s0'].shape[2:] + + # check if we need to update the shape and resolution + exp_shape = get_size(out_xml, setup_id=0) + if shape != exp_shape: + resolution = get_resolution(out_xml, setup_id=0) + scale_factor = [float(esh) / sh for sh, esh in zip(shape, exp_shape)] + resolution = [round(res * sf, 2) for res, sf in zip(resolution, scale_factor)] + print("Updating shape and resolution to:") + print(shape) + print(resolution) + + write_size_and_resolution(out_xml, setup_id=0, + size=shape, resolution=resolution) + + # make transformation the hacky way ... + dz, dy, dx = resolution + oz, oy, ox = 0., 0., 0. + trafo = '{} 0.0 0.0 {} 0.0 {} 0.0 {} 0.0 0.0 {} {}'.format(dx, ox, + dy, oy, + dz, oz) + trafo = list(map(float, trafo.split(' '))) + write_affine(out_xml, setup_id=0, affine=trafo, overwrite=True) + + +def add_to_image_dict(name, layer_type, xml_path): + settings = default_layer_setting(layer_type) + storage = {"remote": os.path.split(xml_path)[1]} + settings.update({"storage": storage}) + + if os.path.exists(IMAGE_DICT): + with open(IMAGE_DICT) as f: + image_dict = json.load(f) + else: + image_dict = {} + + image_dict[name] = settings + with open(IMAGE_DICT, 'w') as f: + json.dump(image_dict, f, indent=2, sort_keys=True) + + +def add_volume(in_path, vol_name, layer_type, start_scale=0): + out_path = os.path.join('data', f'{vol_name}.ome.zarr') + + # convert to ome zarr + convert_bdv_n5(in_path=in_path, + out_path=out_path, + out_key='setup0/timepoint0', + vol_name=vol_name, + use_nested_store=False, + n_threads=8, + start_scale=start_scale) + + # create the bdv.xml + in_xml = in_path.replace('.n5', '.xml') + out_xml = os.path.join('data', f'{vol_name}.xml') + write_metadata(in_xml, out_xml, out_path) + + add_to_image_dict(vol_name, layer_type, out_xml) # def convert_bdv_n5(in_path, out_path, use_nested_store, n_threads): # add the myosin prospr data def add_myosin(): + print("Add myosin") in_path = os.path.join('/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/0.6.3', - 'images/local/prospr-6dpf-1-whole-non-muscle-mhc.n5') - convert_bdv_n5(in_path=in_path, - out_path='platy.ome.zarr', - out_key='prospr-myosin', - use_nested_store=False, - n_threads=4) + 'images/local/prospr-6dpf-1-whole-mhcl4.n5') + add_volume(in_path, vol_name='prospr-myosin', layer_type='image') # add the em raw data def add_raw(): - pass + print("Add raw") + in_path = '/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/rawdata/sbem-6dpf-1-whole-raw.n5' + add_volume(in_path, vol_name='em-raw', layer_type='image', start_scale=3) # add the em cell segmentation def add_seg(): - pass + print("Add cells") + in_path = os.path.join('/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/1.0.1', + 'images/local/sbem-6dpf-1-whole-segmented-cells.n5') + add_volume(in_path, vol_name='em-cells', layer_type='segmentation', start_scale=2) -if __name__ == '__main__': +def add_all_volumes(): + os.makedirs('./data', exist_ok=True) add_myosin() + add_raw() + add_seg() + + +if __name__ == '__main__': + add_all_volumes() diff --git a/data/em-cells.xml b/data/em-cells.xml new file mode 100644 index 0000000..c6cdff1 --- /dev/null +++ b/data/em-cells.xml @@ -0,0 +1,43 @@ + + . + + + + + 0 + 0 + + + + 0 + Setup0 + 3438 3240 2854 + + micrometer + 0.08 0.08 0.1 + + + 0 + + + + + 0 + 0 + + + em-cells.ome.zarr + us-west-2 + https://s3.embl.de + i2k-2020 + Anonymous + + + + + + 0.08 0.0 0.0 0.0 0.0 0.08 0.0 0.0 0.0 0.0 0.1 0.0 + + + + diff --git a/data/em-raw.xml b/data/em-raw.xml new file mode 100644 index 0000000..572e728 --- /dev/null +++ b/data/em-raw.xml @@ -0,0 +1,43 @@ + + . + + + + + 1 + 1 + + + + 0 + channel 1 + 3438 3240 2854 + + micrometer + 0.08 0.08 0.1 + + + 1 + + + + + 0 + 0 + + + em-raw.ome.zarr + us-west-2 + https://s3.embl.de + i2k-2020 + Anonymous + + + + + + 0.08 0.0 0.0 0.0 0.0 0.08 0.0 0.0 0.0 0.0 0.1 0.0 + + + + diff --git a/data/images.json b/data/images.json new file mode 100644 index 0000000..6254a0b --- /dev/null +++ b/data/images.json @@ -0,0 +1,35 @@ +{ + "em-cells": { + "color": "randomFromGlasbey", + "contrastLimits": [ + 0.0, + 1000.0 + ], + "storage": { + "remote": "em-cells.xml" + }, + "type": "segmentation" + }, + "em-raw": { + "color": "white", + "contrastLimits": [ + 0.0, + 255.0 + ], + "storage": { + "remote": "em-raw.xml" + }, + "type": "image" + }, + "prospr-myosin": { + "color": "white", + "contrastLimits": [ + 0.0, + 255.0 + ], + "storage": { + "remote": "prospr-myosin.xml" + }, + "type": "image" + } +} \ No newline at end of file diff --git a/data/prospr-myosin.xml b/data/prospr-myosin.xml new file mode 100644 index 0000000..94a5cd0 --- /dev/null +++ b/data/prospr-myosin.xml @@ -0,0 +1,43 @@ + + . + + + + + 0 + 0 + + + + 0 + Setup0 + 500 471 519 + + micrometer + 0.55 0.55 0.55 + + + 0 + + + + + 0 + 0 + + + prospr-myosin.ome.zarr + us-west-2 + https://s3.embl.de + i2k-2020 + Anonymous + + + + + + 0.55 0.0 0.0 0.0 0.0 0.55 0.0 0.0 0.0 0.0 0.55 0.0 + + + + diff --git a/data_conversion/to_ome_zarr.py b/data_conversion/to_ome_zarr.py index 3466178..9e07117 100644 --- a/data_conversion/to_ome_zarr.py +++ b/data_conversion/to_ome_zarr.py @@ -8,9 +8,10 @@ import z5py # NOTE: once the issue with zarr opening n5 groups is resolved, we can also use zarr for reading the n5s from tqdm import tqdm from z5py.util import blocking +from pybdv.util import get_scale_factors, relative_to_absolute_scale_factors -def copy_dataset(ds_in, ds_out, n_threads): +def copy_dataset(ds_in, ds_out, n_threads, desc): """ Copy input to output dataset in parallel. Arguments: @@ -38,7 +39,7 @@ def _copy_chunk(block): ds_out[block] = data_in with futures.ThreadPoolExecutor(n_threads) as tp: - list(tqdm(tp.map(_copy_chunk, blocks), total=n_blocks)) + list(tqdm(tp.map(_copy_chunk, blocks), total=n_blocks, desc=desc)) def is_int(some_string): @@ -103,36 +104,70 @@ def expand_dims(ds_path, use_nested_store): json.dump(attrs, f, indent=2, sort_keys=True) -def convert_bdv_n5(in_path, out_path, out_key, - use_nested_store, n_threads): +def normalize_scales(scales, start_scale): + scales = scales[start_scale:] + normalized_scales = [] + + # to relative scale factors + s_prev = scales[0] + for scale in scales: + norm_scale = [s / sp for s, sp in zip(scale, s_prev)] + normalized_scales.append(norm_scale) + s_prev = scale + + # to absolute scale factors + normalized_scales = relative_to_absolute_scale_factors(normalized_scales) + return normalized_scales + + +def convert_bdv_n5(in_path, out_path, out_key, vol_name, + use_nested_store, n_threads, start_scale=0): + + scales = get_scale_factors(in_path, setup_id=0) + if start_scale > 0: + scales = normalize_scales(scales, start_scale) + with z5py.File(in_path, mode='r') as f_in, zarr.open(out_path, mode='w') as f_out: # we assume bdv.n5 file format and only a single channel scale_group = f_in['setup0/timepoint0'] scale_names = [elem for elem in scale_group] scale_names.sort() - for name in scale_names: + if start_scale > 0: + scale_names = scale_names[start_scale:] + + g_out = f_out.create_group(out_key) + out_names = [] + + for sid, name in enumerate(scale_names): ds_in = scale_group[name] + out_name = f"s{sid}" if use_nested_store: - store = zarr.NestedDirectoryStore(os.path.join(out_path, out_key, name)) + store = zarr.NestedDirectoryStore(os.path.join(out_path, out_key, out_name)) else: - store = zarr.DirectoryStore(os.path.join(out_path, out_key, name)) + store = zarr.DirectoryStore(os.path.join(out_path, out_key, out_name)) ds_out = zarr.zeros(store=store, shape=ds_in.shape, chunks=ds_in.chunks, dtype=ds_in.dtype) - copy_dataset(ds_in, ds_out, n_threads) + desc = f"Copy {name} to {out_name}" + copy_dataset(ds_in, ds_out, n_threads, desc) # this invalidates the shape and chunk attributes of our dataset, # so we can't use it after that (but we also don't need to) - expand_dims(os.path.join(out_path, out_key, name), use_nested_store) + expand_dims(os.path.join(out_path, out_key, out_name), use_nested_store) + out_names.append(out_name) + + assert len(out_names) == len(scales) - f_out.attrs['multiscalles'] = [ + g_out.attrs['multiscales'] = [ { + "name": vol_name, "version": "0.1", - "datasets": [{"path": name} for name in scale_names] + "datasets": [{"path": name} for name in out_names], + "scales": [scale[::-1] for scale in scales] } ] diff --git a/upload_to_s3.py b/upload_to_s3.py deleted file mode 100644 index 4640904..0000000 --- a/upload_to_s3.py +++ /dev/null @@ -1 +0,0 @@ -# TODO