diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7df2e81 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +*.n5 +*.ome.zarr diff --git a/create_data.py b/create_data.py new file mode 100644 index 0000000..051e126 --- /dev/null +++ b/create_data.py @@ -0,0 +1,120 @@ +import json +import os + +import z5py +from data_conversion import convert_bdv_n5 +from pybdv.metadata import (get_size, get_resolution, + write_size_and_resolution, + write_affine) + +from mobie.xml_utils import copy_xml_as_n5_s3 +from mobie.metadata.image_dict import default_layer_setting + +IMAGE_DICT = './data/images.json' + + +def write_metadata(in_xml, out_xml, out_path): + bucket_name = 'i2k-2020' + path_in_bucket = os.path.split(out_path)[1] + copy_xml_as_n5_s3(in_xml, out_xml, + service_endpoint='https://s3.embl.de', + bucket_name=bucket_name, + path_in_bucket=path_in_bucket, + authentication='Anonymous', + bdv_type='bdv.zarr.s3') + + with z5py.File(out_path, 'r') as f: + shape = f['setup0/timepoint0/s0'].shape[2:] + + # check if we need to update the shape and resolution + exp_shape = get_size(out_xml, setup_id=0) + if shape != exp_shape: + resolution = get_resolution(out_xml, setup_id=0) + scale_factor = [float(esh) / sh for sh, esh in zip(shape, exp_shape)] + resolution = [round(res * sf, 2) for res, sf in zip(resolution, scale_factor)] + print("Updating shape and resolution to:") + print(shape) + print(resolution) + + write_size_and_resolution(out_xml, setup_id=0, + size=shape, resolution=resolution) + + # make transformation the hacky way ... + dz, dy, dx = resolution + oz, oy, ox = 0., 0., 0. + trafo = '{} 0.0 0.0 {} 0.0 {} 0.0 {} 0.0 0.0 {} {}'.format(dx, ox, + dy, oy, + dz, oz) + trafo = list(map(float, trafo.split(' '))) + write_affine(out_xml, setup_id=0, affine=trafo, overwrite=True) + + +def add_to_image_dict(name, layer_type, xml_path): + settings = default_layer_setting(layer_type) + storage = {"remote": os.path.split(xml_path)[1]} + settings.update({"storage": storage}) + + if os.path.exists(IMAGE_DICT): + with open(IMAGE_DICT) as f: + image_dict = json.load(f) + else: + image_dict = {} + + image_dict[name] = settings + with open(IMAGE_DICT, 'w') as f: + json.dump(image_dict, f, indent=2, sort_keys=True) + + +def add_volume(in_path, vol_name, layer_type, start_scale=0): + out_path = os.path.join('data', f'{vol_name}.ome.zarr') + + # convert to ome zarr + convert_bdv_n5(in_path=in_path, + out_path=out_path, + out_key='setup0/timepoint0', + vol_name=vol_name, + use_nested_store=False, + n_threads=8, + start_scale=start_scale) + + # create the bdv.xml + in_xml = in_path.replace('.n5', '.xml') + out_xml = os.path.join('data', f'{vol_name}.xml') + write_metadata(in_xml, out_xml, out_path) + + add_to_image_dict(vol_name, layer_type, out_xml) + + +# def convert_bdv_n5(in_path, out_path, use_nested_store, n_threads): +# add the myosin prospr data +def add_myosin(): + print("Add myosin") + in_path = os.path.join('/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/0.6.3', + 'images/local/prospr-6dpf-1-whole-mhcl4.n5') + add_volume(in_path, vol_name='prospr-myosin', layer_type='image') + + +# add the em raw data +def add_raw(): + print("Add raw") + in_path = '/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/rawdata/sbem-6dpf-1-whole-raw.n5' + add_volume(in_path, vol_name='em-raw', layer_type='image', start_scale=3) + + +# add the em cell segmentation +def add_seg(): + print("Add cells") + in_path = os.path.join('/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/1.0.1', + 'images/local/sbem-6dpf-1-whole-segmented-cells.n5') + add_volume(in_path, vol_name='em-cells', layer_type='segmentation', start_scale=2) + + +def add_all_volumes(): + os.makedirs('./data', exist_ok=True) + add_myosin() + add_raw() + add_seg() + + +if __name__ == '__main__': + add_all_volumes() diff --git a/data/em-cells.xml b/data/em-cells.xml new file mode 100644 index 0000000..c6cdff1 --- /dev/null +++ b/data/em-cells.xml @@ -0,0 +1,43 @@ + + . + + + + + 0 + 0 + + + + 0 + Setup0 + 3438 3240 2854 + + micrometer + 0.08 0.08 0.1 + + + 0 + + + + + 0 + 0 + + + em-cells.ome.zarr + us-west-2 + https://s3.embl.de + i2k-2020 + Anonymous + + + + + + 0.08 0.0 0.0 0.0 0.0 0.08 0.0 0.0 0.0 0.0 0.1 0.0 + + + + diff --git a/data/em-raw.xml b/data/em-raw.xml new file mode 100644 index 0000000..572e728 --- /dev/null +++ b/data/em-raw.xml @@ -0,0 +1,43 @@ + + . + + + + + 1 + 1 + + + + 0 + channel 1 + 3438 3240 2854 + + micrometer + 0.08 0.08 0.1 + + + 1 + + + + + 0 + 0 + + + em-raw.ome.zarr + us-west-2 + https://s3.embl.de + i2k-2020 + Anonymous + + + + + + 0.08 0.0 0.0 0.0 0.0 0.08 0.0 0.0 0.0 0.0 0.1 0.0 + + + + diff --git a/data/images.json b/data/images.json new file mode 100644 index 0000000..6254a0b --- /dev/null +++ b/data/images.json @@ -0,0 +1,35 @@ +{ + "em-cells": { + "color": "randomFromGlasbey", + "contrastLimits": [ + 0.0, + 1000.0 + ], + "storage": { + "remote": "em-cells.xml" + }, + "type": "segmentation" + }, + "em-raw": { + "color": "white", + "contrastLimits": [ + 0.0, + 255.0 + ], + "storage": { + "remote": "em-raw.xml" + }, + "type": "image" + }, + "prospr-myosin": { + "color": "white", + "contrastLimits": [ + 0.0, + 255.0 + ], + "storage": { + "remote": "prospr-myosin.xml" + }, + "type": "image" + } +} \ No newline at end of file diff --git a/data/prospr-myosin.xml b/data/prospr-myosin.xml new file mode 100644 index 0000000..94a5cd0 --- /dev/null +++ b/data/prospr-myosin.xml @@ -0,0 +1,43 @@ + + . + + + + + 0 + 0 + + + + 0 + Setup0 + 500 471 519 + + micrometer + 0.55 0.55 0.55 + + + 0 + + + + + 0 + 0 + + + prospr-myosin.ome.zarr + us-west-2 + https://s3.embl.de + i2k-2020 + Anonymous + + + + + + 0.55 0.0 0.0 0.0 0.0 0.55 0.0 0.0 0.0 0.0 0.55 0.0 + + + + diff --git a/data_conversion/__init__.py b/data_conversion/__init__.py new file mode 100644 index 0000000..e28a454 --- /dev/null +++ b/data_conversion/__init__.py @@ -0,0 +1 @@ +from .to_ome_zarr import convert_bdv_n5 diff --git a/data_conversion/check_result.py b/data_conversion/check_result.py new file mode 100644 index 0000000..1924db7 --- /dev/null +++ b/data_conversion/check_result.py @@ -0,0 +1,22 @@ +import sys +import zarr + + +def check_result(path, check_data): + with zarr.open(path, mode='r') as f: + for name, ds in f.items(): + shape = ds.shape + chunks = ds.chunks + assert len(shape) == len(chunks) == 5 + print(name, shape, chunks) + + if check_data: + data = ds[:] + # print(data[0, 0, :10, :10, :10]) + assert data.shape == shape + + print("All tests passed") + + +path = sys.argv[1] +check_result(path, True) diff --git a/data_conversion/joshs_script.py b/data_conversion/joshs_script.py new file mode 100644 index 0000000..32cebe9 --- /dev/null +++ b/data_conversion/joshs_script.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +# This assumes that n5-copy has already been used + +import argparse +import zarr + +parser = argparse.ArgumentParser() +parser.add_argument("input") +parser.add_argument("output") +ns = parser.parse_args() + +zin = zarr.open(ns.input) + +sizes = [] + +def groups(z): + rv = sorted(list(z.groups())) + assert rv + assert not list(z.arrays()) + return rv + +def arrays(z): + rv = sorted(list(z.arrays())) + assert rv + assert not list(z.groups()) + return rv + +setups = groups(zin) +assert len(setups) == 1 # TODO: multiple channels? +for sname, setup in setups: + timepoints = groups(setup) + for tname, timepoint in timepoints: + resolutions = arrays(timepoint) + for idx, rtuple in enumerate(resolutions): + rname, resolution = rtuple + try: + expected = sizes[idx] + assert expected[0] == rname + assert expected[1] == resolution.shape + assert expected[2] == resolution.chunks + assert expected[3] == resolution.dtype + except: + sizes.append((rname, + resolution.shape, + resolution.chunks, + resolution.dtype)) + + +datasets = [] +out = zarr.open(ns.output, mode="w") + +for idx, size in enumerate(sizes): + name, shape, chunks, dtype = size + shape = tuple([len(timepoints), len(setups)] + list(shape)) + chunks = tuple([1, 1] + list(chunks)) + a = out.create_dataset(name, shape=shape, chunks=chunks, dtype=dtype) + datasets.append({"path": name}) + for sidx, stuple in enumerate(groups(zin)): + for tidx, ttuple in enumerate(groups(stuple[1])): + resolutions = arrays(ttuple[1]) + a[tidx, sidx, :, :, :] = resolutions[idx][1] +out.attrs["multiscales"] = [ + { + "version": "0.1", + "datasets": datasets, + } +] + diff --git a/data_conversion/to_ome_zarr.py b/data_conversion/to_ome_zarr.py new file mode 100644 index 0000000..9e07117 --- /dev/null +++ b/data_conversion/to_ome_zarr.py @@ -0,0 +1,184 @@ +import argparse +import os +import json +import shutil +from concurrent import futures + +import zarr +import z5py # NOTE: once the issue with zarr opening n5 groups is resolved, we can also use zarr for reading the n5s +from tqdm import tqdm +from z5py.util import blocking +from pybdv.util import get_scale_factors, relative_to_absolute_scale_factors + + +def copy_dataset(ds_in, ds_out, n_threads, desc): + """ Copy input to output dataset in parallel. + + Arguments: + ds_in [dataset] - input dataset (h5py, z5py or zarr dataset) + ds_out [dataset] - output dataset (h5py, z5py or zarr dataset) + n_threads [int] - number of threads, by default all are used (default: None) + Returns: + array_like - output + """ + + assert ds_in.shape == ds_out.shape + # only thread-safe for same chunk sizes ! + assert ds_in.chunks == ds_out.chunks + + blocks = blocking(ds_in.shape, ds_in.chunks) + blocks = [block for block in blocks] + n_blocks = len(blocks) + + def _copy_chunk(block): + # make sure we don't copy empty blocks; I don't know + # if zarr makes sure not to write them out + data_in = ds_in[block] + if data_in.sum() == 0: + return + ds_out[block] = data_in + + with futures.ThreadPoolExecutor(n_threads) as tp: + list(tqdm(tp.map(_copy_chunk, blocks), total=n_blocks, desc=desc)) + + +def is_int(some_string): + try: + int(some_string) + return True + except ValueError: + return False + + +def expand_chunks_nested(ds_path): + + chunk_files = os.listdir(ds_path) + chunk_files = [cf for cf in chunk_files if is_int(cf)] + + dim0 = os.path.join(ds_path, 'tmp0') + dim1 = os.path.join(dim0, '0') + + os.makedirs(dim1) + for cf in chunk_files: + shutil.move(os.path.join(ds_path, cf), os.path.join(dim1, cf)) + + shutil.move(dim0, os.path.join(ds_path, '0')) + + +def expand_chunks_flat(ds_path): + def is_chunk(some_name): + chunk_idx = some_name.split('.') + return all(map(is_int, chunk_idx)) and len(chunk_idx) > 0 + + chunk_files = os.listdir(ds_path) + chunk_files = [cf for cf in chunk_files if is_chunk(cf)] + + for cf in chunk_files: + shutil.move(os.path.join(ds_path, cf), + os.path.join(ds_path, '0.0.' + cf)) + + +# NOTE this works because zarr doesn't have a chunk header +# expand the 2 leading dimensions of the zarr dataset +def expand_dims(ds_path, use_nested_store): + attrs_file = os.path.join(ds_path, '.zarray') + assert os.path.exists(attrs_file), attrs_file + + if use_nested_store: + expand_chunks_nested(ds_path) + else: + expand_chunks_flat(ds_path) + + with open(attrs_file) as f: + attrs = json.load(f) + + shape = attrs['shape'] + shape = [1, 1] + shape + attrs['shape'] = shape + + chunks = attrs['chunks'] + chunks = [1, 1] + chunks + attrs['chunks'] = chunks + + with open(attrs_file, 'w') as f: + json.dump(attrs, f, indent=2, sort_keys=True) + + +def normalize_scales(scales, start_scale): + scales = scales[start_scale:] + normalized_scales = [] + + # to relative scale factors + s_prev = scales[0] + for scale in scales: + norm_scale = [s / sp for s, sp in zip(scale, s_prev)] + normalized_scales.append(norm_scale) + s_prev = scale + + # to absolute scale factors + normalized_scales = relative_to_absolute_scale_factors(normalized_scales) + return normalized_scales + + +def convert_bdv_n5(in_path, out_path, out_key, vol_name, + use_nested_store, n_threads, start_scale=0): + + scales = get_scale_factors(in_path, setup_id=0) + if start_scale > 0: + scales = normalize_scales(scales, start_scale) + + with z5py.File(in_path, mode='r') as f_in, zarr.open(out_path, mode='w') as f_out: + # we assume bdv.n5 file format and only a single channel + scale_group = f_in['setup0/timepoint0'] + scale_names = [elem for elem in scale_group] + scale_names.sort() + + if start_scale > 0: + scale_names = scale_names[start_scale:] + + g_out = f_out.create_group(out_key) + out_names = [] + + for sid, name in enumerate(scale_names): + ds_in = scale_group[name] + out_name = f"s{sid}" + + if use_nested_store: + store = zarr.NestedDirectoryStore(os.path.join(out_path, out_key, out_name)) + else: + store = zarr.DirectoryStore(os.path.join(out_path, out_key, out_name)) + ds_out = zarr.zeros(store=store, + shape=ds_in.shape, + chunks=ds_in.chunks, + dtype=ds_in.dtype) + + desc = f"Copy {name} to {out_name}" + copy_dataset(ds_in, ds_out, n_threads, desc) + + # this invalidates the shape and chunk attributes of our dataset, + # so we can't use it after that (but we also don't need to) + expand_dims(os.path.join(out_path, out_key, out_name), use_nested_store) + out_names.append(out_name) + + assert len(out_names) == len(scales) + + g_out.attrs['multiscales'] = [ + { + "name": vol_name, + "version": "0.1", + "datasets": [{"path": name} for name in out_names], + "scales": [scale[::-1] for scale in scales] + } + ] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('inp', type=str) + parser.add_argument('outp', type=str) + parser.add_argument('outk', type=str) + parser.add_argument('--use_nested_store', type=int, default=0) + parser.add_argument('--n_threads', type=int, default=8) + + args = parser.parse_args() + convert_bdv_n5(args.inp, args.outp, args.outk, bool(args.use_nested_store), args.n_threads)