From 208300c9ee60290cab20a6dee3dd805248f42fe5 Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Wed, 18 Nov 2020 21:16:14 +0100 Subject: [PATCH] Add platy data conversion script --- .gitignore | 1 + create_data.py | 28 ++++++++ data_conversion/__init__.py | 1 + .../check_result.py | 0 data_conversion/joshs_script.py | 69 +++++++++++++++++++ .../to_ome_zarr.py | 16 +++-- upload_to_s3.py | 1 + 7 files changed, 109 insertions(+), 7 deletions(-) create mode 100644 .gitignore create mode 100644 create_data.py create mode 100644 data_conversion/__init__.py rename {data-conversion => data_conversion}/check_result.py (100%) create mode 100644 data_conversion/joshs_script.py rename {data-conversion => data_conversion}/to_ome_zarr.py (88%) create mode 100644 upload_to_s3.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c18dd8d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/create_data.py b/create_data.py new file mode 100644 index 0000000..23b9462 --- /dev/null +++ b/create_data.py @@ -0,0 +1,28 @@ +import os +from data_conversion import convert_bdv_n5 + + +# def convert_bdv_n5(in_path, out_path, use_nested_store, n_threads): +# add the myosin prospr data +def add_myosin(): + in_path = os.path.join('/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/0.6.3', + 'images/local/prospr-6dpf-1-whole-non-muscle-mhc.n5') + convert_bdv_n5(in_path=in_path, + out_path='platy.ome.zarr', + out_key='prospr-myosin', + use_nested_store=False, + n_threads=4) + + +# add the em raw data +def add_raw(): + pass + + +# add the em cell segmentation +def add_seg(): + pass + + +if __name__ == '__main__': + add_myosin() diff --git a/data_conversion/__init__.py b/data_conversion/__init__.py new file mode 100644 index 0000000..e28a454 --- /dev/null +++ b/data_conversion/__init__.py @@ -0,0 +1 @@ +from .to_ome_zarr import convert_bdv_n5 diff --git a/data-conversion/check_result.py b/data_conversion/check_result.py similarity index 100% rename from data-conversion/check_result.py rename to data_conversion/check_result.py diff --git a/data_conversion/joshs_script.py b/data_conversion/joshs_script.py new file mode 100644 index 0000000..32cebe9 --- /dev/null +++ b/data_conversion/joshs_script.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +# This assumes that n5-copy has already been used + +import argparse +import zarr + +parser = argparse.ArgumentParser() +parser.add_argument("input") +parser.add_argument("output") +ns = parser.parse_args() + +zin = zarr.open(ns.input) + +sizes = [] + +def groups(z): + rv = sorted(list(z.groups())) + assert rv + assert not list(z.arrays()) + return rv + +def arrays(z): + rv = sorted(list(z.arrays())) + assert rv + assert not list(z.groups()) + return rv + +setups = groups(zin) +assert len(setups) == 1 # TODO: multiple channels? +for sname, setup in setups: + timepoints = groups(setup) + for tname, timepoint in timepoints: + resolutions = arrays(timepoint) + for idx, rtuple in enumerate(resolutions): + rname, resolution = rtuple + try: + expected = sizes[idx] + assert expected[0] == rname + assert expected[1] == resolution.shape + assert expected[2] == resolution.chunks + assert expected[3] == resolution.dtype + except: + sizes.append((rname, + resolution.shape, + resolution.chunks, + resolution.dtype)) + + +datasets = [] +out = zarr.open(ns.output, mode="w") + +for idx, size in enumerate(sizes): + name, shape, chunks, dtype = size + shape = tuple([len(timepoints), len(setups)] + list(shape)) + chunks = tuple([1, 1] + list(chunks)) + a = out.create_dataset(name, shape=shape, chunks=chunks, dtype=dtype) + datasets.append({"path": name}) + for sidx, stuple in enumerate(groups(zin)): + for tidx, ttuple in enumerate(groups(stuple[1])): + resolutions = arrays(ttuple[1]) + a[tidx, sidx, :, :, :] = resolutions[idx][1] +out.attrs["multiscales"] = [ + { + "version": "0.1", + "datasets": datasets, + } +] + diff --git a/data-conversion/to_ome_zarr.py b/data_conversion/to_ome_zarr.py similarity index 88% rename from data-conversion/to_ome_zarr.py rename to data_conversion/to_ome_zarr.py index 4727309..3466178 100644 --- a/data-conversion/to_ome_zarr.py +++ b/data_conversion/to_ome_zarr.py @@ -5,7 +5,7 @@ from concurrent import futures import zarr -import z5py +import z5py # NOTE: once the issue with zarr opening n5 groups is resolved, we can also use zarr for reading the n5s from tqdm import tqdm from z5py.util import blocking @@ -81,7 +81,7 @@ def is_chunk(some_name): # expand the 2 leading dimensions of the zarr dataset def expand_dims(ds_path, use_nested_store): attrs_file = os.path.join(ds_path, '.zarray') - assert os.path.exists(attrs_file) + assert os.path.exists(attrs_file), attrs_file if use_nested_store: expand_chunks_nested(ds_path) @@ -103,7 +103,8 @@ def expand_dims(ds_path, use_nested_store): json.dump(attrs, f, indent=2, sort_keys=True) -def convert_bdv_n5(in_path, out_path, use_nested_store, n_threads): +def convert_bdv_n5(in_path, out_path, out_key, + use_nested_store, n_threads): with z5py.File(in_path, mode='r') as f_in, zarr.open(out_path, mode='w') as f_out: # we assume bdv.n5 file format and only a single channel scale_group = f_in['setup0/timepoint0'] @@ -114,9 +115,9 @@ def convert_bdv_n5(in_path, out_path, use_nested_store, n_threads): ds_in = scale_group[name] if use_nested_store: - store = zarr.NestedDirectoryStore(os.path.join(out_path, name)) + store = zarr.NestedDirectoryStore(os.path.join(out_path, out_key, name)) else: - store = zarr.DirectoryStore(os.path.join(out_path, name)) + store = zarr.DirectoryStore(os.path.join(out_path, out_key, name)) ds_out = zarr.zeros(store=store, shape=ds_in.shape, chunks=ds_in.chunks, @@ -126,7 +127,7 @@ def convert_bdv_n5(in_path, out_path, use_nested_store, n_threads): # this invalidates the shape and chunk attributes of our dataset, # so we can't use it after that (but we also don't need to) - expand_dims(os.path.join(out_path, name), use_nested_store) + expand_dims(os.path.join(out_path, out_key, name), use_nested_store) f_out.attrs['multiscalles'] = [ { @@ -140,8 +141,9 @@ def convert_bdv_n5(in_path, out_path, use_nested_store, n_threads): parser = argparse.ArgumentParser() parser.add_argument('inp', type=str) parser.add_argument('outp', type=str) + parser.add_argument('outk', type=str) parser.add_argument('--use_nested_store', type=int, default=0) parser.add_argument('--n_threads', type=int, default=8) args = parser.parse_args() - convert_bdv_n5(args.inp, args.outp, bool(args.use_nested_store), args.n_threads) + convert_bdv_n5(args.inp, args.outp, args.outk, bool(args.use_nested_store), args.n_threads) diff --git a/upload_to_s3.py b/upload_to_s3.py new file mode 100644 index 0000000..4640904 --- /dev/null +++ b/upload_to_s3.py @@ -0,0 +1 @@ +# TODO