Skip to content

Commit

Permalink
Merge pull request #10555 from mtekman/anndatafix
Browse files Browse the repository at this point in the history
[20.09] AnnData Peek Fix
  • Loading branch information
mvdbeek authored Nov 1, 2020
2 parents 3358585 + 69fe83a commit ab5e616
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 76 deletions.
153 changes: 77 additions & 76 deletions lib/galaxy/datatypes/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,8 @@ class Anndata(H5):
True
>>> Anndata().sniff(get_test_fname('test.mz5'))
False
>>> Anndata().sniff(get_test_fname('import.loom.krumsiek11.h5ad'))
True
"""
file_ext = 'h5ad'

Expand All @@ -953,20 +955,20 @@ class Anndata(H5):
MetadataElement(name="obs_names", desc="obs_names", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None)
MetadataElement(name="obs_layers", desc="obs_layers", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None)
MetadataElement(name="obs_count", default=0, desc="obs_count", readonly=True, visible=True, no_value=0)
MetadataElement(name="obs_size", default=0, desc="obs_size", readonly=True, visible=True, no_value=0)
MetadataElement(name="obs_size", default=-1, desc="obs_size", readonly=True, visible=True, no_value=0)
MetadataElement(name="obsm_layers", desc="obsm_layers", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None)
MetadataElement(name="obsm_count", default=0, desc="obsm_count", readonly=True, visible=True, no_value=0)
MetadataElement(name="raw_var_layers", desc="raw_var_layers", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None)
MetadataElement(name="raw_var_count", default=0, desc="raw_var_count", readonly=True, visible=True, no_value=0)
MetadataElement(name="raw_var_size", default=0, desc="raw_var_size", readonly=True, visible=True, no_value=0)
MetadataElement(name="var_layers", desc="var_layers", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None)
MetadataElement(name="var_count", default=0, desc="var_count", readonly=True, visible=True, no_value=0)
MetadataElement(name="var_size", default=0, desc="var_size", readonly=True, visible=True, no_value=0)
MetadataElement(name="var_size", default=-1, desc="var_size", readonly=True, visible=True, no_value=0)
MetadataElement(name="varm_layers", desc="varm_layers", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None)
MetadataElement(name="varm_count", default=0, desc="varm_count", readonly=True, visible=True, no_value=0)
MetadataElement(name="uns_layers", desc="uns_layers", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None)
MetadataElement(name="uns_count", default=0, desc="uns_count", readonly=True, visible=True, no_value=0)
MetadataElement(name="shape", default=(), desc="shape", param=metadata.ListParameter, readonly=True, visible=True, no_value=(0, 0))
MetadataElement(name="shape", default=(-1, -1), desc="shape", param=metadata.ListParameter, readonly=True, visible=True, no_value=(0, 0))

def sniff(self, filename):
if super().sniff(filename):
Expand All @@ -979,79 +981,78 @@ def sniff(self, filename):

def set_meta(self, dataset, overwrite=True, **kwd):
super(Anndata, self).set_meta(dataset, overwrite=overwrite, **kwd)
try:
with h5py.File(dataset.file_name, 'r') as anndata_file:
dataset.metadata.title = util.unicodify(anndata_file.attrs.get('title'))
dataset.metadata.description = util.unicodify(anndata_file.attrs.get('description'))
dataset.metadata.url = util.unicodify(anndata_file.attrs.get('url'))
dataset.metadata.doi = util.unicodify(anndata_file.attrs.get('doi'))
dataset.creation_date = util.unicodify(anndata_file.attrs.get('creation_date'))
# none of the above appear to work in any dataset tested, but could be useful for future
# AnnData datasets

# all possible keys
dataset.metadata.layers_count = len(anndata_file)
dataset.metadata.layers_names = list(anndata_file.keys())

def _layercountsize(tmp, lennames=0):
"From TMP and LENNAMES, return layers, their number, and the length of one of the layers (all equal)."
if hasattr(tmp, 'dtype'):
layers = [util.unicodify(x) for x in tmp.dtype.names]
count = len(tmp.dtype)
size = int(tmp.size)
else:
layers = [util.unicodify(x) for x in list(tmp.keys())]
count = len(layers)
size = lennames
return (layers, count, size)

if 'obs' in dataset.metadata.layers_names:
tmp = anndata_file["obs"]
dataset.metadata.obs_names = [util.unicodify(x) for x in tmp["index"]]
dataset.metadata.obs_layers, \
dataset.metadata.obs_count, \
dataset.metadata.obs_size = _layercountsize(tmp, len(dataset.metadata.obs_names))

if 'obsm' in dataset.metadata.layers_names:
tmp = anndata_file["obsm"]
dataset.metadata.obsm_layers, dataset.metadata.obsm_count, _ = _layercountsize(tmp)

if 'raw.var' in dataset.metadata.layers_names:
tmp = anndata_file["raw.var"]
# full set of genes would never need to be previewed
# dataset.metadata.raw_var_names = tmp["index"]
dataset.metadata.raw_var_layers, \
dataset.metadata.raw_var_count, \
dataset.metadata.raw_var_size = _layercountsize(tmp, len(tmp["index"]))

if 'var' in dataset.metadata.layers_names:
tmp = anndata_file["var"]
# row names are never used in preview windows
# dataset.metadata.var_names = tmp["index"]
dataset.metadata.var_layers, \
dataset.metadata.var_count, \
dataset.metadata.var_size = _layercountsize(tmp, len(tmp["index"]))

if 'varm' in dataset.metadata.layers_names:
tmp = anndata_file["varm"]
dataset.metadata.varm_layers, dataset.metadata.varm_count, _ = _layercountsize(tmp)

if 'uns' in dataset.metadata.layers_names:
tmp = anndata_file["uns"]
dataset.metadata.uns_layers, dataset.metadata.uns_count, _ = _layercountsize(tmp)

if 'X' in dataset.metadata.layers_names:
# Shape we determine here due to the non-standard representation of 'X' dimensions
shape = anndata_file['X'].attrs.get("shape")
if shape is not None:
dataset.metadata.shape = tuple(shape)
elif hasattr(anndata_file['X'], 'shape'):
dataset.metadata.shape = tuple(anndata_file['X'].shape)
else:
dataset.metadata.shape = (int(dataset.metadata.obs_size), int(dataset.metadata.var_size))

except Exception as e:
log.warning('%s, set_meta Exception: %s', self, e)
with h5py.File(dataset.file_name, 'r') as anndata_file:
dataset.metadata.title = util.unicodify(anndata_file.attrs.get('title'))
dataset.metadata.description = util.unicodify(anndata_file.attrs.get('description'))
dataset.metadata.url = util.unicodify(anndata_file.attrs.get('url'))
dataset.metadata.doi = util.unicodify(anndata_file.attrs.get('doi'))
dataset.creation_date = util.unicodify(anndata_file.attrs.get('creation_date'))
dataset.metadata.shape = anndata_file.attrs.get('shape') or dataset.metadata.shape
# none of the above appear to work in any dataset tested, but could be useful for future
# AnnData datasets

# all possible keys
dataset.metadata.layers_count = len(anndata_file)
dataset.metadata.layers_names = list(anndata_file.keys())

def _layercountsize(tmp, lennames=0):
"From TMP and LENNAMES, return layers, their number, and the length of one of the layers (all equal)."
if hasattr(tmp, 'dtype'):
layers = [util.unicodify(x) for x in tmp.dtype.names]
count = len(tmp.dtype)
size = int(tmp.size)
else:
layers = [util.unicodify(x) for x in list(tmp.keys())]
count = len(layers)
size = lennames
return (layers, count, size)

if 'obs' in dataset.metadata.layers_names:
tmp = anndata_file["obs"]
dataset.metadata.obs_names = [util.unicodify(x) for x in tmp["index"]]
dataset.metadata.obs_layers, \
dataset.metadata.obs_count, \
dataset.metadata.obs_size = _layercountsize(tmp, len(dataset.metadata.obs_names))

if 'obsm' in dataset.metadata.layers_names:
tmp = anndata_file["obsm"]
dataset.metadata.obsm_layers, dataset.metadata.obsm_count, _ = _layercountsize(tmp)

if 'raw.var' in dataset.metadata.layers_names:
tmp = anndata_file["raw.var"]
# full set of genes would never need to be previewed
# dataset.metadata.raw_var_names = tmp["index"]
dataset.metadata.raw_var_layers, \
dataset.metadata.raw_var_count, \
dataset.metadata.raw_var_size = _layercountsize(tmp, len(tmp["index"]))

if 'var' in dataset.metadata.layers_names:
tmp = anndata_file["var"]
# row names are never used in preview windows
# dataset.metadata.var_names = tmp["index"]
dataset.metadata.var_layers, \
dataset.metadata.var_count, \
dataset.metadata.var_size = _layercountsize(tmp, len(tmp["index"]))

if 'varm' in dataset.metadata.layers_names:
tmp = anndata_file["varm"]
dataset.metadata.varm_layers, dataset.metadata.varm_count, _ = _layercountsize(tmp)

if 'uns' in dataset.metadata.layers_names:
tmp = anndata_file["uns"]
dataset.metadata.uns_layers, dataset.metadata.uns_count, _ = _layercountsize(tmp)

## Resolving the problematic shape parameter
if 'X' in dataset.metadata.layers_names:
# Shape we determine here due to the non-standard representation of 'X' dimensions
shape = anndata_file['X'].attrs.get("shape")
if shape is not None:
dataset.metadata.shape = tuple(shape)
elif hasattr(anndata_file['X'], 'shape'):
dataset.metadata.shape = tuple(anndata_file['X'].shape)

if dataset.metadata.shape is None:
dataset.metadata.shape = (int(dataset.metadata.obs_size), int(dataset.metadata.var_size))

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
Expand Down
Binary file not shown.

0 comments on commit ab5e616

Please sign in to comment.