Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AnnData as format #2974

Merged
merged 31 commits into from
Apr 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
3cb9de1
Add AnnData as format
flying-sheep Aug 8, 2022
a36e1c1
Merge branch 'master' into format-anndata
flying-sheep Aug 12, 2022
9cf89fe
Add anndata to extras
flying-sheep Aug 12, 2022
cf6533a
Merge branch 'master' into format-anndata
flying-sheep Aug 16, 2022
5980212
Merge branch 'master' into format-anndata
flying-sheep Aug 18, 2022
e78122b
Add test skeleton
flying-sheep Aug 19, 2022
9f480da
Improve tests a bit
flying-sheep Aug 19, 2022
a1786e9
Simplify test dependencies
flying-sheep Aug 19, 2022
37b6652
Add changelog entry
flying-sheep Aug 19, 2022
ed5ec66
Merge branch 'master' into format-anndata
sir-sigurd Aug 22, 2022
6795474
remove unrelated changes
flying-sheep Sep 9, 2022
299f868
Merge branch 'master' into format-anndata
flying-sheep Sep 9, 2022
bdde81a
Fix linter complaints
flying-sheep Sep 9, 2022
a1c87f0
Support current AnnData versions
flying-sheep Sep 9, 2022
9733c2a
Merge branch 'master' into format-anndata
flying-sheep Sep 9, 2022
cc6f417
Windows support
flying-sheep Sep 9, 2022
1fac547
Simplify FormatHandlers registration
flying-sheep Sep 9, 2022
07421ac
increase coverage
flying-sheep Sep 9, 2022
d9ff0aa
Merge branch 'master' into format-anndata
flying-sheep Sep 12, 2022
9387b92
Merge branch 'master' into format-anndata
sir-sigurd Sep 13, 2022
02e41cb
Use recommended APIs
flying-sheep Sep 15, 2022
96c2a10
Merge branch 'master' into format-anndata
flying-sheep Sep 15, 2022
aa9a8ec
Skip coverage on ImportError fallback code branches
flying-sheep Sep 15, 2022
c22ce4e
last ones
flying-sheep Sep 15, 2022
b8a8ff6
dep phrasing
flying-sheep Feb 9, 2023
ec0e023
Use `tempfile` import
flying-sheep Feb 9, 2023
90b322d
Merge branch 'master' into format-anndata
sir-sigurd Apr 5, 2023
6e65be6
isort
sir-sigurd Apr 5, 2023
5aa44d9
do not use experimental anndata features
sir-sigurd Apr 5, 2023
345cf15
revert unrelated changes
sir-sigurd Apr 6, 2023
ab5c024
cleanup
sir-sigurd Apr 6, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 65 additions & 13 deletions api/python/quilt3/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,15 @@
import copy
import csv
import gzip
import importlib
import io
import json
import sys
import tempfile
import warnings
from abc import ABC, abstractmethod
from collections import defaultdict
from pathlib import Path

try:
from importlib import metadata as importlib_metadata
Expand Down Expand Up @@ -323,6 +326,7 @@ def all_supported_formats(cls):

Python Object Type Serialization Formats
<class 'pandas.core.frame.DataFrame'> [ssv, csv, tsv, parquet]
<class 'anndata.AnnData'> [.h5ad]
<class 'numpy.ndarray'> [npy, npz]
<class 'str'> [md, json, rst, txt]
<class 'dict'> [json]
Expand All @@ -333,19 +337,18 @@ def all_supported_formats(cls):
<class 'float'> [json]
<class 'bytes'> [bin]
"""
try:
import numpy as np
except ImportError:
pass
else:
cls.search(np.ndarray) # Force FormatHandlers to register np.ndarray as a supported object type

try:
import pandas as pd
except ImportError:
pass
else:
cls.search(pd.DataFrame) # Force FormatHandlers to register pd.DataFrame as a supported object type
# Force FormatHandlers to register these classes as supported object types
for mod_name, cls_name in [
('numpy', 'ndarray'),
('pandas', 'DataFrame'),
('anndata', 'AnnData'),
]:
try:
mod = importlib.import_module(mod_name)
except ImportError:
pass
else:
cls.search(getattr(mod, cls_name))

type_map = defaultdict(set)
for handler in cls.registered_handlers:
Expand Down Expand Up @@ -1033,6 +1036,55 @@ def deserialize(self, bytes_obj, meta=None, ext=None, **format_opts):
ParquetFormatHandler().register() # latest is preferred


class AnnDataFormatHandler(BaseFormatHandler):
"""Format for AnnData <--> .h5ad

Format Opts:
The following options may be used anywhere format opts are accepted,
or directly in metadata under `{'format': {'opts': {...: ...}}}`.

compression('gzip', 'lzf', None): applies during serialization only.
"""
name = 'h5ad'
handled_extensions = ['h5ad']
opts = ('compression',)
defaults = dict(
compression='lzf',
)

def handles_type(self, typ: type) -> bool:
# don't load module unless we actually have to use it.
if 'anndata' not in sys.modules:
return False
import anndata as ad
self.handled_types.add(ad.AnnData)
return super().handles_type(typ)

def serialize(self, obj, meta=None, ext=None, **format_opts):
opts = self.get_opts(meta, format_opts)
opts_with_defaults = copy.deepcopy(self.defaults)
opts_with_defaults.update(opts)

with tempfile.TemporaryDirectory() as td:
path = Path(td) / 'data.h5ad'
obj.write(path, **opts_with_defaults)
data = path.read_bytes()

return data, self._update_meta(meta, additions=opts_with_defaults)

def deserialize(self, bytes_obj, meta=None, ext=None, **format_opts):
try:
import anndata as ad
except ImportError:
raise QuiltException("Please install quilt3[anndata]")

buf = io.BytesIO(bytes_obj)
return ad.read_h5ad(buf)


AnnDataFormatHandler().register()


class CompressionRegistry:
"""A collection for organizing `CompressionHandler` objects."""
registered_handlers = []
Expand Down
5 changes: 2 additions & 3 deletions api/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,9 @@ def run(self):
'pandas>=0.19.2',
'pyarrow>=0.14.1', # as of 7/5/19: linux/circleci bugs on 0.14.0
],
'anndata': ['anndata>=0.8.0'],
'tests': [
'numpy>=1.14.0', # required by pandas, but missing from its dependencies.
'pandas>=0.19.2',
'pyarrow>=0.14.1', # as of 7/5/19: linux/circleci bugs on 0.14.0
'quilt3[pyarrow,anndata]',
'pytest==6.*',
'pytest-cov',
'coverage==6.4',
Expand Down
Binary file added api/python/tests/data/test.h5ad
Binary file not shown.
32 changes: 32 additions & 0 deletions api/python/tests/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
import numpy as np
import pandas as pd
import pytest
from anndata import AnnData

from quilt3.formats import FormatRegistry
from quilt3.util import QuiltException

# Constants
data_dir = pathlib.Path(__file__).parent / 'data'


# Code
Expand Down Expand Up @@ -146,6 +148,36 @@ def test_formats_csv_roundtrip():
assert df1.equals(df2)


def test_formats_anndata_roundtrip():
meta = {'format': {'name': 'h5ad'}}
ad_file = data_dir / 'test.h5ad'
ad: AnnData = FormatRegistry.deserialize(ad_file.read_bytes(), meta)
assert isinstance(ad, AnnData)

bin, format_meta = FormatRegistry.serialize(ad, meta)
meta2 = {**meta, **format_meta}
ad2: AnnData = FormatRegistry.deserialize(bin, meta2)
np.allclose(ad.X, ad2.X)
ad.obs.equals(ad2.obs)
ad.var.equals(ad2.var)


def test_all_supported_formats():
assert FormatRegistry.all_supported_formats() == {
AnnData: {'h5ad'},
pd.DataFrame: {'csv', 'parquet', 'ssv', 'tsv'},
np.ndarray: {'npy', 'npz'},
str: {'json', 'md', 'rst', 'txt'},
tuple: {'json'},
type(None): {'json'},
dict: {'json'},
int: {'json'},
list: {'json'},
float: {'json'},
bytes: {'bin'},
}


def test_formats_search_fail_notfound():
# a search that finds nothing should raise with an explanation.
class Foo:
Expand Down
4 changes: 4 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ Entries inside each section should be ordered by type:

## Catalog, Lambdas
!-->
# unreleased - YYYY-MM-DD
## Python API
* [Added] Support [AnnData](https://anndata.readthedocs.io/en/latest/) format ([#2974](https://github.com/quiltdata/quilt/pull/2974))

# 5.2.1 - 2023-04-05
## Python API
* [Fixed] Fixed CSV serialization with pandas 2 ([#3395](https://github.com/quiltdata/quilt/pull/3395))
Expand Down