Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AnnData as format #2974

Merged
merged 31 commits into from
Apr 6, 2023
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
3cb9de1
Add AnnData as format
flying-sheep Aug 8, 2022
a36e1c1
Merge branch 'master' into format-anndata
flying-sheep Aug 12, 2022
9cf89fe
Add anndata to extras
flying-sheep Aug 12, 2022
cf6533a
Merge branch 'master' into format-anndata
flying-sheep Aug 16, 2022
5980212
Merge branch 'master' into format-anndata
flying-sheep Aug 18, 2022
e78122b
Add test skeleton
flying-sheep Aug 19, 2022
9f480da
Improve tests a bit
flying-sheep Aug 19, 2022
a1786e9
Simplify test dependencies
flying-sheep Aug 19, 2022
37b6652
Add changelog entry
flying-sheep Aug 19, 2022
ed5ec66
Merge branch 'master' into format-anndata
sir-sigurd Aug 22, 2022
6795474
remove unrelated changes
flying-sheep Sep 9, 2022
299f868
Merge branch 'master' into format-anndata
flying-sheep Sep 9, 2022
bdde81a
Fix linter complaints
flying-sheep Sep 9, 2022
a1c87f0
Support current AnnData versions
flying-sheep Sep 9, 2022
9733c2a
Merge branch 'master' into format-anndata
flying-sheep Sep 9, 2022
cc6f417
Windows support
flying-sheep Sep 9, 2022
1fac547
Simplify FormatHandlers registration
flying-sheep Sep 9, 2022
07421ac
increase coverage
flying-sheep Sep 9, 2022
d9ff0aa
Merge branch 'master' into format-anndata
flying-sheep Sep 12, 2022
9387b92
Merge branch 'master' into format-anndata
sir-sigurd Sep 13, 2022
02e41cb
Use recommended APIs
flying-sheep Sep 15, 2022
96c2a10
Merge branch 'master' into format-anndata
flying-sheep Sep 15, 2022
aa9a8ec
Skip coverage on ImportError fallback code branches
flying-sheep Sep 15, 2022
c22ce4e
last ones
flying-sheep Sep 15, 2022
b8a8ff6
dep phrasing
flying-sheep Feb 9, 2023
ec0e023
Use `tempfile` import
flying-sheep Feb 9, 2023
90b322d
Merge branch 'master' into format-anndata
sir-sigurd Apr 5, 2023
6e65be6
isort
sir-sigurd Apr 5, 2023
5aa44d9
do not use experimental anndata features
sir-sigurd Apr 5, 2023
345cf15
revert unrelated changes
sir-sigurd Apr 6, 2023
ab5c024
cleanup
sir-sigurd Apr 6, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions api/python/quilt3/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ def all_supported_formats(cls):

Python Object Type Serialization Formats
<class 'pandas.core.frame.DataFrame'> [ssv, csv, tsv, parquet]
<class 'anndata.AnnData'> [.h5ad]
<class 'numpy.ndarray'> [npy, npz]
<class 'str'> [md, json, rst, txt]
<class 'dict'> [json]
Expand All @@ -342,6 +343,13 @@ def all_supported_formats(cls):
else:
cls.search(pd.DataFrame) # Force FormatHandlers to register pd.DataFrame as a supported object type

try:
import anndata as ad
except ImportError:
pass
else:
cls.search(ad.AnnData) # Force FormatHandlers to register ad.AnnData as a supported object type

type_map = defaultdict(set)
for handler in cls.registered_handlers:
for t in handler.handled_types:
Expand Down Expand Up @@ -1024,6 +1032,56 @@ def deserialize(self, bytes_obj, meta=None, ext=None, **format_opts):
ParquetFormatHandler().register() # latest is preferred


# noinspection PyPackageRequirements
class AnnDataFormatHandler(BaseFormatHandler):
"""Format for AnnData <--> .h5ad

Format Opts:
The following options may be used anywhere format opts are accepted,
or directly in metadata under `{'format': {'opts': {...: ...}}}`.

compression('gzip', 'lzf', None): applies during serialization only.

"""
name = 'h5ad'
handled_extensions = ['h5ad']
opts = ('compression',)
defaults = dict(
compression='lzf',
)

def handles_type(self, typ: type) -> bool:
# don't load module unless we actually have to use it.
if 'annndata' not in sys.modules:
return False
import anndata as ad
self.handled_types.add(ad.AnnData)
return super().handles_type(typ)

def serialize(self, obj, meta=None, ext=None, **format_opts):
import anndata as ad

opts = self.get_opts(meta, format_opts)
opts_with_defaults = copy.deepcopy(self.defaults)
opts_with_defaults.update(opts)
buf = io.BytesIO()
obj.write(buf, **opts_with_defaults)

return buf.getvalue(), self._update_meta(meta, additions=opts_with_defaults)

def deserialize(self, bytes_obj, meta=None, ext=None, **format_opts):
try:
import anndata as ad
except ImportError:
raise QuiltException("Please install anndata")
flying-sheep marked this conversation as resolved.
Show resolved Hide resolved

buf = io.BytesIO(bytes_obj)
return ad.read_h5ad(buf)


AnnDataFormatHandler().register()


class CompressionRegistry:
"""A collection for organizing `CompressionHandler` objects."""
registered_handlers = []
Expand Down
5 changes: 2 additions & 3 deletions api/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,9 @@ def run(self):
'pandas>=0.19.2',
'pyarrow>=0.14.1', # as of 7/5/19: linux/circleci bugs on 0.14.0
],
'anndata': ['anndata>=0.8.0'],
'tests': [
'numpy>=1.14.0', # required by pandas, but missing from its dependencies.
'pandas>=0.19.2',
'pyarrow>=0.14.1', # as of 7/5/19: linux/circleci bugs on 0.14.0
'quilt3[pyarrow,anndata]',
'pytest==6.*',
'pytest-cov',
'coverage==6.4',
Expand Down
Binary file added api/python/tests/data/test.h5ad
Binary file not shown.
54 changes: 30 additions & 24 deletions api/python/tests/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,33 @@
import numpy as np
import pandas as pd
import pytest
from anndata import AnnData

from quilt3.formats import FormatRegistry
from quilt3.util import QuiltException

# Constants

data_dir = pathlib.Path(__file__).parent / 'data'


# Code
def test_buggy_parquet():
@pytest.mark.parametrize('parquet_handler', FormatRegistry.for_format('parquet'))
def test_buggy_parquet(parquet_handler):
"""
Test that Quilt avoids crashing on bad Pandas metadata from
old pyarrow libaries.
"""
path = pathlib.Path(__file__).parent
for parquet_handler in FormatRegistry.for_format('parquet'):
with open(path / 'data' / 'buggy_parquet.parquet', 'rb') as bad_parq:
# Make sure this doesn't crash.
parquet_handler.deserialize(bad_parq.read())
path = data_dir / 'buggy_parquet.parquet'
data = path.read_bytes()
# Make sure this doesn't crash.
parquet_handler.deserialize(data)


def test_formats_for_obj():
arr = np.ndarray(3)

fmt = FormatRegistry.for_obj(arr)[0]
[fmt] = FormatRegistry.for_obj(arr)
flying-sheep marked this conversation as resolved.
Show resolved Hide resolved

assert 'npz' in fmt.handled_extensions
assert FormatRegistry.for_ext('npy')[0] is fmt
Expand All @@ -46,17 +49,17 @@ def test_formats_for_ext():


def test_formats_for_meta():
bytes_fmt = FormatRegistry.for_meta({'target': 'bytes'})[0]
json_fmt = FormatRegistry.for_meta({'target': 'json'})[0]
[bytes_fmt] = FormatRegistry.for_meta({'target': 'bytes'})
[json_fmt] = FormatRegistry.for_meta({'target': 'json'})

some_bytes = b'["phlipper", "piglet"]'
assert bytes_fmt.serialize(some_bytes)[0] == some_bytes
assert json_fmt.deserialize(some_bytes) == ['phlipper', 'piglet']


def test_formats_for_format():
bytes_fmt = FormatRegistry.for_format('bytes')[0]
json_fmt = FormatRegistry.for_format('json')[0]
[bytes_fmt] = FormatRegistry.for_format('bytes')
[json_fmt] = FormatRegistry.for_format('json')

some_bytes = b'["phlipper", "piglet"]'
assert bytes_fmt.serialize(some_bytes)[0] == some_bytes
Expand Down Expand Up @@ -91,7 +94,7 @@ def test_formats_serdes():


def test_formats_csv_read():
csv_file = pathlib.Path(__file__).parent / 'data' / 'csv.csv'
csv_file = data_dir / 'csv.csv'

meta = {'format': {'name': 'csv'}}
expected_bytes = b'a,b,c,d\n1,2,3,4\n5,6,7,8\n'
Expand Down Expand Up @@ -146,20 +149,23 @@ def test_formats_csv_roundtrip():
assert df1.equals(df2)


def test_formats_search_fail_notfound():
# a search that finds nothing should raise with an explanation.
class Foo:
pass
def test_formats_anndata():
meta = {'format': {'name': 'h5ad'}}
ad_file = data_dir / 'test.h5ad'
ad = FormatRegistry.deserialize(ad_file.read_bytes(), meta)

assert isinstance(ad, AnnData)

bad_kwargs = [
dict(obj_type=Foo, meta=None, ext=None),
dict(obj_type=None, meta={}, ext=None),
dict(obj_type=None, meta=None, ext='.fizz'),
]

for args in bad_kwargs:
with pytest.raises(QuiltException):
FormatRegistry.search(**args)
@pytest.mark.parametrize('args', [
dict(obj_type=type('Foo', (), {}), meta=None, ext=None),
dict(obj_type=None, meta={}, ext=None),
dict(obj_type=None, meta=None, ext='.fizz'),
])
def test_formats_search_fail_notfound(args):
"""a search that finds nothing should raise with an explanation."""
with pytest.raises(QuiltException):
FormatRegistry.search(**args)


def test_formats_search_order():
Expand Down
1 change: 1 addition & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
## Catalog, Lambdas
!-->
## Python API
* [Added] Support [AnnData](https://anndata.readthedocs.io/en/latest/) format ([#2974](https://github.com/quiltdata/quilt/pull/2974))
* [Fixed] Fix check to determine if a file is a tempfile in Windows with Python 3.8+ ([#2900](https://github.com/quiltdata/quilt/pull/2900))
* [Changed] Disable upload optimization for objects with SSE-KMS ([#2790](https://github.com/quiltdata/quilt/pull/2790))

Expand Down