Skip to content

Commit

Permalink
Merge pull request #153 from sfiligoi/h5unifrac_multi
Browse files Browse the repository at this point in the history
Add h5unifrac_all method
  • Loading branch information
sfiligoi authored Apr 24, 2023
2 parents 9fc61fc + 5898295 commit 6a9cca8
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 14 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,13 @@ jobs:
python -c "import h5py,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); f_u=h5py.File('ci/test2.dm.h5','r'); dm_u=skbio.stats.distance.DistanceMatrix(f_u['matrix'][:,:],f_u['order'][:]); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1"
python -c "import h5py; f_u=h5py.File('ci/test2.dm.h5','r'); print(f_u.keys()); assert len(f_u['stat_methods'][:]) == 1"
python -c "import h5py; f_u=h5py.File('ci/test3.dm.h5','r'); print(f_u.keys()); assert len(f_u['pcoa_eigvals'][:]) == 2"
# repeat using unifrac's h5 interfaces
python -c "import unifrac; dm_u=unifrac.h5unifrac('ci/test.dm.h5'); dm_l=unifrac.h5unifrac_all('ci/test.dm.h5')"
python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac('ci/test.dm.h5'); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1"
python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac_all('ci/test.dm.h5')[0]; t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1"
python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac('ci/test2.dm.h5'); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1"
python -c "import unifrac; st_l=unifrac.h5permanova_dict('ci/test2.dm.h5'); assert len(st_l) == 1"
python -c "import unifrac; pc=unifrac.h5pcoa('ci/test3.dm.h5'); print(pc); assert len(pc.eigvals) == 2"
if [[ "$(uname -s)" == "Linux" ]];
then
MD5=md5sum
Expand Down
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,14 @@ The library can be accessed directly from within Python. If operating in this mo
>>> import unifrac
>>> dir(unifrac)
['__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__',
'__package__', '__path__', '__spec__', '__version__', '_api', '_meta', '_methods', 'set_random_seed', 'faith_pd',
'generalized', 'generalized_fp32', 'generalized_fp32_to_file', 'generalized_fp64', 'generalized_fp64_to_file', 'generalized_to_file',
'h5pcoa', 'h5pcoa_all', 'h5permanova', 'h5permanova_dict', 'h5unifrac', 'meta', 'pkg_resources', 'ssu', 'ssu_fast', 'ssu_inmem', 'ssu_to_file', 'ssu_to_file_v2',
'unweighted', 'unweighted_fp32', 'unweighted_fp32_to_file', 'unweighted_fp64', 'unweighted_fp64_to_file', 'unweighted_to_file',
'weighted_normalized', 'weighted_normalized_fp32', 'weighted_normalized_fp32_to_file', 'weighted_normalized_fp64', 'weighted_normalized_fp64_to_file',
'weighted_normalized_to_file', 'weighted_unnormalized', 'weighted_unnormalized_fp32', 'weighted_unnormalized_fp32_to_file', 'weighted_unnormalized_fp64',
'weighted_unnormalized_fp64_to_file', 'weighted_unnormalized_to_file']
'__package__', '__path__', '__spec__', '__version__', '_api', '_meta', '_methods',
'faith_pd', 'generalized', 'generalized_fp32', 'generalized_fp32_to_file', 'generalized_fp64', 'generalized_fp64_to_file', 'generalized_to_file',
'h5pcoa', 'h5pcoa_all', 'h5permanova', 'h5permanova_dict', 'h5unifrac', 'h5unifrac_all', 'meta', 'pkg_resources',
'set_random_seed', 'ssu', 'ssu_fast', 'ssu_inmem', 'ssu_to_file', 'ssu_to_file_v2', 'unweighted', 'unweighted_fp32',
'unweighted_fp32_to_file', 'unweighted_fp64', 'unweighted_fp64_to_file', 'unweighted_to_file', 'weighted_normalized',
'weighted_normalized_fp32', 'weighted_normalized_fp32_to_file', 'weighted_normalized_fp64', 'weighted_normalized_fp64_to_file',
'weighted_normalized_to_file', 'weighted_unnormalized', 'weighted_unnormalized_fp32', 'weighted_unnormalized_fp32_to_file',
'weighted_unnormalized_fp64', 'weighted_unnormalized_fp64_to_file', 'weighted_unnormalized_to_file']
>>> print(unifrac.unweighted.__doc__)
Compute Unweighted UniFrac

Expand Down
4 changes: 2 additions & 2 deletions unifrac/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
weighted_unnormalized_fp32_to_file,
generalized_fp32_to_file,
meta,
h5unifrac,
h5unifrac, h5unifrac_all,
h5pcoa, h5pcoa_all,
h5permanova, h5permanova_dict)
from unifrac._api import ssu, ssu_fast, faith_pd, set_random_seed
Expand All @@ -57,7 +57,7 @@
'weighted_normalized_fp32_to_file',
'weighted_unnormalized_fp32_to_file',
'generalized_fp32_to_file',
'h5unifrac', 'h5pcoa', 'h5pcoa_all',
'h5unifrac', 'h5unifrac_all', 'h5pcoa', 'h5pcoa_all',
'h5permanova', 'h5permanova_dict',
'ssu', 'ssu_fast', 'faith_pd',
'ssu_to_file', 'ssu_to_file_v2',
Expand Down
72 changes: 67 additions & 5 deletions unifrac/_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -2508,7 +2508,7 @@ def generalized_fp32_to_file(table: str,


def h5unifrac(h5file: str) -> skbio.DistanceMatrix:
"""Read UniFrac from a hdf5 file
"""Read UniFrac distance matrix from a hdf5 file
Parameters
----------
Expand Down Expand Up @@ -2538,13 +2538,68 @@ def h5unifrac(h5file: str) -> skbio.DistanceMatrix:
"""

with h5py.File(h5file, "r") as f_u:
dm = skbio.DistanceMatrix(
if 'matrix:0' in f_u.keys():
# multi format
dm = skbio.DistanceMatrix(
f_u['matrix:0'][:, :],
[c.decode('ascii') for c in f_u['order'][:]])
else:
# single format
dm = skbio.DistanceMatrix(
f_u['matrix'][:, :],
[c.decode('ascii') for c in f_u['order'][:]])

return dm


def h5unifrac_all(h5file: str) -> skbio.DistanceMatrix:
"""Read all UniFrac distance matrices from a hdf5 file
Parameters
----------
h5file : str
A filepath to a hdf5 file.
Returns
-------
tuple(skbio.DistanceMatrix)
The distance matrices.
Raises
------
OSError
If the hdf5 file is not found
KeyError
If the hdf5 does not have the necessary fields
References
----------
.. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for
comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235
(2005).
.. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a
powerful beta diversity measure for comparing communities based on
phylogeny. BMC Bioinformatics 12:118 (2011).
"""

with h5py.File(h5file, "r") as f_u:
order = [c.decode('ascii') for c in f_u['order'][:]]
if 'matrix' in f_u.keys():
# single format
dms = [skbio.DistanceMatrix(
f_u['matrix'][:, :], order)]
else:
# multi format
dms = []
i = 0
while 'matrix:%i' % i in f_u.keys():
dms.append(skbio.DistanceMatrix(
f_u['matrix:%i' % i][:, :], order))
i = i + 1

return dms


def _build_pcoa(f_u, long_method_name, order_index,
eigval_key, samples_key, prop_key):
axis_labels = ["PC%d" % i for i in
Expand Down Expand Up @@ -2597,9 +2652,16 @@ def h5pcoa(h5file: str) -> skbio.OrdinationResults:
order_index = [c.decode('ascii')
for c in f_u['order'][:]]

pc = _build_pcoa(f_u, long_method_name, order_index,
'pcoa_eigvals', 'pcoa_samples',
'pcoa_proportion_explained')
if 'pcoa_eigvals:0' in f_u.keys():
# multi interface
pc = _build_pcoa(f_u, long_method_name, order_index,
'pcoa_eigvals:0', 'pcoa_samples:0',
'pcoa_proportion_explained:0')
else:
# single interface
pc = _build_pcoa(f_u, long_method_name, order_index,
'pcoa_eigvals', 'pcoa_samples',
'pcoa_proportion_explained')

return pc

Expand Down

0 comments on commit 6a9cca8

Please sign in to comment.