diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a8bcfbe0..4b5c7655 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -114,6 +114,13 @@ jobs: python -c "import h5py,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); f_u=h5py.File('ci/test2.dm.h5','r'); dm_u=skbio.stats.distance.DistanceMatrix(f_u['matrix'][:,:],f_u['order'][:]); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1" python -c "import h5py; f_u=h5py.File('ci/test2.dm.h5','r'); print(f_u.keys()); assert len(f_u['stat_methods'][:]) == 1" python -c "import h5py; f_u=h5py.File('ci/test3.dm.h5','r'); print(f_u.keys()); assert len(f_u['pcoa_eigvals'][:]) == 2" + # repeat using unifrac's h5 interfaces + python -c "import unifrac; dm_u=unifrac.h5unifrac('ci/test.dm.h5'); dm_l=unifrac.h5unifrac_all('ci/test.dm.h5')" + python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac('ci/test.dm.h5'); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1" + python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac_all('ci/test.dm.h5')[0]; t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1" + python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac('ci/test2.dm.h5'); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1" + python -c "import unifrac; st_l=unifrac.h5permanova_dict('ci/test2.dm.h5'); assert len(st_l) == 1" + python -c "import unifrac; pc=unifrac.h5pcoa('ci/test3.dm.h5'); print(pc); assert len(pc.eigvals) == 2" if [[ "$(uname -s)" == "Linux" ]]; then MD5=md5sum diff --git a/README.md b/README.md index 46685b9c..926ca2ca 100644 --- a/README.md +++ b/README.md @@ -140,13 +140,14 @@ The library can be accessed directly from within Python. If operating in this mo >>> import unifrac >>> dir(unifrac) ['__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', - '__package__', '__path__', '__spec__', '__version__', '_api', '_meta', '_methods', 'set_random_seed', 'faith_pd', - 'generalized', 'generalized_fp32', 'generalized_fp32_to_file', 'generalized_fp64', 'generalized_fp64_to_file', 'generalized_to_file', - 'h5pcoa', 'h5pcoa_all', 'h5permanova', 'h5permanova_dict', 'h5unifrac', 'meta', 'pkg_resources', 'ssu', 'ssu_fast', 'ssu_inmem', 'ssu_to_file', 'ssu_to_file_v2', - 'unweighted', 'unweighted_fp32', 'unweighted_fp32_to_file', 'unweighted_fp64', 'unweighted_fp64_to_file', 'unweighted_to_file', - 'weighted_normalized', 'weighted_normalized_fp32', 'weighted_normalized_fp32_to_file', 'weighted_normalized_fp64', 'weighted_normalized_fp64_to_file', - 'weighted_normalized_to_file', 'weighted_unnormalized', 'weighted_unnormalized_fp32', 'weighted_unnormalized_fp32_to_file', 'weighted_unnormalized_fp64', - 'weighted_unnormalized_fp64_to_file', 'weighted_unnormalized_to_file'] + '__package__', '__path__', '__spec__', '__version__', '_api', '_meta', '_methods', + 'faith_pd', 'generalized', 'generalized_fp32', 'generalized_fp32_to_file', 'generalized_fp64', 'generalized_fp64_to_file', 'generalized_to_file', + 'h5pcoa', 'h5pcoa_all', 'h5permanova', 'h5permanova_dict', 'h5unifrac', 'h5unifrac_all', 'meta', 'pkg_resources', + 'set_random_seed', 'ssu', 'ssu_fast', 'ssu_inmem', 'ssu_to_file', 'ssu_to_file_v2', 'unweighted', 'unweighted_fp32', + 'unweighted_fp32_to_file', 'unweighted_fp64', 'unweighted_fp64_to_file', 'unweighted_to_file', 'weighted_normalized', + 'weighted_normalized_fp32', 'weighted_normalized_fp32_to_file', 'weighted_normalized_fp64', 'weighted_normalized_fp64_to_file', + 'weighted_normalized_to_file', 'weighted_unnormalized', 'weighted_unnormalized_fp32', 'weighted_unnormalized_fp32_to_file', + 'weighted_unnormalized_fp64', 'weighted_unnormalized_fp64_to_file', 'weighted_unnormalized_to_file'] >>> print(unifrac.unweighted.__doc__) Compute Unweighted UniFrac diff --git a/unifrac/__init__.py b/unifrac/__init__.py index 51258246..69e65a9f 100644 --- a/unifrac/__init__.py +++ b/unifrac/__init__.py @@ -33,7 +33,7 @@ weighted_unnormalized_fp32_to_file, generalized_fp32_to_file, meta, - h5unifrac, + h5unifrac, h5unifrac_all, h5pcoa, h5pcoa_all, h5permanova, h5permanova_dict) from unifrac._api import ssu, ssu_fast, faith_pd, set_random_seed @@ -57,7 +57,7 @@ 'weighted_normalized_fp32_to_file', 'weighted_unnormalized_fp32_to_file', 'generalized_fp32_to_file', - 'h5unifrac', 'h5pcoa', 'h5pcoa_all', + 'h5unifrac', 'h5unifrac_all', 'h5pcoa', 'h5pcoa_all', 'h5permanova', 'h5permanova_dict', 'ssu', 'ssu_fast', 'faith_pd', 'ssu_to_file', 'ssu_to_file_v2', diff --git a/unifrac/_methods.py b/unifrac/_methods.py index a3e55499..9c1e616e 100644 --- a/unifrac/_methods.py +++ b/unifrac/_methods.py @@ -2508,7 +2508,7 @@ def generalized_fp32_to_file(table: str, def h5unifrac(h5file: str) -> skbio.DistanceMatrix: - """Read UniFrac from a hdf5 file + """Read UniFrac distance matrix from a hdf5 file Parameters ---------- @@ -2538,13 +2538,68 @@ def h5unifrac(h5file: str) -> skbio.DistanceMatrix: """ with h5py.File(h5file, "r") as f_u: - dm = skbio.DistanceMatrix( + if 'matrix:0' in f_u.keys(): + # multi format + dm = skbio.DistanceMatrix( + f_u['matrix:0'][:, :], + [c.decode('ascii') for c in f_u['order'][:]]) + else: + # single format + dm = skbio.DistanceMatrix( f_u['matrix'][:, :], [c.decode('ascii') for c in f_u['order'][:]]) return dm +def h5unifrac_all(h5file: str) -> skbio.DistanceMatrix: + """Read all UniFrac distance matrices from a hdf5 file + + Parameters + ---------- + h5file : str + A filepath to a hdf5 file. + + Returns + ------- + tuple(skbio.DistanceMatrix) + The distance matrices. + + Raises + ------ + OSError + If the hdf5 file is not found + KeyError + If the hdf5 does not have the necessary fields + + References + ---------- + .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for + comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 + (2005). + .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a + powerful beta diversity measure for comparing communities based on + phylogeny. BMC Bioinformatics 12:118 (2011). + """ + + with h5py.File(h5file, "r") as f_u: + order = [c.decode('ascii') for c in f_u['order'][:]] + if 'matrix' in f_u.keys(): + # single format + dms = [skbio.DistanceMatrix( + f_u['matrix'][:, :], order)] + else: + # multi format + dms = [] + i = 0 + while 'matrix:%i' % i in f_u.keys(): + dms.append(skbio.DistanceMatrix( + f_u['matrix:%i' % i][:, :], order)) + i = i + 1 + + return dms + + def _build_pcoa(f_u, long_method_name, order_index, eigval_key, samples_key, prop_key): axis_labels = ["PC%d" % i for i in @@ -2597,9 +2652,16 @@ def h5pcoa(h5file: str) -> skbio.OrdinationResults: order_index = [c.decode('ascii') for c in f_u['order'][:]] - pc = _build_pcoa(f_u, long_method_name, order_index, - 'pcoa_eigvals', 'pcoa_samples', - 'pcoa_proportion_explained') + if 'pcoa_eigvals:0' in f_u.keys(): + # multi interface + pc = _build_pcoa(f_u, long_method_name, order_index, + 'pcoa_eigvals:0', 'pcoa_samples:0', + 'pcoa_proportion_explained:0') + else: + # single interface + pc = _build_pcoa(f_u, long_method_name, order_index, + 'pcoa_eigvals', 'pcoa_samples', + 'pcoa_proportion_explained') return pc