Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for hdf5_nodist #27

Merged
merged 3 commits into from
Jan 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,39 @@ jobs:
time ssu -f -m unweighted_fp64 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_fp64 -o t1.h5
./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
ls -l t1.h5
rm -f t1.h5
# hdf5 without distance matrix, just PCoA
echo "hdf5_nodist"
time ssu -f -m unweighted_fp64 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_nodist -o t1.h5
./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
ls -l t1.h5
rm -f t1.h5
time ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --pcoa 4 -r hdf5_nodist -o t1.h5
./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
ls -l t1.h5
rm -f t1.h5
# partials
ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --mode partial-report --n-partials 2
time ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --mode partial --start 0 --stop 125 -o t1.partial.1
time ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --mode partial --start 125 --stop 250 -o t1.partial.2
ls -l t1.partial*
ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --mode check-partial --partial-pattern 't1.partial.*'
time ssu -f -m unweighted_fp64 -i test500.biom -t test500.tre --pcoa 4 --mode merge-partial --partial-pattern 't1.partial.*' -r hdf5_fp64 -o t1.h5
./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
ls -l t1.h5
rm -f t1.h5
time ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --pcoa 4 --mode merge-partial --partial-pattern 't1.partial.*' -r hdf5_fp32 -o t1.h5
./compare_unifrac_matrix.py test500.unweighted_fp32.f.h5 t1.h5 1.e-5
./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
ls -l t1.h5
rm -f t1.h5
time ssu -f -m unweighted_fp32 -i test500.biom -t test500.tre --pcoa 4 --mode merge-partial --partial-pattern 't1.partial.*' -r hdf5_nodist -o t1.h5
./compare_unifrac_pcoa.py test500.unweighted_fp32.f.h5 t1.h5 3 0.1
ls -l t1.h5
rm -f t1.h5
rm -f t1.partial.*
popd
- name: Sanity checks
shell: bash -l {0}
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ The methods can be used directly through the command line after install:
hfd5 : HFD5 format. May be fp32 or fp64, depending on method.
hdf5_fp32 : HFD5 format, using fp32 precision.
hdf5_fp64 : HFD5 format, using fp64 precision.
hfd5_nodist : HFD5 format, no distance matrix, just PCoA.
--pcoa [OPTIONAL] Number of PCoA dimensions to compute (default: 10, do not compute if 0)
--diskbuf [OPTIONAL] Use a disk buffer to reduce memory footprint. Provide path to a fast partition (ideally NVMe).
-n [OPTIONAL] DEPRECATED, no-op.
Expand Down
39 changes: 23 additions & 16 deletions src/api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,12 +188,18 @@ inline compute_status is_fp64_method(const std::string &method_string, bool &fp6
}


inline compute_status is_fp64(const std::string &method_string, const std::string &format_string, bool &fp64) {
inline compute_status is_fp64(const std::string &method_string, const std::string &format_string, bool &fp64, bool &save_dist) {
if (format_string == "hdf5_fp32") {
fp64 = false;
save_dist = true;
} else if (format_string == "hdf5_fp64") {
fp64 = true;
save_dist = true;
} else if (format_string == "hdf5") {
save_dist = true;
return is_fp64_method(method_string, fp64);
} else if (format_string == "hdf5_nodist") {
save_dist = false;
return is_fp64_method(method_string, fp64);
} else {
return unknown_method;
Expand Down Expand Up @@ -690,7 +696,8 @@ compute_status unifrac_to_file(const char* biom_filename, const char* tree_filen
unsigned int pcoa_dims, const char *mmap_dir)
{
bool fp64;
compute_status rc = is_fp64(unifrac_method, format, fp64);
bool save_dist;
compute_status rc = is_fp64(unifrac_method, format, fp64, save_dist);

if (rc==okay) {
if (fp64) {
Expand All @@ -702,7 +709,7 @@ compute_status unifrac_to_file(const char* biom_filename, const char* tree_filen

if (rc==okay) {
// we have no alternative to hdf5 right now
IOStatus iostatus = write_mat_from_matrix_hdf5(out_filename, result, pcoa_dims);
IOStatus iostatus = write_mat_from_matrix_hdf5_fp64(out_filename, result, pcoa_dims, save_dist);
destroy_mat_full_fp64(&result);

if (iostatus!=write_okay) rc=output_error;
Expand All @@ -716,7 +723,7 @@ compute_status unifrac_to_file(const char* biom_filename, const char* tree_filen

if (rc==okay) {
// we have no alternative to hdf5 right now
IOStatus iostatus = write_mat_from_matrix_hdf5_fp32(out_filename, result, pcoa_dims);
IOStatus iostatus = write_mat_from_matrix_hdf5_fp32(out_filename, result, pcoa_dims, save_dist);
destroy_mat_full_fp32(&result);

if (iostatus!=write_okay) rc=output_error;
Expand Down Expand Up @@ -812,7 +819,7 @@ herr_t write_hdf5_string(hid_t output_file_id,const char *dname, const char *str

// Internal: Make sure TReal and real_id match
template<class TReal, class TMat>
IOStatus write_mat_from_matrix_hdf5_T(const char* output_filename, TMat * result, hid_t real_id, unsigned int pcoa_dims) {
IOStatus write_mat_from_matrix_hdf5_T(const char* output_filename, TMat * result, hid_t real_id, unsigned int pcoa_dims, bool save_dist) {
/* Create a new file using default properties. */
hid_t output_file_id = H5Fcreate(output_filename, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
if (output_file_id<0) return write_error;
Expand Down Expand Up @@ -857,7 +864,7 @@ IOStatus write_mat_from_matrix_hdf5_T(const char* output_filename, TMat * result
}

// save the matrix
{
if (save_dist) {
hsize_t dims[2];
dims[0] = result->n_samples;
dims[1] = result->n_samples;
Expand Down Expand Up @@ -994,7 +1001,7 @@ IOStatus write_mat_from_matrix_hdf5_T(const char* output_filename, TMat * result

// Internal: Make sure TReal and real_id match
template<class TReal, class TMat>
IOStatus write_mat_hdf5_T(const char* output_filename, mat_t* result,hid_t real_id, unsigned int pcoa_dims) {
IOStatus write_mat_hdf5_T(const char* output_filename, mat_t* result,hid_t real_id, unsigned int pcoa_dims, bool save_dist) {
// compute the matrix
TMat mat_full;
mat_full.n_samples = result->n_samples;
Expand All @@ -1009,26 +1016,26 @@ IOStatus write_mat_hdf5_T(const char* output_filename, mat_t* result,hid_t real_
mat_full.sample_ids = result->sample_ids; // just link

condensed_form_to_matrix_T(result->condensed_form, n_samples, mat_full.matrix);
IOStatus err = write_mat_from_matrix_hdf5_T<TReal,TMat>(output_filename, &mat_full, real_id, pcoa_dims);
IOStatus err = write_mat_from_matrix_hdf5_T<TReal,TMat>(output_filename, &mat_full, real_id, pcoa_dims, save_dist);

free(mat_full.matrix);
return err;
}

IOStatus write_mat_hdf5(const char* output_filename, mat_t* result, unsigned int pcoa_dims) {
return write_mat_hdf5_T<double,mat_full_fp64_t>(output_filename,result,H5T_IEEE_F64LE,pcoa_dims);
IOStatus write_mat_hdf5_fp64(const char* output_filename, mat_t* result, unsigned int pcoa_dims, int save_dist) {
return write_mat_hdf5_T<double,mat_full_fp64_t>(output_filename,result,H5T_IEEE_F64LE,pcoa_dims,save_dist);
}

IOStatus write_mat_hdf5_fp32(const char* output_filename, mat_t* result, unsigned int pcoa_dims) {
return write_mat_hdf5_T<float,mat_full_fp32_t>(output_filename,result,H5T_IEEE_F32LE,pcoa_dims);
IOStatus write_mat_hdf5_fp32(const char* output_filename, mat_t* result, unsigned int pcoa_dims, int save_dist) {
return write_mat_hdf5_T<float,mat_full_fp32_t>(output_filename,result,H5T_IEEE_F32LE,pcoa_dims,save_dist);
}

IOStatus write_mat_from_matrix_hdf5(const char* output_filename, mat_full_fp64_t* result, unsigned int pcoa_dims) {
return write_mat_from_matrix_hdf5_T<double,mat_full_fp64_t>(output_filename,result,H5T_IEEE_F64LE,pcoa_dims);
IOStatus write_mat_from_matrix_hdf5_fp64(const char* output_filename, mat_full_fp64_t* result, unsigned int pcoa_dims, int save_dist) {
return write_mat_from_matrix_hdf5_T<double,mat_full_fp64_t>(output_filename,result,H5T_IEEE_F64LE,pcoa_dims,save_dist);
}

IOStatus write_mat_from_matrix_hdf5_fp32(const char* output_filename, mat_full_fp32_t* result, unsigned int pcoa_dims) {
return write_mat_from_matrix_hdf5_T<float,mat_full_fp32_t>(output_filename,result,H5T_IEEE_F32LE,pcoa_dims);
IOStatus write_mat_from_matrix_hdf5_fp32(const char* output_filename, mat_full_fp32_t* result, unsigned int pcoa_dims, int save_dist) {
return write_mat_from_matrix_hdf5_T<float,mat_full_fp32_t>(output_filename,result,H5T_IEEE_F32LE,pcoa_dims,save_dist);
}

IOStatus write_vec(const char* output_filename, r_vec* result) {
Expand Down
16 changes: 10 additions & 6 deletions src/api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,29 +336,31 @@ EXTERN ComputeStatus unifrac_to_file(const char* biom_filename, const char* tree
*/
EXTERN IOStatus write_mat(const char* filename, mat_t* result);

/* Write a matrix object using hdf5 format
/* Write a matrix object using hdf5 format, using fp64 precision
*
* filename <const char*> the file to write into
* result <mat_t*> the results object
* pcoa_dims <uint> PCoAdimensions to compute, if >0
* save_dist <bool> If false, do not same the distance matrix data
*
* The following error codes are returned:
*
* write_okay : no problems
*/
EXTERN IOStatus write_mat_hdf5(const char* filename, mat_t* result, unsigned int pcoa_dims);
EXTERN IOStatus write_mat_hdf5_fp64(const char* filename, mat_t* result, unsigned int pcoa_dims, int save_dist);

/* Write a matrix object using hdf5 format, using fp32 precision
*
* filename <const char*> the file to write into
* result <mat_t*> the results object
* pcoa_dims <uint> PCoAdimensions to compute, if >0
* save_dist <bool> If false, do not same the distance matrix data
*
* The following error codes are returned:
*
* write_okay : no problems
*/
EXTERN IOStatus write_mat_hdf5_fp32(const char* filename, mat_t* result, unsigned int pcoa_dims);
EXTERN IOStatus write_mat_hdf5_fp32(const char* filename, mat_t* result, unsigned int pcoa_dims, int save_dist);

/* Write a matrix object
*
Expand All @@ -372,29 +374,31 @@ EXTERN IOStatus write_mat_hdf5_fp32(const char* filename, mat_t* result, unsigne
EXTERN IOStatus write_mat_from_matrix(const char* filename, mat_full_fp64_t* result);


/* Write a matrix object from buffer using hdf5 format
/* Write a matrix object from buffer using hdf5 format, using fp64 precision
*
* filename <const char*> the file to write into
* result <mat_full_t*> the results object
* pcoa_dims <uint> PCoAdimensions to compute, if >0
* save_dist <bool> If false, do not same the distance matrix data
*
* The following error codes are returned:
*
* write_okay : no problems
*/
EXTERN IOStatus write_mat_from_matrix_hdf5(const char* filename, mat_full_fp64_t* result, unsigned int pcoa_dims);
EXTERN IOStatus write_mat_from_matrix_hdf5_fp64(const char* filename, mat_full_fp64_t* result, unsigned int pcoa_dims, int save_dist);

/* Write a matrix object from buffer using hdf5 format, using fp32 precision
*
* filename <const char*> the file to write into
* result <mat_full_fp32_t*> the results object
* pcoa_dims <uint> PCoAdimensions to compute, if >0
* save_dist <bool> If false, do not same the distance matrix data
*
* The following error codes are returned:
*
* write_okay : no problems
*/
EXTERN IOStatus write_mat_from_matrix_hdf5_fp32(const char* filename, mat_full_fp32_t* result, unsigned int pcoa_dims);
EXTERN IOStatus write_mat_from_matrix_hdf5_fp32(const char* filename, mat_full_fp32_t* result, unsigned int pcoa_dims, int save_dist);

/* Write a series
*
Expand Down
13 changes: 8 additions & 5 deletions src/su.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include "biom.hpp"
#include "unifrac.hpp"

enum Format {format_invalid,format_ascii, format_hdf5_fp32, format_hdf5_fp64};
enum Format {format_invalid,format_ascii, format_hdf5_fp32, format_hdf5_fp64, format_hdf5_nodist};

void usage() {
std::cout << "usage: ssu -i <biom> -o <out.dm> -m [METHOD] -t <newick> [-a alpha] [-f] [--vaw]" << std::endl;
Expand Down Expand Up @@ -44,6 +44,7 @@ void usage() {
std::cout << " \t\t hfd5 : HFD5 format. May be fp32 or fp64, depending on method." << std::endl;
std::cout << " \t\t hdf5_fp32 : HFD5 format, using fp32 precision." << std::endl;
std::cout << " \t\t hdf5_fp64 : HFD5 format, using fp64 precision." << std::endl;
std::cout << " \t\t hfd5_nodist : HFD5 format, no distance matrix, just PCoA." << std::endl;
std::cout << " --pcoa\t[OPTIONAL] Number of PCoA dimensions to compute (default: 10, do not compute if 0)" << std::endl;
std::cout << " --diskbuf\t[OPTIONAL] Use a disk buffer to reduce memory footprint. Provide path to a fast partition (ideally NVMe)." << std::endl;
std::cout << " -n\t\t[OPTIONAL] DEPRECATED, no-op." << std::endl;
Expand Down Expand Up @@ -168,7 +169,7 @@ int mode_merge_partial_fp32(const char * output_filename, Format format_val, uns
}

IOStatus iostatus;
iostatus = write_mat_from_matrix_hdf5_fp32(output_filename, result, pcoa_dims);
iostatus = write_mat_from_matrix_hdf5_fp32(output_filename, result, pcoa_dims, format_val!=format_hdf5_nodist);
destroy_mat_full_fp32(&result);

if(iostatus != write_okay) {
Expand Down Expand Up @@ -196,8 +197,8 @@ int mode_merge_partial_fp64(const char * output_filename, Format format_val, uns
}

IOStatus iostatus;
if (format_val==format_hdf5_fp64) {
iostatus = write_mat_from_matrix_hdf5(output_filename, result, pcoa_dims);
if (format_val!=format_ascii) {
iostatus = write_mat_from_matrix_hdf5_fp64(output_filename, result, pcoa_dims, format_val!=format_hdf5_nodist);
} else {
iostatus = write_mat_from_matrix(output_filename, result);
}
Expand Down Expand Up @@ -431,6 +432,8 @@ Format get_format(const std::string &format_string, const std::string &method_st
format_val = format_hdf5_fp32;
} else if (format_string == "hdf5_fp64") {
format_val = format_hdf5_fp64;
} else if (format_string == "hdf5_nodist") {
format_val = format_hdf5_nodist;
} else if (format_string == "hdf5") {
if ((method_string=="unweighted_fp64") || (method_string=="weighted_normalized_fp64") || (method_string=="weighted_unnormalized_fp64") || (method_string=="generalized_fp64"))
format_val = format_hdf5_fp64;
Expand Down Expand Up @@ -521,7 +524,7 @@ int main(int argc, char **argv){
format_arg=sformat_arg; // easier to use a single variable
}
if(format_val==format_invalid) {
err("Invalid format, must be one of ascii|hdf5|hdf5_fp32|hdf5_fp64");
err("Invalid format, must be one of ascii|hdf5|hdf5_fp32|hdf5_fp64|hdf5_nodist");
return EXIT_FAILURE;
}

Expand Down