From 4f43828860aa6217a739f482ff1605387bf6ebf2 Mon Sep 17 00:00:00 2001 From: Sam Horsfield Date: Wed, 28 Sep 2022 13:10:00 +0100 Subject: [PATCH 1/3] Adds poppunk_distribute_fit.py --- scripts/poppunk_distribute_fit.py | 146 ++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 scripts/poppunk_distribute_fit.py diff --git a/scripts/poppunk_distribute_fit.py b/scripts/poppunk_distribute_fit.py new file mode 100644 index 00000000..bb73aaab --- /dev/null +++ b/scripts/poppunk_distribute_fit.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python +# Copyright 2022-2023 John Lees, Nick Croucher and Samuel Horsfield + +import shutil +import os +import sys +import argparse +import tarfile + +def get_options(): + description = 'Generates distributable fits from PopPUNK' + parser = argparse.ArgumentParser(description=description, prog='python PopPUNK_distribution.py') + + IO = parser.add_argument_group('Input/Output options') + IO.add_argument('--dbdir', default=None, help='PopPUNK Database Directory. ') + IO.add_argument('--fitdir', default=None, help='PopPUNK fit Directory. ') + IO.add_argument('--outpref', default="PopPUNK", help='Output file prefix. [Default = "PopPUNK"]') + IO.add_argument('--lineage', default=False, action="store_true", help='Specify if lineage used for fit. [Default = False]') + IO.add_argument('--no-compress', default=False, action="store_true", help='No compression of fits. [Default = False] ') + + + return parser.parse_args() + +if __name__ == "__main__": + options = get_options() + + if any([dir == None for dir in (options.dbdir, options.fitdir)]): + print("All input directories must be specified") + sys.exit(1) + + db_dir = options.dbdir + fit_dir = options.fitdir + out_full = options.outpref + "_full" + out_refs = options.outpref + "_refs" + lineage = options.lineage + + + # ensure trailing slash present + db_dir = os.path.join(db_dir, "") + fit_dir = os.path.join(fit_dir, "") + out_full = os.path.join(out_full, "") + out_refs = os.path.join(out_refs, "") + + if not os.path.exists(out_full): + os.mkdir(out_full) + + if not os.path.exists(out_refs): + os.mkdir(out_refs) + + # get absolute paths + db_dir = os.path.abspath(db_dir) + fit_dir = os.path.abspath(fit_dir) + out_full = os.path.abspath(out_full) + out_refs = os.path.abspath(out_refs) + + #print(db_dir) + #print(fit_dir) + #print(out_full) + #print(out_refs) + + # check if directories are real + dir_check = True + for dir in (db_dir, fit_dir, out_full, out_refs): + if not os.path.isdir(dir): + print("Directory {} not found".format(dir)) + dir_check = False + + if not dir_check: + sys.exit(1) + + # database extensions + db_exts = (".dists.npy", ".dists.pkl", ".h5", ".png", "_qcreport.txt") + if lineage: + fit_exts = ("_fit.npz", "_fit.pkl", "_graph.gt", ".csv", ".png", "rank_k_fit.npz") + else: + fit_exts = ("_fit.npz", "_fit.pkl", "_graph.gt", ".csv", ".png") + + + #set current dir + curr_dir = db_dir + out_dir = out_full + + # get files in db_dir + onlyfiles = [os.path.join(curr_dir, f) for f in os.listdir(curr_dir) if os.path.isfile(os.path.join(curr_dir, f))] + for file in onlyfiles: + #print(file) + if any(s in file for s in db_exts) and ".refs" not in file: + shutil.copy(file, out_dir) + + # get files in fit_dir + curr_dir = fit_dir + + onlyfiles = [os.path.join(curr_dir, f) for f in os.listdir(curr_dir) if os.path.isfile(os.path.join(curr_dir, f))] + for file in onlyfiles: + #print(file) + if any(s in file for s in fit_exts) and ".refs" not in file: + shutil.copy(file, out_dir) + + # repeat for refs, will be in fit_dir + out_dir = out_refs + + if lineage: + fit_exts = ("_fit.npz", "_fit.pkl", ".csv", ".png", "_qcreport.txt", "rank_k_fit.npz") + else: + fit_exts = ("_fit.npz", "_fit.pkl", ".csv", ".png", "_qcreport.txt") + + # get files in db_dir + onlyfiles = [os.path.join(curr_dir, f) for f in os.listdir(curr_dir) if os.path.isfile(os.path.join(curr_dir, f))] + for file in onlyfiles: + #print(file) + if ".refs" in file: + shutil.copy(file, out_dir) + + + # get files in fit_dir + onlyfiles = [os.path.join(curr_dir, f) for f in os.listdir(curr_dir) if os.path.isfile(os.path.join(curr_dir, f))] + for file in onlyfiles: + #print(file) + if any(s in file for s in fit_exts): + shutil.copy(file, out_dir) + + # compress fits + if not options.no_compress: + # compress refs + tar_out_list = os.path.split(out_dir) + tar_out = os.path.join(tar_out_list[0], tar_out_list[1] + ".tar.bz2") + #print(tar_out) + onlyfiles = [os.path.join(tar_out_list[1], f) for f in os.listdir(out_dir) if os.path.isfile(os.path.join(out_dir, f))] + os.chdir(tar_out_list[0]) + with tarfile.open(tar_out, "w:bz2") as tar: + for file in onlyfiles: + tar.add(file) + + # compress full + out_dir = out_full + tar_out_list = os.path.split(out_dir) + tar_out = os.path.join(tar_out_list[0], tar_out_list[1] + ".tar.bz2") + #print(tar_out) + onlyfiles = [os.path.join(tar_out_list[1], f) for f in os.listdir(out_dir) if os.path.isfile(os.path.join(out_dir, f))] + os.chdir(tar_out_list[0]) + with tarfile.open(tar_out, "w:bz2") as tar: + for file in onlyfiles: + tar.add(file) + + + sys.exit(0) From a596a67d709a0e5f9382ca8da0f264624047ace0 Mon Sep 17 00:00:00 2001 From: Sam Horsfield Date: Wed, 28 Sep 2022 13:11:33 +0100 Subject: [PATCH 2/3] Updates parser output for poppunk_distribute_fit.py --- scripts/poppunk_distribute_fit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/poppunk_distribute_fit.py b/scripts/poppunk_distribute_fit.py index bb73aaab..1f55f4f1 100644 --- a/scripts/poppunk_distribute_fit.py +++ b/scripts/poppunk_distribute_fit.py @@ -9,7 +9,7 @@ def get_options(): description = 'Generates distributable fits from PopPUNK' - parser = argparse.ArgumentParser(description=description, prog='python PopPUNK_distribution.py') + parser = argparse.ArgumentParser(description=description, prog='python poppunk_distribute_fit.py') IO = parser.add_argument_group('Input/Output options') IO.add_argument('--dbdir', default=None, help='PopPUNK Database Directory. ') From df84138858256427f18258876f80cad3c622e99f Mon Sep 17 00:00:00 2001 From: Sam Horsfield Date: Wed, 28 Sep 2022 14:46:00 +0100 Subject: [PATCH 3/3] Updates documentation for poppunk_distribute_fit.py --- PopPUNK/__init__.py | 2 +- docs/model_distribution.rst | 3 +++ docs/scripts.rst | 20 +++++++++++++++++++- setup.py | 3 ++- 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py index 2a12e3a5..aa0182e9 100644 --- a/PopPUNK/__init__.py +++ b/PopPUNK/__init__.py @@ -3,7 +3,7 @@ '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)''' -__version__ = '2.5.1' +__version__ = '2.5.2' # Minimum sketchlib version SKETCHLIB_MAJOR = 2 diff --git a/docs/model_distribution.rst b/docs/model_distribution.rst index 4d6ff0cf..ddbbc285 100644 --- a/docs/model_distribution.rst +++ b/docs/model_distribution.rst @@ -32,6 +32,9 @@ Lineage models do not use references. enable query assignment, but visualisation and subclustering within strains will no longer be possible, as full information within each strain will be missing. +These databases can be automatically generated using the ``poppunk_distribute_fit.py`` +script after model fitting. See the :doc:`scripts` page for more information. + Picking references ------------------ PopPUNK automatically prunes redundant sequence information from databases by removing diff --git a/docs/scripts.rst b/docs/scripts.rst index 2ab1474c..b1139e2b 100644 --- a/docs/scripts.rst +++ b/docs/scripts.rst @@ -108,8 +108,26 @@ To run:: poppunk_calculate_silhouette.py --distances strain_db.dists --cluster-csv strain_db_clusters.csv -The following additonal options are available for use with external clusterings (e.g. from hierBAPS): +The following additional options are available for use with external clusterings (e.g. from hierBAPS): - ``--cluster-col`` the (1-indexed) column index containing the cluster assignment - ``--id-col`` the (1-indexed) column index containing the sample names - ``--sub`` a string to remove from sample names to match them to those in ``--distances`` + +Distributing PopPUNK models +------------------------------ +This script automatically generates compressed and uncompressed directories containing all files +required for distribution and reuse of PopPUNK model fits. + +To run:: + + python poppunk_distribute_fit.py --dbdir database_directory --fitdir model_fit_directory --outpref output_prefix + +The following additional arguments are available: + +- ``--lineage`` specify only if lineage fit was used. +- ``--no-compress`` will not generate tar.bz2 archives + +``--dbdir`` and ``--fitdir`` can be the same directory, however both must still be specified. +The output of this script is a directory and a compressed tar.bz2 archive for each of the +full dataset and representative genomes dataset. diff --git a/setup.py b/setup.py index 3dad2af6..1f54053a 100755 --- a/setup.py +++ b/setup.py @@ -136,7 +136,8 @@ def build_extension(self, ext): 'scripts/poppunk_extract_distances.py', 'scripts/poppunk_iterate.py', 'scripts/poppunk_add_weights.py', - 'scripts/poppunk_easy_run.py'], + 'scripts/poppunk_easy_run.py', + 'scripts/poppunk_distribute_fit.py'], ext_modules=[CMakeExtension('poppunk_refine')], test_suite="test", cmdclass=dict(build_ext=CMakeBuild),