Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds poppunk_distribute_fit.py #226

Merged
merged 3 commits into from
Sep 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion PopPUNK/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

'''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''

__version__ = '2.5.1'
__version__ = '2.5.2'

# Minimum sketchlib version
SKETCHLIB_MAJOR = 2
Expand Down
3 changes: 3 additions & 0 deletions docs/model_distribution.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ Lineage models do not use references.
enable query assignment, but visualisation and subclustering within strains will no longer be
possible, as full information within each strain will be missing.

These databases can be automatically generated using the ``poppunk_distribute_fit.py``
script after model fitting. See the :doc:`scripts` page for more information.

Picking references
------------------
PopPUNK automatically prunes redundant sequence information from databases by removing
Expand Down
20 changes: 19 additions & 1 deletion docs/scripts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,26 @@ To run::

poppunk_calculate_silhouette.py --distances strain_db.dists --cluster-csv strain_db_clusters.csv

The following additonal options are available for use with external clusterings (e.g. from hierBAPS):
The following additional options are available for use with external clusterings (e.g. from hierBAPS):

- ``--cluster-col`` the (1-indexed) column index containing the cluster assignment
- ``--id-col`` the (1-indexed) column index containing the sample names
- ``--sub`` a string to remove from sample names to match them to those in ``--distances``

Distributing PopPUNK models
------------------------------
This script automatically generates compressed and uncompressed directories containing all files
required for distribution and reuse of PopPUNK model fits.

To run::

python poppunk_distribute_fit.py --dbdir database_directory --fitdir model_fit_directory --outpref output_prefix

The following additional arguments are available:

- ``--lineage`` specify only if lineage fit was used.
- ``--no-compress`` will not generate tar.bz2 archives

``--dbdir`` and ``--fitdir`` can be the same directory, however both must still be specified.
The output of this script is a directory and a compressed tar.bz2 archive for each of the
full dataset and representative genomes dataset.
146 changes: 146 additions & 0 deletions scripts/poppunk_distribute_fit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#!/usr/bin/env python
# Copyright 2022-2023 John Lees, Nick Croucher and Samuel Horsfield

import shutil
import os
import sys
import argparse
import tarfile

def get_options():
description = 'Generates distributable fits from PopPUNK'
parser = argparse.ArgumentParser(description=description, prog='python poppunk_distribute_fit.py')

IO = parser.add_argument_group('Input/Output options')
IO.add_argument('--dbdir', default=None, help='PopPUNK Database Directory. ')
IO.add_argument('--fitdir', default=None, help='PopPUNK fit Directory. ')
IO.add_argument('--outpref', default="PopPUNK", help='Output file prefix. [Default = "PopPUNK"]')
IO.add_argument('--lineage', default=False, action="store_true", help='Specify if lineage used for fit. [Default = False]')
IO.add_argument('--no-compress', default=False, action="store_true", help='No compression of fits. [Default = False] ')


return parser.parse_args()

if __name__ == "__main__":
options = get_options()

if any([dir == None for dir in (options.dbdir, options.fitdir)]):
print("All input directories must be specified")
sys.exit(1)

db_dir = options.dbdir
fit_dir = options.fitdir
out_full = options.outpref + "_full"
out_refs = options.outpref + "_refs"
lineage = options.lineage


# ensure trailing slash present
db_dir = os.path.join(db_dir, "")
fit_dir = os.path.join(fit_dir, "")
out_full = os.path.join(out_full, "")
out_refs = os.path.join(out_refs, "")

if not os.path.exists(out_full):
os.mkdir(out_full)

if not os.path.exists(out_refs):
os.mkdir(out_refs)

# get absolute paths
db_dir = os.path.abspath(db_dir)
fit_dir = os.path.abspath(fit_dir)
out_full = os.path.abspath(out_full)
out_refs = os.path.abspath(out_refs)

#print(db_dir)
#print(fit_dir)
#print(out_full)
#print(out_refs)

# check if directories are real
dir_check = True
for dir in (db_dir, fit_dir, out_full, out_refs):
if not os.path.isdir(dir):
print("Directory {} not found".format(dir))
dir_check = False

if not dir_check:
sys.exit(1)

# database extensions
db_exts = (".dists.npy", ".dists.pkl", ".h5", ".png", "_qcreport.txt")
if lineage:
fit_exts = ("_fit.npz", "_fit.pkl", "_graph.gt", ".csv", ".png", "rank_k_fit.npz")
else:
fit_exts = ("_fit.npz", "_fit.pkl", "_graph.gt", ".csv", ".png")


#set current dir
curr_dir = db_dir
out_dir = out_full

# get files in db_dir
onlyfiles = [os.path.join(curr_dir, f) for f in os.listdir(curr_dir) if os.path.isfile(os.path.join(curr_dir, f))]
for file in onlyfiles:
#print(file)
if any(s in file for s in db_exts) and ".refs" not in file:
shutil.copy(file, out_dir)

# get files in fit_dir
curr_dir = fit_dir

onlyfiles = [os.path.join(curr_dir, f) for f in os.listdir(curr_dir) if os.path.isfile(os.path.join(curr_dir, f))]
for file in onlyfiles:
#print(file)
if any(s in file for s in fit_exts) and ".refs" not in file:
shutil.copy(file, out_dir)

# repeat for refs, will be in fit_dir
out_dir = out_refs

if lineage:
fit_exts = ("_fit.npz", "_fit.pkl", ".csv", ".png", "_qcreport.txt", "rank_k_fit.npz")
else:
fit_exts = ("_fit.npz", "_fit.pkl", ".csv", ".png", "_qcreport.txt")

# get files in db_dir
onlyfiles = [os.path.join(curr_dir, f) for f in os.listdir(curr_dir) if os.path.isfile(os.path.join(curr_dir, f))]
for file in onlyfiles:
#print(file)
if ".refs" in file:
shutil.copy(file, out_dir)


# get files in fit_dir
onlyfiles = [os.path.join(curr_dir, f) for f in os.listdir(curr_dir) if os.path.isfile(os.path.join(curr_dir, f))]
for file in onlyfiles:
#print(file)
if any(s in file for s in fit_exts):
shutil.copy(file, out_dir)

# compress fits
if not options.no_compress:
# compress refs
tar_out_list = os.path.split(out_dir)
tar_out = os.path.join(tar_out_list[0], tar_out_list[1] + ".tar.bz2")
#print(tar_out)
onlyfiles = [os.path.join(tar_out_list[1], f) for f in os.listdir(out_dir) if os.path.isfile(os.path.join(out_dir, f))]
os.chdir(tar_out_list[0])
with tarfile.open(tar_out, "w:bz2") as tar:
for file in onlyfiles:
tar.add(file)

# compress full
out_dir = out_full
tar_out_list = os.path.split(out_dir)
tar_out = os.path.join(tar_out_list[0], tar_out_list[1] + ".tar.bz2")
#print(tar_out)
onlyfiles = [os.path.join(tar_out_list[1], f) for f in os.listdir(out_dir) if os.path.isfile(os.path.join(out_dir, f))]
os.chdir(tar_out_list[0])
with tarfile.open(tar_out, "w:bz2") as tar:
for file in onlyfiles:
tar.add(file)


sys.exit(0)
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ def build_extension(self, ext):
'scripts/poppunk_extract_distances.py',
'scripts/poppunk_iterate.py',
'scripts/poppunk_add_weights.py',
'scripts/poppunk_easy_run.py'],
'scripts/poppunk_easy_run.py',
'scripts/poppunk_distribute_fit.py'],
ext_modules=[CMakeExtension('poppunk_refine')],
test_suite="test",
cmdclass=dict(build_ext=CMakeBuild),
Expand Down