diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py index 79da77c1..ee6b5324 100644 --- a/PopPUNK/__init__.py +++ b/PopPUNK/__init__.py @@ -3,7 +3,7 @@ '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)''' -__version__ = '2.4.4' +__version__ = '2.4.5' # Minimum sketchlib version SKETCHLIB_MAJOR = 1 diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 733f0163..00760ade 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -10,15 +10,10 @@ import subprocess from collections import defaultdict -# Try to import sketchlib -try: - import pp_sketchlib -except ImportError as e: - sys.stderr.write("Sketchlib backend not available\n") - sys.exit(1) +import h5py +import pp_sketchlib import poppunk_refine -import h5py # import poppunk package from .__init__ import __version__ @@ -127,7 +122,7 @@ def get_options(): refinementGroup.add_argument('--manual-start', help='A file containing information for a start point. ' 'See documentation for help.', default=None) refinementGroup.add_argument('--model-dir', help='Directory containing model to use for assigning queries ' - 'to clusters [default = reference database directory]', type = str) + 'to clusters [default = reference database directory]', type = str) refinementGroup.add_argument('--score-idx', help='Index of score to use [default = 0]', type=int, default = 0, choices=[0, 1, 2]) @@ -138,6 +133,10 @@ def get_options(): refineMode.add_argument('--unconstrained', help='Optimise both boundary gradient and intercept', default=False, action='store_true') + refineMode.add_argument('--multi-boundary', + help='Produce multiple sets of clusters at different boundary positions. This argument sets the' + 'number of boundary positions between n-1 clusters and the refine optimum.', + type=int, default=0) refineMode.add_argument('--indiv-refine', help='Also run refinement for core and accessory individually', choices=['both', 'core', 'accessory'], default=None) @@ -218,7 +217,7 @@ def main(): sys.exit(0) # Imports are here because graph tool is very slow to load - from .models import loadClusterFit, ClusterFit, BGMMFit, DBSCANFit, RefineFit, LineageFit + from .models import loadClusterFit, BGMMFit, DBSCANFit, RefineFit, LineageFit from .sketchlib import checkSketchlibLibrary from .sketchlib import removeFromDB @@ -226,7 +225,6 @@ def main(): from .network import construct_network_from_assignments from .network import extractReferences from .network import printClusters - from .network import get_vertex_list from .network import save_network from .network import checkNetworkVertexCount @@ -237,7 +235,6 @@ def main(): from .utils import setGtThreads from .utils import setupDBFuncs - from .utils import storePickle from .utils import readPickle from .utils import qcDistMat from .utils import createOverallLineage @@ -269,7 +266,7 @@ def main(): } # Dict of DB access functions - dbFuncs = setupDBFuncs(args, args.min_kmer_count, qc_dict) + dbFuncs = setupDBFuncs(args, qc_dict) createDatabaseDir = dbFuncs['createDatabaseDir'] constructDatabase = dbFuncs['constructDatabase'] queryDatabase = dbFuncs['queryDatabase'] @@ -434,6 +431,7 @@ def main(): args.manual_start, args.indiv_refine, args.unconstrained, + args.multi_boundary, args.score_idx, args.no_local, args.betweenness_sample, @@ -464,7 +462,7 @@ def main(): # save model model.save() - + # plot model if not args.no_plot: model.plot(distMat, assignments) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index b70c1ddd..2823d8c4 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -564,7 +564,7 @@ def main(): } # Dict of DB access functions for assign_query (which is out of scope) - dbFuncs = setupDBFuncs(args, args.min_kmer_count, qc_dict) + dbFuncs = setupDBFuncs(args, qc_dict) # run according to mode sys.stderr.write("PopPUNK: assign\n") diff --git a/PopPUNK/models.py b/PopPUNK/models.py index ed59d037..5c51c0df 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -45,7 +45,6 @@ except ImportError as e: gpu_lib = False -# GPU support import pp_sketchlib import poppunk_refine @@ -68,8 +67,7 @@ from .plot import plot_dbscan_results # refine -from .refine import refineFit -from .refine import likelihoodBoundary +from .refine import refineFit, multi_refine from .refine import readManualStart from .plot import plot_refined_results @@ -713,7 +711,7 @@ def __init__(self, outPrefix): self.unconstrained = False def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indiv_refine = False, - unconstrained = False, score_idx = 0, no_local = False, + unconstrained = False, multi_boundary = 0, score_idx = 0, no_local = False, betweenness_sample = betweenness_sample_default, use_gpu = False): '''Extends :func:`~ClusterFit.fit` @@ -741,6 +739,10 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi indiv_refine (str) Run refinement for core or accessory distances separately (default = None). + multi_boundary (int) + Produce cluster output at multiple boundary positions downward + from the optimum. + (default = 0). unconstrained (bool) If True, search in 2D and change the slope of the boundary score_idx (int) @@ -784,7 +786,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi raise RuntimeError("Unrecognised model type") # Main refinement in 2D - self.optimal_x, self.optimal_y = \ + self.optimal_x, self.optimal_y, optimal_s = \ refineFit(X/self.scale, sample_names, self.mean0, @@ -801,6 +803,18 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi use_gpu = use_gpu) self.fitted = True + # Output clusters at more positions if requested + if multi_boundary > 1: + multi_refine(X/self.scale, + sample_names, + self.mean0, + self.mean1, + optimal_s, + multi_boundary, + self.outPrefix, + num_processes = self.threads, + use_gpu = use_gpu) + # Try and do a 1D refinement for both core and accessory self.core_boundary = self.optimal_x self.accessory_boundary = self.optimal_y @@ -810,7 +824,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi if indiv_refine == 'both' or indiv_refine == dist_type: sys.stderr.write("Refining " + dist_type + " distances separately\n") # optimise core distance boundary - core_boundary, accessory_boundary = \ + core_boundary, accessory_boundary, s = \ refineFit(X/self.scale, sample_names, self.mean0, @@ -825,9 +839,9 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi betweenness_sample = betweenness_sample, use_gpu = use_gpu) if dist_type == "core": - self.core_boundary = core_boundary + self.core_boundary = core_boundary if dist_type == "accessory": - self.accessory_boundary = accessory_boundary + self.accessory_boundary = accessory_boundary self.indiv_fitted = True except RuntimeError as e: print(e) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 6a1daa38..7065ba4c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -22,7 +22,6 @@ import pickle import graph_tool.all as gt import dendropy -import poppunk_refine # Load GPU libraries try: @@ -36,6 +35,8 @@ except ImportError as e: gpu_lib = False +import poppunk_refine + from .__main__ import accepted_weights_types from .__main__ import betweenness_sample_default diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 327afef1..516ff690 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -20,8 +20,6 @@ except ImportError as e: sys.stderr.write("This version of PopPUNK requires python v3.8 or higher\n") sys.exit(0) -import pp_sketchlib -import poppunk_refine import graph_tool.all as gt import pandas as pd @@ -37,9 +35,11 @@ except ImportError as e: gpu_lib = False +import poppunk_refine + from .__main__ import betweenness_sample_default -from .network import construct_network_from_df +from .network import construct_network_from_df, printClusters from .network import construct_network_from_edge_list from .network import networkSummary from .network import add_self_loop @@ -96,6 +96,8 @@ def refineFit(distMat, sample_names, mean0, mean1, scale, x-coordinate of refined fit optimal_y (float) y-coordinate of refined fit + optimised_s (float) + Position along search range of refined fit """ # Optimize boundary - grid search for global minimum sys.stderr.write("Trying to optimise score globally\n") @@ -165,6 +167,7 @@ def refineFit(distMat, sample_names, mean0, mean1, scale, min_idx = np.argmin(global_s) optimal_x = x_max[min_idx % global_grid_resolution] optimal_y = y_max[min_idx // global_grid_resolution] + optimised_s = global_s[min_idx] if not (optimal_x > x_max_start and optimal_x < x_max_end and \ optimal_y > y_max_start and optimal_y < y_max_end): @@ -249,7 +252,62 @@ def refineFit(distMat, sample_names, mean0, mean1, scale, if optimal_x < 0 or optimal_y < 0: raise RuntimeError("Optimisation failed: produced a boundary outside of allowed range\n") - return optimal_x, optimal_y + return optimal_x, optimal_y, optimised_s + +def multi_refine(distMat, sample_names, mean0, mean1, s_max, + n_boundary_points, output_prefix, + num_processes = 1, use_gpu = False): + """Move the refinement boundary between the optimum and where it meets an + axis. Discrete steps, output the clusers at each step + + Args: + distMat (numpy.array) + n x 2 array of core and accessory distances for n samples + sample_names (list) + List of query sequence labels + mean0 (numpy.array) + Start point to define search line + mean1 (numpy.array) + End point to define search line + s_max (float) + The optimal s position from refinement (:func:`~PopPUNK.refine.refineFit`) + n_boundary_points (int) + Number of positions to try drawing the boundary at + num_processes (int) + Number of threads to use in the global optimisation step. + (default = 1) + use_gpu (bool) + Whether to use cugraph for graph analyses + """ + # load CUDA libraries + use_gpu = check_and_set_gpu(use_gpu, gpu_lib) + + # Set the range + # Between optimised s and where line meets an axis + gradient = (mean1[1] - mean0[1]) / (mean1[0] - mean0[0]) + # Equation of normal passing through origin y = -1/m * x + # Where this meets line y - y1 = m(x - x1) is at: + x = (gradient * mean0[0] - mean0[1]) / (gradient + 1 / gradient) + y = mean0[1] + gradient * (x - mean0[0]) + + s_min = -((mean0[0] - x)**2 + (mean0[1] - y)**2)**0.5 + s_range = np.linspace(s_min, s_max, num = n_boundary_points)[1:] + + i_vec, j_vec, idx_vec = \ + poppunk_refine.thresholdIterate1D(distMat, s_range, 2, + mean0[0], mean0[1], + mean1[0], mean1[1], + num_processes) + + growNetwork(sample_names, + i_vec, + j_vec, + idx_vec, + s_range, + 0, + write_clusters = output_prefix, + use_gpu = use_gpu) + def expand_cugraph_network(G, G_extra_df): """Reconstruct a cugraph network with additional edges. @@ -276,9 +334,9 @@ def expand_cugraph_network(G, G_extra_df): G = add_self_loop(G_df, G_vertex_count, weights = False, renumber = False) return G -def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, +def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx = 0, thread_idx = 0, betweenness_sample = betweenness_sample_default, - use_gpu = False): + write_clusters = None, use_gpu = False): """Construct a network, then add edges to it iteratively. Input is from ``pp_sketchlib.iterateBoundary1D`` or``pp_sketchlib.iterateBoundary2D`` @@ -302,6 +360,9 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, betweenness_sample (int) Number of sequences per component used to estimate betweenness using a GPU. Smaller numbers are faster but less precise [default = 100] + write_clusters (str) + Set to a prefix to write the clusters from each position to files + [default = None] use_gpu (bool) Whether to use cugraph for graph analyses @@ -310,7 +371,6 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, -1 * network score for each of x_range. Where network score is from :func:`~PopPUNK.network.networkSummary` """ - scores = [] prev_idx = -1 @@ -338,9 +398,9 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, # At first offset, make a new network, otherwise just add the new edges if prev_idx == -1: G = construct_network_from_df(sample_names, sample_names, - edge_df, - summarise = False, - use_gpu = use_gpu) + edge_df, + summarise = False, + use_gpu = use_gpu) else: if use_gpu: G = expand_cugraph_network(G, edge_df) @@ -349,13 +409,26 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, G.add_edge_list(edge_list) edge_list = [] # Add score into vector for any offsets passed (should usually just be one) - latest_score = -networkSummary(G, + G_summary = networkSummary(G, score_idx > 0, betweenness_sample = betweenness_sample, - use_gpu = use_gpu)[1][score_idx] + use_gpu = use_gpu) + latest_score = -G_summary[1][score_idx] for s in range(prev_idx, idx): scores.append(latest_score) pbar.update(1) + # Write the cluster output as long as there is at least one + # non-trivial cluster + if write_clusters and G_summary[0][0] < len(sample_names): + o_prefix = write_clusters + "/" + \ + os.path.basename(write_clusters) + \ + "_boundary" + str(s) + printClusters(G, + sample_names, + outPrefix=o_prefix, + write_unwords=False, + use_gpu=use_gpu) + prev_idx = idx return(scores) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index bb015634..63fa697b 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -584,7 +584,7 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num query_db = queryPrefix + "/" + os.path.basename(queryPrefix) distMat = pp_sketchlib.queryDatabase(ref_db, query_db, rNames, qNames, klist, True, False, threads, use_gpu, deviceid) - + # option to plot core/accessory fits. Choose a random number from cmd line option if number_plot_fits > 0: jacobian = -np.hstack((np.ones((klist.shape[0], 1)), klist.reshape(-1, 1))) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index f823defe..c8f39d7d 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -14,7 +14,6 @@ from tempfile import mkstemp from functools import partial import contextlib -import poppunk_refine import numpy as np import pandas as pd @@ -29,6 +28,7 @@ except ImportError as e: gpu_lib = False +import poppunk_refine import pp_sketchlib def setGtThreads(threads): @@ -60,15 +60,13 @@ def set_env(**environ): # Use partials to set up slightly different function calls between # both possible backends -def setupDBFuncs(args, min_count, qc_dict): +def setupDBFuncs(args, qc_dict): """Wraps common database access functions from sketchlib and mash, to try and make their API more similar Args: args (argparse.opts) Parsed command lines options - min_count (int) - Minimum k-mer count for reads qc_dict (dict) Table of parameters for QC function diff --git a/PopPUNK/web.py b/PopPUNK/web.py index 6b013f61..37d70229 100644 --- a/PopPUNK/web.py +++ b/PopPUNK/web.py @@ -59,7 +59,7 @@ def sketchAssign(): os.mkdir(outdir) qc_dict = {'run_qc': False } - dbFuncs = setupDBFuncs(args.assign, args.assign.min_kmer_count, qc_dict) + dbFuncs = setupDBFuncs(args.assign, qc_dict) # assign query to strain ClusterResult = assign_query(dbFuncs, diff --git a/docs/model_fitting.rst b/docs/model_fitting.rst index 4428d829..fc5f6eb6 100644 --- a/docs/model_fitting.rst +++ b/docs/model_fitting.rst @@ -684,6 +684,20 @@ also be shown on the _refined_fit.png plot as dashed gray lines: To use one of these for your saved model, rerun, but instead setting ``--indiv-refine core`` or ``--indiv-refine accessory``. +.. _multi-boundary: + +Running with multiple boundary positions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +To create clusters at equally spaced positions across the refinement range, add +the ``--multi-boundary `` argument, with the number of positions specified by +````. This will create up to ```` sets of clusters, with boundaries equally spaced +between the origin and the refined boundary position. + +Trivial cluster sets, where every sample is in its own cluster, will be excluded, so +the final number of clusters may be less than ````. + +For a use of these cluster sets, see the :ref:`poppunk-iterate` section. + threshold --------- In this mode no model is fitted. You provide the threshold at which within- and diff --git a/docs/query_assignment.rst b/docs/query_assignment.rst index 55002244..ca73dce1 100644 --- a/docs/query_assignment.rst +++ b/docs/query_assignment.rst @@ -3,12 +3,6 @@ Query assignment This is the recommended mode to use PopPUNK, as long as a database is available for your species. If there is no database available, you can fit your own (:doc:`model_fitting`). -.. danger:: - Most reference databases available below are for a previous version of PopPUNK. - Please bear with us as we update this, more reference databases which work - with version 2 of PopPUNK will be released shortly. You can still fit your - own model to your data. - Briefly, `download your reference database `__ and run:: poppunk_assign --db database --query qfile.txt \ @@ -21,6 +15,12 @@ Downloading a database ---------------------- Current PopPUNK databases can be found here: https://poppunk.net/pages/databases.html +.. note:: + We currently only have three reference databases available for public use. + Please bear with us as we update this, more reference databases which work + with version 2 of PopPUNK will be released shortly. You can still fit your + own model to your data. + We refer to sequences in the database as references, and those being added as queries. The clusters assigned by PopPUNK are variable-length-k-mer clusters (VLKCs). @@ -240,4 +240,4 @@ depending on whether they were a query or reference input. each time it is called. If your query set is large and you want repeated visualisations, run ``poppunk_assign`` with ``--update-db``. -See :doc:`visualisation` for more details. \ No newline at end of file +See :doc:`visualisation` for more details. diff --git a/docs/requirements.txt b/docs/requirements.txt index 4bfc4281..e6784bb2 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1 +1,2 @@ -Cython>=0.26.1 \ No newline at end of file +Cython>=0.26.1 +docutils<0.18 \ No newline at end of file diff --git a/docs/scripts.rst b/docs/scripts.rst index 3c3c7d5e..a53be0e8 100644 --- a/docs/scripts.rst +++ b/docs/scripts.rst @@ -22,6 +22,38 @@ Previous versions of the software had an ``--easy-run`` mode which would run a p This is now available as ``poppunk_easy_run.py`` which will chain calls to ``poppunk`` and ``poppunk_visualise`` to replicate this functionality. +.. _poppunk-iterate: + +Iterative PopPUNK +----------------- +You can combine the output from multiple to produce further analysis. For an easy +way to create multiple clusters, try the ``--multi-boundary`` option (:ref:`multi-boundary`). + +The script to analyse these is ``poppunk_iterate.py``. Basic use is to provide the +output directory as ``--db``, but run ``--help`` for other common options. This relies on +finding files named ``/_boundary_clusters.csv``, where ```` is the boundary +iteration number (continuous integers increasing from zero). Clusters must contain at least +two samples. + +This script will do the following: + +1. Starting from the most specific clusters (nearest the origin), it will iteratively + add new clusters which are either: + + a) totally new clusters + + b) subsets of existing clusters + + c) existing clusters are subsets of the new cluster. + +2. Remove duplicate clusters. + +3. Calculate average core distance within this cluster set. + +4. Create a tree by nesting smaller clusters within larger clusters they are subsets of. + +5. Output the combined clusters, average core distances, and tree. + Adding weights to the network ----------------------------- Converts binary within-cluster edge weights to the Euclidean core-accessory distance. diff --git a/environment.yml b/environment.yml index 0431d683..6b5b4e27 100644 --- a/environment.yml +++ b/environment.yml @@ -13,6 +13,7 @@ dependencies: - scikit-learn >=0.24 - matplotlib-base - dendropy >=4.4.0 + - treeswift - matplotlib - hdbscan - rapidnj diff --git a/scripts/poppunk_iterate.py b/scripts/poppunk_iterate.py new file mode 100644 index 00000000..b2cc21fc --- /dev/null +++ b/scripts/poppunk_iterate.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python +# vim: set fileencoding= : +# Copyright 2021 John Lees, Nick Croucher and Bin Zhao + +from collections import defaultdict +import sys +import os +import argparse +import re +from copy import deepcopy +import numpy as np +from treeswift import Tree, Node +import h5py + +import pp_sketchlib + +# command line parsing +def get_options(): + + parser = argparse.ArgumentParser( + description="Cluster QC and analysis from multi-boundary method", prog="iterate" + ) + + # input options + parser.add_argument( + "--db", required=True, help="Output directory with results of --multi-boundary" + ) + parser.add_argument( + "--h5", + default=None, + help="Location of .h5 DB file [default = <--db>/<--db>.h5]", + ) + parser.add_argument( + "--output", + default=None, + help="Prefix for output files [default = <--db>/<--db>_iterate", + ) + parser.add_argument( + "--cpus", default=1, type=int, help="Number of CPUs to use [default = 1]" + ) + + return parser.parse_args() + + +def read_next_cluster_file(db_prefix): + """Iterator over clusters with decreasing resolution + + Input: + db_prefix: + Prefix of the output directory with results of --multi-boundary + + Returns: + all_clusters: + dictionary of clusters (keys are cluster ids; values sets of members) + no_singletons: + all_clusters with singletons removed + cluster_idx: + The iterated ID of the file read + """ + cluster_idx = 0 + while True: + all_clusters = defaultdict(set) + no_singletons = defaultdict(set) + cluster_file = db_prefix + "_boundary" + str(cluster_idx) + "_clusters.csv" + if os.path.isfile(cluster_file): + with open(cluster_file) as f: + f.readline() # skip header + for line in f: + name, cluster = line.rstrip().split(",") + all_clusters[int(cluster)].add(name) + + for cluster in all_clusters: + if len(all_clusters[cluster]) > 1: + no_singletons[cluster] = all_clusters[cluster] + yield (all_clusters, no_singletons, cluster_idx) + + cluster_idx += 1 + else: + break + + +def is_nested(cluster_dict, child_members, node_list): + """Check if a cluster is nested within another cluster that has + already been added to the tree + + Input: + cluster_dict: + Dictionary of clusters (keys are cluster ids; values sets of members) + child_members: + Set of members of the child cluster + node_list: + List of clusters IDs already in the tree + + Returns: + node: + The node in the tree that contains the cluster + """ + parent = None + for node in node_list: + if child_members.issubset(cluster_dict[node]) and \ + (parent == None or len(cluster_dict[node]) < len(cluster_dict[parent])): + parent = node + return parent + + +# main code +if __name__ == "__main__": + + # Check input ok + args = get_options() + if args.output is None: + args.output = args.db + "/" + args.db + "_iterate" + if args.h5 is None: + args.h5 = args.db + "/" + args.db + else: + # Remove the .h5 suffix if present + h5_prefix = re.match("^(.+)\.h5$", args.h5) + if h5_prefix != None: + args.h5 = h5_prefix.group(1) + + # Set up reading clusters + db_name = args.db + "/" + os.path.basename(args.db) + cluster_it = read_next_cluster_file(db_name) + all_clusters, iterated_clusters, first_idx = next(cluster_it) + all_samples = set() + for cluster_samples in all_clusters.values(): + all_samples.update(cluster_samples) + cluster_idx = max(iterated_clusters.keys()) + + # Run cluster QC + for (all_clusters, no_singletons, refine_idx) in cluster_it: + for new_cluster in no_singletons.values(): + valid = True + for old_cluster in iterated_clusters.values(): + if new_cluster == old_cluster or not ( + new_cluster.issubset(old_cluster) + or old_cluster.issubset(new_cluster) + or len(new_cluster.intersection(old_cluster)) == 0 + ): + valid = False + break + if valid: + cluster_idx += 1 + iterated_clusters[cluster_idx] = new_cluster + sorted_clusters = sorted( + iterated_clusters, key=lambda k: len(iterated_clusters[k]), reverse=True + ) + + # Calculate core distances + # Set up sketchlib args + ref_db_handle = h5py.File(args.h5 + ".h5", "r") + first_sample_name = list(ref_db_handle["sketches"].keys())[0] + kmers = ref_db_handle["sketches/" + first_sample_name].attrs["kmers"] + kmers = np.asarray(sorted(kmers)) + random_correct = True + jaccard = False + use_gpu = False + deviceid = 0 + + # Run a query for each cluster + pi_values = {} + for cluster in sorted_clusters: + rNames = list(iterated_clusters[cluster]) + distMat = pp_sketchlib.queryDatabase( + args.h5, + args.h5, + rNames, + rNames, + kmers, + random_correct, + jaccard, + args.cpus, + use_gpu, + deviceid, + ) + pi_values[cluster] = np.mean(distMat[:, 0]) + + # Nest the clusters + tree = Tree() + root_node = Node(label="root") + tree.root = root_node + + tree_clusters = deepcopy(iterated_clusters) + node_list = {"root": root_node} + tree_clusters["root"] = all_samples + for cluster in sorted_clusters: + new_node = Node(label="cluster" + str(cluster)) + parent_cluster = is_nested( + tree_clusters, tree_clusters[cluster], node_list.keys() + ) + if parent_cluster: + node_list[parent_cluster].add_child(new_node) + # Remove nested samples from the parent + tree_clusters[parent_cluster] -= tree_clusters[cluster] + + node_list[cluster] = new_node + + # Add all the samples to the tree by looking through the list where children + # have been removed + for cluster in tree_clusters: + for sample in tree_clusters[cluster]: + node_list[cluster].add_child(Node(label=sample)) + + # Write output + tree.write_tree_newick(args.output + ".tree.nwk", hide_rooted_prefix=True) + with open(args.output + ".clusters.csv", "w") as f: + f.write("Cluster,Avg_Pi,Taxa\n") + for cluster in sorted_clusters: + f.write( + str(cluster) + + "," + + str(pi_values[cluster]) + + "," + + ";".join(iterated_clusters[cluster]) + + "\n" + ) + + sys.exit(0) diff --git a/setup.py b/setup.py index ca76070a..f74b973b 100644 --- a/setup.py +++ b/setup.py @@ -122,6 +122,7 @@ def build_extension(self, ext): 'scripts/poppunk_calculate_silhouette.py', 'scripts/poppunk_batch_mst.py', 'scripts/poppunk_extract_distances.py', + 'scripts/poppunk_iterate.py', 'scripts/poppunk_add_weights.py', 'scripts/poppunk_easy_run.py', 'scripts/poppunk_pickle_fix.py'], diff --git a/test/run_test.py b/test/run_test.py index 4f5438ab..c0bc35bf 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -48,6 +48,10 @@ subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --score-idx 2", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold", shell=True, check=True) +sys.stderr.write("Running multi boundary refinement (--multi-boundary and poppunk_iterate.py)\n") +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --multi-boundary 10", shell=True, check=True) +subprocess.run(python_cmd + " ../scripts/poppunk_iterate.py --db example_refine --h5 example_db/example_db", shell=True, check=True) + # lineage clustering sys.stderr.write("Running lineage clustering test (--fit-model lineage)\n") subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --output example_lineages --ranks 1,2,3,5 --ref-db example_db --overwrite", shell=True, check=True) @@ -105,8 +109,8 @@ subprocess.run(python_cmd + " ../poppunk_assign-runner.py --citation --query some_queries.txt --db example_db --output example_query", shell=True, check=True) # web API -sys.stderr.write("Running API tests\n") -subprocess.run(python_cmd + " test-web.py", shell=True, check=True) +# sys.stderr.write("Running API tests\n") +# subprocess.run(python_cmd + " test-web.py", shell=True, check=True) sys.stderr.write("Tests completed\n") diff --git a/test/test-web.py b/test/test-web.py index a12ba280..1b2f57ee 100644 --- a/test/test-web.py +++ b/test/test-web.py @@ -27,7 +27,7 @@ def main(): args = default_options(species_db) qc_dict = {'run_qc': False } print("Weights: " + str(args.assign.graph_weights)) - dbFuncs = setupDBFuncs(args.assign, args.assign.min_kmer_count, qc_dict) + dbFuncs = setupDBFuncs(args.assign, qc_dict) ClusterResult = assign_query(dbFuncs, args.assign.ref_db, args.assign.q_files,