Merge pull request #195 from johnlees/master

Update parsing fixes branch with new master
bacpop · Jan 21, 2022 · fc4fcf5 · fc4fcf5
2 parents e553ec6 + 46aff5d
commit fc4fcf5
Show file tree

Hide file tree

Showing 18 changed files with 408 additions and 53 deletions.
diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py
@@ -3,7 +3,7 @@
 
 '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''
 
-__version__ = '2.4.4'
+__version__ = '2.4.5'
 
 # Minimum sketchlib version
 SKETCHLIB_MAJOR = 1

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
@@ -10,15 +10,10 @@
 import subprocess
 from collections import defaultdict
 
-# Try to import sketchlib
-try:
-    import pp_sketchlib
-except ImportError as e:
-    sys.stderr.write("Sketchlib backend not available\n")
-    sys.exit(1)
+import h5py
 
+import pp_sketchlib
 import poppunk_refine
-import h5py
 
 # import poppunk package
 from .__init__ import __version__
@@ -127,7 +122,7 @@ def get_options():
     refinementGroup.add_argument('--manual-start', help='A file containing information for a start point. '
             'See documentation for help.', default=None)
     refinementGroup.add_argument('--model-dir', help='Directory containing model to use for assigning queries '
-                                                   'to clusters [default = reference database directory]', type = str)
+                                                     'to clusters [default = reference database directory]', type = str)
     refinementGroup.add_argument('--score-idx',
             help='Index of score to use [default = 0]',
             type=int, default = 0, choices=[0, 1, 2])
@@ -138,6 +133,10 @@ def get_options():
     refineMode.add_argument('--unconstrained',
             help='Optimise both boundary gradient and intercept',
             default=False, action='store_true')
+    refineMode.add_argument('--multi-boundary',
+            help='Produce multiple sets of clusters at different boundary positions. This argument sets the'
+                 'number of boundary positions between n-1 clusters and the refine optimum.',
+            type=int, default=0)
     refineMode.add_argument('--indiv-refine', help='Also run refinement for core and accessory individually',
             choices=['both', 'core', 'accessory'], default=None)
 
@@ -218,15 +217,14 @@ def main():
         sys.exit(0)
 
     # Imports are here because graph tool is very slow to load
-    from .models import loadClusterFit, ClusterFit, BGMMFit, DBSCANFit, RefineFit, LineageFit
+    from .models import loadClusterFit, BGMMFit, DBSCANFit, RefineFit, LineageFit
     from .sketchlib import checkSketchlibLibrary
     from .sketchlib import removeFromDB
 
     from .network import construct_network_from_edge_list
     from .network import construct_network_from_assignments
     from .network import extractReferences
     from .network import printClusters
-    from .network import get_vertex_list
     from .network import save_network
     from .network import checkNetworkVertexCount
 
@@ -237,7 +235,6 @@ def main():
 
     from .utils import setGtThreads
     from .utils import setupDBFuncs
-    from .utils import storePickle
     from .utils import readPickle
     from .utils import qcDistMat
     from .utils import createOverallLineage
@@ -269,7 +266,7 @@ def main():
     }
 
     # Dict of DB access functions
-    dbFuncs = setupDBFuncs(args, args.min_kmer_count, qc_dict)
+    dbFuncs = setupDBFuncs(args, qc_dict)
     createDatabaseDir = dbFuncs['createDatabaseDir']
     constructDatabase = dbFuncs['constructDatabase']
     queryDatabase = dbFuncs['queryDatabase']
@@ -434,6 +431,7 @@ def main():
                                             args.manual_start,
                                             args.indiv_refine,
                                             args.unconstrained,
+                                            args.multi_boundary,
                                             args.score_idx,
                                             args.no_local,
                                             args.betweenness_sample,
@@ -464,7 +462,7 @@ def main():
 
             # save model
             model.save()
-            
+
             # plot model
             if not args.no_plot:
                 model.plot(distMat, assignments)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
@@ -564,7 +564,7 @@ def main():
         }
 
     # Dict of DB access functions for assign_query (which is out of scope)
-    dbFuncs = setupDBFuncs(args, args.min_kmer_count, qc_dict)
+    dbFuncs = setupDBFuncs(args, qc_dict)
 
     # run according to mode
     sys.stderr.write("PopPUNK: assign\n")

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
@@ -45,7 +45,6 @@
 except ImportError as e:
     gpu_lib = False
 
-# GPU support
 import pp_sketchlib
 import poppunk_refine
 
@@ -68,8 +67,7 @@
 from .plot import plot_dbscan_results
 
 # refine
-from .refine import refineFit
-from .refine import likelihoodBoundary
+from .refine import refineFit, multi_refine
 from .refine import readManualStart
 from .plot import plot_refined_results
 
@@ -713,7 +711,7 @@ def __init__(self, outPrefix):
         self.unconstrained = False
 
     def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indiv_refine = False,
-            unconstrained = False, score_idx = 0, no_local = False,
+            unconstrained = False, multi_boundary = 0, score_idx = 0, no_local = False,
             betweenness_sample = betweenness_sample_default, use_gpu = False):
         '''Extends :func:`~ClusterFit.fit`
 
@@ -741,6 +739,10 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
             indiv_refine (str)
                 Run refinement for core or accessory distances separately
                 (default = None).
+            multi_boundary (int)
+                Produce cluster output at multiple boundary positions downward
+                from the optimum.
+                (default = 0).
             unconstrained (bool)
                 If True, search in 2D and change the slope of the boundary
             score_idx (int)
@@ -784,7 +786,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
             raise RuntimeError("Unrecognised model type")
 
         # Main refinement in 2D
-        self.optimal_x, self.optimal_y = \
+        self.optimal_x, self.optimal_y, optimal_s = \
           refineFit(X/self.scale,
                     sample_names,
                     self.mean0,
@@ -801,6 +803,18 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
                     use_gpu = use_gpu)
         self.fitted = True
 
+        # Output clusters at more positions if requested
+        if multi_boundary > 1:
+            multi_refine(X/self.scale,
+                        sample_names,
+                        self.mean0,
+                        self.mean1,
+                        optimal_s,
+                        multi_boundary,
+                        self.outPrefix,
+                        num_processes = self.threads,
+                        use_gpu = use_gpu)
+
         # Try and do a 1D refinement for both core and accessory
         self.core_boundary = self.optimal_x
         self.accessory_boundary = self.optimal_y
@@ -810,7 +824,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
                     if indiv_refine == 'both' or indiv_refine == dist_type:
                         sys.stderr.write("Refining " + dist_type + " distances separately\n")
                         # optimise core distance boundary
-                        core_boundary, accessory_boundary = \
+                        core_boundary, accessory_boundary, s = \
                           refineFit(X/self.scale,
                                     sample_names,
                                     self.mean0,
@@ -825,9 +839,9 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
                                     betweenness_sample = betweenness_sample,
                                     use_gpu = use_gpu)
                         if dist_type == "core":
-                          self.core_boundary = core_boundary
+                            self.core_boundary = core_boundary
                         if dist_type == "accessory":
-                          self.accessory_boundary = accessory_boundary
+                            self.accessory_boundary = accessory_boundary
                 self.indiv_fitted = True
             except RuntimeError as e:
                 print(e)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
@@ -22,7 +22,6 @@
 import pickle
 import graph_tool.all as gt
 import dendropy
-import poppunk_refine
 
 # Load GPU libraries
 try:
@@ -36,6 +35,8 @@
 except ImportError as e:
     gpu_lib = False
 
+import poppunk_refine
+
 from .__main__ import accepted_weights_types
 from .__main__ import betweenness_sample_default
 

diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
@@ -20,8 +20,6 @@
 except ImportError as e:
     sys.stderr.write("This version of PopPUNK requires python v3.8 or higher\n")
     sys.exit(0)
-import pp_sketchlib
-import poppunk_refine
 import graph_tool.all as gt
 import pandas as pd
 
@@ -37,9 +35,11 @@
 except ImportError as e:
     gpu_lib = False
 
+import poppunk_refine
+
 from .__main__ import betweenness_sample_default
 
-from .network import construct_network_from_df
+from .network import construct_network_from_df, printClusters
 from .network import construct_network_from_edge_list
 from .network import networkSummary
 from .network import add_self_loop
@@ -96,6 +96,8 @@ def refineFit(distMat, sample_names, mean0, mean1, scale,
             x-coordinate of refined fit
         optimal_y (float)
             y-coordinate of refined fit
+        optimised_s (float)
+            Position along search range of refined fit
     """
     # Optimize boundary - grid search for global minimum
     sys.stderr.write("Trying to optimise score globally\n")
@@ -165,6 +167,7 @@ def refineFit(distMat, sample_names, mean0, mean1, scale,
         min_idx = np.argmin(global_s)
         optimal_x = x_max[min_idx % global_grid_resolution]
         optimal_y = y_max[min_idx // global_grid_resolution]
+        optimised_s = global_s[min_idx]
 
         if not (optimal_x > x_max_start and optimal_x < x_max_end and \
                 optimal_y > y_max_start and optimal_y < y_max_end):
@@ -249,7 +252,62 @@ def refineFit(distMat, sample_names, mean0, mean1, scale,
     if optimal_x < 0 or optimal_y < 0:
         raise RuntimeError("Optimisation failed: produced a boundary outside of allowed range\n")
 
-    return optimal_x, optimal_y
+    return optimal_x, optimal_y, optimised_s
+
+def multi_refine(distMat, sample_names, mean0, mean1, s_max,
+                 n_boundary_points, output_prefix,
+                 num_processes = 1, use_gpu = False):
+    """Move the refinement boundary between the optimum and where it meets an
+    axis. Discrete steps, output the clusers at each step
+
+    Args:
+        distMat (numpy.array)
+            n x 2 array of core and accessory distances for n samples
+        sample_names (list)
+            List of query sequence labels
+        mean0 (numpy.array)
+            Start point to define search line
+        mean1 (numpy.array)
+            End point to define search line
+        s_max (float)
+            The optimal s position from refinement (:func:`~PopPUNK.refine.refineFit`)
+        n_boundary_points (int)
+            Number of positions to try drawing the boundary at
+        num_processes (int)
+            Number of threads to use in the global optimisation step.
+            (default = 1)
+        use_gpu (bool)
+            Whether to use cugraph for graph analyses
+    """
+    # load CUDA libraries
+    use_gpu = check_and_set_gpu(use_gpu, gpu_lib)
+
+    # Set the range
+    # Between optimised s and where line meets an axis
+    gradient = (mean1[1] - mean0[1]) / (mean1[0] - mean0[0])
+    # Equation of normal passing through origin y = -1/m * x
+    # Where this meets line y - y1 = m(x - x1) is at:
+    x = (gradient * mean0[0] - mean0[1]) / (gradient + 1 / gradient)
+    y = mean0[1] + gradient * (x - mean0[0])
+
+    s_min = -((mean0[0] - x)**2 + (mean0[1] - y)**2)**0.5
+    s_range = np.linspace(s_min, s_max, num = n_boundary_points)[1:]
+
+    i_vec, j_vec, idx_vec = \
+        poppunk_refine.thresholdIterate1D(distMat, s_range, 2,
+                                          mean0[0], mean0[1],
+                                          mean1[0], mean1[1],
+                                          num_processes)
+
+    growNetwork(sample_names,
+                i_vec,
+                j_vec,
+                idx_vec,
+                s_range,
+                0,
+                write_clusters = output_prefix,
+                use_gpu = use_gpu)
+
 
 def expand_cugraph_network(G, G_extra_df):
     """Reconstruct a cugraph network with additional edges.
@@ -276,9 +334,9 @@ def expand_cugraph_network(G, G_extra_df):
     G = add_self_loop(G_df, G_vertex_count, weights = False, renumber = False)
     return G
 
-def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx,
+def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx = 0,
                 thread_idx = 0, betweenness_sample = betweenness_sample_default,
-                use_gpu = False):
+                write_clusters = None, use_gpu = False):
     """Construct a network, then add edges to it iteratively.
     Input is from ``pp_sketchlib.iterateBoundary1D`` or``pp_sketchlib.iterateBoundary2D``
 
@@ -302,6 +360,9 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx,
         betweenness_sample (int)
             Number of sequences per component used to estimate betweenness using
             a GPU. Smaller numbers are faster but less precise [default = 100]
+        write_clusters (str)
+            Set to a prefix to write the clusters from each position to files
+            [default = None]
         use_gpu (bool)
             Whether to use cugraph for graph analyses
 
@@ -310,7 +371,6 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx,
             -1 * network score for each of x_range.
             Where network score is from :func:`~PopPUNK.network.networkSummary`
     """
-
     scores = []
     prev_idx = -1
 
@@ -338,9 +398,9 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx,
             # At first offset, make a new network, otherwise just add the new edges
             if prev_idx == -1:
                 G = construct_network_from_df(sample_names, sample_names,
-                                                 edge_df,
-                                                 summarise = False,
-                                                 use_gpu = use_gpu)
+                                              edge_df,
+                                              summarise = False,
+                                              use_gpu = use_gpu)
             else:
                 if use_gpu:
                     G = expand_cugraph_network(G, edge_df)
@@ -349,13 +409,26 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx,
                     G.add_edge_list(edge_list)
                     edge_list = []
             # Add score into vector for any offsets passed (should usually just be one)
-            latest_score = -networkSummary(G,
+            G_summary = networkSummary(G,
                                 score_idx > 0,
                                 betweenness_sample = betweenness_sample,
-                                use_gpu = use_gpu)[1][score_idx]
+                                use_gpu = use_gpu)
+            latest_score = -G_summary[1][score_idx]
             for s in range(prev_idx, idx):
                 scores.append(latest_score)
                 pbar.update(1)
+                # Write the cluster output as long as there is at least one
+                # non-trivial cluster
+                if write_clusters and G_summary[0][0] < len(sample_names):
+                    o_prefix = write_clusters + "/" + \
+                        os.path.basename(write_clusters) + \
+                        "_boundary" + str(s)
+                    printClusters(G,
+                                  sample_names,
+                                  outPrefix=o_prefix,
+                                  write_unwords=False,
+                                  use_gpu=use_gpu)
+
             prev_idx = idx
 
     return(scores)

diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py
@@ -584,7 +584,7 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
         query_db = queryPrefix + "/" + os.path.basename(queryPrefix)
         distMat = pp_sketchlib.queryDatabase(ref_db, query_db, rNames, qNames, klist,
                                              True, False, threads, use_gpu, deviceid)
-                                             
+
         # option to plot core/accessory fits. Choose a random number from cmd line option
         if number_plot_fits > 0:
             jacobian = -np.hstack((np.ones((klist.shape[0], 1)), klist.reshape(-1, 1)))