Skip to content

Commit

Permalink
Merge pull request #180 from johnlees/enhanced
Browse files Browse the repository at this point in the history
Multi-boundary method
  • Loading branch information
johnlees authored Nov 10, 2021
2 parents 1b4c788 + f065207 commit 9067f4c
Show file tree
Hide file tree
Showing 17 changed files with 418 additions and 63 deletions.
2 changes: 1 addition & 1 deletion PopPUNK/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

'''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''

__version__ = '2.4.4'
__version__ = '2.4.5'

# Minimum sketchlib version
SKETCHLIB_MAJOR = 1
Expand Down
24 changes: 11 additions & 13 deletions PopPUNK/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,10 @@
import subprocess
from collections import defaultdict

# Try to import sketchlib
try:
import pp_sketchlib
except ImportError as e:
sys.stderr.write("Sketchlib backend not available\n")
sys.exit(1)
import h5py

import pp_sketchlib
import poppunk_refine
import h5py

# import poppunk package
from .__init__ import __version__
Expand Down Expand Up @@ -127,7 +122,7 @@ def get_options():
refinementGroup.add_argument('--manual-start', help='A file containing information for a start point. '
'See documentation for help.', default=None)
refinementGroup.add_argument('--model-dir', help='Directory containing model to use for assigning queries '
'to clusters [default = reference database directory]', type = str)
'to clusters [default = reference database directory]', type = str)
refinementGroup.add_argument('--score-idx',
help='Index of score to use [default = 0]',
type=int, default = 0, choices=[0, 1, 2])
Expand All @@ -138,6 +133,10 @@ def get_options():
refineMode.add_argument('--unconstrained',
help='Optimise both boundary gradient and intercept',
default=False, action='store_true')
refineMode.add_argument('--multi-boundary',
help='Produce multiple sets of clusters at different boundary positions. This argument sets the'
'number of boundary positions between n-1 clusters and the refine optimum.',
type=int, default=0)
refineMode.add_argument('--indiv-refine', help='Also run refinement for core and accessory individually',
choices=['both', 'core', 'accessory'], default=None)

Expand Down Expand Up @@ -204,15 +203,14 @@ def main():
sys.exit(0)

# Imports are here because graph tool is very slow to load
from .models import loadClusterFit, ClusterFit, BGMMFit, DBSCANFit, RefineFit, LineageFit
from .models import loadClusterFit, BGMMFit, DBSCANFit, RefineFit, LineageFit
from .sketchlib import checkSketchlibLibrary
from .sketchlib import removeFromDB

from .network import construct_network_from_edge_list
from .network import construct_network_from_assignments
from .network import extractReferences
from .network import printClusters
from .network import get_vertex_list
from .network import save_network
from .network import checkNetworkVertexCount

Expand All @@ -223,7 +221,6 @@ def main():

from .utils import setGtThreads
from .utils import setupDBFuncs
from .utils import storePickle
from .utils import readPickle
from .utils import qcDistMat
from .utils import createOverallLineage
Expand Down Expand Up @@ -255,7 +252,7 @@ def main():
}

# Dict of DB access functions
dbFuncs = setupDBFuncs(args, args.min_kmer_count, qc_dict)
dbFuncs = setupDBFuncs(args, qc_dict)
createDatabaseDir = dbFuncs['createDatabaseDir']
constructDatabase = dbFuncs['constructDatabase']
queryDatabase = dbFuncs['queryDatabase']
Expand Down Expand Up @@ -419,6 +416,7 @@ def main():
args.manual_start,
args.indiv_refine,
args.unconstrained,
args.multi_boundary,
args.score_idx,
args.no_local,
args.betweenness_sample,
Expand All @@ -444,7 +442,7 @@ def main():

# save model
model.save()

# plot model
if not args.no_plot:
model.plot(distMat, assignments)
Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ def main():
}

# Dict of DB access functions for assign_query (which is out of scope)
dbFuncs = setupDBFuncs(args, args.min_kmer_count, qc_dict)
dbFuncs = setupDBFuncs(args, qc_dict)

# run according to mode
sys.stderr.write("PopPUNK: assign\n")
Expand Down
30 changes: 22 additions & 8 deletions PopPUNK/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
except ImportError as e:
gpu_lib = False

# GPU support
import pp_sketchlib
import poppunk_refine

Expand All @@ -68,8 +67,7 @@
from .plot import plot_dbscan_results

# refine
from .refine import refineFit
from .refine import likelihoodBoundary
from .refine import refineFit, multi_refine
from .refine import readManualStart
from .plot import plot_refined_results

Expand Down Expand Up @@ -710,7 +708,7 @@ def __init__(self, outPrefix):
self.unconstrained = False

def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indiv_refine = False,
unconstrained = False, score_idx = 0, no_local = False,
unconstrained = False, multi_boundary = 0, score_idx = 0, no_local = False,
betweenness_sample = betweenness_sample_default, use_gpu = False):
'''Extends :func:`~ClusterFit.fit`
Expand Down Expand Up @@ -738,6 +736,10 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
indiv_refine (str)
Run refinement for core or accessory distances separately
(default = None).
multi_boundary (int)
Produce cluster output at multiple boundary positions downward
from the optimum.
(default = 0).
unconstrained (bool)
If True, search in 2D and change the slope of the boundary
score_idx (int)
Expand Down Expand Up @@ -781,7 +783,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
raise RuntimeError("Unrecognised model type")

# Main refinement in 2D
self.optimal_x, self.optimal_y = \
self.optimal_x, self.optimal_y, optimal_s = \
refineFit(X/self.scale,
sample_names,
self.mean0,
Expand All @@ -798,6 +800,18 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
use_gpu = use_gpu)
self.fitted = True

# Output clusters at more positions if requested
if multi_boundary > 1:
multi_refine(X/self.scale,
sample_names,
self.mean0,
self.mean1,
optimal_s,
multi_boundary,
self.outPrefix,
num_processes = self.threads,
use_gpu = use_gpu)

# Try and do a 1D refinement for both core and accessory
self.core_boundary = self.optimal_x
self.accessory_boundary = self.optimal_y
Expand All @@ -807,7 +821,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
if indiv_refine == 'both' or indiv_refine == dist_type:
sys.stderr.write("Refining " + dist_type + " distances separately\n")
# optimise core distance boundary
core_boundary, accessory_boundary = \
core_boundary, accessory_boundary, s = \
refineFit(X/self.scale,
sample_names,
self.mean0,
Expand All @@ -822,9 +836,9 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
betweenness_sample = betweenness_sample,
use_gpu = use_gpu)
if dist_type == "core":
self.core_boundary = core_boundary
self.core_boundary = core_boundary
if dist_type == "accessory":
self.accessory_boundary = accessory_boundary
self.accessory_boundary = accessory_boundary
self.indiv_fitted = True
except RuntimeError as e:
print(e)
Expand Down
37 changes: 19 additions & 18 deletions PopPUNK/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import pickle
import graph_tool.all as gt
import dendropy
import poppunk_refine

# Load GPU libraries
try:
Expand All @@ -36,6 +35,8 @@
except ImportError as e:
gpu_lib = False

import poppunk_refine

from .__main__ import accepted_weights_types
from .__main__ import betweenness_sample_default

Expand Down Expand Up @@ -640,7 +641,7 @@ def process_weights(distMat, weights_type):
else:
sys.stderr.write('Require distance matrix to calculate distances\n')
return processed_weights

def process_previous_network(previous_network = None, adding_qq_dists = False, old_ids = None,
previous_pkl = None, vertex_labels = None, weights = False, use_gpu = False):
"""Extract edge types from an existing network
Expand Down Expand Up @@ -696,7 +697,7 @@ def process_previous_network(previous_network = None, adding_qq_dists = False, o
else:
sys.stderr.write('A distance pkl corresponding to ' + previous_pkl + ' is required for loading\n')
sys.exit(1)

return extra_sources, extra_targets, extra_weights

def construct_network_from_edge_list(rlist,
Expand Down Expand Up @@ -751,10 +752,10 @@ def construct_network_from_edge_list(rlist,
G (graph)
The resulting network
"""

# Check GPU library use
use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)

# data structures
vertex_labels, self_comparison = initial_graph_properties(rlist, qlist)

Expand Down Expand Up @@ -872,10 +873,10 @@ def construct_network_from_df(rlist,
G (graph)
The resulting network
"""

# Check GPU library use
use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)

# data structures
vertex_labels, self_comparison = initial_graph_properties(rlist, qlist)

Expand Down Expand Up @@ -976,10 +977,10 @@ def construct_network_from_sparse_matrix(rlist,
G (graph)
The resulting network
"""

# Check GPU library use
use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)

if use_gpu:
G_df = cudf.DataFrame()
else:
Expand Down Expand Up @@ -1058,7 +1059,7 @@ def construct_dense_weighted_network(rlist, distMat, weights_type = None, use_gp
# Could alternatively assign weights through eweight.a = weights
G.add_edge_list(weighted_edges, eprops = [eweight])
G.edge_properties["weight"] = eweight

return G


Expand Down Expand Up @@ -1114,7 +1115,7 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label =
G (graph)
The resulting network
"""

# Check GPU library use
use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True)

Expand Down Expand Up @@ -1157,7 +1158,7 @@ def get_cugraph_triangles(G):
Args:
G (cugraph network)
Network to be analysed
Returns:
triangle_count (int)
Count of triangles in graph
Expand Down Expand Up @@ -1308,7 +1309,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
"""
# initalise functions
queryDatabase = dbFuncs['queryDatabase']

# do not calculate weights unless specified
if weights is None:
distance_type = None
Expand Down Expand Up @@ -1391,7 +1392,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
self = True,
number_plot_fits = 0,
threads = threads)

if distance_type == 'core':
queryAssignation = model.assign(qqDistMat, slope = 0)
elif distance_type == 'accessory':
Expand All @@ -1415,7 +1416,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
else:
edge_tuple = (query_indices[query1], query_indices[query2])
new_edges.append(edge_tuple)

G = construct_network_from_assignments(qList,
qList,
queryAssignation,
Expand Down Expand Up @@ -1763,11 +1764,11 @@ def generate_minimum_spanning_tree(G, from_cugraph = False):
out_degrees = component.get_out_degrees(component_vertices)
seed_vertex = list(component_vertices[np.where(out_degrees == np.amax(out_degrees))])
seed_vertices.add(seed_vertex[0]) # Can only add one otherwise not MST


# If multiple components, add distances between seed nodes
if num_components > 1:

# Extract edges and maximum edge length - as DF for cugraph
# list of tuples for graph-tool
if from_cugraph:
Expand Down Expand Up @@ -1881,7 +1882,7 @@ def cugraph_to_graph_tool(G, rlist):
Cugraph network
rlist (list)
List of sequence names
Returns:
G (graph-tool network)
Graph tool network
Expand Down
Loading

0 comments on commit 9067f4c

Please sign in to comment.