Skip to content

Commit

Permalink
Merge pull request #195 from johnlees/master
Browse files Browse the repository at this point in the history
Update parsing fixes branch with new master
  • Loading branch information
nickjcroucher authored Jan 21, 2022
2 parents e553ec6 + 46aff5d commit fc4fcf5
Show file tree
Hide file tree
Showing 18 changed files with 408 additions and 53 deletions.
2 changes: 1 addition & 1 deletion PopPUNK/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

'''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''

__version__ = '2.4.4'
__version__ = '2.4.5'

# Minimum sketchlib version
SKETCHLIB_MAJOR = 1
Expand Down
24 changes: 11 additions & 13 deletions PopPUNK/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,10 @@
import subprocess
from collections import defaultdict

# Try to import sketchlib
try:
import pp_sketchlib
except ImportError as e:
sys.stderr.write("Sketchlib backend not available\n")
sys.exit(1)
import h5py

import pp_sketchlib
import poppunk_refine
import h5py

# import poppunk package
from .__init__ import __version__
Expand Down Expand Up @@ -127,7 +122,7 @@ def get_options():
refinementGroup.add_argument('--manual-start', help='A file containing information for a start point. '
'See documentation for help.', default=None)
refinementGroup.add_argument('--model-dir', help='Directory containing model to use for assigning queries '
'to clusters [default = reference database directory]', type = str)
'to clusters [default = reference database directory]', type = str)
refinementGroup.add_argument('--score-idx',
help='Index of score to use [default = 0]',
type=int, default = 0, choices=[0, 1, 2])
Expand All @@ -138,6 +133,10 @@ def get_options():
refineMode.add_argument('--unconstrained',
help='Optimise both boundary gradient and intercept',
default=False, action='store_true')
refineMode.add_argument('--multi-boundary',
help='Produce multiple sets of clusters at different boundary positions. This argument sets the'
'number of boundary positions between n-1 clusters and the refine optimum.',
type=int, default=0)
refineMode.add_argument('--indiv-refine', help='Also run refinement for core and accessory individually',
choices=['both', 'core', 'accessory'], default=None)

Expand Down Expand Up @@ -218,15 +217,14 @@ def main():
sys.exit(0)

# Imports are here because graph tool is very slow to load
from .models import loadClusterFit, ClusterFit, BGMMFit, DBSCANFit, RefineFit, LineageFit
from .models import loadClusterFit, BGMMFit, DBSCANFit, RefineFit, LineageFit
from .sketchlib import checkSketchlibLibrary
from .sketchlib import removeFromDB

from .network import construct_network_from_edge_list
from .network import construct_network_from_assignments
from .network import extractReferences
from .network import printClusters
from .network import get_vertex_list
from .network import save_network
from .network import checkNetworkVertexCount

Expand All @@ -237,7 +235,6 @@ def main():

from .utils import setGtThreads
from .utils import setupDBFuncs
from .utils import storePickle
from .utils import readPickle
from .utils import qcDistMat
from .utils import createOverallLineage
Expand Down Expand Up @@ -269,7 +266,7 @@ def main():
}

# Dict of DB access functions
dbFuncs = setupDBFuncs(args, args.min_kmer_count, qc_dict)
dbFuncs = setupDBFuncs(args, qc_dict)
createDatabaseDir = dbFuncs['createDatabaseDir']
constructDatabase = dbFuncs['constructDatabase']
queryDatabase = dbFuncs['queryDatabase']
Expand Down Expand Up @@ -434,6 +431,7 @@ def main():
args.manual_start,
args.indiv_refine,
args.unconstrained,
args.multi_boundary,
args.score_idx,
args.no_local,
args.betweenness_sample,
Expand Down Expand Up @@ -464,7 +462,7 @@ def main():

# save model
model.save()

# plot model
if not args.no_plot:
model.plot(distMat, assignments)
Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ def main():
}

# Dict of DB access functions for assign_query (which is out of scope)
dbFuncs = setupDBFuncs(args, args.min_kmer_count, qc_dict)
dbFuncs = setupDBFuncs(args, qc_dict)

# run according to mode
sys.stderr.write("PopPUNK: assign\n")
Expand Down
30 changes: 22 additions & 8 deletions PopPUNK/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
except ImportError as e:
gpu_lib = False

# GPU support
import pp_sketchlib
import poppunk_refine

Expand All @@ -68,8 +67,7 @@
from .plot import plot_dbscan_results

# refine
from .refine import refineFit
from .refine import likelihoodBoundary
from .refine import refineFit, multi_refine
from .refine import readManualStart
from .plot import plot_refined_results

Expand Down Expand Up @@ -713,7 +711,7 @@ def __init__(self, outPrefix):
self.unconstrained = False

def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indiv_refine = False,
unconstrained = False, score_idx = 0, no_local = False,
unconstrained = False, multi_boundary = 0, score_idx = 0, no_local = False,
betweenness_sample = betweenness_sample_default, use_gpu = False):
'''Extends :func:`~ClusterFit.fit`
Expand Down Expand Up @@ -741,6 +739,10 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
indiv_refine (str)
Run refinement for core or accessory distances separately
(default = None).
multi_boundary (int)
Produce cluster output at multiple boundary positions downward
from the optimum.
(default = 0).
unconstrained (bool)
If True, search in 2D and change the slope of the boundary
score_idx (int)
Expand Down Expand Up @@ -784,7 +786,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
raise RuntimeError("Unrecognised model type")

# Main refinement in 2D
self.optimal_x, self.optimal_y = \
self.optimal_x, self.optimal_y, optimal_s = \
refineFit(X/self.scale,
sample_names,
self.mean0,
Expand All @@ -801,6 +803,18 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
use_gpu = use_gpu)
self.fitted = True

# Output clusters at more positions if requested
if multi_boundary > 1:
multi_refine(X/self.scale,
sample_names,
self.mean0,
self.mean1,
optimal_s,
multi_boundary,
self.outPrefix,
num_processes = self.threads,
use_gpu = use_gpu)

# Try and do a 1D refinement for both core and accessory
self.core_boundary = self.optimal_x
self.accessory_boundary = self.optimal_y
Expand All @@ -810,7 +824,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
if indiv_refine == 'both' or indiv_refine == dist_type:
sys.stderr.write("Refining " + dist_type + " distances separately\n")
# optimise core distance boundary
core_boundary, accessory_boundary = \
core_boundary, accessory_boundary, s = \
refineFit(X/self.scale,
sample_names,
self.mean0,
Expand All @@ -825,9 +839,9 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
betweenness_sample = betweenness_sample,
use_gpu = use_gpu)
if dist_type == "core":
self.core_boundary = core_boundary
self.core_boundary = core_boundary
if dist_type == "accessory":
self.accessory_boundary = accessory_boundary
self.accessory_boundary = accessory_boundary
self.indiv_fitted = True
except RuntimeError as e:
print(e)
Expand Down
3 changes: 2 additions & 1 deletion PopPUNK/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import pickle
import graph_tool.all as gt
import dendropy
import poppunk_refine

# Load GPU libraries
try:
Expand All @@ -36,6 +35,8 @@
except ImportError as e:
gpu_lib = False

import poppunk_refine

from .__main__ import accepted_weights_types
from .__main__ import betweenness_sample_default

Expand Down
97 changes: 85 additions & 12 deletions PopPUNK/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
except ImportError as e:
sys.stderr.write("This version of PopPUNK requires python v3.8 or higher\n")
sys.exit(0)
import pp_sketchlib
import poppunk_refine
import graph_tool.all as gt
import pandas as pd

Expand All @@ -37,9 +35,11 @@
except ImportError as e:
gpu_lib = False

import poppunk_refine

from .__main__ import betweenness_sample_default

from .network import construct_network_from_df
from .network import construct_network_from_df, printClusters
from .network import construct_network_from_edge_list
from .network import networkSummary
from .network import add_self_loop
Expand Down Expand Up @@ -96,6 +96,8 @@ def refineFit(distMat, sample_names, mean0, mean1, scale,
x-coordinate of refined fit
optimal_y (float)
y-coordinate of refined fit
optimised_s (float)
Position along search range of refined fit
"""
# Optimize boundary - grid search for global minimum
sys.stderr.write("Trying to optimise score globally\n")
Expand Down Expand Up @@ -165,6 +167,7 @@ def refineFit(distMat, sample_names, mean0, mean1, scale,
min_idx = np.argmin(global_s)
optimal_x = x_max[min_idx % global_grid_resolution]
optimal_y = y_max[min_idx // global_grid_resolution]
optimised_s = global_s[min_idx]

if not (optimal_x > x_max_start and optimal_x < x_max_end and \
optimal_y > y_max_start and optimal_y < y_max_end):
Expand Down Expand Up @@ -249,7 +252,62 @@ def refineFit(distMat, sample_names, mean0, mean1, scale,
if optimal_x < 0 or optimal_y < 0:
raise RuntimeError("Optimisation failed: produced a boundary outside of allowed range\n")

return optimal_x, optimal_y
return optimal_x, optimal_y, optimised_s

def multi_refine(distMat, sample_names, mean0, mean1, s_max,
n_boundary_points, output_prefix,
num_processes = 1, use_gpu = False):
"""Move the refinement boundary between the optimum and where it meets an
axis. Discrete steps, output the clusers at each step
Args:
distMat (numpy.array)
n x 2 array of core and accessory distances for n samples
sample_names (list)
List of query sequence labels
mean0 (numpy.array)
Start point to define search line
mean1 (numpy.array)
End point to define search line
s_max (float)
The optimal s position from refinement (:func:`~PopPUNK.refine.refineFit`)
n_boundary_points (int)
Number of positions to try drawing the boundary at
num_processes (int)
Number of threads to use in the global optimisation step.
(default = 1)
use_gpu (bool)
Whether to use cugraph for graph analyses
"""
# load CUDA libraries
use_gpu = check_and_set_gpu(use_gpu, gpu_lib)

# Set the range
# Between optimised s and where line meets an axis
gradient = (mean1[1] - mean0[1]) / (mean1[0] - mean0[0])
# Equation of normal passing through origin y = -1/m * x
# Where this meets line y - y1 = m(x - x1) is at:
x = (gradient * mean0[0] - mean0[1]) / (gradient + 1 / gradient)
y = mean0[1] + gradient * (x - mean0[0])

s_min = -((mean0[0] - x)**2 + (mean0[1] - y)**2)**0.5
s_range = np.linspace(s_min, s_max, num = n_boundary_points)[1:]

i_vec, j_vec, idx_vec = \
poppunk_refine.thresholdIterate1D(distMat, s_range, 2,
mean0[0], mean0[1],
mean1[0], mean1[1],
num_processes)

growNetwork(sample_names,
i_vec,
j_vec,
idx_vec,
s_range,
0,
write_clusters = output_prefix,
use_gpu = use_gpu)


def expand_cugraph_network(G, G_extra_df):
"""Reconstruct a cugraph network with additional edges.
Expand All @@ -276,9 +334,9 @@ def expand_cugraph_network(G, G_extra_df):
G = add_self_loop(G_df, G_vertex_count, weights = False, renumber = False)
return G

def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx,
def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx = 0,
thread_idx = 0, betweenness_sample = betweenness_sample_default,
use_gpu = False):
write_clusters = None, use_gpu = False):
"""Construct a network, then add edges to it iteratively.
Input is from ``pp_sketchlib.iterateBoundary1D`` or``pp_sketchlib.iterateBoundary2D``
Expand All @@ -302,6 +360,9 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx,
betweenness_sample (int)
Number of sequences per component used to estimate betweenness using
a GPU. Smaller numbers are faster but less precise [default = 100]
write_clusters (str)
Set to a prefix to write the clusters from each position to files
[default = None]
use_gpu (bool)
Whether to use cugraph for graph analyses
Expand All @@ -310,7 +371,6 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx,
-1 * network score for each of x_range.
Where network score is from :func:`~PopPUNK.network.networkSummary`
"""

scores = []
prev_idx = -1

Expand Down Expand Up @@ -338,9 +398,9 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx,
# At first offset, make a new network, otherwise just add the new edges
if prev_idx == -1:
G = construct_network_from_df(sample_names, sample_names,
edge_df,
summarise = False,
use_gpu = use_gpu)
edge_df,
summarise = False,
use_gpu = use_gpu)
else:
if use_gpu:
G = expand_cugraph_network(G, edge_df)
Expand All @@ -349,13 +409,26 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx,
G.add_edge_list(edge_list)
edge_list = []
# Add score into vector for any offsets passed (should usually just be one)
latest_score = -networkSummary(G,
G_summary = networkSummary(G,
score_idx > 0,
betweenness_sample = betweenness_sample,
use_gpu = use_gpu)[1][score_idx]
use_gpu = use_gpu)
latest_score = -G_summary[1][score_idx]
for s in range(prev_idx, idx):
scores.append(latest_score)
pbar.update(1)
# Write the cluster output as long as there is at least one
# non-trivial cluster
if write_clusters and G_summary[0][0] < len(sample_names):
o_prefix = write_clusters + "/" + \
os.path.basename(write_clusters) + \
"_boundary" + str(s)
printClusters(G,
sample_names,
outPrefix=o_prefix,
write_unwords=False,
use_gpu=use_gpu)

prev_idx = idx

return(scores)
Expand Down
2 changes: 1 addition & 1 deletion PopPUNK/sketchlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
query_db = queryPrefix + "/" + os.path.basename(queryPrefix)
distMat = pp_sketchlib.queryDatabase(ref_db, query_db, rNames, qNames, klist,
True, False, threads, use_gpu, deviceid)

# option to plot core/accessory fits. Choose a random number from cmd line option
if number_plot_fits > 0:
jacobian = -np.hstack((np.ones((klist.shape[0], 1)), klist.reshape(-1, 1)))
Expand Down
Loading

0 comments on commit fc4fcf5

Please sign in to comment.