diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..88ddd57e --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,51 @@ +cmake_minimum_required(VERSION 3.18) +project(poppunk_refine) +set(CMAKE_CXX_STANDARD 14) + +# Variable definitions +set(TARGET_NAME poppunk_refine) +add_compile_definitions(PYTHON_EXT) + +# Add -O0 to remove optimizations when using gcc +IF(CMAKE_COMPILER_IS_GNUCC) + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0") +ENDIF(CMAKE_COMPILER_IS_GNUCC) + +if(UNIX AND NOT APPLE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS") + set(CMAKE_LD_FLAGS "${CMAKE_LDFLAGS} -Wl,--as-needed") +endif() + +# Set paths for non standard lib/ and include/ locations +if(DEFINED ENV{CONDA_PREFIX}) + include_directories($ENV{CONDA_PREFIX}/include) + link_directories($ENV{CONDA_PREFIX}/lib) + link_directories($ENV{CONDA_PREFIX}/lib/intel64) +endif() + +# Add libraries +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) +find_package(pybind11 REQUIRED) +find_package (Eigen3 3.3 REQUIRED NO_MODULE) +#find_package(OpenMP) # This links system openmp if present - conda sorts out rpath but take care + +# Define python library target +add_library("${TARGET_NAME}" MODULE) + +# Compile CPU library +target_sources("${TARGET_NAME}" PRIVATE src/python_bindings.cpp + src/boundary.cpp) + +set_target_properties("${TARGET_NAME}" PROPERTIES + CXX_VISIBILITY_PRESET "hidden" + INTERPROCEDURAL_OPTIMIZATION TRUE + PREFIX "${PYTHON_MODULE_PREFIX}" + SUFFIX "${PYTHON_MODULE_EXTENSION}" +) + +target_link_libraries("${TARGET_NAME}" PRIVATE pybind11::module Eigen3::Eigen + z gomp openblas gfortran m dl) +#if(OpenMP_CXX_FOUND) +# target_link_libraries("${TARGET_NAME}" PRIVATE OpenMP::OpenMP_CXX) +#endif() diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py index 0d38638d..1022c21d 100644 --- a/PopPUNK/__init__.py +++ b/PopPUNK/__init__.py @@ -3,9 +3,9 @@ '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)''' -__version__ = '2.3.0' +__version__ = '2.4.0' # Minimum sketchlib version SKETCHLIB_MAJOR = 1 SKETCHLIB_MINOR = 6 -SKETCHLIB_PATCH = 0 +SKETCHLIB_PATCH = 2 diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 4c59b145..63924906 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -10,8 +10,15 @@ import subprocess from collections import defaultdict -# required from v2.1.1 onwards (no mash support) -import pp_sketchlib +# Try to import sketchlib +try: + import pp_sketchlib +except ImportError as e: + sys.stderr.write("Sketchlib backend not available\n") + sys.exit(1) + +import poppunk_refine +import h5py # import poppunk package from .__init__ import __version__ @@ -114,13 +121,19 @@ def get_options(): type=float, default = None) refinementGroup.add_argument('--manual-start', help='A file containing information for a start point. ' 'See documentation for help.', default=None) - refinementGroup.add_argument('--indiv-refine', help='Also run refinement for core and accessory individually', - choices=['both', 'core', 'accessory'], default = False) refinementGroup.add_argument('--no-local', help='Do not perform the local optimization step (speed up on very large datasets)', default=False, action='store_true') refinementGroup.add_argument('--model-dir', help='Directory containing model to use for assigning queries ' 'to clusters [default = reference database directory]', type = str) - refinementGroup.add_argument('--core-only', help='Save the core distance fit (with ') + refinementGroup.add_argument('--score-idx', + help='Index of score to use [default = 0]', + type=int, default = 0, choices=[0, 1, 2]) + refineMode = refinementGroup.add_mutually_exclusive_group() + refineMode.add_argument('--unconstrained', + help='Optimise both boundary gradient and intercept', + default=False, action='store_true') + refineMode.add_argument('--indiv-refine', help='Also run refinement for core and accessory individually', + choices=['both', 'core', 'accessory'], default=False) # lineage clustering within strains lineagesGroup = parser.add_argument_group('Lineage analysis options') @@ -334,7 +347,8 @@ def main(): model_prefix + "/" + os.path.basename(model_prefix) + '_fit.npz', output) sys.stderr.write("Loaded previous model of type: " + model.type + "\n") - if args.fit_model == "refine" and (model.type != 'bgmm' and model.type != 'dbscan'): + if args.fit_model == "refine" and args.manual_start == None \ + and model.type != 'bgmm' and model.type != 'dbscan': sys.stderr.write("Model needs to be from BGMM or DBSCAN to refine\n") sys.exit(1) @@ -368,6 +382,8 @@ def main(): args.pos_shift, args.neg_shift, args.manual_start, args.indiv_refine, + args.unconstrained, + args.score_idx, args.no_local, args.threads) new_model.plot(distMat) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 422b13f1..902cf738 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -20,6 +20,7 @@ from scipy.sparse import coo_matrix, bmat, find import pp_sketchlib +import poppunk_refine # BGMM from .bgmm import fit2dMultiGaussian @@ -526,9 +527,10 @@ def __init__(self, outPrefix): self.within_label = -1 self.slope = 2 self.threshold = False + self.unconstrained = False def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indiv_refine = False, - no_local = False, threads = 1): + unconstrained = False, score_idx = 0, no_local = False, threads = 1): '''Extends :func:`~ClusterFit.fit` Fits the distances by optimising network score, by calling @@ -557,6 +559,11 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi Run refinement for core and accessory distances separately (default = False). + unconstrained (bool) + If True, search in 2D and change the slope of the boundary + score_idx (int) + Index of score from :func:`~PopPUNK.network.networkSummary` to use + [default = 0] no_local (bool) Turn off the local optimisation step. Quicker, but may be less well refined. @@ -572,9 +579,9 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi self.scale = np.copy(model.scale) self.max_move = max_move self.min_move = min_move + self.unconstrained = unconstrained # Get starting point - assignment = model.assign(X) model.no_scale() if startFile: self.mean0, self.mean1, self.start_s = readManualStart(startFile) @@ -607,9 +614,11 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi raise RuntimeError("Unrecognised model type") # Main refinement in 2D - self.start_point, self.optimal_x, self.optimal_y, self.min_move, self.max_move = refineFit(X/self.scale, - sample_names, self.start_s, self.mean0, self.mean1, self.max_move, self.min_move, - slope = 2, no_local = no_local, num_processes = threads) + self.start_point, self.optimal_x, self.optimal_y, self.min_move, self.max_move = \ + refineFit(X/self.scale, + sample_names, self.start_s, self.mean0, self.mean1, self.max_move, self.min_move, + slope = 2, score_idx = score_idx, unconstrained = unconstrained, + no_local = no_local, num_processes = threads) self.fitted = True # Try and do a 1D refinement for both core and accessory @@ -619,13 +628,15 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi try: sys.stderr.write("Refining core and accessory separately\n") # optimise core distance boundary - start_point, self.core_boundary, core_acc, self.min_move, self.max_move = refineFit(X/self.scale, - sample_names, self.start_s, self.mean0, self.mean1, self.max_move, self.min_move, - slope = 0, no_local = no_local,num_processes = threads) + start_point, self.core_boundary, core_acc, self.min_move, self.max_move = \ + refineFit(X/self.scale, + sample_names, self.start_s, self.mean0, self.mean1, self.max_move, self.min_move, + slope = 0, score_idx = score_idx, no_local = no_local,num_processes = threads) # optimise accessory distance boundary - start_point, acc_core, self.accessory_boundary, self.min_move, self.max_move = refineFit(X/self.scale, - sample_names, self.start_s,self.mean0, self.mean1, self.max_move, self.min_move, slope = 1, - no_local = no_local, num_processes = threads) + start_point, acc_core, self.accessory_boundary, self.min_move, self.max_move = \ + refineFit(X/self.scale, + sample_names, self.start_s,self.mean0, self.mean1, self.max_move, self.min_move, + slope = 1, score_idx = score_idx, no_local = no_local, num_processes = threads) self.indiv_fitted = True except RuntimeError as e: sys.stderr.write("Could not separately refine core and accessory boundaries. " @@ -670,6 +681,7 @@ def apply_threshold(self, X, threshold): self.fitted = True self.threshold = True self.indiv_fitted = False + self.unconstrained = False y = self.assign(X) return y @@ -740,8 +752,8 @@ def plot(self, X, y=None): plot_refined_results(plot_X, self.assign(plot_X), self.optimal_x, self.optimal_y, self.core_boundary, self.accessory_boundary, self.mean0, self.mean1, self.start_point, self.min_move, - self.max_move, self.scale, self.threshold, self.indiv_fitted, "Refined fit boundary", - self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_refined_fit") + self.max_move, self.scale, self.threshold, self.indiv_fitted, self.unconstrained, + "Refined fit boundary", self.outPrefix + "/" + os.path.basename(self.outPrefix) + "_refined_fit") def assign(self, X, slope=None, cpus=1): @@ -765,11 +777,11 @@ def assign(self, X, slope=None, cpus=1): raise RuntimeError("Trying to assign using an unfitted model") else: if slope == 2 or (slope == None and self.slope == 2): - y = pp_sketchlib.assignThreshold(X/self.scale, 2, self.optimal_x, self.optimal_y, cpus) + y = poppunk_refine.assignThreshold(X/self.scale, 2, self.optimal_x, self.optimal_y, cpus) elif slope == 0 or (slope == None and self.slope == 0): - y = pp_sketchlib.assignThreshold(X/self.scale, 0, self.core_boundary, 0, cpus) + y = poppunk_refine.assignThreshold(X/self.scale, 0, self.core_boundary, 0, cpus) elif slope == 1 or (slope == None and self.slope == 1): - y = pp_sketchlib.assignThreshold(X/self.scale, 1, 0, self.accessory_boundary, cpus) + y = poppunk_refine.assignThreshold(X/self.scale, 1, 0, self.accessory_boundary, cpus) return y diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 3e993600..b5c9cadd 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -127,6 +127,8 @@ def cliquePrune(component, graph, reference_indices, components_list): """Wrapper function around :func:`~getCliqueRefs` so it can be called by a multiprocessing pool """ + if gt.openmp_enabled(): + gt.openmp_set_num_threads(1) subgraph = gt.GraphView(graph, vfilt=components_list == component) refs = reference_indices.copy() if subgraph.num_vertices() <= 2: @@ -364,39 +366,60 @@ def constructNetwork(rlist, qlist, assignments, within_label, # print some summaries if summarise: - (components, density, transitivity, score) = networkSummary(G) - sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t" + str(components), - "\tDensity\t" + "{:.4f}".format(density), - "\tTransitivity\t" + "{:.4f}".format(transitivity), - "\tScore\t" + "{:.4f}".format(score)]) + (metrics, scores) = networkSummary(G) + sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t\t\t\t" + str(metrics[0]), + "\tDensity\t\t\t\t\t" + "{:.4f}".format(metrics[1]), + "\tTransitivity\t\t\t\t" + "{:.4f}".format(metrics[2]), + "\tMean betweenness\t\t\t" + "{:.4f}".format(metrics[3]), + "\tWeighted-mean betweenness\t\t" + "{:.4f}".format(metrics[4]), + "\tScore\t\t\t\t\t" + "{:.4f}".format(scores[0]), + "\tScore (w/ betweenness)\t\t\t" + "{:.4f}".format(scores[1]), + "\tScore (w/ weighted-betweenness)\t\t" + "{:.4f}".format(scores[2])]) + "\n") return G -def networkSummary(G): +def networkSummary(G, calc_betweenness=True): """Provides summary values about the network Args: G (graph) The network of strains from :func:`~constructNetwork` + calc_betweenness (bool) + Whether to calculate betweenness stats Returns: - components (int) - The number of connected components (and clusters) - density (float) - The proportion of possible edges used - transitivity (float) - Network transitivity (triads/triangles) - score (float) - A score of network fit, given by :math:`\mathrm{transitivity} * (1-\mathrm{density})` + metrics (list) + List with # components, density, transitivity, mean betweenness + and weighted mean betweenness + scores (list) + List of scores """ component_assignments, component_frequencies = gt.label_components(G) components = len(component_frequencies) density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1)) transitivity = gt.global_clustering(G)[0] - score = transitivity * (1-density) - return(components, density, transitivity, score) + mean_bt = 0 + weighted_mean_bt = 0 + if calc_betweenness: + betweenness = [] + sizes = [] + for component, size in enumerate(component_frequencies): + if size > 3: + vfilt = component_assignments.a == component + subgraph = gt.GraphView(G, vfilt=vfilt) + betweenness.append(max(gt.betweenness(subgraph, norm = True)[0].a)) + sizes.append(size) + + if len(betweenness) > 1: + mean_bt = np.mean(betweenness) + weighted_mean_bt = np.average(betweenness, weights=sizes) + + metrics = [components, density, transitivity, mean_bt, weighted_mean_bt] + base_score = transitivity * (1 - density) + scores = [base_score, base_score * (1 - metrics[3]), base_score * (1 - metrics[4])] + return(metrics, scores) def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, assignments, model, queryDB, queryQuery = False, diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 2f0f23dc..521b41ae 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -28,6 +28,7 @@ from .trees import write_tree, mst_to_phylogeny from .utils import isolateNameToLabel +from .utils import decisionBoundary def plot_scatter(X, out_prefix, title, kde = True): """Draws a 2D scatter plot (png) of the core and accessory distances @@ -225,7 +226,7 @@ def plot_dbscan_results(X, y, n_clusters, out_prefix): def plot_refined_results(X, Y, x_boundary, y_boundary, core_boundary, accessory_boundary, mean0, mean1, start_point, min_move, max_move, scale, threshold, indiv_boundaries, - title, out_prefix): + unconstrained, title, out_prefix): """Draw a scatter plot (png) to show the refined model fit A scatter plot of core and accessory distances, coloured by component @@ -285,12 +286,21 @@ def plot_refined_results(X, Y, x_boundary, y_boundary, core_boundary, accessory_ # Draw boundary search range if mean0 is not None and mean1 is not None and min_move is not None and max_move is not None and start_point is not None: - minimum_xy = transformLine(-min_move, start_point, mean1) * scale - maximum_xy = transformLine(max_move, start_point, mean1) * scale - plt.plot([minimum_xy[0], maximum_xy[0]], [minimum_xy[1], maximum_xy[1]], - color='k', linewidth=1, linestyle=':', label='Search range') - start_point *= scale - plt.plot(start_point[0], start_point[1], 'ro', label='Initial boundary') + if unconstrained: + gradient = (mean1[1] - mean0[1]) / (mean1[0] - mean0[0]) + opt_start = decisionBoundary(mean0, gradient) * scale + opt_end = decisionBoundary(mean1, gradient) * scale + plt.fill([opt_start[0], opt_end[0], 0, 0], + [0, 0, opt_end[1], opt_start[1]], + fill=True, facecolor='lightcoral', alpha = 0.2, + label='Search range') + else: + minimum_xy = transformLine(-min_move, start_point, mean1) * scale + maximum_xy = transformLine(max_move, start_point, mean1) * scale + plt.plot([minimum_xy[0], maximum_xy[0]], [minimum_xy[1], maximum_xy[1]], + color='k', linewidth=1, linestyle=':', label='Search range') + start_point *= scale + plt.plot(start_point[0], start_point[1], 'ro', label='Initial boundary') mean0 *= scale mean1 *= scale diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 7bb5aa9b..4dab0523 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -7,10 +7,12 @@ import os import sys # additional +from itertools import chain from functools import partial import numpy as np import scipy.optimize import collections +from tqdm import tqdm try: from multiprocessing import Pool, shared_memory from multiprocessing.managers import SharedMemoryManager @@ -19,13 +21,18 @@ sys.stderr.write("This version of PopPUNK requires python v3.8 or higher\n") sys.exit(0) import pp_sketchlib +import poppunk_refine import graph_tool.all as gt from .network import constructNetwork from .network import networkSummary +from .utils import transformLine +from .utils import decisionBoundary + def refineFit(distMat, sample_names, start_s, mean0, mean1, - max_move, min_move, slope = 2, no_local = False, num_processes = 1): + max_move, min_move, slope = 2, score_idx = 0, + unconstrained = False, no_local = False, num_processes = 1): """Try to refine a fit by maximising a network score based on transitivity and density. Iteratively move the decision boundary to do this, using starting point from existing model. @@ -48,6 +55,11 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1, slope (int) Set to 0 for a vertical line, 1 for a horizontal line, or 2 to use a slope + score_idx (int) + Index of score from :func:`~PopPUNK.network.networkSummary` to use + [default = 0] + unconstrained (bool) + If True, search in 2D and change the slope of the boundary no_local (bool) Turn off the local optimisation step. Quicker, but may be less well refined. @@ -77,53 +89,90 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1, # Boundary is left of line normal to this point and first line gradient = (mean1[1] - mean0[1]) / (mean1[0] - mean0[0]) - # ALTERNATIVE - use a single network - # Move boundary along in steps, and find those samples which have changed - # Use remove_edges/add_edges with index k lookup (n total) to find sample IDs - # https://stackoverflow.com/questions/27086195/linear-index-upper-triangular-matrix - # i = n - 2 - int(sqrt(-8*k + 4*n*(n-1)-7)/2.0 - 0.5) - # j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2 - # Optimize boundary - grid search for global minimum sys.stderr.write("Trying to optimise score globally\n") - global_grid_resolution = 40 # Seems to work - s_range = np.linspace(-min_move, max_move, num = global_grid_resolution) - - # Move distMat into shared memory - with SharedMemoryManager() as smm: - shm_distMat = smm.SharedMemory(size = distMat.nbytes) - distances_shared_array = np.ndarray(distMat.shape, dtype = distMat.dtype, buffer = shm_distMat.buf) - distances_shared_array[:] = distMat[:] - distances_shared = NumpyShared(name = shm_distMat.name, shape = distMat.shape, dtype = distMat.dtype) - - with Pool(processes = num_processes) as pool: - global_s = pool.map(partial(newNetwork, - sample_names = sample_names, - distMat = distances_shared, - start_point = start_point, - mean1 = mean1, - gradient = gradient, - slope = slope), - s_range) + + if unconstrained: + if slope != 2: + raise RuntimeError("Unconstrained optimization and indiv-refine incompatible") + + global_grid_resolution = 20 + x_max_start, y_max_start = decisionBoundary(mean0, gradient) + x_max_end, y_max_end = decisionBoundary(mean1, gradient) + x_max = np.linspace(x_max_start, x_max_end, global_grid_resolution, dtype=np.float32) + y_max = np.linspace(y_max_start, y_max_end, global_grid_resolution, dtype=np.float32) + + if gt.openmp_enabled(): + gt.openmp_set_num_threads(1) + + with SharedMemoryManager() as smm: + shm_distMat = smm.SharedMemory(size = distMat.nbytes) + distances_shared_array = np.ndarray(distMat.shape, dtype = distMat.dtype, buffer = shm_distMat.buf) + distances_shared_array[:] = distMat[:] + distances_shared = NumpyShared(name = shm_distMat.name, shape = distMat.shape, dtype = distMat.dtype) + + with Pool(processes = num_processes) as pool: + global_s = pool.map(partial(newNetwork2D, + sample_names = sample_names, + distMat = distances_shared, + x_range = x_max, + y_range = y_max, + score_idx = score_idx), + range(global_grid_resolution)) + + if gt.openmp_enabled(): + gt.openmp_set_num_threads(num_processes) + + global_s = list(chain.from_iterable(global_s)) + min_idx = np.argmin(np.array(global_s)) + optimal_x = x_max[min_idx % global_grid_resolution] + optimal_y = y_max[min_idx // global_grid_resolution] + + if not (optimal_x > x_max_start and optimal_x < x_max_end and \ + optimal_y > y_max_start and optimal_y < y_max_end): + no_local = True + elif not no_local: + # We have a fixed gradient and want to optimised the intercept + # This parameterisation is a little awkward to match the 1D case: + # Make two points along the right slope + gradient = optimal_x / optimal_y # of 1D search + delta = x_max[1] - x_max[0] + bounds = [-delta, delta] + start_point = (optimal_x, 0) + mean1 = (optimal_x + delta, delta * gradient) + + else: + global_grid_resolution = 40 # Seems to work + s_range = np.linspace(-min_move, max_move, num = global_grid_resolution) + i_vec, j_vec, idx_vec = \ + poppunk_refine.thresholdIterate1D(distMat, s_range, slope, + start_point[0], start_point[1], + mean1[0], mean1[1], num_processes) + global_s = growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx) + min_idx = np.argmin(np.array(global_s)) + if min_idx > 0 and min_idx < len(s_range) - 1: + bounds = [s_range[min_idx-1], s_range[min_idx+1]] + else: + no_local = True + optimised_s = s_range[min_idx] # Local optimisation around global optimum - min_idx = np.argmin(np.array(global_s)) - if min_idx > 0 and min_idx < len(s_range) - 1 and not no_local: + if not no_local: sys.stderr.write("Trying to optimise score locally\n") local_s = scipy.optimize.minimize_scalar(newNetwork, - bounds=[s_range[min_idx-1], s_range[min_idx+1]], + bounds=bounds, method='Bounded', options={'disp': True}, - args = (sample_names, distMat, start_point, mean1, gradient, slope)) + args = (sample_names, distMat, start_point, mean1, gradient, slope, score_idx)) optimised_s = local_s.x - else: - optimised_s = s_range[min_idx] - optimised_coor = transformLine(optimised_s, start_point, mean1) - if slope == 2: - optimal_x, optimal_y = decisionBoundary(optimised_coor, gradient) - else: - optimal_x = optimised_coor[0] - optimal_y = optimised_coor[1] + # Convert to x_max, y_max if needed + if not unconstrained or not no_local: + optimised_coor = transformLine(optimised_s, start_point, mean1) + if slope == 2: + optimal_x, optimal_y = decisionBoundary(optimised_coor, gradient) + else: + optimal_x = optimised_coor[0] + optimal_y = optimised_coor[1] if optimal_x < 0 or optimal_y < 0: raise RuntimeError("Optimisation failed: produced a boundary outside of allowed range\n") @@ -131,7 +180,71 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1, return start_point, optimal_x, optimal_y, min_move, max_move -def newNetwork(s, sample_names, distMat, start_point, mean1, gradient, slope=2, cpus = 1): +def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_idx = 0): + """Construct a network, then add edges to it iteratively. + Input is from ``pp_sketchlib.iterateBoundary1D`` or``pp_sketchlib.iterateBoundary2D`` + + Args: + sample_names (list) + Sample names corresponding to distMat (accessed by iterator) + i_vec (list) + Ordered ref vertex index to add + j_vec (list) + Ordered query (==ref) vertex index to add + idx_vec (list) + For each i, j tuple, the index of the intercept at which these enter + the network. These are sorted and increasing + s_range (list) + Offsets which correspond to idx_vec entries + score_idx (int) + Index of score from :func:`~PopPUNK.network.networkSummary` to use + [default = 0] + thread_idx (int) + Optional thread idx (if multithreaded) to offset progress bar by + Returns: + scores (list) + -1 * network score for each of x_range. + Where network score is from :func:`~PopPUNK.network.networkSummary` + """ + scores = [] + edge_list = [] + prev_idx = 0 + # Grow a network + with tqdm(total=(idx_vec[-1] + 1), + bar_format="{bar}| {n_fmt}/{total_fmt}", + ncols=40, + position=thread_idx) as pbar: + for i, j, idx in zip(i_vec, j_vec, idx_vec): + if idx > prev_idx: + # At first offset, make a new network, otherwise just add the new edges + if prev_idx == 0: + G = constructNetwork(sample_names, sample_names, edge_list, -1, + summarise=False, edge_list=True) + else: + G.add_edge_list(edge_list) + # Add score into vector for any offsets passed (should usually just be one) + for s in range(prev_idx, idx): + scores.append(-networkSummary(G, score_idx > 0)[1][score_idx]) + pbar.update(1) + prev_idx = idx + edge_list = [] + edge_list.append((i, j)) + + # Add score for final offset(s) at end of loop + if prev_idx == 0: + G = constructNetwork(sample_names, sample_names, edge_list, -1, + summarise=False, edge_list=True) + else: + G.add_edge_list(edge_list) + for s in range(prev_idx, len(s_range)): + scores.append(-networkSummary(G, score_idx > 0)[1][score_idx]) + pbar.update(1) + + return(scores) + + +def newNetwork(s, sample_names, distMat, start_point, mean1, gradient, + slope=2, score_idx=0, cpus=1): """Wrapper function for :func:`~PopPUNK.network.constructNetwork` which is called by optimisation functions moving a triangular decision boundary. @@ -154,6 +267,10 @@ def newNetwork(s, sample_names, distMat, start_point, mean1, gradient, slope=2, slope (int) Set to 0 for a vertical line, 1 for a horizontal line, or 2 to use a slope + [default = 2] + score_idx (int) + Index of score from :func:`~PopPUNK.network.networkSummary` to use + [default = 0] cpus (int) Number of CPUs to use for calculating assignment Returns: @@ -176,13 +293,50 @@ def newNetwork(s, sample_names, distMat, start_point, mean1, gradient, slope=2, y_max = new_intercept[1] # Make network - boundary_assignments = pp_sketchlib.assignThreshold(distMat, slope, x_max, y_max, cpus) + boundary_assignments = poppunk_refine.assignThreshold(distMat, slope, x_max, y_max, cpus) G = constructNetwork(sample_names, sample_names, boundary_assignments, -1, summarise = False) # Return score - score = networkSummary(G)[3] + score = networkSummary(G, score_idx > 0)[1][score_idx] return(-score) +def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0): + """Wrapper function for thresholdIterate2D and :func:`growNetwork`. + + For a given y_max, constructs networks across x_range and returns a list + of scores + + Args: + y_idx (float) + Maximum y-intercept of boundary, as index into y_range + sample_names (list) + Sample names corresponding to distMat (accessed by iterator) + distMat (numpy.array or NumpyShared) + Core and accessory distances or NumpyShared describing these in sharedmem + x_range (list) + Sorted list of x-intercepts to search + y_range (list) + Sorted list of y-intercepts to search + score_idx (int) + Index of score from :func:`~PopPUNK.network.networkSummary` to use + [default = 0] + Returns: + scores (list) + -1 * network score for each of x_range. + Where network score is from :func:`~PopPUNK.network.networkSummary` + """ + if gt.openmp_enabled(): + gt.openmp_set_num_threads(1) + if isinstance(distMat, NumpyShared): + distMat_shm = shared_memory.SharedMemory(name = distMat.name) + distMat = np.ndarray(distMat.shape, dtype = distMat.dtype, buffer = distMat_shm.buf) + + y_max = y_range[y_idx] + i_vec, j_vec, idx_vec = \ + poppunk_refine.thresholdIterate2D(distMat, x_range, y_max) + scores = growNetwork(sample_names, i_vec, j_vec, idx_vec, x_range, score_idx, y_idx) + return(scores) + def readManualStart(startFile): """Reads a file to define a manual start point, rather than using ``--fit-model`` @@ -261,48 +415,3 @@ def likelihoodBoundary(s, model, start, end, within, between): responsibilities = model.assign(X, values = True) return(responsibilities[0, within] - responsibilities[0, between]) - -def transformLine(s, mean0, mean1): - """Return x and y co-ordinates for traversing along a line between mean0 and mean1, parameterised by - a single scalar distance s from the start point mean0. - - Args: - s (float) - Distance along line from mean0 - mean0 (numpy.array) - Start position of line (x0, y0) - mean1 (numpy.array) - End position of line (x1, y1) - Returns: - x (float) - The Cartesian x-coordinate - y (float) - The Cartesian y-coordinate - """ - tan_theta = (mean1[1] - mean0[1]) / (mean1[0] - mean0[0]) - x = mean0[0] + s * (1/np.sqrt(1+tan_theta)) - y = mean0[1] + s * (tan_theta/np.sqrt(1+tan_theta)) - - return np.array([x, y]) - - -def decisionBoundary(intercept, gradient): - """Returns the co-ordinates where the triangle the decision boundary forms - meets the x- and y-axes. - - Args: - intercept (numpy.array) - Cartesian co-ordinates of point along line (:func:`~transformLine`) - which intercepts the boundary - gradient (float) - Gradient of the line - Returns: - x (float) - The x-axis intercept - y (float) - The y-axis intercept - """ - x = intercept[0] + intercept[1] * gradient - y = intercept[1] + intercept[0] / gradient - return(x, y) - diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 1ef1bb3e..0f33e196 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -20,13 +20,8 @@ import numpy as np from scipy import optimize -# Try to import sketchlib -try: - import pp_sketchlib - import h5py -except ImportError as e: - sys.stderr.write("Sketchlib backend not available") - sys.exit(1) +import pp_sketchlib +import h5py from .__init__ import SKETCHLIB_MAJOR, SKETCHLIB_MINOR, SKETCHLIB_PATCH from .utils import iterDistRows @@ -610,6 +605,7 @@ def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads): hdf_in = h5py.File(db_name, 'r+') # try/except structure to prevent h5 corruption + failed_samples = False try: # process data structures read_grp = hdf_in['sketches'] @@ -643,7 +639,6 @@ def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads): # open file to report QC failures with open(prefix + '/' + os.path.basename(prefix) + '_qcreport.txt', 'a+') as qc_file: # iterate through and filter - failed_sample = False for dataset in seq_length.keys(): # determine if sequence passes filters remove = False @@ -662,6 +657,7 @@ def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads): if remove: sys.stderr.write(dataset + ' failed QC\n') + failed_samples = True failed.append(dataset) else: retained.append(dataset) @@ -690,7 +686,7 @@ def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads): raise # stop if at least one sample fails QC and option is not continue/prune - if failed_sample and qc_dict['qc_filter'] == 'stop': + if failed_samples and qc_dict['qc_filter'] == 'stop': sys.stderr.write('Sequences failed QC filters - details in ' + \ prefix + '/' + os.path.basename(prefix) + \ '_qcreport.txt\n') diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index aec9b05e..bd3d2995 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -445,3 +445,48 @@ def createOverallLineage(rank_list, lineage_clusters): overall_lineages['overall'][isolate] = overall_lineage return overall_lineages + + +def transformLine(s, mean0, mean1): + """Return x and y co-ordinates for traversing along a line between mean0 and mean1, parameterised by + a single scalar distance s from the start point mean0. + + Args: + s (float) + Distance along line from mean0 + mean0 (numpy.array) + Start position of line (x0, y0) + mean1 (numpy.array) + End position of line (x1, y1) + Returns: + x (float) + The Cartesian x-coordinate + y (float) + The Cartesian y-coordinate + """ + tan_theta = (mean1[1] - mean0[1]) / (mean1[0] - mean0[0]) + x = mean0[0] + s * (1/np.sqrt(1+tan_theta)) + y = mean0[1] + s * (tan_theta/np.sqrt(1+tan_theta)) + + return np.array([x, y]) + + +def decisionBoundary(intercept, gradient): + """Returns the co-ordinates where the triangle the decision boundary forms + meets the x- and y-axes. + + Args: + intercept (numpy.array) + Cartesian co-ordinates of point along line (:func:`~transformLine`) + which intercepts the boundary + gradient (float) + Gradient of the line + Returns: + x (float) + The x-axis intercept + y (float) + The y-axis intercept + """ + x = intercept[0] + intercept[1] * gradient + y = intercept[1] + intercept[0] / gradient + return(x, y) \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 61baf7f4..b6376398 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -46,6 +46,7 @@ "matplotlib", "dendropy", "pp-sketchlib", + "poppunk_refine", "h5py", "flask", "flask-cors", diff --git a/docs/images/ecoli_refine_constrained.png b/docs/images/ecoli_refine_constrained.png new file mode 100644 index 00000000..00bec09a Binary files /dev/null and b/docs/images/ecoli_refine_constrained.png differ diff --git a/docs/images/ecoli_refine_unconstrained.png b/docs/images/ecoli_refine_unconstrained.png new file mode 100644 index 00000000..8a6af138 Binary files /dev/null and b/docs/images/ecoli_refine_unconstrained.png differ diff --git a/docs/images/unconstrained_refine.png b/docs/images/unconstrained_refine.png new file mode 100644 index 00000000..f2626545 Binary files /dev/null and b/docs/images/unconstrained_refine.png differ diff --git a/docs/installation.rst b/docs/installation.rst index 1cfe6f56..7233a7b6 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -67,10 +67,10 @@ conda or pip for this purpose). Dependencies ------------ -We tested PopPUNK with the following packages: +This documentation refers to a conda installation with the following packages: * python3 (3.8.2) -* ``pp-sketchlib`` (1.5.1) +* ``pp-sketchlib`` (1.6.2) * ``DendroPy`` (4.3.0) * ``hdbscan`` (0.8.13) * ``matplotlib`` (2.1.2) diff --git a/docs/model_fitting.rst b/docs/model_fitting.rst index d1239b04..1b0fde9c 100644 --- a/docs/model_fitting.rst +++ b/docs/model_fitting.rst @@ -99,10 +99,14 @@ Interpreting the network summary All fits will output a network summary which looks similar to this:: Network summary: - Components 31 - Density 0.0897 - Transitivity 1.0000 - Score 0.9103 + Components 59 + Density 0.0531 + Transitivity 0.9966 + Mean betweenness 0.0331 + Weighted-mean betweenness 0.0454 + Score 0.9438 + Score (w/ betweenness) 0.9126 + Score (w/ weighted-betweenness) 0.9009 - Components are the number of strains (clusters) found using this model. - Density is the proportion of distances assigned as 'within-strain'. Generally @@ -113,6 +117,8 @@ All fits will output a network summary which looks similar to this:: - Score synthesises the above as :math:`(1 - \mathrm{density}) * \mathrm{transitivity}`, which gives a single number between 0 (bad) and 1 (good) which in many cases is at a maximum when it accurately describes strains in the data. +- Two further scores for larger networks. See :ref:`alt-scores` for more information + on these. .. _bgmm: @@ -490,6 +496,101 @@ Which, looking at the `microreact output =1.6.0 + - pp-sketchlib >=1.6.2 - graph-tool - requests - flask - flask-cors - networkx + - pybind11 + - eigen + - openblas + - cmake >= 3.18 + - libgfortran-ng + - boost-cpp + - libgomp + - tqdm diff --git a/setup.py b/setup.py index 455167fc..a0bea06d 100644 --- a/setup.py +++ b/setup.py @@ -4,10 +4,16 @@ from setuptools import setup, find_packages from codecs import open from os import path -import os +import os, sys import re import io +import platform +import subprocess +from setuptools import setup, find_packages, Extension +from setuptools.command.build_ext import build_ext +from setuptools.command.install import install +from distutils.version import LooseVersion def read(*names, **kwargs): with io.open( @@ -25,6 +31,53 @@ def find_version(*file_paths): return version_match.group(1) raise RuntimeError("Unable to find version string.") +class CMakeExtension(Extension): + def __init__(self, name, sourcedir=''): + Extension.__init__(self, name, sources=[]) + self.sourcedir = os.path.abspath(sourcedir) + +class CMakeBuild(build_ext): + def run(self): + try: + out = subprocess.check_output(['cmake', '--version']) + except OSError: + raise RuntimeError("CMake must be installed to build the following extensions: " + + ", ".join(e.name for e in self.extensions)) + + if platform.system() == "Windows": + cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1)) + if cmake_version < '3.1.0': + raise RuntimeError("CMake >= 3.1.0 is required on Windows") + + for ext in self.extensions: + self.build_extension(ext) + + def build_extension(self, ext): + extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) + cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir, + '-DPYTHON_EXECUTABLE=' + sys.executable, + '-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON'] + + cfg = 'Debug' if self.debug else 'Release' + build_args = ['--config', cfg] + + if platform.system() == "Windows": + cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)] + if sys.maxsize > 2**32: + cmake_args += ['-A', 'x64'] + build_args += ['--', '/m'] + else: + cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg] + build_args += ['--', '-j2'] + + env = os.environ.copy() + env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''), + self.distribution.get_version()) + + if not os.path.exists(self.build_temp): + os.makedirs(self.build_temp) + subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env) + subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp) here = path.abspath(path.dirname(__file__)) @@ -70,5 +123,8 @@ def find_version(*file_paths): 'scripts/poppunk_add_weights.py', 'scripts/poppunk_easy_run.py', 'scripts/poppunk_pickle_fix.py'], + ext_modules=[CMakeExtension('poppunk_refine')], test_suite="test", + cmdclass=dict(build_ext=CMakeBuild), + zip_safe=False ) diff --git a/src/boundary.cpp b/src/boundary.cpp new file mode 100644 index 00000000..8f38ed67 --- /dev/null +++ b/src/boundary.cpp @@ -0,0 +1,215 @@ +/* + * + * boundary.cpp + * Functions to move a network boundary + * + */ +#include // size_t +#include +#include // floor/sqrt +#include // assert +#include +#include +#include +#include + +// Parallel sort +#include + +#include "boundary.hpp" + +const float epsilon = 1E-10; + +template +inline size_t rows_to_samples(const T &longMat) +{ + return 0.5 * (1 + sqrt(1 + 8 * (longMat.rows()))); +} + +inline size_t calc_row_idx(const uint64_t k, const size_t n) +{ + return n - 2 - std::floor(std::sqrt(static_cast(-8 * k + 4 * n * (n - 1) - 7)) / 2 - 0.5); +} + +inline size_t calc_col_idx(const uint64_t k, const size_t i, const size_t n) +{ + return k + i + 1 - n * (n - 1) / 2 + (n - i) * ((n - i) - 1) / 2; +} + +inline size_t square_to_condensed(const size_t i, const size_t j, const size_t n) +{ + assert(j > i); + return (n * i - ((i * (i + 1)) >> 1) + j - 1 - i); +} + +//https://stackoverflow.com/a/12399290 +template +std::vector sort_indexes(const T &v, const uint32_t n_threads) +{ + // initialize original index locations + std::vector idx(v.size()); + std::iota(idx.begin(), idx.end(), 0); + + boost::sort::parallel_stable_sort( + idx.begin(), idx.end(), + [&v](long i1, long i2) { return v[i1] < v[i2]; }, + n_threads); + + return idx; +} + +// Unnormalised (signed_ distance between a point (x0, y0) and a line defined +// by the two points (xmax, 0) and (0, ymax) +// Divide by 1/sqrt(xmax^2 + ymax^2) to get distance +inline float line_dist(const float x0, + const float y0, + const float x_max, + const float y_max, + const int slope) +{ + float boundary_side = 0; + if (slope == 2) + { + boundary_side = y0 * x_max + x0 * y_max - x_max * y_max; + } + else if (slope == 0) + { + boundary_side = x0 - x_max; + } + else if (slope == 1) + { + boundary_side = y0 - y_max; + } + + return boundary_side; +} + +Eigen::VectorXf assign_threshold(const NumpyMatrix &distMat, + const int slope, + const float x_max, + const float y_max, + unsigned int num_threads) +{ + Eigen::VectorXf boundary_test(distMat.rows()); + +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (long row_idx = 0; row_idx < distMat.rows(); row_idx++) + { + float in_tri = line_dist(distMat(row_idx, 0), distMat(row_idx, 1), + x_max, y_max, slope); + float boundary_side; + if (in_tri == 0) + { + boundary_side = 0; + } + else if (in_tri > 0) + { + boundary_side = 1; + } + else + { + boundary_side = -1; + } + boundary_test[row_idx] = boundary_side; + } + return (boundary_test); +} + +// Line defined between (x0, y0) and (x1, y1) +// Offset is distance along this line, starting at (x0, y0) +network_coo threshold_iterate_1D(const NumpyMatrix &distMat, + const std::vector &offsets, + const int slope, + const float x0, + const float y0, + const float x1, + const float y1, + const int num_threads) +{ + std::vector i_vec; + std::vector j_vec; + std::vector offset_idx; + const float gradient = (y1 - y0) / (x1 - x0); // == tan(theta) + const size_t n_samples = rows_to_samples(distMat); + + std::vector boundary_dist(distMat.rows()); + std::vector boundary_order; + long sorted_idx = 0; + for (int offset_nr = 0; offset_nr < offsets.size(); ++offset_nr) + { + float x_intercept = x0 + offsets[offset_nr] * (1 / std::sqrt(1 + gradient)); + float y_intercept = y0 + offsets[offset_nr] * (gradient / std::sqrt(1 + gradient)); + float x_max, y_max; + if (slope == 2) + { + x_max = x_intercept + y_intercept * gradient; + y_max = y_intercept + x_intercept / gradient; + } + else if (slope == 0) + { + x_max = x_intercept; + y_max = 0; + } + else + { + x_max = 0; + y_max = y_intercept; + } + + // Calculate the distances and sort them on the first loop entry + if (offset_nr == 0) + { +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (long row_idx = 0; row_idx < distMat.rows(); row_idx++) + { + boundary_dist[row_idx] = line_dist(distMat(row_idx, 0), distMat(row_idx, 1), x_max, y_max, slope); + } + boundary_order = sort_indexes(boundary_dist, num_threads); + } + + long row_idx = boundary_order[sorted_idx]; + while (sorted_idx < boundary_order.size() && + line_dist(distMat(row_idx, 0), + distMat(row_idx, 1), + x_max, y_max, slope) <= 0) + { + long i = calc_row_idx(row_idx, n_samples); + long j = calc_col_idx(row_idx, i, n_samples); + i_vec.push_back(i); + j_vec.push_back(j); + offset_idx.push_back(offset_nr); + sorted_idx++; + row_idx = boundary_order[sorted_idx]; + } + } + return (std::make_tuple(i_vec, j_vec, offset_idx)); +} + +network_coo threshold_iterate_2D(const NumpyMatrix &distMat, + const std::vector &x_max, + const float y_max) +{ + std::vector i_vec; + std::vector j_vec; + std::vector offset_idx; + const size_t n_samples = rows_to_samples(distMat); + + for (int offset_nr = 0; offset_nr < x_max.size(); ++offset_nr) + { + for (long row_idx = 0; row_idx < distMat.rows(); row_idx++) + { + if (line_dist(distMat(row_idx, 0), distMat(row_idx, 1), x_max[offset_nr], y_max, 2) <= 0) + { + if (offset_nr == 0 || (line_dist(distMat(row_idx, 0), distMat(row_idx, 1), x_max[offset_nr - 1], y_max, 2) > 0)) + { + long i = calc_row_idx(row_idx, n_samples); + long j = calc_col_idx(row_idx, i, n_samples); + i_vec.push_back(i); + j_vec.push_back(j); + offset_idx.push_back(offset_nr); + } + } + } + } + return (std::make_tuple(i_vec, j_vec, offset_idx)); +} diff --git a/src/boundary.hpp b/src/boundary.hpp new file mode 100644 index 00000000..820daff4 --- /dev/null +++ b/src/boundary.hpp @@ -0,0 +1,36 @@ +/* + * + * matrix.hpp + * functions in matrix_ops.cpp + * + */ +#pragma once + +#include +#include +#include +#include + +#include + +typedef Eigen::Matrix NumpyMatrix; +typedef std::tuple, std::vector, std::vector> network_coo; + +Eigen::VectorXf assign_threshold(const NumpyMatrix &distMat, + const int slope, + const float x_max, + const float y_max, + unsigned int num_threads); + +network_coo threshold_iterate_1D(const NumpyMatrix &distMat, + const std::vector &offsets, + const int slope, + const float x0, + const float y0, + const float x1, + const float y1, + const int num_threads = 1); + +network_coo threshold_iterate_2D(const NumpyMatrix &distMat, + const std::vector &x_max, + const float y_max); diff --git a/src/python_bindings.cpp b/src/python_bindings.cpp new file mode 100644 index 00000000..0171ad5e --- /dev/null +++ b/src/python_bindings.cpp @@ -0,0 +1,88 @@ +/* + * sketchlib_bindings.cpp + * Python bindings for PopPUNK C++ functions + * + */ + +// pybind11 headers +#include +#include +#include + +namespace py = pybind11; + +#include "boundary.hpp" + +// Wrapper which makes a ref to the python/numpy array +Eigen::VectorXf assignThreshold(const Eigen::Ref &distMat, + const int slope, + const double x_max, + const double y_max, + const unsigned int num_threads = 1) +{ + Eigen::VectorXf assigned = assign_threshold(distMat, + slope, + x_max, + y_max, + num_threads); + return (assigned); +} + +network_coo thresholdIterate1D(const Eigen::Ref &distMat, + const std::vector &offsets, + const int slope, + const double x0, + const double y0, + const double x1, + const double y1, + const int num_threads) +{ + if (!std::is_sorted(offsets.begin(), offsets.end())) + { + throw std::runtime_error("Offsets to thresholdIterate1D must be sorted"); + } + std::tuple, std::vector, std::vector> add_idx = + threshold_iterate_1D(distMat, offsets, slope, x0, y0, x1, y1, num_threads); + return (add_idx); +} + +network_coo thresholdIterate2D(const Eigen::Ref &distMat, + const std::vector &x_max, + const float y_max) +{ + if (!std::is_sorted(x_max.begin(), x_max.end())) + { + throw std::runtime_error("x_max range to thresholdIterate2D must be sorted"); + } + std::tuple, std::vector, std::vector> add_idx = + threshold_iterate_2D(distMat, x_max, y_max); + return (add_idx); +} + +PYBIND11_MODULE(poppunk_refine, m) +{ + m.doc() = "Network refine helper functions"; + + // Exported functions + m.def("assignThreshold", &assignThreshold, py::return_value_policy::reference_internal, "Assign samples based on their relation to a 2D boundary", + py::arg("distMat").noconvert(), + py::arg("slope"), + py::arg("x_max"), + py::arg("y_max"), + py::arg("num_threads") = 1); + + m.def("thresholdIterate1D", &thresholdIterate1D, py::return_value_policy::reference_internal, "Move a 2D boundary to grow a network by adding edges at each offset", + py::arg("distMat").noconvert(), + py::arg("offsets"), + py::arg("slope"), + py::arg("x0"), + py::arg("y0"), + py::arg("x1"), + py::arg("y1"), + py::arg("num_threads") = 1); + + m.def("thresholdIterate2D", &thresholdIterate2D, py::return_value_policy::reference_internal, "Move a 2D boundary to grow a network by adding edges at each offset", + py::arg("distMat").noconvert(), + py::arg("x_max"), + py::arg("y_max")); +} diff --git a/test/run_test.py b/test/run_test.py index 03ab8942..c3f11051 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -31,6 +31,8 @@ sys.stderr.write("Running model refinement (--fit-model refine)\n") subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite", shell=True, check=True) subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both", shell=True, check=True) +subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1", shell=True, check=True) +subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 2", shell=True, check=True) subprocess.run("python ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold", shell=True, check=True) # lineage clustering @@ -42,6 +44,8 @@ subprocess.run("python ../poppunk-runner.py --use-model --ref-db example_db --model-dir example_db --output example_use --overwrite", shell=True, check=True) # tests of other command line programs +sys.stderr.write("Testing C++ extension\n") +subprocess.run("python test-refine.py", shell=True, check=True) #assign query sys.stderr.write("Running query assignment\n") diff --git a/test/test-refine.py b/test/test-refine.py new file mode 100644 index 00000000..0a80447b --- /dev/null +++ b/test/test-refine.py @@ -0,0 +1,106 @@ +import os, sys +import numpy as np +from math import sqrt + +# testing without install +#sys.path.insert(0, '../build/lib.macosx-10.9-x86_64-3.8') +import poppunk_refine + +# Original PopPUNK function (with some improvements) +def withinBoundary(dists, x_max, y_max, slope=2): + boundary_test = np.ones((dists.shape[0])) + for row in range(boundary_test.size): + if slope == 2: + in_tri = dists[row, 1] * x_max + dists[row, 0] * y_max - x_max * y_max + elif slope == 0: + in_tri = dists[row, 0] - x_max + elif slope == 1: + in_tri = dists[row, 1] - y_max + if abs(in_tri) < np.finfo(np.float32).eps: + boundary_test[row] = 0 + elif in_tri < 0: + boundary_test[row] = -1 + return(boundary_test) + +def check_res(res, expected): + if (not np.all(res == expected)): + print(res) + print(expected) + raise RuntimeError("Results don't match") + +# assigning +x = np.arange(0, 1, 0.1, dtype=np.float32) +y = np.arange(0, 1, 0.1, dtype=np.float32) +xv, yv = np.meshgrid(x, y) +distMat = np.hstack((xv.reshape(-1,1), yv.reshape(-1,1))) +assign0 = poppunk_refine.assignThreshold(distMat, 0, 0.5, 0.5, 2) +assign1 = poppunk_refine.assignThreshold(distMat, 1, 0.5, 0.5, 2) +assign2 = poppunk_refine.assignThreshold(distMat, 2, 0.5, 0.5, 2) + +assign0_res = withinBoundary(distMat, 0.5, 0.5, 0) +assign1_res = withinBoundary(distMat, 0.5, 0.5, 1) +assign2_res = withinBoundary(distMat, 0.5, 0.5, 2) + +check_res(assign0, assign0_res) +check_res(assign1, assign1_res) +check_res(assign2, assign2_res) + +# move boundary 1D +# example is symmetrical at points (0.1, 0.1); (0.2, 0.2); (0.3, 0.3) +samples = 100 +distMat = np.random.rand(int(0.5 * samples * (samples - 1)), 2) +distMat = np.array(distMat, dtype = np.float32) +offsets = [x * sqrt(2) for x in [-0.1, 0.0, 0.1]] +i_vec, j_vec, idx_vec = poppunk_refine.thresholdIterate1D(distMat, offsets, 2, 0.2, 0.2, 0.3, 0.3) +sketchlib_i = [] +sketchlib_j = [] +for offset_idx, offset in enumerate(offsets): + for i, j, idx in zip(i_vec, j_vec, idx_vec): + if idx > offset_idx: + break + elif idx == offset_idx: + sketchlib_i.append(i) + sketchlib_j.append(j) + + py_i = [] + py_j = [] + xmax = 0.4 + (2 * (offset/sqrt(2))) + assign = poppunk_refine.assignThreshold(distMat, 2, xmax, xmax, 1) + dist_idx = 0 + for i in range(samples): + for j in range(i + 1, samples): + if assign[dist_idx] <= 0: + py_i.append(i) + py_j.append(j) + dist_idx += 1 + if set(zip(py_i, py_j)) != set(zip(sketchlib_i, sketchlib_j)): + raise RuntimeError("Threshold 1D iterate mismatch at offset " + str(offset)) + +# move boundary 2D +# example is for boundaries (0.1, 0.2); (0.2, 0.2); (0.3, 0.2) +offsets = [0.1, 0.2, 0.3] +y_max = 0.2 +i_vec, j_vec, idx_vec = poppunk_refine.thresholdIterate2D(distMat, offsets, y_max) +sketchlib_i = [] +sketchlib_j = [] +for offset_idx, offset in enumerate(offsets): + for i, j, idx in zip(i_vec, j_vec, idx_vec): + if idx > offset_idx: + break + elif idx == offset_idx: + sketchlib_i.append(i) + sketchlib_j.append(j) + + py_i = [] + py_j = [] + assign = poppunk_refine.assignThreshold(distMat, 2, offset, y_max, 1) + dist_idx = 0 + for i in range(samples): + for j in range(i + 1, samples): + if assign[dist_idx] <= 0: + py_i.append(i) + py_j.append(j) + dist_idx += 1 + if set(zip(py_i, py_j)) != set(zip(sketchlib_i, sketchlib_j)): + raise RuntimeError("Threshold 2D iterate mismatch at offset " + str(offset)) +