From 932075aebc047c362167e14cddb77f05b6f5aa98 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 24 May 2021 20:51:41 +0100 Subject: [PATCH 001/175] Change individual boundary refinement --- PopPUNK/models.py | 53 +++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 68e14c14..cf15ce6a 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -40,6 +40,7 @@ import cudf import cupy as cp from numba import cuda + import rmm gpu_lib = True except ImportError as e: gpu_lib = False @@ -810,24 +811,40 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi self.accessory_boundary = self.optimal_y if indiv_refine is not None: try: - for dist_type, slope in zip(['core', 'accessory'], [0, 1]): - if indiv_refine == 'both' or indiv_refine == dist_type: - sys.stderr.write("Refining " + dist_type + " distances separately\n") - # optimise core distance boundary - start_point, self.core_boundary, core_acc, self.min_move, self.max_move = \ - refineFit(X/self.scale, - sample_names, - self.start_s, - self.mean0, - self.mean1, - self.max_move, - self.min_move, - slope = slope, - score_idx = score_idx, - no_local = no_local, - num_processes = self.threads, - betweenness_sample = betweenness_sample, - use_gpu = use_gpu) + if indiv_refine == 'both' or indiv_refine == 'core': + sys.stderr.write("Refining core distances separately\n") + # optimise core distance boundary + start_point, self.core_boundary, unused_accessory_boundary, self.min_move, self.max_move = \ + refineFit(X/self.scale, + sample_names, + self.start_s, + self.mean0, + self.mean1, + self.max_move, + self.min_move, + slope = 0, + score_idx = score_idx, + no_local = no_local, + num_processes = self.threads, + betweenness_sample = betweenness_sample, + use_gpu = use_gpu) + if indiv_refine == 'both' or indiv_refine == 'accessory': + sys.stderr.write("Refining accessory distances separately\n") + # optimise core distance boundary + start_point, unused_core_boundary, self.accessory_boundary, self.min_move, self.max_move = \ + refineFit(X/self.scale, + sample_names, + self.start_s, + self.mean0, + self.mean1, + self.max_move, + self.min_move, + slope = 1, + score_idx = score_idx, + no_local = no_local, + num_processes = self.threads, + betweenness_sample = betweenness_sample, + use_gpu = use_gpu) self.indiv_fitted = True except RuntimeError as e: print(e) From 8acd320cbd6853efa7389630cf4147928bcfd607 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 24 May 2021 20:56:26 +0100 Subject: [PATCH 002/175] Change GPU shared memory management --- PopPUNK/utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index e2da1fc2..1f73048e 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -22,6 +22,9 @@ try: import cudf + import rmm + import cupy + from numba import cuda gpu_lib = True except ImportError as e: gpu_lib = False @@ -624,6 +627,12 @@ def check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = False): # Set memory management for large networks if use_gpu: + rmm.reinitialize(managed_memory=True) cudf.set_allocator("managed") - + if "cupy" in sys.modules: + cupy.cuda.set_allocator(rmm.rmm_cupy_allocator) + if "cuda" in sys.modules: + cuda.set_memory_manager(rmm.RMMNumbaManager) + assert(rmm.is_initialized()) + return use_gpu From 5df8ee9174d2e665e685a99ad69540f02f7d2109 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 24 May 2021 20:57:00 +0100 Subject: [PATCH 003/175] Import rmm package --- PopPUNK/network.py | 1 + PopPUNK/refine.py | 1 + PopPUNK/sparse_mst.py | 1 + PopPUNK/trees.py | 1 + PopPUNK/tsne.py | 1 + 5 files changed, 5 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ec2b5d53..2dbb6092 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -31,6 +31,7 @@ import cudf import cupy as cp from numba import cuda + import rmm gpu_lib = True except ImportError as e: gpu_lib = False diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 430372a9..a3bb5777 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -32,6 +32,7 @@ import cudf import cupy as cp from numba import cuda + import rmm gpu_lib = True except ImportError as e: gpu_lib = False diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index a8233465..1a2f86d0 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -19,6 +19,7 @@ import cudf import cupy as cp from numba import cuda + import rmm gpu_lib = True except ImportError as e: gpu_lib = False diff --git a/PopPUNK/trees.py b/PopPUNK/trees.py index f6683b5e..712170f5 100644 --- a/PopPUNK/trees.py +++ b/PopPUNK/trees.py @@ -19,6 +19,7 @@ import cudf import cupy as cp from numba import cuda + import rmm gpu_lib = True except ImportError as e: gpu_lib = False diff --git a/PopPUNK/tsne.py b/PopPUNK/tsne.py index 7068449f..0b171328 100644 --- a/PopPUNK/tsne.py +++ b/PopPUNK/tsne.py @@ -17,6 +17,7 @@ import cudf import cupy as cp from numba import cuda + import rmm gpu_lib = True except ImportError as e: gpu_lib = False From e7767cda6824e2d281a7808d352484233365eb50 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 24 May 2021 21:27:57 +0100 Subject: [PATCH 004/175] Update refinement model assignment --- PopPUNK/__main__.py | 2 +- PopPUNK/models.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index d259603b..f166cfa5 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -526,7 +526,7 @@ def main(): indivNetworks = {} for dist_type, slope in zip(['core', 'accessory'], [0, 1]): if args.indiv_refine == 'both' or args.indiv_refine == dist_type: - indivAssignments = model.assign(distMat, slope) + indivAssignments = model.assign(distMat, slope = slope) indivNetworks[dist_type] = \ construct_network_from_assignments(refList, queryList, diff --git a/PopPUNK/models.py b/PopPUNK/models.py index cf15ce6a..a153d8a7 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -973,11 +973,8 @@ def assign(self, X, slope=None): Core and accessory distances slope (int) Override self.slope. Default - use self.slope - Set to 0 for a vertical line, 1 for a horizontal line, or 2 to use a slope - cpus (int) - Number of threads to use Returns: y (numpy.array) Cluster assignments by samples @@ -985,11 +982,13 @@ def assign(self, X, slope=None): if not self.fitted: raise RuntimeError("Trying to assign using an unfitted model") else: - if slope == 2 or (slope == None and self.slope == 2): + if slope == None: + slope = self.slope + if slope == 2: y = poppunk_refine.assignThreshold(X/self.scale, 2, self.optimal_x, self.optimal_y, self.threads) - elif slope == 0 or (slope == None and self.slope == 0): + elif slope == 0: y = poppunk_refine.assignThreshold(X/self.scale, 0, self.core_boundary, 0, self.threads) - elif slope == 1 or (slope == None and self.slope == 1): + elif slope == 1: y = poppunk_refine.assignThreshold(X/self.scale, 1, 0, self.accessory_boundary, self.threads) return y From 2e2ea8736fa2de2b73cc3bc7271a2ef4848bfd6b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 25 May 2021 08:25:51 +0100 Subject: [PATCH 005/175] Fix network weights processing --- PopPUNK/network.py | 38 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 2dbb6092..25304958 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -600,9 +600,6 @@ def process_previous_network(previous_network = None, previous_pkl = None, verte Whether weights in the G_df data frame should be included in the network distMat (2 column ndarray) Numpy array of pairwise distances - weights_type (str) - Measure to calculate from the distMat to use as edge weights in network - - options are core, accessory or euclidean distance previous_network (str) Name of file containing a previous network to be integrated into this new network @@ -647,7 +644,7 @@ def process_previous_network(previous_network = None, previous_pkl = None, verte return extra_sources, extra_targets, extra_weights def construct_network_from_edge_list(rlist, qlist, edge_list, - weights = None, distMat = None, weights_type = None, previous_network = None, previous_pkl = None, + weights = None, distMat = None, previous_network = None, previous_pkl = None, betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): """Construct an undirected network using a data frame of edges. Nodes are samples and edges where samples are within the same cluster @@ -665,9 +662,6 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, Whether weights in the G_df data frame should be included in the network distMat (2 column ndarray) Numpy array of pairwise distances - weights_type (str) - Measure to calculate from the distMat to use as edge weights in network - - options are core, accessory or euclidean distance previous_network (str) Name of file containing a previous network to be integrated into this new network @@ -692,8 +686,6 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, # data structures vertex_labels, self_comparison = initial_graph_properties(rlist, qlist) - if weights_type is not None: - weights = process_weights(distMat, weights_type) # Load previous network if previous_network is not None: @@ -720,7 +712,6 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, G = construct_network_from_df(rlist, qlist, G_df, weights = (weights is not None), distMat = distMat, - weights_type = weights_type, previous_network = previous_network, previous_pkl = previous_pkl, summarise = False, @@ -752,7 +743,7 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, return G def construct_network_from_df(rlist, qlist, G_df, - weights = False, distMat = None, weights_type = None, previous_network = None, previous_pkl = None, + weights = False, distMat = None, previous_network = None, previous_pkl = None, betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): """Construct an undirected network using a data frame of edges. Nodes are samples and edges where samples are within the same cluster @@ -770,9 +761,6 @@ def construct_network_from_df(rlist, qlist, G_df, Whether weights in the G_df data frame should be included in the network distMat (2 column ndarray) Numpy array of pairwise distances - weights_type (str) - Measure to calculate from the distMat to use as edge weights in network - - options are core, accessory or euclidean distance previous_network (str) Name of file containing a previous network to be integrated into this new network @@ -797,8 +785,6 @@ def construct_network_from_df(rlist, qlist, G_df, # data structures vertex_labels, self_comparison = initial_graph_properties(rlist, qlist) - if weights_type is not None: - G_df['weights'] = process_weights(distMat, weights_type) # Check df format is correct if weights: @@ -844,7 +830,6 @@ def construct_network_from_df(rlist, qlist, G_df, G = construct_network_from_edge_list(rlist, qlist, connections, weights = weights, distMat = distMat, - weights_type = weights_type, previous_network = previous_network, previous_pkl = previous_pkl, summarise = False, @@ -854,7 +839,7 @@ def construct_network_from_df(rlist, qlist, G_df, return G def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, - weights = None, weights_type = None, previous_network = None, previous_pkl = None, + weights = None, previous_network = None, previous_pkl = None, betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): """Construct an undirected network using a sparse matrix. Nodes are samples and edges where samples are within the same cluster @@ -872,9 +857,6 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, List of weights for each edge in the network distMat (2 column ndarray) Numpy array of pairwise distances - weights_type (str) - Measure to calculate from the distMat to use as edge weights in network - - options are core, accessory or euclidean distance previous_network (str) Name of file containing a previous network to be integrated into this new network @@ -906,7 +888,6 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, G_df['weights'] = sparse_input.data G = construct_network_from_df(rlist, qlist, G_df, weights = True, - weights_type = weights_type, previous_network = previous_network, previous_pkl = previous_pkl, betweenness_sample = betweenness_sample, @@ -962,18 +943,21 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = # Check GPU library use use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True) - - # Convert edge indices to tuples - connections = poppunk_refine.generateTuples(assignments, within_label) + # Filter weights to only the relevant edges if weights is not None: weights = weights[assignments == within_label] - elif distMat is not None: + elif distMat is not None and weights_type is not None: distMat = distMat[assignments == within_label,:] + weights = process_weights(distMat, weights_type) + + # Convert edge indices to tuples + connections = poppunk_refine.generateTuples(assignments, within_label) + + # Construct network using edge list G = construct_network_from_edge_list(rlist, qlist, connections, weights = weights, distMat = distMat, - weights_type = weights_type, previous_network = previous_network, previous_pkl = previous_pkl, summarise = False, From fb8dce9bcccd6dfe8f1e82bece62ed6d3e5387be Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 25 May 2021 09:32:44 +0100 Subject: [PATCH 006/175] Fix distmat subsetting --- PopPUNK/network.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 25304958..2f58e740 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -948,6 +948,8 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = if weights is not None: weights = weights[assignments == within_label] elif distMat is not None and weights_type is not None: + if type(assignments) == 'list': + assignments = np.array(assignments) distMat = distMat[assignments == within_label,:] weights = process_weights(distMat, weights_type) From d5ab4f0fe1e0e1859a4e8a7ffdabae53ba4c43d0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 25 May 2021 09:42:46 +0100 Subject: [PATCH 007/175] Fix list class check --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 2f58e740..11713583 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -948,7 +948,7 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = if weights is not None: weights = weights[assignments == within_label] elif distMat is not None and weights_type is not None: - if type(assignments) == 'list': + if isinstance(assignments, list): assignments = np.array(assignments) distMat = distMat[assignments == within_label,:] weights = process_weights(distMat, weights_type) From 746f23624d0e6bdcaf8fe7e940bd9c147fc4e422 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 25 May 2021 09:48:46 +0100 Subject: [PATCH 008/175] Load GPU packages for visualisation --- PopPUNK/visualise.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 800af930..a3be2781 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -9,6 +9,15 @@ import numpy as np import scipy.sparse +try: + import cudf + import rmm + import cupy + from numba import cuda + gpu_lib = True +except ImportError as e: + gpu_lib = False + # required from v2.1.1 onwards (no mash support) import pp_sketchlib From 609249c38375f3064e60559916dba55f6678710d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 25 May 2021 09:50:23 +0100 Subject: [PATCH 009/175] Load cugraph --- PopPUNK/visualise.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index a3be2781..e0b53ef4 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -13,6 +13,7 @@ import cudf import rmm import cupy + import cugraph from numba import cuda gpu_lib = True except ImportError as e: From f03b3ee4db5368e69937fdc5f2648c571615f257 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 25 May 2021 09:57:43 +0100 Subject: [PATCH 010/175] Convert cupy array to list --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 11713583..6e1f800c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1694,7 +1694,7 @@ def cugraph_to_graph_tool(G, rlist): Graph tool network """ edge_df = G.view_edge_list() - edge_tuple = edge_df[['src', 'dst']].values + edge_tuple = edge_df[['src', 'dst']].values.to_list() edge_weights = None if 'weights' in edge_df.columns: edge_weights = edge_df['weights'].values_host From 3feecf6a92706b382d7c92f14184f10a5e80a786 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 25 May 2021 09:59:12 +0100 Subject: [PATCH 011/175] Fix function name --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 6e1f800c..622f3352 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1694,7 +1694,7 @@ def cugraph_to_graph_tool(G, rlist): Graph tool network """ edge_df = G.view_edge_list() - edge_tuple = edge_df[['src', 'dst']].values.to_list() + edge_tuple = edge_df[['src', 'dst']].values.tolist() edge_weights = None if 'weights' in edge_df.columns: edge_weights = edge_df['weights'].values_host From 5f549411221492ee95be6d2d492bfad8793ef5f0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 26 May 2021 08:28:37 +0100 Subject: [PATCH 012/175] Make sequence name processing consistent --- PopPUNK/plot.py | 2 +- PopPUNK/utils.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index d794ff98..fcdb03e8 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -594,7 +594,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, if queryNames is not None: colnames.append('Status') else: - sys.stderr.write("Do not recognise format for CSV writing") + sys.stderr.write("Do not recognise format for CSV writing\n") exit(1) # process epidemiological data diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 1f73048e..03c261e4 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -498,7 +498,7 @@ def readRfile(rFile, oneSeq=False): for sequence in rFields[1:]: sample_files.append(sequence) - # Take first of sequence list if using mash + # Take first of sequence list if oneSeq: if len(sample_files) > 1: sys.stderr.write("Multiple sequence found for " + rFields[0] + @@ -507,6 +507,9 @@ def readRfile(rFile, oneSeq=False): else: sequences.append(sample_files) + # Process names to ensure compatibility with downstream software + names = isolateNameToLabel(names) + if len(set(names)) != len(names): seen = set() dupes = set(x for x in names if x in seen or seen.add(x)) From 300a851064fb0df3f8d8989b819171dbc447d1ee Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 26 May 2021 08:32:48 +0100 Subject: [PATCH 013/175] Replace dots with underscores in names --- PopPUNK/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 03c261e4..0b5da31a 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -540,7 +540,7 @@ def isolateNameToLabel(names): """ # useful to have as a function in case we # want to remove certain characters - labels = [name.split('/')[-1].split('.')[0].replace(':','') for name in names] + labels = [name.split('/')[-1].replace('.','_')[0].replace(':','') for name in names] return labels From 7f7bb88da3cc8bd7e35677117b8d44bcd9b526d9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 26 May 2021 08:34:24 +0100 Subject: [PATCH 014/175] Replace dots with underscores in names correctly --- PopPUNK/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 0b5da31a..da495cc4 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -540,7 +540,7 @@ def isolateNameToLabel(names): """ # useful to have as a function in case we # want to remove certain characters - labels = [name.split('/')[-1].replace('.','_')[0].replace(':','') for name in names] + labels = [name.split('/')[-1].replace('.','_').replace(':','') for name in names] return labels From 91fe22a7f5528b70db5867434ddb14d2b0452769 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 7 Jun 2021 11:47:20 +0100 Subject: [PATCH 015/175] Enable visualisation of individuall-refined clusters --- PopPUNK/visualise.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index e0b53ef4..38418ed1 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -332,14 +332,28 @@ def generate_visualisations(query_db, if model.type == "lineage": mode = "lineages" suffix = "_lineages.csv" - if model.indiv_fitted: - sys.stderr.write("Note: Individual (core/accessory) fits found, but " - "visualisation only supports combined boundary fit\n") prev_clustering = os.path.basename(model_file) + '/' + os.path.basename(model_file) + suffix isolateClustering = readIsolateTypeFromCsv(prev_clustering, mode = mode, return_dict = True) + # Add individual refinement clusters if they exist + if model.indiv_fitted: + core_suffix = '_core_clusters.csv' + core_clustering = os.path.basename(model_file) + '/' + os.path.basename(model_file) + core_suffix + if os.path.isfile(core_clustering): + core_isolateClustering = readIsolateTypeFromCsv(core_clustering, + mode = mode, + return_dict = True) + isolateClustering['Core'] = core_isolateClustering['Cluster'] + acc_suffix = '_accessory_clusters.csv' + accessory_clustering = os.path.basename(model_file) + '/' + os.path.basename(model_file) + acc_suffix + if os.path.isfile(accessory_clustering): + accessory_isolateClustering = readIsolateTypeFromCsv(accessory_clustering, + mode = mode, + return_dict = True) + isolateClustering['Accessory'] = accessory_isolateClustering['Cluster'] + # Join clusters with query clusters if required if not self: if previous_query_clustering is not None: From 7445d4c95c50b42f550b6520f5f03b4bc1ac935b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 7 Jun 2021 11:53:37 +0100 Subject: [PATCH 016/175] Simplify indiv-refine reading loop --- PopPUNK/visualise.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 38418ed1..2c90252c 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -339,20 +339,13 @@ def generate_visualisations(query_db, # Add individual refinement clusters if they exist if model.indiv_fitted: - core_suffix = '_core_clusters.csv' - core_clustering = os.path.basename(model_file) + '/' + os.path.basename(model_file) + core_suffix - if os.path.isfile(core_clustering): - core_isolateClustering = readIsolateTypeFromCsv(core_clustering, - mode = mode, - return_dict = True) - isolateClustering['Core'] = core_isolateClustering['Cluster'] - acc_suffix = '_accessory_clusters.csv' - accessory_clustering = os.path.basename(model_file) + '/' + os.path.basename(model_file) + acc_suffix - if os.path.isfile(accessory_clustering): - accessory_isolateClustering = readIsolateTypeFromCsv(accessory_clustering, + for type, suffix in zip(['Core','Accessory'],['_core_clusters.csv','_accessory_clusters.csv']): + indiv_clustering = os.path.basename(model_file) + '/' + os.path.basename(model_file) + suffix + if os.path.isfile(indiv_clustering): + indiv_isolateClustering = readIsolateTypeFromCsv(indiv_clustering, mode = mode, return_dict = True) - isolateClustering['Accessory'] = accessory_isolateClustering['Cluster'] + isolateClustering[type] = indiv_isolateClustering['Cluster'] # Join clusters with query clusters if required if not self: From a0a0dadc30260b204243b290508f7a26208af1db Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 7 Jun 2021 21:36:43 +0100 Subject: [PATCH 017/175] Save each refined graph to a different file --- PopPUNK/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index f166cfa5..21a37c00 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -542,7 +542,7 @@ def main(): use_gpu = args.gpu_graph) save_network(indivNetworks[dist_type], prefix = output, - suffix = '_graph', + suffix = '_' + dist_type + '_graph', use_gpu = args.gpu_graph) #******************************# From e3d3b3c45ef90981455bcdcc6914814bd8364e87 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 7 Jun 2021 22:36:32 +0100 Subject: [PATCH 018/175] Enable querying with indiv-refine --- PopPUNK/assign.py | 54 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index fb933f9d..e36e45a6 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -238,9 +238,17 @@ def assign_query(dbFuncs, strand_preserved, weights = weights, threads = threads, use_gpu = gpu_graph) + if core_only: + output_fn = output + "/" + os.path.basename(output) + '_core' + elif accessory_only: + output_fn = output + "/" + os.path.basename(output) + '_accessory' + else: + output_fn = output + "/" + os.path.basename(output) + isolateClustering = \ - {'combined': printClusters(genomeNetwork, rNames + qNames, - output + "/" + os.path.basename(output), + {'combined': printClusters(genomeNetwork, + rNames + qNames, + output_fn, old_cluster_file, external_clustering, write_references or update_db, @@ -265,7 +273,16 @@ def assign_query(dbFuncs, model.outPrefix = os.path.basename(output) model.save() else: - save_network(genomeNetwork, prefix = output, suffix = '_graph', use_gpu = gpu_graph) + if core_only: + graph_suffix = '_core_graph' + elif accessory_only: + graph_suffix = '_accessory_graph' + else: + graph_suffix = '_graph' + save_network(genomeNetwork, + prefix = output, + suffix = graph_suffix, + use_gpu = gpu_graph) # Load the previous distances refList_loaded, refList_copy, self, rrDistMat = \ @@ -313,11 +330,32 @@ def assign_query(dbFuncs, if (len(names_to_remove) > 0): # This function also writes out the new ref distance matrix + if core_only: + db_suffix = "_core.refs.dists" + elif accessory_only: + db_suffix = "_accessory.refs.dists" + else: + graph_suffix = ".refs.dists" postpruning_combined_seq, newDistMat = \ prune_distance_matrix(combined_seq, names_to_remove, complete_distMat, output + "/" + os.path.basename(output) + ".refs.dists") - save_network(genomeNetwork, prefix = output, suffix = 'refs_graph', use_gpu = gpu_graph) + if core_only: + graph_suffix = '_core_refs_graph' + elif accessory_only: + graph_suffix = '_accessory_refs_graph' + else: + graph_suffix = '_refs_graph' + save_network(genomeNetwork, + prefix = output, + suffix = graph_suffix, + use_gpu = gpu_graph) removeFromDB(output, output, names_to_remove) + if core_only: + db_suffix = "_core.refs.h5" + elif accessory_only: + db_suffix = "_accessory.refs.h5" + else: + graph_suffix = ".refs.h5" os.rename(output + "/" + os.path.basename(output) + ".tmp.h5", output + "/" + os.path.basename(output) + ".refs.h5") @@ -329,7 +367,13 @@ def assign_query(dbFuncs, if model.type == 'lineage': save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph) else: - save_network(genomeNetwork, prefix = output, suffix = '_graph', use_gpu = gpu_graph) + if core_only: + graph_suffix = '_core_graph' + elif accessory_only: + graph_suffix = '_accessory_graph' + else: + graph_suffix = '_graph' + save_network(genomeNetwork, prefix = output, suffix = graph_suffix, use_gpu = gpu_graph) return(isolateClustering) From c15b6ba93e0c20473ec6b9c512241a6a599fca84 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 8 Jun 2021 13:50:00 +0100 Subject: [PATCH 019/175] Enable querying of indiv-refine models --- PopPUNK/assign.py | 512 +++++++++++++++++++++++---------------------- PopPUNK/network.py | 59 ++++-- 2 files changed, 305 insertions(+), 266 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index e36e45a6..bcd4fd6b 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -43,8 +43,8 @@ def assign_query(dbFuncs, strand_preserved, previous_clustering, external_clustering, - core_only, - accessory_only, + core, + accessory, gpu_sketch, gpu_dist, gpu_graph, @@ -116,264 +116,272 @@ def assign_query(dbFuncs, # Find distances to reference db kmers, sketch_sizes, codon_phased = readDBParams(ref_db) - # Find distances vs ref seqs - rNames = [] - use_ref_graph = \ - os.path.isfile(ref_db + "/" + os.path.basename(ref_db) + ".refs") \ - and not update_db and model.type != 'lineage' - if use_ref_graph: - with open(ref_db + "/" + os.path.basename(ref_db) + ".refs") as refFile: - for reference in refFile: - rNames.append(reference.rstrip()) - else: - if os.path.isfile(distances + ".pkl"): - rNames = readPickle(distances, enforce_self = True, distances=False)[0] - elif update_db: - sys.stderr.write("Reference distances missing, cannot use --update-db\n") - sys.exit(1) + # Iterate through different types of model fit with a refined model when specified + # Core and accessory assignments use the same model and same overall set of distances + # but have different networks, references, reference distances and assignments + fit_type_list = ['original'] + fit_string_list = [''] + if core: + fit_type_list.append('core') + fit_string_list.append('_core') + if accessory: + fit_type_list.append('accessory') + fit_string_list.append('_accessory') + + for fit_type, fit_string in zip(fit_type_list, fit_string_list): + # Find distances vs ref seqs + rNames = [] + use_ref_graph = \ + os.path.isfile(ref_db + "/" + os.path.basename(ref_db) + fit_string + ".refs") \ + and not update_db and model.type != 'lineage' + if use_ref_graph: + with open(ref_db + "/" + os.path.basename(ref_db) + fit_string + ".refs") as refFile: + for reference in refFile: + rNames.append(reference.rstrip()) else: - rNames = getSeqsInDb(ref_db + "/" + os.path.basename(ref_db) + ".h5") - # construct database - if (web and json_sketch): - qNames = sketch_to_hdf5(json_sketch, output) - else: - # construct database - createDatabaseDir(output, kmers) - qNames = constructDatabase(q_files, - kmers, - sketch_sizes, - output, - threads, - overwrite, - codon_phased = codon_phased, - calc_random = False, - use_gpu = gpu_sketch, - deviceid = deviceid) - # run query - qrDistMat = queryDatabase(rNames = rNames, - qNames = qNames, - dbPrefix = ref_db, - queryPrefix = output, - klist = kmers, - self = False, - number_plot_fits = plot_fit, - threads = threads, - use_gpu = gpu_dist) - # QC distance matrix - if qc_dict['run_qc']: - seq_names_passing = qcDistMat(qrDistMat, rNames, qNames, ref_db, output, qc_dict)[0] - else: - seq_names_passing = rNames + qNames - - # Load the network based on supplied options - genomeNetwork, old_cluster_file = \ - fetchNetwork(prev_clustering, - model, - rNames, - ref_graph = use_ref_graph, - core_only = core_only, - accessory_only = accessory_only, - use_gpu = gpu_graph) - - if model.type == 'lineage': - # Assign lineages by calculating query-query information - addRandom(output, qNames, kmers, strand_preserved, overwrite, threads) - qqDistMat = queryDatabase(rNames = qNames, - qNames = qNames, - dbPrefix = output, - queryPrefix = output, - klist = kmers, - self = True, - number_plot_fits = 0, - threads = threads, - use_gpu = gpu_dist) - model.extend(qqDistMat, qrDistMat) - - genomeNetwork = {} - isolateClustering = defaultdict(dict) - for rank in model.ranks: - assignment = model.assign(rank) - # Overwrite the network loaded above - if graph_weights: - weights = model.edge_weights(rank) + if os.path.isfile(distances + ".pkl"): + rNames = readPickle(distances, enforce_self = True, distances=False)[0] + elif update_db: + sys.stderr.write("Reference distances missing, cannot use --update-db\n") + sys.exit(1) else: - weights = None - genomeNetwork[rank] = construct_network_from_edge_list(rNames + qNames, - rNames + qNames, - edge_list = assignment, - weights = weights, - use_gpu = gpu_graph) - - isolateClustering[rank] = \ - printClusters(genomeNetwork[rank], - rNames + qNames, - printCSV = False, - use_gpu = gpu_graph) - - overall_lineage = createOverallLineage(model.ranks, isolateClustering) - writeClusterCsv( - output + "/" + os.path.basename(output) + '_lineages.csv', - rNames + qNames, - rNames + qNames, - overall_lineage, - output_format = 'phandango', - epiCsv = None, - queryNames = qNames, - suffix = '_Lineage') - - else: - # Assign these distances as within or between strain - queryAssignments = model.assign(qrDistMat) - - # Assign clustering by adding to network - if graph_weights: - weights = qrDistMat - else: - weights = None - - genomeNetwork, qqDistMat = \ - addQueryToNetwork(dbFuncs, rNames, qNames, - genomeNetwork, kmers, - queryAssignments, model, output, update_db, - strand_preserved, - weights = weights, threads = threads, use_gpu = gpu_graph) - - if core_only: - output_fn = output + "/" + os.path.basename(output) + '_core' - elif accessory_only: - output_fn = output + "/" + os.path.basename(output) + '_accessory' - else: - output_fn = output + "/" + os.path.basename(output) - - isolateClustering = \ - {'combined': printClusters(genomeNetwork, - rNames + qNames, - output_fn, - old_cluster_file, - external_clustering, - write_references or update_db, - use_gpu = gpu_graph)} - - # Update DB as requested - dists_out = output + "/" + os.path.basename(output) + ".dists" - if update_db: - # Check new sequences pass QC before adding them - if len(set(seq_names_passing).difference(rNames + qNames)) > 0: - sys.stderr.write("Queries contained outlier distances, " - "not updating database\n") + rNames = getSeqsInDb(ref_db + "/" + os.path.basename(ref_db) + ".h5") + # construct database - use a single database directory for all query outputs + if (web and json_sketch): + qNames = sketch_to_hdf5(json_sketch, output) + elif (fit_type == 'original'): + # construct database + createDatabaseDir(output, kmers) + qNames = constructDatabase(q_files, + kmers, + sketch_sizes, + output, + threads, + overwrite, + codon_phased = codon_phased, + calc_random = False, + use_gpu = gpu_sketch, + deviceid = deviceid) + if (fit_type == 'original'): + # run query + qrDistMat = queryDatabase(rNames = rNames, + qNames = qNames, + dbPrefix = ref_db, + queryPrefix = output, + klist = kmers, + self = False, + number_plot_fits = plot_fit, + threads = threads, + use_gpu = gpu_dist) + elif (fit_type != 'original' and use_ref_graph == False): + # Only re-run query if references are being used + qrDistMat = queryDatabase(rNames = rNames, + qNames = qNames, + dbPrefix = ref_db, + queryPrefix = output, + klist = kmers, + self = False, + number_plot_fits = plot_fit, + threads = threads, + use_gpu = gpu_dist) + + # QC distance matrix + if qc_dict['run_qc']: + seq_names_passing = qcDistMat(qrDistMat, rNames, qNames, ref_db, output, qc_dict)[0] else: - sys.stderr.write("Updating reference database to " + output + "\n") + seq_names_passing = rNames + qNames + + # Load the network based on supplied options + genomeNetwork, old_cluster_file = \ + fetchNetwork(prev_clustering, + model, + rNames, + ref_graph = use_ref_graph, + core_only = (fit_type == 'core'), + accessory_only = (fit_type == 'accessory'), + use_gpu = gpu_graph) - # Update the network + ref list (everything) - joinDBs(ref_db, output, output, - {"threads": threads, "strand_preserved": strand_preserved}) if model.type == 'lineage': - save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph) - # Save sparse distance matrices and updated model - model.outPrefix = os.path.basename(output) - model.save() + # Assign lineages by calculating query-query information + addRandom(output, qNames, kmers, strand_preserved, overwrite, threads) + qqDistMat = queryDatabase(rNames = qNames, + qNames = qNames, + dbPrefix = output, + queryPrefix = output, + klist = kmers, + self = True, + number_plot_fits = 0, + threads = threads, + use_gpu = gpu_dist) + model.extend(qqDistMat, qrDistMat) + + genomeNetwork = {} + isolateClustering = defaultdict(dict) + for rank in model.ranks: + assignment = model.assign(rank) + # Overwrite the network loaded above + if graph_weights: + weights = model.edge_weights(rank) + else: + weights = None + genomeNetwork[rank] = construct_network_from_edge_list(rNames + qNames, + rNames + qNames, + edge_list = assignment, + weights = weights, + use_gpu = gpu_graph) + + isolateClustering[rank] = \ + printClusters(genomeNetwork[rank], + rNames + qNames, + printCSV = False, + use_gpu = gpu_graph) + + overall_lineage = createOverallLineage(model.ranks, isolateClustering) + writeClusterCsv( + output + "/" + os.path.basename(output) + '_lineages.csv', + rNames + qNames, + rNames + qNames, + overall_lineage, + output_format = 'phandango', + epiCsv = None, + queryNames = qNames, + suffix = '_Lineage') + else: - if core_only: - graph_suffix = '_core_graph' - elif accessory_only: - graph_suffix = '_accessory_graph' + # Assign these distances as within or between strain + if fit_type == 'original': + queryAssignments = model.assign(qrDistMat) + dist_type = 'euclidean' + elif fit_type == 'core': + queryAssignments = model.assign(qrDistMat, slope = 0) + dist_type = 'core' + elif fit_type == 'accessory': + queryAssignments = model.assign(qrDistMat, slope = 1) + dist_type = 'accessory' + + # Assign clustering by adding to network + if graph_weights: + weights = qrDistMat else: - graph_suffix = '_graph' - save_network(genomeNetwork, - prefix = output, - suffix = graph_suffix, - use_gpu = gpu_graph) - - # Load the previous distances - refList_loaded, refList_copy, self, rrDistMat = \ - readPickle(distances, - enforce_self = True) - # This should now always be true, otherwise both qrDistMat and sparse matrix - # may need reordering - assert(refList_loaded == rNames) - - combined_seq, core_distMat, acc_distMat = \ - update_distance_matrices(rNames, rrDistMat, - qNames, qrDistMat, - qqDistMat, threads = threads) - assert combined_seq == rNames + qNames - - # Get full distance matrix and save - complete_distMat = \ - np.hstack((pp_sketchlib.squareToLong(core_distMat, threads).reshape(-1, 1), - pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1))) - storePickle(combined_seq, combined_seq, True, complete_distMat, dists_out) - - # Copy model if needed - if output != model.outPrefix: - model.copy(output) - - # Clique pruning - if model.type != 'lineage': - dbOrder = rNames + qNames - newRepresentativesIndices, newRepresentativesNames, \ - newRepresentativesFile, genomeNetwork = \ - extractReferences(genomeNetwork, - dbOrder, - output, - existingRefs = rNames, - type_isolate = qc_dict['type_isolate'], - threads = threads, - use_gpu = gpu_graph) - # intersection that maintains order - newQueries = [x for x in qNames if x in frozenset(newRepresentativesNames)] - - # could also have newRepresentativesNames in this diff (should be the same) - but want - # to ensure consistency with the network in case of bad input/bugs - nodes_to_remove = set(range(len(dbOrder))).difference(newRepresentativesIndices) - names_to_remove = [dbOrder[n] for n in nodes_to_remove] - - if (len(names_to_remove) > 0): - # This function also writes out the new ref distance matrix - if core_only: - db_suffix = "_core.refs.dists" - elif accessory_only: - db_suffix = "_accessory.refs.dists" - else: - graph_suffix = ".refs.dists" - postpruning_combined_seq, newDistMat = \ - prune_distance_matrix(combined_seq, names_to_remove, complete_distMat, - output + "/" + os.path.basename(output) + ".refs.dists") - if core_only: - graph_suffix = '_core_refs_graph' - elif accessory_only: - graph_suffix = '_accessory_refs_graph' - else: - graph_suffix = '_refs_graph' + weights = None + genomeNetwork, qqDistMat = \ + addQueryToNetwork(dbFuncs, + rNames, + qNames, + genomeNetwork, + kmers, + queryAssignments, + model, + output, + distance_type = dist_type, + queryQuery = update_db, + strand_preserved = strand_preserved, + weights = weights, + threads = threads, + use_gpu = gpu_graph) + output_fn = output + "/" + os.path.basename(output) + fit_string + isolateClustering = \ + {'combined': printClusters(genomeNetwork, + rNames + qNames, + output_fn, + old_cluster_file, + external_clustering, + write_references or update_db, + use_gpu = gpu_graph)} + # Update DB as requested + dists_out = output + "/" + os.path.basename(output) + ".dists" + if update_db: + # Check new sequences pass QC before adding them + if len(set(seq_names_passing).difference(rNames + qNames)) > 0: + sys.stderr.write("Queries contained outlier distances, " + "not updating database\n") + else: + sys.stderr.write("Updating reference database to " + output + "\n") + # Update the network + ref list (everything) - no need to duplicate for core/accessory + if fit_type == 'original': + joinDBs(ref_db, output, output, + {"threads": threads, "strand_preserved": strand_preserved}) + if model.type == 'lineage': + save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph) + # Save sparse distance matrices and updated model + model.outPrefix = os.path.basename(output) + model.save() + else: + graph_suffix = fit_string + '_graph' save_network(genomeNetwork, prefix = output, suffix = graph_suffix, use_gpu = gpu_graph) - removeFromDB(output, output, names_to_remove) - if core_only: - db_suffix = "_core.refs.h5" - elif accessory_only: - db_suffix = "_accessory.refs.h5" - else: - graph_suffix = ".refs.h5" - os.rename(output + "/" + os.path.basename(output) + ".tmp.h5", - output + "/" + os.path.basename(output) + ".refs.h5") - - # ensure sketch and distMat order match - assert postpruning_combined_seq == rNames + newQueries - else: - storePickle(rNames, qNames, False, qrDistMat, dists_out) - if save_partial_query_graph: - if model.type == 'lineage': - save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph) - else: - if core_only: - graph_suffix = '_core_graph' - elif accessory_only: - graph_suffix = '_accessory_graph' + # Load the previous distances + refList_loaded, refList_copy, self, rrDistMat = \ + readPickle(distances, + enforce_self = True) + # This should now always be true, otherwise both qrDistMat and sparse matrix + # may need reordering + assert(refList_loaded == rNames) + combined_seq, core_distMat, acc_distMat = \ + update_distance_matrices(rNames, rrDistMat, + qNames, qrDistMat, + qqDistMat, threads = threads) + assert combined_seq == rNames + qNames + + # Get full distance matrix and save + complete_distMat = \ + np.hstack((pp_sketchlib.squareToLong(core_distMat, threads).reshape(-1, 1), + pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1))) + storePickle(combined_seq, combined_seq, True, complete_distMat, dists_out) + + # Copy model if needed + if output != model.outPrefix and fit_type == 'original': + model.copy(output) + + # Clique pruning + if model.type != 'lineage': + dbOrder = rNames + qNames + newRepresentativesIndices, newRepresentativesNames, \ + newRepresentativesFile, genomeNetwork = \ + extractReferences(genomeNetwork, + dbOrder, + output, + outSuffix = fit_string, + existingRefs = rNames, + type_isolate = qc_dict['type_isolate'], + threads = threads, + use_gpu = gpu_graph) + # intersection that maintains order + newQueries = [x for x in qNames if x in frozenset(newRepresentativesNames)] + + # could also have newRepresentativesNames in this diff (should be the same) - but want + # to ensure consistency with the network in case of bad input/bugs + nodes_to_remove = set(range(len(dbOrder))).difference(newRepresentativesIndices) + names_to_remove = [dbOrder[n] for n in nodes_to_remove] + + if (len(names_to_remove) > 0): + # This function also writes out the new ref distance matrix + dists_suffix = fit_string + '.refs.dists' + postpruning_combined_seq, newDistMat = \ + prune_distance_matrix(combined_seq, names_to_remove, complete_distMat, + output + "/" + os.path.basename(output) + dists_suffix) + graph_suffix = fit_string + '_refs_graph' + save_network(genomeNetwork, + prefix = output, + suffix = graph_suffix, + use_gpu = gpu_graph) + removeFromDB(output, output, names_to_remove) + db_suffix = fit_string + '.refs.h5' + os.rename(output + "/" + os.path.basename(output) + ".tmp.h5", + output + "/" + os.path.basename(output) + db_suffix) + + # ensure sketch and distMat order match + assert postpruning_combined_seq == rNames + newQueries + else: + storePickle(rNames, qNames, False, qrDistMat, dists_out) + if save_partial_query_graph: + if model.type == 'lineage': + save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph) else: - graph_suffix = '_graph' - save_network(genomeNetwork, prefix = output, suffix = graph_suffix, use_gpu = gpu_graph) + graph_suffix = fit_string + '_graph' + save_network(genomeNetwork, prefix = output, suffix = graph_suffix, use_gpu = gpu_graph) return(isolateClustering) @@ -449,10 +457,10 @@ def get_options(): queryingGroup.add_argument('--previous-clustering', help='Directory containing previous cluster definitions ' 'and network [default = use that in the directory ' 'containing the model]', type = str) - queryingGroup.add_argument('--core-only', help='(with a \'refine\' model) ' + queryingGroup.add_argument('--core', help='(with a \'refine\' model) ' 'Use a core-distance only model for assigning queries ' '[default = False]', default=False, action='store_true') - queryingGroup.add_argument('--accessory-only', help='(with a \'refine\' or \'lineage\' model) ' + queryingGroup.add_argument('--accessory', help='(with a \'refine\' or \'lineage\' model) ' 'Use an accessory-distance only model for assigning queries ' '[default = False]', default=False, action='store_true') @@ -577,8 +585,8 @@ def main(): args.strand_preserved, args.previous_clustering, args.external_clustering, - args.core_only, - args.accessory_only, + args.core, + args.accessory, args.gpu_sketch, args.gpu_dist, args.gpu_graph, diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 622f3352..4ac9dd84 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -215,7 +215,7 @@ def cliquePrune(component, graph, reference_indices, components_list): ref_list = getCliqueRefs(subgraph, refs) return(list(ref_list)) -def extractReferences(G, dbOrder, outPrefix, type_isolate = None, +def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None, existingRefs = None, threads = 1, use_gpu = False): """Extract references for each cluster based on cliques @@ -227,7 +227,9 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, dbOrder (list) The order of files in the sketches, so returned references are in the same order outPrefix (str) - Prefix for output file (.refs will be appended) + Prefix for output file + outSuffix (str) + Suffix for output file (.refs will be appended) type_isolate (str) Isolate to be included in set of references existingRefs (list) @@ -409,24 +411,26 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, # Order found references as in sketch files reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] - refFileName = writeReferences(reference_names, outPrefix) + refFileName = writeReferences(reference_names, outPrefix, outSuffix = outSuffix) return reference_indices, reference_names, refFileName, G_ref -def writeReferences(refList, outPrefix): +def writeReferences(refList, outPrefix, outSuffix = ""): """Writes chosen references to file Args: refList (list) Reference names to write outPrefix (str) - Prefix for output file (.refs will be appended) + Prefix for output file + outSuffix (str) + Suffix for output file (.refs will be appended) Returns: refFileName (str) The name of the file references were written to """ # write references to file - refFileName = outPrefix + "/" + os.path.basename(outPrefix) + ".refs" + refFileName = outPrefix + "/" + os.path.basename(outPrefix) + outSuffix + ".refs" with open(refFileName, 'w') as rFile: for ref in refList: rFile.write(ref + '\n') @@ -1078,8 +1082,8 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa return(metrics, scores) def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, - assignments, model, queryDB, queryQuery = False, - strand_preserved = False, weights = None, threads = 1, + assignments, model, queryDB, distance_type = 'euclidean', + queryQuery = False, strand_preserved = False, weights = None, threads = 1, use_gpu = False): """Finds edges between queries and items in the reference database, and modifies the network to include them. @@ -1101,6 +1105,8 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, Model fitted to reference database queryDB (str) Query database location + distance_type (str) + Distance type to use as weights in network queryQuery (bool) Add in all query-query distances (default = False) @@ -1137,7 +1143,12 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, if assignment == model.within_label: # query index needs to be adjusted for existing vertices in network if weights is not None: - dist = np.linalg.norm(weights[row_idx, :]) + if distance_type == 'core': + dist = weights[row_idx, 0] + elif distance_type == 'accessory': + dist = weights[row_idx, 1] + else: + dist = np.linalg.norm(weights[row_idx, :]) edge_tuple = (ref, query + ref_count, dist) else: edge_tuple = (ref, query + ref_count) @@ -1160,11 +1171,21 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, number_plot_fits = 0, threads = threads) - queryAssignation = model.assign(qqDistMat) + if distance_type == 'core': + queryAssignation = model.assign(qqDistMat, slope = 0) + elif distance_type == 'accessory': + queryAssignation = model.assign(qqDistMat, slope = 1) + else: + queryAssignation = model.assign(qqDistMat) for row_idx, (assignment, (ref, query)) in enumerate(zip(queryAssignation, listDistInts(qList, qList, self = True))): if assignment == model.within_label: if weights is not None: - dist = np.linalg.norm(qqDistMat[row_idx, :]) + if distance_type == 'core': + dist = weights[row_idx, 0] + elif distance_type == 'accessory': + dist = weights[row_idx, 1] + else: + dist = np.linalg.norm(weights[row_idx, :]) edge_tuple = (ref + ref_count, query + ref_count, dist) else: edge_tuple = (ref + ref_count, query + ref_count) @@ -1189,8 +1210,13 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, self = True, number_plot_fits = 0, threads = threads) - - queryAssignation = model.assign(qqDistMat) + + if distance_type == 'core': + queryAssignation = model.assign(qqDistMat, slope = 0) + elif distance_type == 'accessory': + queryAssignation = model.assign(qqDistMat, slope = 1) + else: + queryAssignation = model.assign(qqDistMat) # identify any links between queries and store in the same links dict # links dict now contains lists of links both to original database and new queries @@ -1198,7 +1224,12 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, for row_idx, (assignment, (query1, query2)) in enumerate(zip(queryAssignation, iterDistRows(qList, qList, self = True))): if assignment == model.within_label: if weights is not None: - dist = np.linalg.norm(qqDistMat[row_idx, :]) + if distance_type == 'core': + dist = weights[row_idx, 0] + elif distance_type == 'accessory': + dist = weights[row_idx, 1] + else: + dist = np.linalg.norm(weights[row_idx, :]) edge_tuple = (query_indices[query1], query_indices[query2], dist) else: edge_tuple = (query_indices[query1], query_indices[query2]) From a05b92cce46c73e9be21f1f1aa1ebfad7d4794a6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 8 Jun 2021 14:16:21 +0100 Subject: [PATCH 020/175] Enable output of separate networks with indiv-refine models --- PopPUNK/__main__.py | 60 +++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 21a37c00..b11e10ec 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -553,26 +553,48 @@ def main(): # extract limited references from clique by default # (this no longer loses information and should generally be kept on) if model.type != "lineage": - newReferencesIndices, newReferencesNames, newReferencesFile, genomeNetwork = \ - extractReferences(genomeNetwork, - refList, - output, - type_isolate = qc_dict['type_isolate'], - threads = args.threads, + dist_type_list = ['original'] + dist_string_list = [''] + if args.indiv_refine == 'both' or args.indiv_refine == 'core': + dist_type_list.append('core') + dist_string_list.append('_core') + if args.indiv_refine == 'both' or args.indiv_refine == 'accessory': + dist_type_list.append('accessory') + dist_string_list.append('_accessory') + # Iterate through different network types + for dist_type, dist_string in zip(dist_type_list, dist_string_list): + if dist_type == 'original': + network_for_refs = genomeNetwork + elif dist_type == 'core': + network_for_refs = indivNetworks[dist_type] + elif dist_type == 'accessory': + network_for_refs = indivNetworks[dist_type] + newReferencesIndices, newReferencesNames, newReferencesFile, genomeNetwork = \ + extractReferences(network_for_refs, + refList, + output, + outSuffix = dist_string, + type_isolate = qc_dict['type_isolate'], + threads = args.threads, + use_gpu = args.gpu_graph) + nodes_to_remove = set(range(len(refList))).difference(newReferencesIndices) + names_to_remove = [refList[n] for n in nodes_to_remove] + + if (len(names_to_remove) > 0): + # Save reference distances + dists_suffix = dist_string + '.refs.dists' + prune_distance_matrix(refList, names_to_remove, distMat, + output + "/" + os.path.basename(output) + dists_suffix) + # Save reference network + graphs_suffix = dist_string + '.refs_graph' + save_network(genomeNetwork, + prefix = output, + suffix = graphs_suffix, use_gpu = args.gpu_graph) - nodes_to_remove = set(range(len(refList))).difference(newReferencesIndices) - names_to_remove = [refList[n] for n in nodes_to_remove] - - if (len(names_to_remove) > 0): - # Save reference distances - prune_distance_matrix(refList, names_to_remove, distMat, - output + "/" + os.path.basename(output) + ".refs.dists") - # Save reference network - save_network(genomeNetwork, prefix = output, suffix = ".refs_graph", - use_gpu = args.gpu_graph) - removeFromDB(args.ref_db, output, names_to_remove) - os.rename(output + "/" + os.path.basename(output) + ".tmp.h5", - output + "/" + os.path.basename(output) + ".refs.h5") + db_suffix = dist_string + '.refs.h5' + removeFromDB(args.ref_db, output, names_to_remove) + os.rename(output + "/" + os.path.basename(output) + '.tmp.h5', + output + "/" + os.path.basename(output) + db_suffix) sys.stderr.write("\nDone\n") From a7406f3b2aeaab0254c89595ce6e02eb067696f6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 8 Jun 2021 16:58:17 +0100 Subject: [PATCH 021/175] Change processing of reference networks --- PopPUNK/assign.py | 8 ++++++-- PopPUNK/network.py | 14 +++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index bcd4fd6b..825d0a7f 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -132,10 +132,10 @@ def assign_query(dbFuncs, # Find distances vs ref seqs rNames = [] use_ref_graph = \ - os.path.isfile(ref_db + "/" + os.path.basename(ref_db) + fit_string + ".refs") \ + os.path.isfile(model_prefix + "/" + os.path.basename(model_prefix) + fit_string + ".refs") \ and not update_db and model.type != 'lineage' if use_ref_graph: - with open(ref_db + "/" + os.path.basename(ref_db) + fit_string + ".refs") as refFile: + with open(model_prefix + "/" + os.path.basename(model_prefix) + fit_string + ".refs") as refFile: for reference in refFile: rNames.append(reference.rstrip()) else: @@ -337,7 +337,10 @@ def assign_query(dbFuncs, # Clique pruning if model.type != 'lineage': + dbOrder = rNames + qNames + + # Extract references from graph newRepresentativesIndices, newRepresentativesNames, \ newRepresentativesFile, genomeNetwork = \ extractReferences(genomeNetwork, @@ -348,6 +351,7 @@ def assign_query(dbFuncs, type_isolate = qc_dict['type_isolate'], threads = threads, use_gpu = gpu_graph) + # intersection that maintains order newQueries = [x for x in qNames if x in frozenset(newRepresentativesNames)] diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 4ac9dd84..f14fb314 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -96,12 +96,16 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, graph_suffix = '.gt' if core_only and model.type == 'refine': - model.slope = 0 - network_file = dir_prefix + '_core_graph' + graph_suffix + if ref_graph: + network_file = dir_prefix + '_core.refs_graph' + graph_suffix + else: + network_file = dir_prefix + '_core_graph' + graph_suffix cluster_file = dir_prefix + '_core_clusters.csv' elif accessory_only and model.type == 'refine': - model.slope = 1 - network_file = dir_prefix + '_accessory_graph' + graph_suffix + if ref_graph: + network_file = dir_prefix + '_accessory.refs_graph' + graph_suffix + else: + network_file = dir_prefix + '_accessory_graph' + graph_suffix cluster_file = dir_prefix + '_accessory_clusters.csv' else: if ref_graph and os.path.isfile(dir_prefix + '.refs_graph' + graph_suffix): @@ -114,6 +118,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, "a refined fit. Using the combined distances.\n") # Load network file + print("Loading network from file " + network_file) genomeNetwork = load_network_file(network_file, use_gpu = use_gpu) # Ensure all in dists are in final network @@ -434,7 +439,6 @@ def writeReferences(refList, outPrefix, outSuffix = ""): with open(refFileName, 'w') as rFile: for ref in refList: rFile.write(ref + '\n') - return refFileName def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False, From d6477a3d1938766829502a6d2ff99725f79a3328 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 8 Jun 2021 19:19:30 +0100 Subject: [PATCH 022/175] Fix reference expansion on query --- PopPUNK/assign.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 825d0a7f..330b937c 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -340,6 +340,11 @@ def assign_query(dbFuncs, dbOrder = rNames + qNames + existing_ref_list = [] + with open(model_prefix + "/" + os.path.basename(model_prefix) + fit_string + ".refs") as refFile: + for reference in refFile: + existing_ref_list.append(reference.rstrip()) + # Extract references from graph newRepresentativesIndices, newRepresentativesNames, \ newRepresentativesFile, genomeNetwork = \ @@ -347,7 +352,7 @@ def assign_query(dbFuncs, dbOrder, output, outSuffix = fit_string, - existingRefs = rNames, + existingRefs = existing_ref_list, type_isolate = qc_dict['type_isolate'], threads = threads, use_gpu = gpu_graph) @@ -377,7 +382,7 @@ def assign_query(dbFuncs, output + "/" + os.path.basename(output) + db_suffix) # ensure sketch and distMat order match - assert postpruning_combined_seq == rNames + newQueries + assert postpruning_combined_seq == existing_ref_list + newQueries else: storePickle(rNames, qNames, False, qrDistMat, dists_out) if save_partial_query_graph: From ac2b17a445287f28667e805192fbbc70fbcfe488 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 8 Jun 2021 22:09:52 +0100 Subject: [PATCH 023/175] Sote indiv_refine status correctly --- PopPUNK/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index a153d8a7..a04923ef 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -845,12 +845,11 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi num_processes = self.threads, betweenness_sample = betweenness_sample, use_gpu = use_gpu) - self.indiv_fitted = True except RuntimeError as e: print(e) sys.stderr.write("Could not separately refine core and accessory boundaries. " "Using joint 2D refinement only.\n") - + self.indiv_fitted = True y = self.assign(X) return y From 219c788b5e12acfb8779676b2a6503e1f4a98881 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 9 Jun 2021 12:14:18 +0100 Subject: [PATCH 024/175] Change reference selection check --- PopPUNK/assign.py | 6 ++++-- PopPUNK/network.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 330b937c..a5d7ba09 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -381,8 +381,10 @@ def assign_query(dbFuncs, os.rename(output + "/" + os.path.basename(output) + ".tmp.h5", output + "/" + os.path.basename(output) + db_suffix) - # ensure sketch and distMat order match - assert postpruning_combined_seq == existing_ref_list + newQueries + # Check that the updated set of references includes all old references, and references added from + # queries; there may be further new references, even from the original database, where paths are + # added between reference isolates in the same component, or new cliques formed + assert set(postpruning_combined_seq).issuperset(set(existing_ref_list).union(set(newQueries))) else: storePickle(rNames, qNames, False, qrDistMat, dists_out) if save_partial_query_graph: diff --git a/PopPUNK/network.py b/PopPUNK/network.py index f14fb314..a6c20470 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -255,7 +255,7 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None references = set(existingRefs) index_lookup = {v:k for k,v in enumerate(dbOrder)} reference_indices = set([index_lookup[r] for r in references]) - + # Add type isolate, if necessary type_isolate_index = None if type_isolate is not None: From 82a9a3114e65c16deecdca9aea15675953b6013c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 9 Jun 2021 12:19:14 +0100 Subject: [PATCH 025/175] Remove redundant dborder variable --- PopPUNK/assign.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index a5d7ba09..15d52035 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -337,8 +337,6 @@ def assign_query(dbFuncs, # Clique pruning if model.type != 'lineage': - - dbOrder = rNames + qNames existing_ref_list = [] with open(model_prefix + "/" + os.path.basename(model_prefix) + fit_string + ".refs") as refFile: @@ -349,7 +347,7 @@ def assign_query(dbFuncs, newRepresentativesIndices, newRepresentativesNames, \ newRepresentativesFile, genomeNetwork = \ extractReferences(genomeNetwork, - dbOrder, + combined_seq, output, outSuffix = fit_string, existingRefs = existing_ref_list, @@ -362,8 +360,8 @@ def assign_query(dbFuncs, # could also have newRepresentativesNames in this diff (should be the same) - but want # to ensure consistency with the network in case of bad input/bugs - nodes_to_remove = set(range(len(dbOrder))).difference(newRepresentativesIndices) - names_to_remove = [dbOrder[n] for n in nodes_to_remove] + nodes_to_remove = set(range(len(combined_seq))).difference(newRepresentativesIndices) + names_to_remove = [combined_seq[n] for n in nodes_to_remove] if (len(names_to_remove) > 0): # This function also writes out the new ref distance matrix From d75842c30d2c531a6406b03fabda1bf65731b378 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 9 Jun 2021 17:19:36 +0100 Subject: [PATCH 026/175] Use new network construction functions for assignment --- PopPUNK/network.py | 99 +++++++++++++++-------------------------- src/boundary.cpp | 8 ++-- src/boundary.hpp | 5 ++- src/python_bindings.cpp | 10 +++-- 4 files changed, 51 insertions(+), 71 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index a6c20470..c3823c0a 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -905,7 +905,7 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) return G -def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, +def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, int_offset = 0, weights = None, distMat = None, weights_type = None, previous_network = None, previous_pkl = None, betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): """Construct an undirected network using sequence lists, assignments of pairwise distances @@ -923,6 +923,8 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = Labels of most likely cluster assignment within_label (int) The label for the cluster representing within-strain distances + int_offset (int) + Constant integer to add to each node index weights (list) List of weights for each edge in the network distMat (2 column ndarray) @@ -1143,21 +1145,17 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, # store links for each query in a list of edge tuples ref_count = len(rList) - for row_idx, (assignment, (ref, query)) in enumerate(zip(assignments, listDistInts(rList, qList, self = False))): - if assignment == model.within_label: - # query index needs to be adjusted for existing vertices in network - if weights is not None: - if distance_type == 'core': - dist = weights[row_idx, 0] - elif distance_type == 'accessory': - dist = weights[row_idx, 1] - else: - dist = np.linalg.norm(weights[row_idx, :]) - edge_tuple = (ref, query + ref_count, dist) - else: - edge_tuple = (ref, query + ref_count) - new_edges.append(edge_tuple) - assigned.add(qList[query]) + + # Add queries to network + G = construct_network_from_assignments(rList, + qList, + assignments, + within_label = model.within_label, + previous_network = G, + distMat = weights, + weights_type = distance_type, + summarise = False, + use_gpu = use_gpu) # Calculate all query-query distances too, if updating database if queryQuery: @@ -1181,19 +1179,18 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, queryAssignation = model.assign(qqDistMat, slope = 1) else: queryAssignation = model.assign(qqDistMat) - for row_idx, (assignment, (ref, query)) in enumerate(zip(queryAssignation, listDistInts(qList, qList, self = True))): - if assignment == model.within_label: - if weights is not None: - if distance_type == 'core': - dist = weights[row_idx, 0] - elif distance_type == 'accessory': - dist = weights[row_idx, 1] - else: - dist = np.linalg.norm(weights[row_idx, :]) - edge_tuple = (ref + ref_count, query + ref_count, dist) - else: - edge_tuple = (ref + ref_count, query + ref_count) - new_edges.append(edge_tuple) + + # Add queries to network + G = construct_network_from_assignments(qList, + qList, + queryAssignation, + int_offset = ref_count, + within_label = model.within_label, + previous_network = G, + distMat = weights, + weights_type = distance_type, + summarise = False, + use_gpu = use_gpu) # Otherwise only calculate query-query distances for new clusters else: @@ -1238,39 +1235,17 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, else: edge_tuple = (query_indices[query1], query_indices[query2]) new_edges.append(edge_tuple) - - # finish by updating the network - if use_gpu: - - use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True) - - # construct updated graph - G_current_df = G.view_edge_list() - if weights is not None: - G_current_df.columns = ['source','destination','weights'] - G_extra_df = cudf.DataFrame(new_edges, columns = ['source','destination','weights']) - G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True) - else: - G_current_df.columns = ['source','destination'] - G_extra_df = cudf.DataFrame(new_edges, columns = ['source','destination']) - G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True) - - # use self-loop to ensure all nodes are present - max_in_vertex_labels = ref_count + len(qList) - 1 - include_weights = False - if weights is not None: - include_weights = True - G = add_self_loop(G_df, max_in_vertex_labels, weights = include_weights) - - else: - G.add_vertex(len(qList)) - - if weights is not None: - eweight = G.new_ep("float") - G.add_edge_list(new_edges, eprops = [eweight]) - G.edge_properties["weight"] = eweight - else: - G.add_edge_list(new_edges) + + G = construct_network_from_assignments(qList, + qList, + queryAssignation, + int_offset = ref_count, + within_label = model.within_label, + previous_network = G, + distMat = weights, + weights_type = distance_type, + summarise = False, + use_gpu = use_gpu) return G, qqDistMat diff --git a/src/boundary.cpp b/src/boundary.cpp index 1de89271..c0cb9af0 100644 --- a/src/boundary.cpp +++ b/src/boundary.cpp @@ -109,14 +109,16 @@ edge_tuple edge_iterate(const NumpyMatrix &distMat, const int slope, return edge_vec; } -edge_tuple generate_tuples(const std::vector &assignments, const int within_label) { +edge_tuple generate_tuples(const std::vector &assignments, + const int within_label, + const int int_offset) { const size_t n_rows = assignments.size(); const size_t n_samples = 0.5 * (1 + sqrt(1 + 8 * (n_rows))); edge_tuple edge_vec; for (long row_idx = 0; row_idx < n_rows; row_idx++) { if (assignments[row_idx] == within_label) { - long i = calc_row_idx(row_idx, n_samples); - long j = calc_col_idx(row_idx, i, n_samples); + long i = calc_row_idx(row_idx, n_samples) + int_offset; + long j = calc_col_idx(row_idx, i, n_samples) + int_offset; edge_vec.push_back(std::make_tuple(i, j)); } } diff --git a/src/boundary.hpp b/src/boundary.hpp index 3b361146..cf46d715 100644 --- a/src/boundary.hpp +++ b/src/boundary.hpp @@ -21,13 +21,14 @@ typedef std::vector> edge_tuple; Eigen::VectorXf assign_threshold(const NumpyMatrix &distMat, const int slope, const float x_max, const float y_max, - unsigned int num_threads); + unsigned int num_threads = 1); edge_tuple edge_iterate(const NumpyMatrix &distMat, const int slope, const float x_max, const float y_max); edge_tuple generate_tuples(const std::vector &assignments, - const int within_label); + const int within_label, + const int int_offset = 0); network_coo threshold_iterate_1D(const NumpyMatrix &distMat, const std::vector &offsets, diff --git a/src/python_bindings.cpp b/src/python_bindings.cpp index 31ec3f9e..853d9251 100644 --- a/src/python_bindings.cpp +++ b/src/python_bindings.cpp @@ -31,8 +31,9 @@ edge_tuple edgeThreshold(const Eigen::Ref &distMat, } edge_tuple generateTuples(const std::vector &assignments, - const int within_label) { - edge_tuple edges = generate_tuples(assignments, within_label); + const int within_label, + const int int_offset = 0) { + edge_tuple edges = generate_tuples(assignments, within_label, int_offset); return (edges); } @@ -40,7 +41,7 @@ network_coo thresholdIterate1D(const Eigen::Ref &distMat, const std::vector &offsets, const int slope, const double x0, const double y0, const double x1, - const double y1, const int num_threads) { + const double y1, const int num_threads = 1) { if (!std::is_sorted(offsets.begin(), offsets.end())) { throw std::runtime_error("Offsets to thresholdIterate1D must be sorted"); } @@ -82,7 +83,8 @@ PYBIND11_MODULE(poppunk_refine, m) { m.def("generateTuples", &generateTuples, py::return_value_policy::reference_internal, "Return edge tuples based on assigned groups", - py::arg("assignments"), py::arg("within_label")); + py::arg("assignments"), py::arg("within_label"), + py::arg("int_offset") = 0); m.def("thresholdIterate1D", &thresholdIterate1D, py::return_value_policy::reference_internal, From 4e921c6a31a8783cef573d0889e1de6475d3214c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 10 Jun 2021 16:29:17 +0100 Subject: [PATCH 027/175] Improve ref-query graph construction --- PopPUNK/assign.py | 9 +++ PopPUNK/network.py | 158 ++++++++++++++++++++++++++-------------- src/boundary.cpp | 23 ++++-- src/boundary.hpp | 2 + src/python_bindings.cpp | 6 +- 5 files changed, 139 insertions(+), 59 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 15d52035..817026f8 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -66,6 +66,7 @@ def assign_query(dbFuncs, from .network import addQueryToNetwork from .network import printClusters from .network import save_network + from .network import get_vertex_list from .plot import writeClusterCsv @@ -200,6 +201,11 @@ def assign_query(dbFuncs, core_only = (fit_type == 'core'), accessory_only = (fit_type == 'accessory'), use_gpu = gpu_graph) + + if max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) != (len(rNames) - 1): + sys.stderr.write("There are " + str(max(get_vertex_list(genomeNetwork, use_gpu = use_gpu)) + 1) + \ + " vertices in the network but " + str(len(rNames)) + "reference names supplied; " + \ + "please check the '--model-dir' variable is pointing to the correct directory\n") if model.type == 'lineage': # Assign lineages by calculating query-query information @@ -264,6 +270,7 @@ def assign_query(dbFuncs, weights = qrDistMat else: weights = None + genomeNetwork, qqDistMat = \ addQueryToNetwork(dbFuncs, rNames, @@ -273,12 +280,14 @@ def assign_query(dbFuncs, queryAssignments, model, output, + distances = distances, distance_type = dist_type, queryQuery = update_db, strand_preserved = strand_preserved, weights = weights, threads = threads, use_gpu = gpu_graph) + output_fn = output + "/" + os.path.basename(output) + fit_string isolateClustering = \ {'combined': printClusters(genomeNetwork, diff --git a/PopPUNK/network.py b/PopPUNK/network.py index c3823c0a..32caf261 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -118,7 +118,6 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, "a refined fit. Using the combined distances.\n") # Load network file - print("Loading network from file " + network_file) genomeNetwork = load_network_file(network_file, use_gpu = use_gpu) # Ensure all in dists are in final network @@ -441,16 +440,22 @@ def writeReferences(refList, outPrefix, outSuffix = ""): rFile.write(ref + '\n') return refFileName -def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False, - use_gpu = False): +def network_to_edges(prev_G_fn, rlist, adding_queries_to_network = False, + old_ids = None, previous_pkl = None, weights = False, + use_gpu = False): """Load previous network, extract the edges to match the vertex order specified in rlist, and also return weights if specified. Args: - prev_G_fn (str) - Path of file containing existing network. + prev_G_fn (str or graph object) + Path of file containing existing network, or already-loaded + graph object + adding_queries_to_network (bool) + False rlist (list) List of reference sequence labels in new network + old_ids (list) + List of IDs of vertices in existing network previous_pkl (str) Path of pkl file containing names of sequences in previous network @@ -468,10 +473,14 @@ def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False, edge_weights (list) Weights for each new edge """ - # get list for translating node IDs to rlist - prev_G = load_network_file(prev_G_fn, use_gpu = use_gpu) + # Load graph from file if passed string; else use graph object passed in + # as argument + if isinstance(prev_G_fn, str): + prev_G = load_network_file(prev_G_fn, use_gpu = use_gpu) + else: + prev_G = prev_G_fn - # load list of names in previous network + # load list of names in previous network if pkl name supplied if previous_pkl is not None: with open(previous_pkl, 'rb') as pickle_file: old_rlist, old_qlist, self = pickle.load(pickle_file) @@ -479,7 +488,7 @@ def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False, old_ids = old_rlist else: old_ids = old_rlist + old_qlist - else: + elif old_ids is None: sys.stderr.write('Missing .pkl file containing names of sequences in ' 'previous network\n') sys.exit(1) @@ -502,11 +511,18 @@ def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False, if weights: edge_weights = list(prev_G.ep['weight']) - # Update IDs to new versions - old_id_indices = [rlist.index(x) for x in old_ids] - # translate to indices - source_ids = [old_id_indices[x] for x in old_source_ids] - target_ids = [old_id_indices[x] for x in old_target_ids] + # If appending queries to an existing network, then the recovered links can be left + # unchanged, as the new IDs are the queries, and the existing sequences will not be found + # in the list of IDs + if adding_queries_to_network: + source_ids = old_source_ids + target_ids = old_target_ids + else: + # Update IDs to new versions + old_id_indices = [rlist.index(x) for x in old_ids] + # translate to indices + source_ids = [old_id_indices[x] for x in old_source_ids] + target_ids = [old_id_indices[x] for x in old_target_ids] # return values if weights: @@ -555,11 +571,12 @@ def initial_graph_properties(rlist, qlist): Whether the network is being constructed from all-v-all distances or reference-v-query information """ - self_comparison = True - vertex_labels = rlist - if rlist != qlist: + if rlist == qlist: + self_comparison = True + vertex_labels = rlist + else: self_comparison = False - vertex_labels.append(qlist) + vertex_labels = rlist + qlist return vertex_labels, self_comparison def process_weights(distMat, weights_type): @@ -591,26 +608,18 @@ def process_weights(distMat, weights_type): sys.stderr.write('Require distance matrix to calculate distances\n') return processed_weights -def process_previous_network(previous_network = None, previous_pkl = None, vertex_labels = None, - weights = False, use_gpu = False): +def process_previous_network(previous_network = None, adding_queries_to_network = False, old_ids = None, + previous_pkl = None, vertex_labels = None, weights = False, use_gpu = False): """Extract edge types from an existing network - Will print summary statistics about the network to ``STDERR`` - Args: - rlist (list) - List of reference sequence labels - qlist (list) - List of query sequence labels - G_df (cudf or pandas data frame) - Data frame in which the first two columns are the nodes linked by edges - weights (bool) - Whether weights in the G_df data frame should be included in the network - distMat (2 column ndarray) - Numpy array of pairwise distances - previous_network (str) + previous_network (str or graph object) Name of file containing a previous network to be integrated into this new - network + network, or already-loaded graph object + adding_queries_to_network (bool) + Blah + old_ids (list) + Ordered list of vertex names in previous network previous_pkl (str) Name of file containing the names of the sequences in the previous_network ordered based on the original network construction @@ -629,11 +638,13 @@ def process_previous_network(previous_network = None, previous_pkl = None, verte extra_weights (list or None) List of edge weights """ - if previous_pkl is not None: - if weights is not None: + if previous_pkl is not None or old_ids is not None: + if weights: # Extract from network extra_sources, extra_targets, extra_weights = network_to_edges(previous_network, vertex_labels, + adding_queries_to_network = adding_queries_to_network, + old_ids = old_ids, previous_pkl = previous_pkl, weights = True, use_gpu = use_gpu) @@ -641,6 +652,8 @@ def process_previous_network(previous_network = None, previous_pkl = None, verte # Extract from network extra_sources, extra_targets = network_to_edges(previous_network, vertex_labels, + adding_queries_to_network = adding_queries_to_network, + old_ids = old_ids, previous_pkl = previous_pkl, weights = False, use_gpu = use_gpu) @@ -652,8 +665,9 @@ def process_previous_network(previous_network = None, previous_pkl = None, verte return extra_sources, extra_targets, extra_weights def construct_network_from_edge_list(rlist, qlist, edge_list, - weights = None, distMat = None, previous_network = None, previous_pkl = None, - betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): + weights = None, distMat = None, previous_network = None, adding_queries_to_network = False, + old_ids = None, previous_pkl = None, betweenness_sample = betweenness_sample_default, + summarise = True, use_gpu = False): """Construct an undirected network using a data frame of edges. Nodes are samples and edges where samples are within the same cluster @@ -670,9 +684,13 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, Whether weights in the G_df data frame should be included in the network distMat (2 column ndarray) Numpy array of pairwise distances - previous_network (str) + previous_network (str or graph object) Name of file containing a previous network to be integrated into this new - network + network, or the already-loaded graph object + adding_queries_to_network (bool) + Blah + old_ids (list) + Ordered list previous_pkl (str) Name of file containing the names of the sequences in the previous_network betweenness_sample (int) @@ -694,10 +712,12 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, # data structures vertex_labels, self_comparison = initial_graph_properties(rlist, qlist) - + # Load previous network if previous_network is not None: extra_sources, extra_targets, extra_weights = process_previous_network(previous_network = previous_network, + adding_queries_to_network = adding_queries_to_network, + old_ids = old_ids, previous_pkl = previous_pkl, vertex_labels = vertex_labels, weights = (weights is not None), @@ -748,10 +768,11 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, G.add_edge_list(edge_list) if summarise: print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) + return G def construct_network_from_df(rlist, qlist, G_df, - weights = False, distMat = None, previous_network = None, previous_pkl = None, + weights = False, distMat = None, previous_network = None, old_ids = None, previous_pkl = None, betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): """Construct an undirected network using a data frame of edges. Nodes are samples and edges where samples are within the same cluster @@ -769,9 +790,13 @@ def construct_network_from_df(rlist, qlist, G_df, Whether weights in the G_df data frame should be included in the network distMat (2 column ndarray) Numpy array of pairwise distances - previous_network (str) + previous_network (str or graph object) Name of file containing a previous network to be integrated into this new - network + network, or the already-loaded graph object + adding_queries_to_network (bool) + Blah + old_ids (list) + Blah previous_pkl (str) Name of file containing the names of the sequences in the previous_network betweenness_sample (int) @@ -803,9 +828,11 @@ def construct_network_from_df(rlist, qlist, G_df, # Load previous network if previous_network is not None: extra_sources, extra_targets, extra_weights = process_previous_network(previous_network = previous_network, + adding_queries_to_network = adding_queries_to_network, + old_ids = old_ids, previous_pkl = previous_pkl, vertex_labels = vertex_labels, - weights = weights, + weights = (weights is not None), use_gpu = use_gpu) if use_gpu: G_extra_df = cudf.DataFrame() @@ -839,6 +866,7 @@ def construct_network_from_df(rlist, qlist, G_df, weights = weights, distMat = distMat, previous_network = previous_network, + old_ids = old_ids, previous_pkl = previous_pkl, summarise = False, use_gpu = use_gpu) @@ -906,8 +934,9 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, return G def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, int_offset = 0, - weights = None, distMat = None, weights_type = None, previous_network = None, previous_pkl = None, - betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): + weights = None, distMat = None, weights_type = None, previous_network = None, old_ids = None, + adding_queries_to_network = False, previous_pkl = None, betweenness_sample = betweenness_sample_default, + summarise = True, use_gpu = False): """Construct an undirected network using sequence lists, assignments of pairwise distances to clusters, and the identifier of the cluster assigned to within-strain distances. Nodes are samples and edges where samples are within the same cluster @@ -935,6 +964,10 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = previous_network (str) Name of file containing a previous network to be integrated into this new network + old_ids (list) + Blah + adding_queries_to_network (bool) + Blah previous_pkl (str) Name of file containing the names of the sequences in the previous_network betweenness_sample (int) @@ -953,7 +986,7 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = # Check GPU library use use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True) - + # Filter weights to only the relevant edges if weights is not None: weights = weights[assignments == within_label] @@ -962,20 +995,27 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = assignments = np.array(assignments) distMat = distMat[assignments == within_label,:] weights = process_weights(distMat, weights_type) - + # Convert edge indices to tuples - connections = poppunk_refine.generateTuples(assignments, within_label) - + connections = poppunk_refine.generateTuples(assignments, + within_label, + self = (rlist == qlist), + num_ref = len(rlist), + int_offset = int_offset) + # Construct network using edge list G = construct_network_from_edge_list(rlist, qlist, connections, weights = weights, distMat = distMat, previous_network = previous_network, + adding_queries_to_network = adding_queries_to_network, + old_ids = old_ids, previous_pkl = previous_pkl, summarise = False, use_gpu = use_gpu) if summarise: print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) + return G def get_cugraph_triangles(G): @@ -1088,7 +1128,7 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa return(metrics, scores) def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, - assignments, model, queryDB, distance_type = 'euclidean', + assignments, model, queryDB, distances = None, distance_type = 'euclidean', queryQuery = False, strand_preserved = False, weights = None, threads = 1, use_gpu = False): """Finds edges between queries and items in the reference database, @@ -1111,6 +1151,8 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, Model fitted to reference database queryDB (str) Query database location + distances (str) + Prefix of distance files for extending network distance_type (str) Distance type to use as weights in network queryQuery (bool) @@ -1135,6 +1177,10 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, """ # initalise functions queryDatabase = dbFuncs['queryDatabase'] + + # do not calculate weights unless specified + if weights is None: + distance_type = None # initialise links data structure new_edges = [] @@ -1152,6 +1198,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, assignments, within_label = model.within_label, previous_network = G, + old_ids = rList, distMat = weights, weights_type = distance_type, summarise = False, @@ -1179,7 +1226,6 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, queryAssignation = model.assign(qqDistMat, slope = 1) else: queryAssignation = model.assign(qqDistMat) - # Add queries to network G = construct_network_from_assignments(qList, qList, @@ -1187,6 +1233,9 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, int_offset = ref_count, within_label = model.within_label, previous_network = G, + weights = weights, + old_ids = rList, + adding_queries_to_network = True, distMat = weights, weights_type = distance_type, summarise = False, @@ -1242,6 +1291,9 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, int_offset = ref_count, within_label = model.within_label, previous_network = G, + weights = weights, + old_ids = rList + qList, + adding_queries_to_network = True, distMat = weights, weights_type = distance_type, summarise = False, diff --git a/src/boundary.cpp b/src/boundary.cpp index c0cb9af0..7e9cec4f 100644 --- a/src/boundary.cpp +++ b/src/boundary.cpp @@ -111,15 +111,28 @@ edge_tuple edge_iterate(const NumpyMatrix &distMat, const int slope, edge_tuple generate_tuples(const std::vector &assignments, const int within_label, + bool self, + const int num_ref, const int int_offset) { const size_t n_rows = assignments.size(); const size_t n_samples = 0.5 * (1 + sqrt(1 + 8 * (n_rows))); edge_tuple edge_vec; - for (long row_idx = 0; row_idx < n_rows; row_idx++) { - if (assignments[row_idx] == within_label) { - long i = calc_row_idx(row_idx, n_samples) + int_offset; - long j = calc_col_idx(row_idx, i, n_samples) + int_offset; - edge_vec.push_back(std::make_tuple(i, j)); + if (self) { + for (long row_idx = 0; row_idx < n_rows; row_idx++) { + if (assignments[row_idx] == within_label) { + long i = calc_row_idx(row_idx, n_samples); + long j = calc_col_idx(row_idx, i, n_samples) + int_offset; + i = i + int_offset; + edge_vec.push_back(std::make_tuple(i, j)); + } + } + } else { + for (long row_idx = 0; row_idx < n_rows; row_idx++) { + if (assignments[row_idx] == within_label) { + unsigned long i = row_idx % num_ref + int_offset; + unsigned long j = static_cast(row_idx / (float)num_ref + 0.001f) + int_offset; + edge_vec.push_back(std::make_tuple(i, j)); + } } } return edge_vec; diff --git a/src/boundary.hpp b/src/boundary.hpp index cf46d715..fa6fab11 100644 --- a/src/boundary.hpp +++ b/src/boundary.hpp @@ -28,6 +28,8 @@ edge_tuple edge_iterate(const NumpyMatrix &distMat, const int slope, edge_tuple generate_tuples(const std::vector &assignments, const int within_label, + bool self = true, + const int num_ref = 0, const int int_offset = 0); network_coo threshold_iterate_1D(const NumpyMatrix &distMat, diff --git a/src/python_bindings.cpp b/src/python_bindings.cpp index 853d9251..39d7ca5e 100644 --- a/src/python_bindings.cpp +++ b/src/python_bindings.cpp @@ -32,8 +32,11 @@ edge_tuple edgeThreshold(const Eigen::Ref &distMat, edge_tuple generateTuples(const std::vector &assignments, const int within_label, + bool self = true, + const int num_ref = 0, const int int_offset = 0) { - edge_tuple edges = generate_tuples(assignments, within_label, int_offset); + edge_tuple edges = generate_tuples(assignments, within_label, self, num_ref, + int_offset); return (edges); } @@ -84,6 +87,7 @@ PYBIND11_MODULE(poppunk_refine, m) { py::return_value_policy::reference_internal, "Return edge tuples based on assigned groups", py::arg("assignments"), py::arg("within_label"), + py::arg("self") = true, py::arg("num_ref") = 0, py::arg("int_offset") = 0); m.def("thresholdIterate1D", &thresholdIterate1D, From adfe2aa8f380d387a18e2665e203b11f69122a2c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 10 Jun 2021 17:12:27 +0100 Subject: [PATCH 028/175] Fix ref-query edge additions --- PopPUNK/network.py | 1 + src/boundary.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 32caf261..10a1e2d9 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1226,6 +1226,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, queryAssignation = model.assign(qqDistMat, slope = 1) else: queryAssignation = model.assign(qqDistMat) + # Add queries to network G = construct_network_from_assignments(qList, qList, diff --git a/src/boundary.cpp b/src/boundary.cpp index 7e9cec4f..3f7aa82f 100644 --- a/src/boundary.cpp +++ b/src/boundary.cpp @@ -130,7 +130,7 @@ edge_tuple generate_tuples(const std::vector &assignments, for (long row_idx = 0; row_idx < n_rows; row_idx++) { if (assignments[row_idx] == within_label) { unsigned long i = row_idx % num_ref + int_offset; - unsigned long j = static_cast(row_idx / (float)num_ref + 0.001f) + int_offset; + unsigned long j = static_cast(row_idx / (float)num_ref + 0.001f) + num_ref + int_offset; edge_vec.push_back(std::make_tuple(i, j)); } } From af827ada29a0ab398819a76ec59b0b20ac26c230 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 10 Jun 2021 19:55:31 +0100 Subject: [PATCH 029/175] Fix query testing --- PopPUNK/network.py | 10 +++++----- test/run_test.py | 8 ++++---- test/test-gpu.py | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 10a1e2d9..268c6698 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -751,7 +751,7 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, for ((src, dest), weight) in zip(edge_list, weights): edge_list.append((src, dest, weight)) if previous_network is not None: - for ((src, dest), weight) in zip(extra_sources, extra_targets, extra_weights): + for (src, dest, weight) in zip(extra_sources, extra_targets, extra_weights): edge_list.append((src, dest, weight)) else: if previous_network is not None: @@ -989,6 +989,8 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = # Filter weights to only the relevant edges if weights is not None: + print("Weights: " + str(weights)) + print("Assignments: " + str(assignments)) weights = weights[assignments == within_label] elif distMat is not None and weights_type is not None: if isinstance(assignments, list): @@ -1234,10 +1236,9 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, int_offset = ref_count, within_label = model.within_label, previous_network = G, - weights = weights, old_ids = rList, adding_queries_to_network = True, - distMat = weights, + distMat = qqDistMat, weights_type = distance_type, summarise = False, use_gpu = use_gpu) @@ -1292,10 +1293,9 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, int_offset = ref_count, within_label = model.within_label, previous_network = G, - weights = weights, old_ids = rList + qList, adding_queries_to_network = True, - distMat = weights, + distMat = qqDistMat, weights_type = distance_type, summarise = False, use_gpu = use_gpu) diff --git a/test/run_test.py b/test/run_test.py index b48f7d34..6846cb83 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -61,10 +61,10 @@ #assign query sys.stderr.write("Running query assignment\n") -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query --overwrite", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query_update --update-db --graph-weights --overwrite", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --output example_single_query --update-db --overwrite", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_lineages --output example_lineage_query --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_dbscan --output example_query_update --update-db --graph-weights --overwrite", shell=True, check=True) # uses graph weights +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --model-dir example_refine --output example_single_query --update-db --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --model-dir example_lineages --output example_lineage_query --overwrite", shell=True, check=True) # viz sys.stderr.write("Running visualisations (poppunk_visualise)\n") diff --git a/test/test-gpu.py b/test/test-gpu.py index 60111ffe..0d56cb6d 100755 --- a/test/test-gpu.py +++ b/test/test-gpu.py @@ -61,9 +61,9 @@ #assign query sys.stderr.write("Running query assignment\n") -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query --overwrite --gpu-dist --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query_update --update-db --graph-weights --overwrite --gpu-dist --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --output example_single_query --update-db --overwrite --gpu-dist --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --overwrite --gpu-dist --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_dbscan --output example_query_update --update-db --graph-weights --overwrite --gpu-dist --gpu-graph", shell=True, check=True) # uses graph weights +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --model-dir example_refine --output example_single_query --update-db --overwrite --gpu-dist --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_lineages --output example_lineage_query --overwrite --gpu-graph --gpu-dist", shell=True, check=True) # viz From 2f07f0b4808a50980eadd8b2d9a7fe4897c2ecae Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 10 Jun 2021 21:08:33 +0100 Subject: [PATCH 030/175] Check for graph weights --- PopPUNK/network.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 268c6698..edc97beb 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -497,6 +497,10 @@ def network_to_edges(prev_G_fn, rlist, adding_queries_to_network = False, if use_gpu: G_df = prev_G.view_edge_list() if weights: + if len(G_df.columns) < 3: + sys.stderr.write('Loaded network does not have edge weights; try a different ' + 'network or turn off graph weights\n') + exit(1) G_df.columns = ['source','destination','weight'] edge_weights = G_df['weight'].to_arrow().to_pylist() else: @@ -509,6 +513,10 @@ def network_to_edges(prev_G_fn, rlist, adding_queries_to_network = False, old_target_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "target") # get the weights if weights: + if prev_G.edge_properties.keys() is None or 'weight' not in prev_G.edge_properties.keys(): + sys.stderr.write('Loaded network does not have edge weights; try a different ' + 'network or turn off graph weights\n') + exit(1) edge_weights = list(prev_G.ep['weight']) # If appending queries to an existing network, then the recovered links can be left From e0add4c4ff9b6dc3e2365688d79b8a177b1dc515 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 10 Jun 2021 21:36:39 +0100 Subject: [PATCH 031/175] Remove cupyx function --- PopPUNK/models.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index a04923ef..5d9e58ed 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -1075,16 +1075,10 @@ def save(self): raise RuntimeError("Trying to save unfitted model") else: for rank in self.ranks: - if self.use_gpu: - cupyx.scipy.sparse.save_npz( - self.outPrefix + "/" + os.path.basename(self.outPrefix) + \ - rankFile(rank), - self.nn_dists[rank]) - else: - scipy.sparse.save_npz( - self.outPrefix + "/" + os.path.basename(self.outPrefix) + \ - rankFile(rank), - self.nn_dists[rank]) + scipy.sparse.save_npz( + self.outPrefix + "/" + os.path.basename(self.outPrefix) + \ + rankFile(rank), + self.nn_dists[rank]) with open(self.outPrefix + "/" + os.path.basename(self.outPrefix) + \ '_fit.pkl', 'wb') as pickle_file: pickle.dump([[self.ranks, self.dist_col], self.type], pickle_file) From 9aedb5c6cddbc4eb1a76e5e0c92a3f9d226b7871 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 10 Jun 2021 21:43:50 +0100 Subject: [PATCH 032/175] Change GPU flag --- PopPUNK/assign.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 817026f8..a622ea72 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -203,7 +203,7 @@ def assign_query(dbFuncs, use_gpu = gpu_graph) if max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) != (len(rNames) - 1): - sys.stderr.write("There are " + str(max(get_vertex_list(genomeNetwork, use_gpu = use_gpu)) + 1) + \ + sys.stderr.write("There are " + str(max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) + 1) + \ " vertices in the network but " + str(len(rNames)) + "reference names supplied; " + \ "please check the '--model-dir' variable is pointing to the correct directory\n") From e955ff0ac10600fdd7b800e21a696579e4516956 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 10 Jun 2021 22:33:31 +0100 Subject: [PATCH 033/175] Fix GPU network reference graph construction --- PopPUNK/assign.py | 2 +- PopPUNK/network.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index a622ea72..fd9fce67 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -204,7 +204,7 @@ def assign_query(dbFuncs, if max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) != (len(rNames) - 1): sys.stderr.write("There are " + str(max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) + 1) + \ - " vertices in the network but " + str(len(rNames)) + "reference names supplied; " + \ + " vertices in the network but " + str(len(rNames)) + " reference names supplied; " + \ "please check the '--model-dir' variable is pointing to the correct directory\n") if model.type == 'lineage': diff --git a/PopPUNK/network.py b/PopPUNK/network.py index edc97beb..32accc86 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -290,8 +290,7 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] # Add self-loop if needed - max_in_vertex_labels = max(reference_indices) - G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False) + G_ref = add_self_loop(G_ref_df, len(reference_indices) - 1, renumber = True) # Check references in same component in overall graph are connected in the reference graph # First get components of original reference graph @@ -333,7 +332,7 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None reference_indices = list(reference_index_set) # Create new reference graph G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] - G_ref = add_self_loop(G_ref_df, max_in_vertex_labels, renumber = False) + G_ref = add_self_loop(G_ref_df, len(reference_indices) - 1, renumber = True) else: # Each component is independent, so can be multithreaded From 91ff53ce9d4a7cf1389e06d05518e7b1e4a82a11 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 11 Jun 2021 06:32:02 +0100 Subject: [PATCH 034/175] Edit messages when fetching network --- PopPUNK/network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 32accc86..81b1265c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -114,10 +114,11 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, network_file = dir_prefix + '_graph' + graph_suffix cluster_file = dir_prefix + '_clusters.csv' if core_only or accessory_only: - sys.stderr.write("Can only do --core-only or --accessory-only fits from " + sys.stderr.write("Can only do --core or --accessory fits from " "a refined fit. Using the combined distances.\n") # Load network file + sys.stderr.write("Loading network from " + network_file + "\n") genomeNetwork = load_network_file(network_file, use_gpu = use_gpu) # Ensure all in dists are in final network From a5757fdea6e9c37a8322d71d8130e936eabd357f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 11 Jun 2021 08:08:26 +0100 Subject: [PATCH 035/175] Fix selection of references with GPUs --- PopPUNK/network.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 81b1265c..9646716e 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -288,10 +288,10 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None # Extract reference edges G_df = G.view_edge_list() if 'src' in G_df.columns: - G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) - G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] - # Add self-loop if needed - G_ref = add_self_loop(G_ref_df, len(reference_indices) - 1, renumber = True) + G_df.rename(columns={'src': 'old_source','dst': 'old_destination'}, inplace=True) + else: + G_df.rename(columns={'source': 'old_source','destination': 'old_destination'}, inplace=True) + G_ref_df = G_df[G_df['old_source'].isin(reference_indices) & G_df['old_destination'].isin(reference_indices)] # Check references in same component in overall graph are connected in the reference graph # First get components of original reference graph @@ -332,8 +332,12 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None # Add expanded reference set to the overall list reference_indices = list(reference_index_set) # Create new reference graph - G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] - G_ref = add_self_loop(G_ref_df, len(reference_indices) - 1, renumber = True) + G_ref_df = G_df[G_df['old_source'].isin(reference_indices) & G_df['old_destination'].isin(reference_indices)] + + # Translate network indices to match name order + G_ref_df['source'] = [reference_indices.index(x) for x in G_ref_df['old_source'].to_arrow().to_pylist()] + G_ref_df['destination'] = [reference_indices.index(x) for x in G_ref_df['old_destination'].to_arrow().to_pylist()] + G_ref = add_self_loop(G_ref_df, len(reference_indices) - 1, renumber = True) else: # Each component is independent, so can be multithreaded @@ -780,8 +784,9 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, return G def construct_network_from_df(rlist, qlist, G_df, - weights = False, distMat = None, previous_network = None, old_ids = None, previous_pkl = None, - betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): + weights = False, distMat = None, previous_network = None, adding_queries_to_network = False, + old_ids = None, previous_pkl = None, betweenness_sample = betweenness_sample_default, + summarise = True, use_gpu = False): """Construct an undirected network using a data frame of edges. Nodes are samples and edges where samples are within the same cluster From cde0c911cd769eb5b299d8b8fd4be9b5ad3831fc Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 11 Jun 2021 08:20:10 +0100 Subject: [PATCH 036/175] Alter GPU reference graph construction --- PopPUNK/network.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 9646716e..db7e9152 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -220,6 +220,27 @@ def cliquePrune(component, graph, reference_indices, components_list): ref_list = getCliqueRefs(subgraph, refs) return(list(ref_list)) +def translate_network_indices(G_ref_df, reference_indices): + """Extract references for each cluster based on cliques + + Writes chosen references to file by calling :func:`~writeReferences` + + Args: + G_ref_df (graph) + A network used to define clusters + reference_indices (list) + The order of files in the sketches, so returned references are in the same order + + Returns: + G_ref (str) + The name of the file references were written tos + """ + # Translate network indices to match name order + G_ref_df['source'] = [reference_indices.index(x) for x in G_ref_df['old_source'].to_arrow().to_pylist()] + G_ref_df['destination'] = [reference_indices.index(x) for x in G_ref_df['old_destination'].to_arrow().to_pylist()] + G_ref = add_self_loop(G_ref_df, len(reference_indices) - 1, renumber = True) + return(G_ref) + def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None, existingRefs = None, threads = 1, use_gpu = False): """Extract references for each cluster based on cliques @@ -292,6 +313,8 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None else: G_df.rename(columns={'source': 'old_source','destination': 'old_destination'}, inplace=True) G_ref_df = G_df[G_df['old_source'].isin(reference_indices) & G_df['old_destination'].isin(reference_indices)] + # Translate network indices to match name order + G_ref = translate_network_indices(G_ref_df, reference_indices) # Check references in same component in overall graph are connected in the reference graph # First get components of original reference graph @@ -333,11 +356,7 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None reference_indices = list(reference_index_set) # Create new reference graph G_ref_df = G_df[G_df['old_source'].isin(reference_indices) & G_df['old_destination'].isin(reference_indices)] - - # Translate network indices to match name order - G_ref_df['source'] = [reference_indices.index(x) for x in G_ref_df['old_source'].to_arrow().to_pylist()] - G_ref_df['destination'] = [reference_indices.index(x) for x in G_ref_df['old_destination'].to_arrow().to_pylist()] - G_ref = add_self_loop(G_ref_df, len(reference_indices) - 1, renumber = True) + G_ref = translate_network_indices(G_ref_df, reference_indices) else: # Each component is independent, so can be multithreaded From ce0ca1d62ffeb5187721437b6999537881d675bc Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 11 Jun 2021 08:28:06 +0100 Subject: [PATCH 037/175] Update GPU graph loading --- PopPUNK/network.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index db7e9152..182233b1 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -226,14 +226,14 @@ def translate_network_indices(G_ref_df, reference_indices): Writes chosen references to file by calling :func:`~writeReferences` Args: - G_ref_df (graph) - A network used to define clusters + G_ref_df (cudf data frame) + List of edges in reference network reference_indices (list) - The order of files in the sketches, so returned references are in the same order + The ordered list of reference indices in the original network Returns: - G_ref (str) - The name of the file references were written tos + G_ref (cugraph network) + Network of reference sequences """ # Translate network indices to match name order G_ref_df['source'] = [reference_indices.index(x) for x in G_ref_df['old_source'].to_arrow().to_pylist()] @@ -771,6 +771,8 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, G = construct_network_from_df(rlist, qlist, G_df, weights = (weights is not None), distMat = distMat, + adding_queries_to_network = adding_queries_to_network, + old_ids = old_ids, previous_network = previous_network, previous_pkl = previous_pkl, summarise = False, From 0f070274dbbada6b98c66b5d780c9d28ebf1329a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 11 Jun 2021 09:33:58 +0100 Subject: [PATCH 038/175] Fix boolean argument --- PopPUNK/network.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 182233b1..eead81cf 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -474,7 +474,9 @@ def network_to_edges(prev_G_fn, rlist, adding_queries_to_network = False, Path of file containing existing network, or already-loaded graph object adding_queries_to_network (bool) - False + Boolean specifying whether query-query edges are being added + to an existing network, such that not all the sequence IDs will + be found in the old IDs, which should already be correctly ordered rlist (list) List of reference sequence labels in new network old_ids (list) @@ -866,7 +868,7 @@ def construct_network_from_df(rlist, qlist, G_df, old_ids = old_ids, previous_pkl = previous_pkl, vertex_labels = vertex_labels, - weights = (weights is not None), + weights = weights, use_gpu = use_gpu) if use_gpu: G_extra_df = cudf.DataFrame() From d7b67527ca5b441a8590730c8b9351f07cbd0521 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 11 Jun 2021 10:18:11 +0100 Subject: [PATCH 039/175] Update tests --- test/run_test.py | 4 ++-- test/test-gpu.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/run_test.py b/test/run_test.py index 6846cb83..4dca5363 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -40,12 +40,12 @@ #refine model with GMM sys.stderr.write("Running model refinement (--fit-model refine)\n") subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both --no-local", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --unconstrained", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 2", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both", shell=True, check=True) # lineage clustering sys.stderr.write("Running lineage clustering test (--fit-model lineage)\n") @@ -61,7 +61,7 @@ #assign query sys.stderr.write("Running query assignment\n") -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --overwrite --core --accessory", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_dbscan --output example_query_update --update-db --graph-weights --overwrite", shell=True, check=True) # uses graph weights subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --model-dir example_refine --output example_single_query --update-db --overwrite", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --model-dir example_lineages --output example_lineage_query --overwrite", shell=True, check=True) diff --git a/test/test-gpu.py b/test/test-gpu.py index 0d56cb6d..201b7f6e 100755 --- a/test/test-gpu.py +++ b/test/test-gpu.py @@ -40,12 +40,12 @@ #refine model with GMM sys.stderr.write("Running model refinement (--fit-model refine)\n") subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both --no-local --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --unconstrained --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1 --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 2 --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both --gpu-graph", shell=True, check=True) # lineage clustering sys.stderr.write("Running lineage clustering test (--fit-model lineage)\n") @@ -61,7 +61,7 @@ #assign query sys.stderr.write("Running query assignment\n") -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --overwrite --gpu-dist --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --overwrite --gpu-dist --gpu-graph --core --accessory", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_dbscan --output example_query_update --update-db --graph-weights --overwrite --gpu-dist --gpu-graph", shell=True, check=True) # uses graph weights subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --model-dir example_refine --output example_single_query --update-db --overwrite --gpu-dist --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_lineages --output example_lineage_query --overwrite --gpu-graph --gpu-dist", shell=True, check=True) From 1d5f12d99bd1952b3ffb4febda8da746891ccbeb Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 11 Jun 2021 10:50:16 +0100 Subject: [PATCH 040/175] Change qq_dist update --- PopPUNK/network.py | 60 ++++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index eead81cf..daf288e7 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -463,7 +463,7 @@ def writeReferences(refList, outPrefix, outSuffix = ""): rFile.write(ref + '\n') return refFileName -def network_to_edges(prev_G_fn, rlist, adding_queries_to_network = False, +def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, old_ids = None, previous_pkl = None, weights = False, use_gpu = False): """Load previous network, extract the edges to match the @@ -473,7 +473,7 @@ def network_to_edges(prev_G_fn, rlist, adding_queries_to_network = False, prev_G_fn (str or graph object) Path of file containing existing network, or already-loaded graph object - adding_queries_to_network (bool) + adding_qq_dists (bool) Boolean specifying whether query-query edges are being added to an existing network, such that not all the sequence IDs will be found in the old IDs, which should already be correctly ordered @@ -547,7 +547,7 @@ def network_to_edges(prev_G_fn, rlist, adding_queries_to_network = False, # If appending queries to an existing network, then the recovered links can be left # unchanged, as the new IDs are the queries, and the existing sequences will not be found # in the list of IDs - if adding_queries_to_network: + if adding_qq_dists: source_ids = old_source_ids target_ids = old_target_ids else: @@ -641,7 +641,7 @@ def process_weights(distMat, weights_type): sys.stderr.write('Require distance matrix to calculate distances\n') return processed_weights -def process_previous_network(previous_network = None, adding_queries_to_network = False, old_ids = None, +def process_previous_network(previous_network = None, adding_qq_dists = False, old_ids = None, previous_pkl = None, vertex_labels = None, weights = False, use_gpu = False): """Extract edge types from an existing network @@ -649,8 +649,10 @@ def process_previous_network(previous_network = None, adding_queries_to_network previous_network (str or graph object) Name of file containing a previous network to be integrated into this new network, or already-loaded graph object - adding_queries_to_network (bool) - Blah + adding_qq_dists (bool) + Boolean specifying whether query-query edges are being added + to an existing network, such that not all the sequence IDs will + be found in the old IDs, which should already be correctly ordered old_ids (list) Ordered list of vertex names in previous network previous_pkl (str) @@ -676,7 +678,7 @@ def process_previous_network(previous_network = None, adding_queries_to_network # Extract from network extra_sources, extra_targets, extra_weights = network_to_edges(previous_network, vertex_labels, - adding_queries_to_network = adding_queries_to_network, + adding_qq_dists = adding_qq_dists, old_ids = old_ids, previous_pkl = previous_pkl, weights = True, @@ -685,7 +687,7 @@ def process_previous_network(previous_network = None, adding_queries_to_network # Extract from network extra_sources, extra_targets = network_to_edges(previous_network, vertex_labels, - adding_queries_to_network = adding_queries_to_network, + adding_qq_dists = adding_qq_dists, old_ids = old_ids, previous_pkl = previous_pkl, weights = False, @@ -698,7 +700,7 @@ def process_previous_network(previous_network = None, adding_queries_to_network return extra_sources, extra_targets, extra_weights def construct_network_from_edge_list(rlist, qlist, edge_list, - weights = None, distMat = None, previous_network = None, adding_queries_to_network = False, + weights = None, distMat = None, previous_network = None, adding_qq_dists = False, old_ids = None, previous_pkl = None, betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): """Construct an undirected network using a data frame of edges. Nodes are samples and @@ -720,10 +722,12 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, previous_network (str or graph object) Name of file containing a previous network to be integrated into this new network, or the already-loaded graph object - adding_queries_to_network (bool) - Blah + adding_qq_dists (bool) + Boolean specifying whether query-query edges are being added + to an existing network, such that not all the sequence IDs will + be found in the old IDs, which should already be correctly ordered old_ids (list) - Ordered list + Ordered list of vertex names in previous network previous_pkl (str) Name of file containing the names of the sequences in the previous_network betweenness_sample (int) @@ -749,7 +753,7 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, # Load previous network if previous_network is not None: extra_sources, extra_targets, extra_weights = process_previous_network(previous_network = previous_network, - adding_queries_to_network = adding_queries_to_network, + adding_qq_dists = adding_qq_dists, old_ids = old_ids, previous_pkl = previous_pkl, vertex_labels = vertex_labels, @@ -773,7 +777,7 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, G = construct_network_from_df(rlist, qlist, G_df, weights = (weights is not None), distMat = distMat, - adding_queries_to_network = adding_queries_to_network, + adding_qq_dists = adding_qq_dists, old_ids = old_ids, previous_network = previous_network, previous_pkl = previous_pkl, @@ -807,7 +811,7 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, return G def construct_network_from_df(rlist, qlist, G_df, - weights = False, distMat = None, previous_network = None, adding_queries_to_network = False, + weights = False, distMat = None, previous_network = None, adding_qq_dists = False, old_ids = None, previous_pkl = None, betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): """Construct an undirected network using a data frame of edges. Nodes are samples and @@ -829,10 +833,12 @@ def construct_network_from_df(rlist, qlist, G_df, previous_network (str or graph object) Name of file containing a previous network to be integrated into this new network, or the already-loaded graph object - adding_queries_to_network (bool) - Blah + adding_qq_dists (bool) + Boolean specifying whether query-query edges are being added + to an existing network, such that not all the sequence IDs will + be found in the old IDs, which should already be correctly ordered old_ids (list) - Blah + Ordered list of vertex names in previous network previous_pkl (str) Name of file containing the names of the sequences in the previous_network betweenness_sample (int) @@ -864,7 +870,7 @@ def construct_network_from_df(rlist, qlist, G_df, # Load previous network if previous_network is not None: extra_sources, extra_targets, extra_weights = process_previous_network(previous_network = previous_network, - adding_queries_to_network = adding_queries_to_network, + adding_qq_dists = adding_qq_dists, old_ids = old_ids, previous_pkl = previous_pkl, vertex_labels = vertex_labels, @@ -971,7 +977,7 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, int_offset = 0, weights = None, distMat = None, weights_type = None, previous_network = None, old_ids = None, - adding_queries_to_network = False, previous_pkl = None, betweenness_sample = betweenness_sample_default, + adding_qq_dists = False, previous_pkl = None, betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): """Construct an undirected network using sequence lists, assignments of pairwise distances to clusters, and the identifier of the cluster assigned to within-strain distances. @@ -1001,9 +1007,11 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = Name of file containing a previous network to be integrated into this new network old_ids (list) - Blah - adding_queries_to_network (bool) - Blah + Ordered list of vertex names in previous network + adding_qq_dists (bool) + Boolean specifying whether query-query edges are being added + to an existing network, such that not all the sequence IDs will + be found in the old IDs, which should already be correctly ordered previous_pkl (str) Name of file containing the names of the sequences in the previous_network betweenness_sample (int) @@ -1046,7 +1054,7 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = weights = weights, distMat = distMat, previous_network = previous_network, - adding_queries_to_network = adding_queries_to_network, + adding_qq_dists = adding_qq_dists, old_ids = old_ids, previous_pkl = previous_pkl, summarise = False, @@ -1273,7 +1281,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, within_label = model.within_label, previous_network = G, old_ids = rList, - adding_queries_to_network = True, + adding_qq_dists = True, distMat = qqDistMat, weights_type = distance_type, summarise = False, @@ -1330,7 +1338,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, within_label = model.within_label, previous_network = G, old_ids = rList + qList, - adding_queries_to_network = True, + adding_qq_dists = True, distMat = qqDistMat, weights_type = distance_type, summarise = False, From 212dec5bf2d6ee4b20745cbc6906cda8f6e82ed0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 11 Jun 2021 12:33:04 +0100 Subject: [PATCH 041/175] Fix reference file writing --- PopPUNK/network.py | 1 - 1 file changed, 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index daf288e7..7eb77d70 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -304,7 +304,6 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None # Order found references as in sketchlib database reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] - refFileName = writeReferences(reference_names, outPrefix) # Extract reference edges G_df = G.view_edge_list() From 71e1b0125fd5099abf47a0531fbd20090d23574e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 11 Jun 2021 16:05:47 +0100 Subject: [PATCH 042/175] Enable deletion of random matches in HDF --- PopPUNK/sketchlib.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index de584180..f946de31 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -738,6 +738,8 @@ def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads # remove random matches if already present if 'random' in hdf_in: + hdf_in.close() + hdf_in = h5py.File(db_name, 'r+') del hdf_in['random'] hdf_in.close() From c8bfb6e4fb3bfc6d86e6ecb0a20718e0d65451f4 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 11 Jun 2021 22:41:34 +0100 Subject: [PATCH 043/175] Fix edge weight calculation format --- PopPUNK/network.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 7eb77d70..fb5b9b82 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -631,11 +631,11 @@ def process_weights(distMat, weights_type): sys.stderr.write("Unable to calculate distance type " + str(weights_type) + "; " "accepted types are " + str(accepted_weights_types) + "\n") if weights_type == 'euclidean': - processed_weights = np.linalg.norm(distMat, axis = 1) + processed_weights = np.linalg.norm(distMat, axis = 1).tolist() elif weights_type == 'core': - processed_weights = distMat[:, 0] + processed_weights = distMat[:, 0].tolist() elif weights_type == 'accessory': - processed_weights = distMat[:, 1] + processed_weights = distMat[:, 1].tolist() else: sys.stderr.write('Require distance matrix to calculate distances\n') return processed_weights @@ -714,8 +714,8 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, List of query sequence labels G_df (cudf or pandas data frame) Data frame in which the first two columns are the nodes linked by edges - weights (bool) - Whether weights in the G_df data frame should be included in the network + weights (list) + List of edge weights distMat (2 column ndarray) Numpy array of pairwise distances previous_network (str or graph object) From c0a27a9119dd4e4321572d2b2422663013e3dbb1 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 11 Jun 2021 22:59:44 +0100 Subject: [PATCH 044/175] Attempt to resolve GPU bus error --- PopPUNK/sketchlib.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index f946de31..60ffb741 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -713,7 +713,7 @@ def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads failed, full_names = True) os.rename(filtered_db_name, db_name) - + hdf_in.close() # if failure still close files to avoid corruption except: hdf_in.close() @@ -738,10 +738,9 @@ def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads # remove random matches if already present if 'random' in hdf_in: - hdf_in.close() hdf_in = h5py.File(db_name, 'r+') del hdf_in['random'] - hdf_in.close() + hdf_in.close() # This gives back retained in the same order as names retained = [x for x in names if x in frozenset(retained)] From 8492787854cf34e73420d791d0f1c5e48a87d771 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 11 Jun 2021 23:09:41 +0100 Subject: [PATCH 045/175] Remove attempt to resolve GPU bus error --- PopPUNK/sketchlib.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 60ffb741..71afc4b4 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -738,9 +738,10 @@ def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads # remove random matches if already present if 'random' in hdf_in: + hdf_in.close() hdf_in = h5py.File(db_name, 'r+') del hdf_in['random'] - hdf_in.close() + hdf_in.close() # This gives back retained in the same order as names retained = [x for x in names if x in frozenset(retained)] From 4a5797109db773d6c0523eb0b2bdb87a4d7d5dee Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 13 Jun 2021 22:40:41 +0100 Subject: [PATCH 046/175] Change processing of previous networks with cugraph --- PopPUNK/network.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index fb5b9b82..c07ffdfe 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -749,24 +749,8 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, # data structures vertex_labels, self_comparison = initial_graph_properties(rlist, qlist) - # Load previous network - if previous_network is not None: - extra_sources, extra_targets, extra_weights = process_previous_network(previous_network = previous_network, - adding_qq_dists = adding_qq_dists, - old_ids = old_ids, - previous_pkl = previous_pkl, - vertex_labels = vertex_labels, - weights = (weights is not None), - use_gpu = use_gpu) - # Create new network if use_gpu: - # Add extra information from previous network - if previous_network is not None: - for (src, dest) in zip(extra_sources, extra_targets): - edge_list.append((src, dest)) - if weights is not None: - weights.extend(extra_weights) # benchmarking concurs with https://stackoverflow.com/questions/55922162/recommended-cudf-dataframe-construction edge_array = cp.array(edge_list, dtype = np.int32) edge_gpu_matrix = cuda.to_device(edge_array) @@ -783,6 +767,16 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, summarise = False, use_gpu = use_gpu) else: + # Load previous network + if previous_network is not None: + extra_sources, extra_targets, extra_weights = \ + process_previous_network(previous_network = previous_network, + adding_qq_dists = adding_qq_dists, + old_ids = old_ids, + previous_pkl = previous_pkl, + vertex_labels = vertex_labels, + weights = (weights is not None), + use_gpu = use_gpu) # Construct list of tuples for graph-tool # Include information from previous graph if supplied if weights is not None: From afdde8c769c060e8825ae273ac78797011f17ec7 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 13 Jun 2021 22:57:32 +0100 Subject: [PATCH 047/175] Make edge tuples consistent --- src/boundary.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/boundary.cpp b/src/boundary.cpp index 3f7aa82f..261a1318 100644 --- a/src/boundary.cpp +++ b/src/boundary.cpp @@ -123,7 +123,9 @@ edge_tuple generate_tuples(const std::vector &assignments, long i = calc_row_idx(row_idx, n_samples); long j = calc_col_idx(row_idx, i, n_samples) + int_offset; i = i + int_offset; - edge_vec.push_back(std::make_tuple(i, j)); + long min_node = std::min(i,j); + long max_node = std::max(i,j); + edge_vec.push_back(std::make_tuple(min_node, max_node)); } } } else { @@ -131,7 +133,9 @@ edge_tuple generate_tuples(const std::vector &assignments, if (assignments[row_idx] == within_label) { unsigned long i = row_idx % num_ref + int_offset; unsigned long j = static_cast(row_idx / (float)num_ref + 0.001f) + num_ref + int_offset; - edge_vec.push_back(std::make_tuple(i, j)); + long min_node = std::min(i,j); + long max_node = std::max(i,j); + edge_vec.push_back(std::make_tuple(min_node, max_node)); } } } From d220f22c1c78b1e5532687493925edde4e5e5d59 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 14 Jun 2021 11:09:15 +0100 Subject: [PATCH 048/175] Fix GPU tests --- test/test-gpu.py | 2 +- test/test-update-gpu.py | 124 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 1 deletion(-) create mode 100755 test/test-update-gpu.py diff --git a/test/test-gpu.py b/test/test-gpu.py index 201b7f6e..ae917065 100755 --- a/test/test-gpu.py +++ b/test/test-gpu.py @@ -27,7 +27,7 @@ # test updating order is correct sys.stderr.write("Running distance matrix order check (--update-db)\n") -subprocess.run(python_cmd + " test-update.py", shell=True, check=True) +subprocess.run(python_cmd + " test-update-gpu.py", shell=True, check=True) #fit GMM sys.stderr.write("Running GMM model fit (--fit-model gmm)\n") diff --git a/test/test-update-gpu.py b/test/test-update-gpu.py new file mode 100755 index 00000000..20887750 --- /dev/null +++ b/test/test-update-gpu.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python +# Copyright 2018-2021 John Lees and Nick Croucher + +"""Tests for PopPUNK --update-db order""" + +import subprocess +import os, sys +import sys +import shutil +import pickle + +import numpy as np +from scipy import stats +import h5py +import scipy.sparse + +import pp_sketchlib + +if os.environ.get("POPPUNK_PYTHON"): + python_cmd = os.environ.get("POPPUNK_PYTHON") +else: + python_cmd = "python" + +def run_regression(x, y, threshold = 0.99): + res = stats.linregress(x, y) + print("R^2: " + str(res.rvalue**2)) + if res.rvalue**2 < threshold: + sys.stderr.write("Distance matrix order failed!\n") + sys.exit(1) + +def compare_sparse_matrices(d1,d2,r1,r2): + d1_pairs = get_seq_tuples(d1.row,d1.col,r1) + d2_pairs = get_seq_tuples(d2.row,d2.col,r2) + d1_dists = [] + d2_dists = [] + + for (pair1,dist1) in zip(d1_pairs,d1.data): + for (pair2,dist2) in zip(d2_pairs,d2.data): + if pair1 == pair2: + d1_dists.append(dist1) + d2_dists.append(dist2) + break + + run_regression(np.asarray(d1_dists),np.asarray(d2_dists)) + +def get_seq_tuples(rows,cols,names): + tuple_list = [] + for (i,j) in zip(rows,cols): + sorted_pair = tuple(sorted((names[i],names[j]))) + tuple_list.append(sorted_pair) + return tuple_list + +def old_get_seq_tuples(rows,cols): + max_seqs = np.maximum(rows,cols) + min_seqs = np.minimum(rows,cols) + concat_seqs = np.vstack((max_seqs,min_seqs)) + seq_pairs = concat_seqs.T + seq_tuples = [tuple(row) for row in seq_pairs] + return seq_tuples + +# Check distances after one query + +# Check that order is the same after doing 1 + 2 with --update-db, as doing all of 1 + 2 together +subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile12.txt --output batch12 --overwrite --gpu-dist", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch12 --ranks 1,2 --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile1.txt --output batch1 --overwrite --gpu-dist", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch1 --ranks 1,2 --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch1 --query rfile2.txt --output batch2 --update-db --overwrite --gpu-graph --gpu-dist", shell=True, check=True) + +# Load updated distances +X2 = np.load("batch2/batch2.dists.npy") +with open("batch2/batch2.dists.pkl", 'rb') as pickle_file: + rlist2, qlist, self = pickle.load(pickle_file) + +# Get same distances from the full database +ref_db = "batch12/batch12" +ref_h5 = h5py.File(ref_db + ".h5", 'r') +db_kmers = sorted(ref_h5['sketches/' + rlist2[0]].attrs['kmers']) +ref_h5.close() +X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist2, rlist2, db_kmers, + True, False, 1, False, 0) + +# Check distances match +run_regression(X1[:, 0], X2[:, 0]) +run_regression(X1[:, 1], X2[:, 1]) + +# Check sparse distances after one query +with open("batch12/batch12.dists.pkl", 'rb') as pickle_file: + rlist1, qlist1, self = pickle.load(pickle_file) +S1 = scipy.sparse.load_npz("batch12/batch12_rank2_fit.npz") +S2 = scipy.sparse.load_npz("batch2/batch2_rank2_fit.npz") +compare_sparse_matrices(S1,S2,rlist1,rlist2) + +# Check distances after second query + +# Check that order is the same after doing 1 + 2 + 3 with --update-db, as doing all of 1 + 2 + 3 together +subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile123.txt --output batch123 --overwrite --gpu-dist", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch123 --ranks 1,2 --gpu-graph --gpu-dist", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch2 --query rfile3.txt --output batch3 --update-db --overwrite --gpu-graph --gpu-dist", shell=True, check=True) + +# Load updated distances +X2 = np.load("batch3/batch3.dists.npy") +with open("batch3/batch3.dists.pkl", 'rb') as pickle_file: + rlist4, qlist, self = pickle.load(pickle_file) + +# Get same distances from the full database +ref_db = "batch123/batch123" +ref_h5 = h5py.File(ref_db + ".h5", 'r') +db_kmers = sorted(ref_h5['sketches/' + rlist4[0]].attrs['kmers']) +ref_h5.close() +X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist4, rlist4, db_kmers, + True, False, 1, False, 0) + +# Check distances match +run_regression(X1[:, 0], X2[:, 0]) +run_regression(X1[:, 1], X2[:, 1]) + +# Check sparse distances after second query +with open("batch123/batch123.dists.pkl", 'rb') as pickle_file: + rlist3, qlist, self = pickle.load(pickle_file) +S3 = scipy.sparse.load_npz("batch123/batch123_rank2_fit.npz") +S4 = scipy.sparse.load_npz("batch3/batch3_rank2_fit.npz") + +compare_sparse_matrices(S3,S4,rlist3,rlist4) From cc2591f329172efd4ebcfeacdb220b29eb04417b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 14 Jun 2021 21:40:23 +0100 Subject: [PATCH 049/175] Fix GPU construction for single-edge graphs --- PopPUNK/network.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index c07ffdfe..e12c6328 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -752,9 +752,15 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, # Create new network if use_gpu: # benchmarking concurs with https://stackoverflow.com/questions/55922162/recommended-cudf-dataframe-construction - edge_array = cp.array(edge_list, dtype = np.int32) - edge_gpu_matrix = cuda.to_device(edge_array) - G_df = cudf.DataFrame(edge_gpu_matrix, columns = ['source','destination']) + if len(edge_list) > 1: + edge_array = cp.array(edge_list, dtype = np.int32) + edge_gpu_matrix = cuda.to_device(edge_array) + G_df = cudf.DataFrame(edge_gpu_matrix, columns = ['source','destination']) + else: + # Cannot generate an array when one edge + G_df = cudf.DataFrame(columns = ['source','destination']) + G_df['source'] = [edge_list[0][0]] + G_df['destination'] = [edge_list[0][1]] if weights is not None: G_df['weights'] = weights G = construct_network_from_df(rlist, qlist, G_df, From ed26be36045dd62ae0eeb226271c60990b1a470d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 15 Jun 2021 06:39:44 +0100 Subject: [PATCH 050/175] Change GPU package import --- PopPUNK/models.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 5d9e58ed..438286d4 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -35,7 +35,7 @@ # Load GPU libraries try: - import cupyx.scipy.sparse + import cupyx import cugraph import cudf import cupy as cp @@ -117,7 +117,7 @@ def loadClusterFit(pkl_file, npz_file, outPrefix = "", max_samples = 100000, rank_file = os.path.dirname(pkl_file) + "/" + \ prefix.group(1) + rankFile(rank) if use_gpu: - fit_data[rank] = cupyx.scipy.sparse.load_npz(rank_file) + fit_data[rank] = scipy.sparse.load_npz(rank_file) else: fit_data[rank] = scipy.sparse.load_npz(rank_file) else: @@ -1035,6 +1035,9 @@ def fit(self, X, accessory): y (numpy.array) Cluster assignments of samples in X ''' + # Check if model requires GPU + check_and_set_gpu(self.use_gpu, gpu_lib, quit_on_fail = True) + ClusterFit.fit(self, X) sample_size = int(round(0.5 * (1 + np.sqrt(1 + 8 * X.shape[0])))) if (max(self.ranks) >= sample_size): @@ -1168,7 +1171,7 @@ def extend(self, qqDists, qrDists): for rank in self.ranks: # Add the matrices together to make a large square matrix if self.use_gpu: - full_mat = cupyx.scipy.sparse.bmat([[self.nn_dists[rank], + full_mat = scipy.sparse.bmat([[self.nn_dists[rank], qrRect.transpose()], [qrRect,qqSquare]], format = 'csr', @@ -1187,7 +1190,7 @@ def extend(self, qqDists, qrDists): for row_idx in range(full_mat.shape[0]): sample_row = full_mat.getrow(row_idx) if self.use_gpu: - dist_row, dist_col, dist = cupyx.scipy.sparse.find(sample_row) + dist_row, dist_col, dist = scipy.sparse.find(sample_row) else: dist_row, dist_col, dist = scipy.sparse.find(sample_row) dist = [epsilon if d < epsilon else d for d in dist] From 83f3438bbf7f6494c2918a488b689e24c1678abb Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 16 Jun 2021 10:06:04 +0100 Subject: [PATCH 051/175] Update pruning of sequences on model fitting --- docs/qc.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/qc.rst b/docs/qc.rst index 74fed79f..97541c24 100644 --- a/docs/qc.rst +++ b/docs/qc.rst @@ -59,8 +59,11 @@ and run:: poppunk_prune --remove remove.txt --distances strain_db/strain_db.dists --output pruned_db -This will remove the samples from the ``strain_db.dists`` files, from which -``--model-fit`` can be run again. +This will remove the samples from the ``strain_db.dists`` files. This can instead be done +simultaneously as the model is fitted - problematic sequences can be pruned, and the model fitted +to the remaining high-quality samples by modifying the model fitting command to include QC options: + + poppunk --fit-model dbscan --ref-db example_db --output example_dbscan --max-a-dist 0.4 --max-pi-dist 0.2 Dealing with poor quality data ------------------------------ @@ -123,4 +126,3 @@ cytoscape directly, though removal from the PopPUNK database is best. The second largest cluster is also suspicious, where there are few triangles (low transitivity) and the nodes involved have high Stress. This is indicative of a bad fit overall, rather than a single problem sample. - From 2261e36ef5a7000e4a09ec9d68372aaf4e5d9965 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 2 Jul 2021 16:13:57 +0100 Subject: [PATCH 052/175] Avoid system recursion limit --- PopPUNK/network.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index e12c6328..ce1920e0 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -366,12 +366,14 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None gt.openmp_set_num_threads(1) # Cliques are pruned, taking one reference from each, until none remain + sys.setrecursionlimit = 5000 with Pool(processes=threads) as pool: ref_lists = pool.map(partial(cliquePrune, graph=G, reference_indices=reference_indices, components_list=components), set(components)) + sys.setrecursionlimit = 1000 # Returns nested lists, which need to be flattened reference_indices = set([entry for sublist in ref_lists for entry in sublist]) From 69454e27dcdab6475086a0075281871123942610 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 2 Jul 2021 16:21:10 +0100 Subject: [PATCH 053/175] Change legend position --- PopPUNK/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 7b38da49..d346d208 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -307,7 +307,7 @@ def plot_refined_results(X, Y, x_boundary, y_boundary, core_boundary, accessory_ plt.plot([core_boundary*scale[0], core_boundary*scale[0]], [0, np.amax(X[:,1])], color='red', linewidth=2, linestyle='--', label='Threshold boundary') - plt.legend() + plt.legend(loc='lower right') plt.title(title) plt.xlabel('Core distance (' + r'$\pi$' + ')') plt.ylabel('Accessory distance (' + r'$a$' + ')') From 1f0c78c188c5bb9bcf691ff61f19031033c7f323 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 4 Jul 2021 06:37:30 +0100 Subject: [PATCH 054/175] Update GPU tests --- test/test-gpu.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/test/test-gpu.py b/test/test-gpu.py index ae917065..7f17ee4a 100755 --- a/test/test-gpu.py +++ b/test/test-gpu.py @@ -39,13 +39,14 @@ #refine model with GMM sys.stderr.write("Running model refinement (--fit-model refine)\n") -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both --no-local --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --unconstrained --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1 --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 2 --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --manual-start manual.txt --overwrite --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --indiv-refine both --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --indiv-refine both --no-local --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --unconstrained --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --score-idx 1 --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --score-idx 2 --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both --gpu-graph", shell=True, check=True) # lineage clustering sys.stderr.write("Running lineage clustering test (--fit-model lineage)\n") From 0b26c310fe50035e58e4af8440621868f7b2039b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 5 Jul 2021 12:41:33 +0100 Subject: [PATCH 055/175] Fix import of GPU libraries --- PopPUNK/tsne.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PopPUNK/tsne.py b/PopPUNK/tsne.py index 0b171328..75234037 100644 --- a/PopPUNK/tsne.py +++ b/PopPUNK/tsne.py @@ -16,6 +16,7 @@ import cugraph import cudf import cupy as cp + from cuml import manifold as manifold_gpu from numba import cuda import rmm gpu_lib = True @@ -81,6 +82,7 @@ def get_options(): parser.add_argument('--output', required=True, help='Name of output file') parser.add_argument('--perplexity', help='Perplexity used to generate t-SNE projection [default = 30]', type=int, default=30) parser.add_argument('--verbosity', help='Verbosity level for t-SNE (0-3) [default = 0]', type=int, default=0) + parser.add_argument('--use-gpu', help='Whether to use GPU libraries for t-SNE calculation', default = False, action='store_true') return parser.parse_args() @@ -126,7 +128,7 @@ def main(): j += 1 # generate accessory genome distance representation - generate_tsne(seqLabels, accMat, args.perplexity, args.output, overwrite = True, verbosity = verbosity) + generate_tsne(seqLabels, accMat, args.perplexity, args.output, overwrite = True, use_gpu = args.use_gpu, verbosity = verbosity) if __name__ == "__main__": From 7fec15107d49ef00dcc287799f1ec6cac3a4ba94 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 5 Jul 2021 12:41:46 +0100 Subject: [PATCH 056/175] Update GPU command lines --- test/test-gpu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test-gpu.py b/test/test-gpu.py index 7f17ee4a..3e44a115 100755 --- a/test/test-gpu.py +++ b/test/test-gpu.py @@ -70,7 +70,7 @@ # viz sys.stderr.write("Running visualisations (poppunk_visualise)\n") subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --microreact --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --cytoscape --network-file example_db/example_db_graph.gt --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --cytoscape --network-file example_db/example_db_graph.csv.gz --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --phandango --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt --gpu-graph", shell=True, check=True) @@ -85,11 +85,11 @@ # t-sne sys.stderr.write("Running tsne viz\n") -subprocess.run(python_cmd + " ../poppunk_tsne-runner.py --distances example_db/example_db.dists --output example_tsne --perplexity 5 --verbosity 1 --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_tsne-runner.py --distances example_db/example_db.dists --output example_tsne --perplexity 5 --verbosity 1 --use-gpu", shell=True, check=True) # prune sys.stderr.write("Running poppunk_prune\n") -subprocess.run(python_cmd + " ../poppunk_prune-runner.py --distances example_db/example_db.dists --ref-db example_db --remove subset.txt --output example_prune --gpu-dist", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_prune-runner.py --distances example_db/example_db.dists --ref-db example_db --remove subset.txt --output example_prune", shell=True, check=True) # references sys.stderr.write("Running poppunk_references\n") From 41e7821ad2f5105f54fcff1d6ca366467e8ab067 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 5 Jul 2021 13:03:48 +0100 Subject: [PATCH 057/175] Update reference pick to use GPU --- PopPUNK/reference_pick.py | 14 +++++++++++--- test/test-gpu.py | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/PopPUNK/reference_pick.py b/PopPUNK/reference_pick.py index 6e4bcd68..cec9e6a0 100755 --- a/PopPUNK/reference_pick.py +++ b/PopPUNK/reference_pick.py @@ -44,6 +44,7 @@ def get_options(): # processing other = parser.add_argument_group('Other options') other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]') + other.add_argument('--use-gpu', default=False, action='store_true', help='Whether to use GPUs') other.add_argument('--version', action='version', version='%(prog)s '+__version__) @@ -70,14 +71,21 @@ def main(): refList, queryList, self, distMat = readPickle(args.distances, enforce_self=True) # Read in full network - genomeNetwork = gt.load_graph(args.network) + genomeNetwork = load_network_file(args.network, use_gpu = use_gpu) sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") # This is the same set of function calls for --fit-model when no --full-db in __main__.py # Find refs and prune network reference_indices, reference_names, refFileName, G_ref = \ - extractReferences(genomeNetwork, refList, args.output, threads = args.threads) - G_ref.save(args.output + "/" + os.path.basename(args.output) + '_graph.gt', fmt = 'gt') + extractReferences(genomeNetwork, + refList, + args.output, + threads = args.threads, + use_gpu = args.use_gpu) + save_network(G_ref, + prefix = args.output, + suffix = ".refs_graph", + use_gpu = args.use_gpu) # Prune distances nodes_to_remove = set(range(len(refList))).difference(reference_indices) diff --git a/test/test-gpu.py b/test/test-gpu.py index 3e44a115..f9e997d2 100755 --- a/test/test-gpu.py +++ b/test/test-gpu.py @@ -93,7 +93,7 @@ # references sys.stderr.write("Running poppunk_references\n") -subprocess.run(python_cmd + " ../poppunk_references-runner.py --network example_db/example_db_graph.gt --distances example_db/example_db.dists --ref-db example_db --output example_refs --model example_db --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_references-runner.py --network example_db/example_db_graph.gt --distances example_db/example_db.dists --ref-db example_db --output example_refs --model example_db --use-gpu", shell=True, check=True) # citations sys.stderr.write("Printing citations\n") From 53464cb76072ec0ab737d10890ed15c5f0f81379 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 5 Jul 2021 13:12:51 +0100 Subject: [PATCH 058/175] Fix function load --- PopPUNK/reference_pick.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/PopPUNK/reference_pick.py b/PopPUNK/reference_pick.py index cec9e6a0..d8b2be5f 100755 --- a/PopPUNK/reference_pick.py +++ b/PopPUNK/reference_pick.py @@ -14,6 +14,8 @@ from .sketchlib import removeFromDB from .network import extractReferences +from .network import load_network_file +from .network import save_network from .prune_db import prune_distance_matrix @@ -71,8 +73,11 @@ def main(): refList, queryList, self, distMat = readPickle(args.distances, enforce_self=True) # Read in full network - genomeNetwork = load_network_file(args.network, use_gpu = use_gpu) - sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") + genomeNetwork = load_network_file(args.network, use_gpu = args.use_gpu) + if args.use_gpu: + sys.stderr.write("Network loaded: " + str(genomeNetwork.number_of_vertices()) + " samples\n") + else: + sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") # This is the same set of function calls for --fit-model when no --full-db in __main__.py # Find refs and prune network From e70b02ae994305c1dcb6b9ff2cfb4439e5dbd8ce Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 5 Jul 2021 13:27:01 +0100 Subject: [PATCH 059/175] Fix graph file name --- test/test-gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test-gpu.py b/test/test-gpu.py index f9e997d2..e5b09183 100755 --- a/test/test-gpu.py +++ b/test/test-gpu.py @@ -93,7 +93,7 @@ # references sys.stderr.write("Running poppunk_references\n") -subprocess.run(python_cmd + " ../poppunk_references-runner.py --network example_db/example_db_graph.gt --distances example_db/example_db.dists --ref-db example_db --output example_refs --model example_db --use-gpu", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_references-runner.py --network example_db/example_db_graph.csv.gz --distances example_db/example_db.dists --ref-db example_db --output example_refs --model example_db --use-gpu", shell=True, check=True) # citations sys.stderr.write("Printing citations\n") From cf33f65f417cb40c3b5b2042db1243edf0a11f10 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 6 Jul 2021 12:23:22 +0100 Subject: [PATCH 060/175] Save MST network --- PopPUNK/visualise.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 2c90252c..2d7a4eef 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -179,6 +179,7 @@ def generate_visualisations(query_db, from .network import generate_minimum_spanning_tree from .network import load_network_file from .network import cugraph_to_graph_tool + from .network import save_network from .plot import drawMST from .plot import outputsForMicroreact @@ -410,6 +411,11 @@ def generate_visualisations(query_db, vals = isolateNameToLabel(combined_seq)) mst_graph.vp.id = vid drawMST(mst_graph, output, isolateClustering, clustering_name, overwrite) + save_network(mst_graph, + prefix = output, + suffix = '_mst', + use_graphml = False, + use_gpu = gpu_graph) else: mst_tree = existing_tree From 52d71e0e5f594a11497f0933bb776db672072f64 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 6 Jul 2021 12:42:15 +0100 Subject: [PATCH 061/175] Move MST network save --- PopPUNK/visualise.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 2d7a4eef..fad33934 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -401,6 +401,12 @@ def generate_visualisations(query_db, G = cugraph.minimum_spanning_tree(G, weight='weights') mst_graph = generate_minimum_spanning_tree(G, gpu_graph) del G + # save outputs + save_network(mst_graph, + prefix = output, + suffix = '_mst', + use_graphml = False, + use_gpu = gpu_graph) mst_as_tree = mst_to_phylogeny(mst_graph, isolateNameToLabel(combined_seq), use_gpu = gpu_graph) @@ -411,11 +417,6 @@ def generate_visualisations(query_db, vals = isolateNameToLabel(combined_seq)) mst_graph.vp.id = vid drawMST(mst_graph, output, isolateClustering, clustering_name, overwrite) - save_network(mst_graph, - prefix = output, - suffix = '_mst', - use_graphml = False, - use_gpu = gpu_graph) else: mst_tree = existing_tree From 70c14cca9d0076aedfabca7313521a585b7804a5 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 6 Jul 2021 12:59:32 +0100 Subject: [PATCH 062/175] Save Newick MST --- PopPUNK/visualise.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index fad33934..c314d640 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -407,15 +407,17 @@ def generate_visualisations(query_db, suffix = '_mst', use_graphml = False, use_gpu = gpu_graph) - mst_as_tree = mst_to_phylogeny(mst_graph, - isolateNameToLabel(combined_seq), - use_gpu = gpu_graph) if gpu_graph: mst_graph = cugraph_to_graph_tool(mst_graph, isolateNameToLabel(combined_seq)) else: vid = mst_graph.new_vertex_property('string', vals = isolateNameToLabel(combined_seq)) mst_graph.vp.id = vid + mst_as_tree = mst_to_phylogeny(mst_graph, + isolateNameToLabel(combined_seq), + use_gpu = False) + with open(os.path.join(output,os.path.basename(output) + '_mst.nwk')) as tree_out: + tree_out.write(mst_as_tree) drawMST(mst_graph, output, isolateClustering, clustering_name, overwrite) else: mst_tree = existing_tree From 58ad996832863733321bbba6e5bcc16dabee206e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 6 Jul 2021 13:02:04 +0100 Subject: [PATCH 063/175] Change file mode --- PopPUNK/visualise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index c314d640..d6e589b2 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -416,7 +416,7 @@ def generate_visualisations(query_db, mst_as_tree = mst_to_phylogeny(mst_graph, isolateNameToLabel(combined_seq), use_gpu = False) - with open(os.path.join(output,os.path.basename(output) + '_mst.nwk')) as tree_out: + with open(os.path.join(output,os.path.basename(output) + '_mst.nwk'),'w') as tree_out: tree_out.write(mst_as_tree) drawMST(mst_graph, output, isolateClustering, clustering_name, overwrite) else: From f25752de7ac06fa53efd3c8fe3d91a18893d8047 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 06:30:58 +0100 Subject: [PATCH 064/175] Overwrite not append for weighted networks --- PopPUNK/network.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ce1920e0..794d715e 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -787,16 +787,18 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, use_gpu = use_gpu) # Construct list of tuples for graph-tool # Include information from previous graph if supplied + weighted_edges = [] if weights is not None: for ((src, dest), weight) in zip(edge_list, weights): edge_list.append((src, dest, weight)) if previous_network is not None: for (src, dest, weight) in zip(extra_sources, extra_targets, extra_weights): - edge_list.append((src, dest, weight)) + weighted_edges.append((src, dest, weight)) else: if previous_network is not None: for (src, dest) in zip(extra_sources, extra_targets): - edge_list.append((src, dest)) + weighted_edges.append((src, dest)) + edge_list = weighted_edges # build the graph G = gt.Graph(directed = False) G.add_vertex(len(vertex_labels)) From 5ae2eab94e99846c963edb06edce6f7f769997f5 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 08:35:44 +0100 Subject: [PATCH 065/175] Remove quotations from dendropy newick strings --- PopPUNK/trees.py | 2 ++ PopPUNK/visualise.py | 1 + 2 files changed, 3 insertions(+) diff --git a/PopPUNK/trees.py b/PopPUNK/trees.py index 712170f5..e240037d 100644 --- a/PopPUNK/trees.py +++ b/PopPUNK/trees.py @@ -174,6 +174,8 @@ def generate_nj_tree(coreMat, seqLabels, outPrefix, rapidnj, threads): tree_string = tree.as_string(schema="newick", suppress_rooting=True, unquoted_underscores=True) + tree_string = tree_string.replace("'","") + return tree_string def mst_to_phylogeny(mst_network, names, use_gpu = False): diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index d6e589b2..b748cfee 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -416,6 +416,7 @@ def generate_visualisations(query_db, mst_as_tree = mst_to_phylogeny(mst_graph, isolateNameToLabel(combined_seq), use_gpu = False) + mst_as_tree.replace("'","") with open(os.path.join(output,os.path.basename(output) + '_mst.nwk'),'w') as tree_out: tree_out.write(mst_as_tree) drawMST(mst_graph, output, isolateClustering, clustering_name, overwrite) From 0e1f552627a11bb6d17cc46a712efe830bd048ca Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 08:37:19 +0100 Subject: [PATCH 066/175] Remove quotations from dendropy newick strings --- PopPUNK/visualise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index b748cfee..38d866d4 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -416,7 +416,7 @@ def generate_visualisations(query_db, mst_as_tree = mst_to_phylogeny(mst_graph, isolateNameToLabel(combined_seq), use_gpu = False) - mst_as_tree.replace("'","") + mst_as_tree = mst_as_tree.replace("'","") with open(os.path.join(output,os.path.basename(output) + '_mst.nwk'),'w') as tree_out: tree_out.write(mst_as_tree) drawMST(mst_graph, output, isolateClustering, clustering_name, overwrite) From abecf50db7e39427d98d511a0981d45a10df5398 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 09:12:50 +0100 Subject: [PATCH 067/175] Fix weighted network error --- PopPUNK/network.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 794d715e..0c8144cf 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -787,18 +787,18 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, use_gpu = use_gpu) # Construct list of tuples for graph-tool # Include information from previous graph if supplied - weighted_edges = [] if weights is not None: + weighted_edges = [] for ((src, dest), weight) in zip(edge_list, weights): - edge_list.append((src, dest, weight)) + weighted_edges.append((src, dest, weight)) if previous_network is not None: for (src, dest, weight) in zip(extra_sources, extra_targets, extra_weights): weighted_edges.append((src, dest, weight)) + edge_list = weighted_edges else: if previous_network is not None: for (src, dest) in zip(extra_sources, extra_targets): - weighted_edges.append((src, dest)) - edge_list = weighted_edges + edge_list.append((src, dest)) # build the graph G = gt.Graph(directed = False) G.add_vertex(len(vertex_labels)) From b88e05af3593887886114f6212345460c162bb5e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 10:08:24 +0100 Subject: [PATCH 068/175] Change dense network construction --- PopPUNK/network.py | 63 ++++++++++++++++++++++++++++++++++++++++++-- PopPUNK/visualise.py | 13 +++------ 2 files changed, 65 insertions(+), 11 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 0c8144cf..7e35f8d1 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -978,6 +978,67 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) return G +def construct_dense_weighted_network(rlist, weights = None, use_gpu = False): + """Construct an undirected network using sequence lists, assignments of pairwise distances + to clusters, and the identifier of the cluster assigned to within-strain distances. + Nodes are samples and edges where samples are within the same cluster + + Will print summary statistics about the network to ``STDERR`` + + Args: + rlist (list) + List of reference sequence labels + weights (list) + List of weights for each edge in the network + use_gpu (bool) + Whether to use GPUs for network construction + + Returns: + G (graph) + The resulting network + """ + # Check GPU library use + use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True) + + # data structures + vertex_labels, self_comparison = initial_graph_properties(rlist, qlist) + + # Filter weights to only the relevant edges + if weights is None: + sys.stderr.write("Need weights to construct weighted network\n") + sys.exit(1) + + # Convert edge indices to tuples + edge_list = poppunk_refine.generateTuples(assignments, + within_label, + self = (rlist == qlist), + num_ref = len(rlist), + int_offset = int_offset) + + if use_gpu: + # Construct network with GPU via data frame + G_df = cudf.DataFrame(columns = ['source','destination','weights']) + G_df['source'] = [edge_list[0][0]] + G_df['destination'] = [edge_list[0][1]] + G_df['weights'] = weights + max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) + max_in_vertex_labels = len(vertex_labels)-1 + G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) + else: + # Construct network with CPU via edge list + weighted_edges = [] + for ((src, dest), weight) in zip(edge_list, weights): + weighted_edges.append((src, dest, weight)) + # build the graph + G = gt.Graph(directed = False) + G.add_vertex(len(vertex_labels)) + eweight = G.new_ep("float") + G.add_edge_list(edge_list, eprops = [eweight]) + G.edge_properties["weight"] = eweight + + return G + + def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, int_offset = 0, weights = None, distMat = None, weights_type = None, previous_network = None, old_ids = None, adding_qq_dists = False, previous_pkl = None, betweenness_sample = betweenness_sample_default, @@ -1036,8 +1097,6 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = # Filter weights to only the relevant edges if weights is not None: - print("Weights: " + str(weights)) - print("Assignments: " + str(assignments)) weights = weights[assignments == within_label] elif distMat is not None and weights_type is not None: if isinstance(assignments, list): diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 38d866d4..d89c3205 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -174,7 +174,7 @@ def generate_visualisations(query_db, from .models import loadClusterFit - from .network import construct_network_from_assignments + from .network import construct_dense_weighted_network from .network import fetchNetwork from .network import generate_minimum_spanning_tree from .network import load_network_file @@ -389,14 +389,9 @@ def generate_visualisations(query_db, pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1))) # Dense network may be slow sys.stderr.write("Generating MST from dense distances (may be slow)\n") - G = construct_network_from_assignments(combined_seq, - combined_seq, - [0]*complete_distMat.shape[0], - within_label = 0, - distMat = complete_distMat, - weights_type = mst_distances, - use_gpu = gpu_graph, - summarise = False) + G = construct_dense_weighted_network(combined_seq, + weights = mst_distances, + use_gpu = gpu_graph) if gpu_graph: G = cugraph.minimum_spanning_tree(G, weight='weights') mst_graph = generate_minimum_spanning_tree(G, gpu_graph) From 6fb579dff31dba86171887d6c3b86e614691dffc Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 10:14:44 +0100 Subject: [PATCH 069/175] Fix tuple generation --- PopPUNK/network.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 7e35f8d1..7cf81670 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1001,7 +1001,7 @@ def construct_dense_weighted_network(rlist, weights = None, use_gpu = False): use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True) # data structures - vertex_labels, self_comparison = initial_graph_properties(rlist, qlist) + vertex_labels, self_comparison = initial_graph_properties(rlist, rlist) # Filter weights to only the relevant edges if weights is None: @@ -1009,8 +1009,8 @@ def construct_dense_weighted_network(rlist, weights = None, use_gpu = False): sys.exit(1) # Convert edge indices to tuples - edge_list = poppunk_refine.generateTuples(assignments, - within_label, + edge_list = poppunk_refine.generateTuples([0] * len(weights), + 0, self = (rlist == qlist), num_ref = len(rlist), int_offset = int_offset) From 2379cde5b65c4975d5d399f728092eb8fcdedc7e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 10:15:37 +0100 Subject: [PATCH 070/175] Fix tuple generation --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 7cf81670..24eb7ab8 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1011,7 +1011,7 @@ def construct_dense_weighted_network(rlist, weights = None, use_gpu = False): # Convert edge indices to tuples edge_list = poppunk_refine.generateTuples([0] * len(weights), 0, - self = (rlist == qlist), + self = True, num_ref = len(rlist), int_offset = int_offset) From 6bb281ebcb7b1dee64d0ebb631ab2340f94f977c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 10:16:25 +0100 Subject: [PATCH 071/175] Fix tuple generation --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 24eb7ab8..28a2fea4 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1013,7 +1013,7 @@ def construct_dense_weighted_network(rlist, weights = None, use_gpu = False): 0, self = True, num_ref = len(rlist), - int_offset = int_offset) + int_offset = 0) if use_gpu: # Construct network with GPU via data frame From 149110fbe6e6958d5a0508d8cd637e349a78c2ce Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 10:27:11 +0100 Subject: [PATCH 072/175] All dtype flexibility --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 28a2fea4..09d4e03a 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1017,7 +1017,7 @@ def construct_dense_weighted_network(rlist, weights = None, use_gpu = False): if use_gpu: # Construct network with GPU via data frame - G_df = cudf.DataFrame(columns = ['source','destination','weights']) + G_df = cudf.DataFrame(columns = ['source','destination']) G_df['source'] = [edge_list[0][0]] G_df['destination'] = [edge_list[0][1]] G_df['weights'] = weights From 0124b66effb1561b9c0cb48cca42f2929c4cfdb4 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 10:32:09 +0100 Subject: [PATCH 073/175] Change distance processing --- PopPUNK/network.py | 11 ++++++++--- PopPUNK/visualise.py | 3 ++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 09d4e03a..41229ab2 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -978,7 +978,7 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) return G -def construct_dense_weighted_network(rlist, weights = None, use_gpu = False): +def construct_dense_weighted_network(rlist, distMat = None, weights_type = None, use_gpu = False): """Construct an undirected network using sequence lists, assignments of pairwise distances to clusters, and the identifier of the cluster assigned to within-strain distances. Nodes are samples and edges where samples are within the same cluster @@ -988,8 +988,10 @@ def construct_dense_weighted_network(rlist, weights = None, use_gpu = False): Args: rlist (list) List of reference sequence labels - weights (list) - List of weights for each edge in the network + distMat (2 column ndarray) + Numpy array of pairwise distances + weights_type (str) + Type of weight to use for network use_gpu (bool) Whether to use GPUs for network construction @@ -1008,6 +1010,9 @@ def construct_dense_weighted_network(rlist, weights = None, use_gpu = False): sys.stderr.write("Need weights to construct weighted network\n") sys.exit(1) + # Process weights + weights = process_weights(distMat, weights_type) + # Convert edge indices to tuples edge_list = poppunk_refine.generateTuples([0] * len(weights), 0, diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index d89c3205..45d75b3c 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -390,7 +390,8 @@ def generate_visualisations(query_db, # Dense network may be slow sys.stderr.write("Generating MST from dense distances (may be slow)\n") G = construct_dense_weighted_network(combined_seq, - weights = mst_distances, + distMat = distMat, + weights_type = mst_distances, use_gpu = gpu_graph) if gpu_graph: G = cugraph.minimum_spanning_tree(G, weight='weights') From 0fcde0d0496a58a458a318ceb3d64015f2dfc928 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 10:33:37 +0100 Subject: [PATCH 074/175] Fix distmat name --- PopPUNK/visualise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 45d75b3c..cb04fb00 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -390,7 +390,7 @@ def generate_visualisations(query_db, # Dense network may be slow sys.stderr.write("Generating MST from dense distances (may be slow)\n") G = construct_dense_weighted_network(combined_seq, - distMat = distMat, + distMat = complete_distMat, weights_type = mst_distances, use_gpu = gpu_graph) if gpu_graph: From 1c853e60c7e77d585444a604cb7a96fcf417a281 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 10:38:56 +0100 Subject: [PATCH 075/175] Change network df construction --- PopPUNK/network.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 41229ab2..abcb0d4c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1022,9 +1022,9 @@ def construct_dense_weighted_network(rlist, distMat = None, weights_type = None, if use_gpu: # Construct network with GPU via data frame - G_df = cudf.DataFrame(columns = ['source','destination']) - G_df['source'] = [edge_list[0][0]] - G_df['destination'] = [edge_list[0][1]] + edge_array = cp.array(edge_list, dtype = np.int32) + edge_gpu_matrix = cuda.to_device(edge_array) + G_df = cudf.DataFrame(edge_gpu_matrix, columns = ['source','destination']) G_df['weights'] = weights max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) max_in_vertex_labels = len(vertex_labels)-1 From 32b24d4fecd6d29d37731ab7e95a976b1c62ca59 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 10:46:26 +0100 Subject: [PATCH 076/175] Remove custom function as unnecessary --- PopPUNK/network.py | 66 -------------------------------------------- PopPUNK/visualise.py | 10 +++++-- 2 files changed, 7 insertions(+), 69 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index abcb0d4c..bbcaf195 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -978,72 +978,6 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) return G -def construct_dense_weighted_network(rlist, distMat = None, weights_type = None, use_gpu = False): - """Construct an undirected network using sequence lists, assignments of pairwise distances - to clusters, and the identifier of the cluster assigned to within-strain distances. - Nodes are samples and edges where samples are within the same cluster - - Will print summary statistics about the network to ``STDERR`` - - Args: - rlist (list) - List of reference sequence labels - distMat (2 column ndarray) - Numpy array of pairwise distances - weights_type (str) - Type of weight to use for network - use_gpu (bool) - Whether to use GPUs for network construction - - Returns: - G (graph) - The resulting network - """ - # Check GPU library use - use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True) - - # data structures - vertex_labels, self_comparison = initial_graph_properties(rlist, rlist) - - # Filter weights to only the relevant edges - if weights is None: - sys.stderr.write("Need weights to construct weighted network\n") - sys.exit(1) - - # Process weights - weights = process_weights(distMat, weights_type) - - # Convert edge indices to tuples - edge_list = poppunk_refine.generateTuples([0] * len(weights), - 0, - self = True, - num_ref = len(rlist), - int_offset = 0) - - if use_gpu: - # Construct network with GPU via data frame - edge_array = cp.array(edge_list, dtype = np.int32) - edge_gpu_matrix = cuda.to_device(edge_array) - G_df = cudf.DataFrame(edge_gpu_matrix, columns = ['source','destination']) - G_df['weights'] = weights - max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) - max_in_vertex_labels = len(vertex_labels)-1 - G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) - else: - # Construct network with CPU via edge list - weighted_edges = [] - for ((src, dest), weight) in zip(edge_list, weights): - weighted_edges.append((src, dest, weight)) - # build the graph - G = gt.Graph(directed = False) - G.add_vertex(len(vertex_labels)) - eweight = G.new_ep("float") - G.add_edge_list(edge_list, eprops = [eweight]) - G.edge_properties["weight"] = eweight - - return G - - def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, int_offset = 0, weights = None, distMat = None, weights_type = None, previous_network = None, old_ids = None, adding_qq_dists = False, previous_pkl = None, betweenness_sample = betweenness_sample_default, diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index cb04fb00..38d866d4 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -174,7 +174,7 @@ def generate_visualisations(query_db, from .models import loadClusterFit - from .network import construct_dense_weighted_network + from .network import construct_network_from_assignments from .network import fetchNetwork from .network import generate_minimum_spanning_tree from .network import load_network_file @@ -389,10 +389,14 @@ def generate_visualisations(query_db, pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1))) # Dense network may be slow sys.stderr.write("Generating MST from dense distances (may be slow)\n") - G = construct_dense_weighted_network(combined_seq, + G = construct_network_from_assignments(combined_seq, + combined_seq, + [0]*complete_distMat.shape[0], + within_label = 0, distMat = complete_distMat, weights_type = mst_distances, - use_gpu = gpu_graph) + use_gpu = gpu_graph, + summarise = False) if gpu_graph: G = cugraph.minimum_spanning_tree(G, weight='weights') mst_graph = generate_minimum_spanning_tree(G, gpu_graph) From dcf8afbec605a870ff8076d8220af22c2267482a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 10:59:28 +0100 Subject: [PATCH 077/175] Restore function for modification --- PopPUNK/network.py | 66 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index bbcaf195..41229ab2 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -978,6 +978,72 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) return G +def construct_dense_weighted_network(rlist, distMat = None, weights_type = None, use_gpu = False): + """Construct an undirected network using sequence lists, assignments of pairwise distances + to clusters, and the identifier of the cluster assigned to within-strain distances. + Nodes are samples and edges where samples are within the same cluster + + Will print summary statistics about the network to ``STDERR`` + + Args: + rlist (list) + List of reference sequence labels + distMat (2 column ndarray) + Numpy array of pairwise distances + weights_type (str) + Type of weight to use for network + use_gpu (bool) + Whether to use GPUs for network construction + + Returns: + G (graph) + The resulting network + """ + # Check GPU library use + use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True) + + # data structures + vertex_labels, self_comparison = initial_graph_properties(rlist, rlist) + + # Filter weights to only the relevant edges + if weights is None: + sys.stderr.write("Need weights to construct weighted network\n") + sys.exit(1) + + # Process weights + weights = process_weights(distMat, weights_type) + + # Convert edge indices to tuples + edge_list = poppunk_refine.generateTuples([0] * len(weights), + 0, + self = True, + num_ref = len(rlist), + int_offset = 0) + + if use_gpu: + # Construct network with GPU via data frame + G_df = cudf.DataFrame(columns = ['source','destination']) + G_df['source'] = [edge_list[0][0]] + G_df['destination'] = [edge_list[0][1]] + G_df['weights'] = weights + max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) + max_in_vertex_labels = len(vertex_labels)-1 + G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) + else: + # Construct network with CPU via edge list + weighted_edges = [] + for ((src, dest), weight) in zip(edge_list, weights): + weighted_edges.append((src, dest, weight)) + # build the graph + G = gt.Graph(directed = False) + G.add_vertex(len(vertex_labels)) + eweight = G.new_ep("float") + G.add_edge_list(edge_list, eprops = [eweight]) + G.edge_properties["weight"] = eweight + + return G + + def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, int_offset = 0, weights = None, distMat = None, weights_type = None, previous_network = None, old_ids = None, adding_qq_dists = False, previous_pkl = None, betweenness_sample = betweenness_sample_default, From 80c763f9b6280dca3a59fb782ee87f746c455fa9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 11:20:15 +0100 Subject: [PATCH 078/175] Remove unneeded function after benchmarking --- PopPUNK/network.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index bbcaf195..7fab7128 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -978,6 +978,73 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) return G +def construct_dense_weighted_network(rlist, distMat = None, weights_type = None, use_gpu = False): + """Construct an undirected network using sequence lists, assignments of pairwise distances + to clusters, and the identifier of the cluster assigned to within-strain distances. + Nodes are samples and edges where samples are within the same cluster + + Will print summary statistics about the network to ``STDERR`` + + Args: + rlist (list) + List of reference sequence labels + distMat (2 column ndarray) + Numpy array of pairwise distances + weights_type (str) + Type of weight to use for network + use_gpu (bool) + Whether to use GPUs for network construction + + Returns: + G (graph) + The resulting network + """ + # Check GPU library use + use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True) + + # data structures + vertex_labels, self_comparison = initial_graph_properties(rlist, rlist) + + # Filter weights to only the relevant edges + if weights is None: + sys.stderr.write("Need weights to construct weighted network\n") + sys.exit(1) + + # Process weights + weights = process_weights(distMat, weights_type) + + # Convert edge indices to tuples + edge_list = poppunk_refine.generateTuples([0] * len(weights), + 0, + self = True, + num_ref = len(rlist), + int_offset = 0) + + if use_gpu: + # Construct network with GPU via data frame + G_df = cudf.DataFrame(columns = ['source','destination']) + G_df['source'] = [edge_list[0][0]] + G_df['destination'] = [edge_list[0][1]] + G_df['weights'] = weights + max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) + max_in_vertex_labels = len(vertex_labels)-1 + G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) + else: + # Construct network with CPU via edge list + #weighted_edges = [] + for ((src, dest), weight) in zip(edge_list, weights): + weighted_edges.append((src, dest, weight)) + # build the graph + G = gt.Graph(directed = False) + G.add_vertex(len(vertex_labels)) + eweight = G.new_ep("float") + # Could alternatively assign weights through eweight.a = weights + G.add_edge_list(weighted_edges, eprops = [eweight]) + G.edge_properties["weight"] = eweight + + return G + + def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, int_offset = 0, weights = None, distMat = None, weights_type = None, previous_network = None, old_ids = None, adding_qq_dists = False, previous_pkl = None, betweenness_sample = betweenness_sample_default, From e8c978cd4594c838dafe50f782bff064b81f4517 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 11:21:19 +0100 Subject: [PATCH 079/175] Remove function after further benchmarking --- PopPUNK/network.py | 66 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index bbcaf195..41229ab2 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -978,6 +978,72 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) return G +def construct_dense_weighted_network(rlist, distMat = None, weights_type = None, use_gpu = False): + """Construct an undirected network using sequence lists, assignments of pairwise distances + to clusters, and the identifier of the cluster assigned to within-strain distances. + Nodes are samples and edges where samples are within the same cluster + + Will print summary statistics about the network to ``STDERR`` + + Args: + rlist (list) + List of reference sequence labels + distMat (2 column ndarray) + Numpy array of pairwise distances + weights_type (str) + Type of weight to use for network + use_gpu (bool) + Whether to use GPUs for network construction + + Returns: + G (graph) + The resulting network + """ + # Check GPU library use + use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True) + + # data structures + vertex_labels, self_comparison = initial_graph_properties(rlist, rlist) + + # Filter weights to only the relevant edges + if weights is None: + sys.stderr.write("Need weights to construct weighted network\n") + sys.exit(1) + + # Process weights + weights = process_weights(distMat, weights_type) + + # Convert edge indices to tuples + edge_list = poppunk_refine.generateTuples([0] * len(weights), + 0, + self = True, + num_ref = len(rlist), + int_offset = 0) + + if use_gpu: + # Construct network with GPU via data frame + G_df = cudf.DataFrame(columns = ['source','destination']) + G_df['source'] = [edge_list[0][0]] + G_df['destination'] = [edge_list[0][1]] + G_df['weights'] = weights + max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) + max_in_vertex_labels = len(vertex_labels)-1 + G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) + else: + # Construct network with CPU via edge list + weighted_edges = [] + for ((src, dest), weight) in zip(edge_list, weights): + weighted_edges.append((src, dest, weight)) + # build the graph + G = gt.Graph(directed = False) + G.add_vertex(len(vertex_labels)) + eweight = G.new_ep("float") + G.add_edge_list(edge_list, eprops = [eweight]) + G.edge_properties["weight"] = eweight + + return G + + def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, int_offset = 0, weights = None, distMat = None, weights_type = None, previous_network = None, old_ids = None, adding_qq_dists = False, previous_pkl = None, betweenness_sample = betweenness_sample_default, From 4c500d15a243ebe6d9421d943edd353a1092cce0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 11:22:29 +0100 Subject: [PATCH 080/175] Remove function after further benchmarking --- PopPUNK/network.py | 66 ---------------------------------------------- 1 file changed, 66 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 41229ab2..bbcaf195 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -978,72 +978,6 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) return G -def construct_dense_weighted_network(rlist, distMat = None, weights_type = None, use_gpu = False): - """Construct an undirected network using sequence lists, assignments of pairwise distances - to clusters, and the identifier of the cluster assigned to within-strain distances. - Nodes are samples and edges where samples are within the same cluster - - Will print summary statistics about the network to ``STDERR`` - - Args: - rlist (list) - List of reference sequence labels - distMat (2 column ndarray) - Numpy array of pairwise distances - weights_type (str) - Type of weight to use for network - use_gpu (bool) - Whether to use GPUs for network construction - - Returns: - G (graph) - The resulting network - """ - # Check GPU library use - use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True) - - # data structures - vertex_labels, self_comparison = initial_graph_properties(rlist, rlist) - - # Filter weights to only the relevant edges - if weights is None: - sys.stderr.write("Need weights to construct weighted network\n") - sys.exit(1) - - # Process weights - weights = process_weights(distMat, weights_type) - - # Convert edge indices to tuples - edge_list = poppunk_refine.generateTuples([0] * len(weights), - 0, - self = True, - num_ref = len(rlist), - int_offset = 0) - - if use_gpu: - # Construct network with GPU via data frame - G_df = cudf.DataFrame(columns = ['source','destination']) - G_df['source'] = [edge_list[0][0]] - G_df['destination'] = [edge_list[0][1]] - G_df['weights'] = weights - max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) - max_in_vertex_labels = len(vertex_labels)-1 - G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) - else: - # Construct network with CPU via edge list - weighted_edges = [] - for ((src, dest), weight) in zip(edge_list, weights): - weighted_edges.append((src, dest, weight)) - # build the graph - G = gt.Graph(directed = False) - G.add_vertex(len(vertex_labels)) - eweight = G.new_ep("float") - G.add_edge_list(edge_list, eprops = [eweight]) - G.edge_properties["weight"] = eweight - - return G - - def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, int_offset = 0, weights = None, distMat = None, weights_type = None, previous_network = None, old_ids = None, adding_qq_dists = False, previous_pkl = None, betweenness_sample = betweenness_sample_default, From d819829b774d02088ed93dee5e9d055fd64a566c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 11:41:14 +0100 Subject: [PATCH 081/175] Update QC of query distances --- PopPUNK/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 52ec1968..1ac36121 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -267,7 +267,7 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): from .sketchlib import pickTypeIsolate # Create overall list of sequences - if refList == refList: + if refList == queryList: seq_names_passing = refList else: seq_names_passing = refList + queryList From 3031454f315d3f18044107b5a2afdd7f99d8a7e9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 7 Jul 2021 13:11:21 +0100 Subject: [PATCH 082/175] Update QC routines --- PopPUNK/models.py | 2 +- PopPUNK/utils.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index cca24f11..9ec55d38 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -1172,7 +1172,7 @@ def extend(self, qqDists, qrDists): dist_row, dist_col, dist = scipy.sparse.find(sample_row) else: dist_row, dist_col, dist = scipy.sparse.find(sample_row) - dist = [epsilon if d < epsilon else d for d in dist] + dist[dist < epsilon] = epsilon dist_idx_sort = np.argsort(dist) # Identical to C++ code in matrix_ops.cpp:sparsify_dists diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 1ac36121..4f81e33d 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -291,7 +291,11 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): # First check with numpy, which is quicker than iterating over everything #long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist() long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])],0,1)[0].tolist() - long_edges = poppunk_refine.generateTuples(long_distance_rows, 0) + long_edges = poppunk_refine.generateTuples(long_distance_rows, + 0, + self = False, + num_ref = len(refList), + int_offset = 0) if len(long_edges) > 0: # Prune sequences based on reference sequence for (s,t) in long_edges: From 8940556623fd82285f0d707d61da66ca5c15693b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 8 Jul 2021 06:48:46 +0100 Subject: [PATCH 083/175] Fix indexing of query values --- src/boundary.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/boundary.cpp b/src/boundary.cpp index d7368600..1ec62fdd 100644 --- a/src/boundary.cpp +++ b/src/boundary.cpp @@ -136,7 +136,7 @@ edge_tuple generate_tuples(const std::vector &assignments, for (long row_idx = 0; row_idx < n_rows; row_idx++) { if (assignments[row_idx] == within_label) { unsigned long i = row_idx % num_ref + int_offset; - unsigned long j = static_cast(row_idx / (float)num_ref + 0.001f) + num_ref + int_offset; + unsigned long j = row_idx / num_ref + num_ref + int_offset; long min_node = std::min(i,j); long max_node = std::max(i,j); edge_vec.push_back(std::make_tuple(min_node, max_node)); From a2dd161ffe382d1070588dc246fb6fa1bd2ac6fa Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 8 Jul 2021 08:55:30 +0100 Subject: [PATCH 084/175] Make query functions consistent with arguments --- PopPUNK/assign.py | 3 ++- PopPUNK/sketchlib.py | 21 +++++++++++++++++++++ PopPUNK/utils.py | 2 +- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index fd9fce67..970b75b4 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -234,7 +234,8 @@ def assign_query(dbFuncs, rNames + qNames, edge_list = assignment, weights = weights, - use_gpu = gpu_graph) + use_gpu = gpu_graph, + summarise = False) isolateClustering[rank] = \ printClusters(genomeNetwork[rank], diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 71afc4b4..28207fb6 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -568,6 +568,27 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num query_db = queryPrefix + "/" + os.path.basename(queryPrefix) distMat = pp_sketchlib.queryDatabase(ref_db, query_db, rNames, qNames, klist, True, False, threads, use_gpu, deviceid) + + # option to plot core/accessory fits. Choose a random number from cmd line option + if number_plot_fits > 0: + jacobian = -np.hstack((np.ones((klist.shape[0], 1)), klist.reshape(-1, 1))) + for plot_idx in range(number_plot_fits): + ref_example = sample(rNames, k=1) + query_example = sample(qNames, k=1) + raw = np.zeros(len(klist)) + corrected = np.zeros(len(klist)) + for kidx, kmer in enumerate(klist): + raw[kidx] = pp_sketchlib.jaccardDist(ref_db, ref_example, query_example, kmer, False) + corrected[kidx] = pp_sketchlib.jaccardDist(ref_db, ref_example, query_example, kmer, True) + raw_fit = fitKmerCurve(raw, klist, jacobian) + corrected_fit = fitKmerCurve(corrected, klist, jacobian) + plot_fit(klist, + raw, + raw_fit, + corrected, + corrected_fit, + dbPrefix + "/" + dbPrefix + "_fit_example_" + str(plot_idx + 1), + "Example fit " + str(plot_idx + 1) + " - " + example[0] + " vs. " + example[1]) return distMat diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 4f81e33d..8d160bac 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -293,7 +293,7 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])],0,1)[0].tolist() long_edges = poppunk_refine.generateTuples(long_distance_rows, 0, - self = False, + self = (refList == queryList), num_ref = len(refList), int_offset = 0) if len(long_edges) > 0: From 121cdd3bfcdd0bb11d7216f99745459ceed08a71 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 8 Jul 2021 11:26:58 +0100 Subject: [PATCH 085/175] Remove jaccard wrapped from queries --- PopPUNK/sketchlib.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 28207fb6..4b964de9 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -578,8 +578,24 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num raw = np.zeros(len(klist)) corrected = np.zeros(len(klist)) for kidx, kmer in enumerate(klist): - raw[kidx] = pp_sketchlib.jaccardDist(ref_db, ref_example, query_example, kmer, False) - corrected[kidx] = pp_sketchlib.jaccardDist(ref_db, ref_example, query_example, kmer, True) + raw[kidx] = pp_sketchlib.queryDatabase(ref_db, + query_db, + ref_example[0], + query_example[0], + kmer, + random_correct = False, + jaccard = True, + num_threads = threads, + use_gpu = use_gpu) + corrected[kidx] = pp_sketchlib.jaccardDist(ref_db, + query_db, + ref_example[0], + query_example[0], + kmer, + random_correct = True, + jaccard = True, + num_threads = threads, + use_gpu = use_gpu) raw_fit = fitKmerCurve(raw, klist, jacobian) corrected_fit = fitKmerCurve(corrected, klist, jacobian) plot_fit(klist, From c6b5cac43e47541ed9be111ca14699f37f99fc9c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 8 Jul 2021 11:28:55 +0100 Subject: [PATCH 086/175] Change string to list --- PopPUNK/sketchlib.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 4b964de9..dbaa2d6e 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -580,8 +580,8 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num for kidx, kmer in enumerate(klist): raw[kidx] = pp_sketchlib.queryDatabase(ref_db, query_db, - ref_example[0], - query_example[0], + ref_example, + query_example, kmer, random_correct = False, jaccard = True, @@ -589,8 +589,8 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num use_gpu = use_gpu) corrected[kidx] = pp_sketchlib.jaccardDist(ref_db, query_db, - ref_example[0], - query_example[0], + ref_example, + query_example, kmer, random_correct = True, jaccard = True, From e70812ddfdc32a044678f15b323df54a0c6a9c52 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 8 Jul 2021 11:31:53 +0100 Subject: [PATCH 087/175] Change to kmer list --- PopPUNK/sketchlib.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index dbaa2d6e..82acc4cc 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -577,25 +577,24 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num query_example = sample(qNames, k=1) raw = np.zeros(len(klist)) corrected = np.zeros(len(klist)) - for kidx, kmer in enumerate(klist): - raw[kidx] = pp_sketchlib.queryDatabase(ref_db, - query_db, - ref_example, - query_example, - kmer, - random_correct = False, - jaccard = True, - num_threads = threads, - use_gpu = use_gpu) - corrected[kidx] = pp_sketchlib.jaccardDist(ref_db, - query_db, - ref_example, - query_example, - kmer, - random_correct = True, - jaccard = True, - num_threads = threads, - use_gpu = use_gpu) + raw = pp_sketchlib.queryDatabase(ref_db, + query_db, + ref_example, + query_example, + klist, + random_correct = False, + jaccard = True, + num_threads = threads, + use_gpu = use_gpu) + corrected = pp_sketchlib.jaccardDist(ref_db, + query_db, + ref_example, + query_example, + klist, + random_correct = True, + jaccard = True, + num_threads = threads, + use_gpu = use_gpu) raw_fit = fitKmerCurve(raw, klist, jacobian) corrected_fit = fitKmerCurve(corrected, klist, jacobian) plot_fit(klist, From 46c1187ac0d8464bb477fd1de2467d9b233115ed Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 8 Jul 2021 11:32:58 +0100 Subject: [PATCH 088/175] Switch off GPUs for Jaccard calculation --- PopPUNK/sketchlib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 82acc4cc..974f32c5 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -585,7 +585,7 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num random_correct = False, jaccard = True, num_threads = threads, - use_gpu = use_gpu) + use_gpu = False) corrected = pp_sketchlib.jaccardDist(ref_db, query_db, ref_example, @@ -594,7 +594,7 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num random_correct = True, jaccard = True, num_threads = threads, - use_gpu = use_gpu) + use_gpu = False) raw_fit = fitKmerCurve(raw, klist, jacobian) corrected_fit = fitKmerCurve(corrected, klist, jacobian) plot_fit(klist, From 401271f52c71b073ad04f1e6fed7071ddc749b10 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 8 Jul 2021 11:34:25 +0100 Subject: [PATCH 089/175] Change sketchlib function name --- PopPUNK/sketchlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 974f32c5..f47060ac 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -586,7 +586,7 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num jaccard = True, num_threads = threads, use_gpu = False) - corrected = pp_sketchlib.jaccardDist(ref_db, + corrected = pp_sketchlib.queryDatabase(ref_db, query_db, ref_example, query_example, From 30f98532a8c299a3b281e950fb5e7b00b1326324 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 8 Jul 2021 11:54:47 +0100 Subject: [PATCH 090/175] Fix fit example plotting for queries --- PopPUNK/sketchlib.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index f47060ac..b17f82f6 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -575,8 +575,6 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num for plot_idx in range(number_plot_fits): ref_example = sample(rNames, k=1) query_example = sample(qNames, k=1) - raw = np.zeros(len(klist)) - corrected = np.zeros(len(klist)) raw = pp_sketchlib.queryDatabase(ref_db, query_db, ref_example, @@ -595,15 +593,15 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num jaccard = True, num_threads = threads, use_gpu = False) - raw_fit = fitKmerCurve(raw, klist, jacobian) - corrected_fit = fitKmerCurve(corrected, klist, jacobian) + raw_fit = fitKmerCurve(raw[0], klist, jacobian) + corrected_fit = fitKmerCurve(corrected[0], klist, jacobian) plot_fit(klist, - raw, + raw[0], raw_fit, - corrected, + corrected[0], corrected_fit, - dbPrefix + "/" + dbPrefix + "_fit_example_" + str(plot_idx + 1), - "Example fit " + str(plot_idx + 1) + " - " + example[0] + " vs. " + example[1]) + queryPrefix + "/" + queryPrefix + "_fit_example_" + str(plot_idx + 1), + "Example fit " + str(plot_idx + 1) + " - " + ref_example[0] + " vs. " + query_example[0]) return distMat From 346c98c4bf791018645acdd9283bdabdba02315a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 8 Jul 2021 12:31:01 +0100 Subject: [PATCH 091/175] Remove jaccardDist function --- PopPUNK/sketchlib.py | 72 +++++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index b17f82f6..8de3591d 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -544,15 +544,30 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num example = sample(rNames, k=2) raw = np.zeros(len(klist)) corrected = np.zeros(len(klist)) - for kidx, kmer in enumerate(klist): - raw[kidx] = pp_sketchlib.jaccardDist(ref_db, example[0], example[1], kmer, False) - corrected[kidx] = pp_sketchlib.jaccardDist(ref_db, example[0], example[1], kmer, True) - raw_fit = fitKmerCurve(raw, klist, jacobian) - corrected_fit = fitKmerCurve(corrected, klist, jacobian) + raw = pp_sketchlib.queryDatabase(ref_db, + query_db, + example[0], + example[1], + klist, + random_correct = False, + jaccard = True, + num_threads = threads, + use_gpu = False) + corrected = pp_sketchlib.queryDatabase(ref_db, + query_db, + example[0], + example[1], + klist, + random_correct = True, + jaccard = True, + num_threads = threads, + use_gpu = False) + raw_fit = fitKmerCurve(raw[0], klist, jacobian) + corrected_fit = fitKmerCurve(corrected[0], klist, jacobian) plot_fit(klist, - raw, + raw[0], raw_fit, - corrected, + corrected[0], corrected_fit, dbPrefix + "/" + dbPrefix + "_fit_example_" + str(plot_idx + 1), "Example fit " + str(plot_idx + 1) + " - " + example[0] + " vs. " + example[1]) @@ -572,36 +587,37 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num # option to plot core/accessory fits. Choose a random number from cmd line option if number_plot_fits > 0: jacobian = -np.hstack((np.ones((klist.shape[0], 1)), klist.reshape(-1, 1))) - for plot_idx in range(number_plot_fits): - ref_example = sample(rNames, k=1) - query_example = sample(qNames, k=1) - raw = pp_sketchlib.queryDatabase(ref_db, + ref_examples = sample(rNames, k = number_plot_fits) + query_examples = sample(qNames, k = number_plot_fits) + raw = pp_sketchlib.queryDatabase(ref_db, + query_db, + ref_examples, + query_examples, + klist, + random_correct = False, + jaccard = True, + num_threads = threads, + use_gpu = False) + corrected = pp_sketchlib.queryDatabase(ref_db, query_db, - ref_example, - query_example, + ref_examples, + query_examples, klist, - random_correct = False, + random_correct = True, jaccard = True, num_threads = threads, use_gpu = False) - corrected = pp_sketchlib.queryDatabase(ref_db, - query_db, - ref_example, - query_example, - klist, - random_correct = True, - jaccard = True, - num_threads = threads, - use_gpu = False) - raw_fit = fitKmerCurve(raw[0], klist, jacobian) - corrected_fit = fitKmerCurve(corrected[0], klist, jacobian) + for plot_idx in range(number_plot_fits): + raw_fit = fitKmerCurve(raw[plot_idx], klist, jacobian) + corrected_fit = fitKmerCurve(corrected[plot_idx], klist, jacobian) plot_fit(klist, - raw[0], + raw[plot_idx], raw_fit, - corrected[0], + corrected[plot_idx], corrected_fit, queryPrefix + "/" + queryPrefix + "_fit_example_" + str(plot_idx + 1), - "Example fit " + str(plot_idx + 1) + " - " + ref_example[0] + " vs. " + query_example[0]) + "Example fit " + str(plot_idx + 1) + " - " + ref_examples[plot_idx] + \ + " vs. " + query_examples[plot_idx]) return distMat From 678ac8af3b2255c024ddd703786bfb783cfd355f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 8 Jul 2021 19:48:45 +0100 Subject: [PATCH 092/175] Use cupyx functions --- PopPUNK/models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 9ec55d38..fdc4501f 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -1036,7 +1036,7 @@ def fit(self, X, accessory): 0, rank ) - data = [epsilon if d < epsilon else d for d in data] + data[data < epsilon] = epsilon if self.use_gpu: self.nn_dists[rank] = cupyx.scipy.sparse.coo_matrix((cp.array(data),(cp.array(row),cp.array(col))), shape=(sample_size, sample_size), @@ -1150,7 +1150,7 @@ def extend(self, qqDists, qrDists): for rank in self.ranks: # Add the matrices together to make a large square matrix if self.use_gpu: - full_mat = scipy.sparse.bmat([[self.nn_dists[rank], + full_mat = cupyx.scipy.sparse.bmat([[self.nn_dists[rank], qrRect.transpose()], [qrRect,qqSquare]], format = 'csr', @@ -1169,7 +1169,7 @@ def extend(self, qqDists, qrDists): for row_idx in range(full_mat.shape[0]): sample_row = full_mat.getrow(row_idx) if self.use_gpu: - dist_row, dist_col, dist = scipy.sparse.find(sample_row) + dist_row, dist_col, dist = cupyx.scipy.sparse.find(sample_row) else: dist_row, dist_col, dist = scipy.sparse.find(sample_row) dist[dist < epsilon] = epsilon From 08c7a02c703abf96ee94bb8ab357b670a549a702 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 8 Jul 2021 22:21:24 +0100 Subject: [PATCH 093/175] Update and document lineage assignment --- PopPUNK/models.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index fdc4501f..5a0d6ba1 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -1137,7 +1137,27 @@ def edge_weights(self, rank): else: return (self.nn_dists[rank].data) - def extend(self, qqDists, qrDists): + def extend(self, qqDists, qrDists) + '''Update the sparse distance matrix of nearest neighbours after querying + + Args: + qqDists (numpy or cupy ndarray) + Two column array of query-query distances + qqDists (numpy or cupy ndarray) + Two column array of reference-query distances + Returns: + y (list of tuples) + Edges to include in network + ''' + + # Check if model requires GPU + check_and_set_gpu(self.use_gpu, gpu_lib, quit_on_fail = True) + + # Convert data structures if using GPU + if self.use_gpu: + qqDists = cp.array(qqDists) + qrDists = cp.array(qrDists) + # Reshape qq and qr dist matrices qqSquare = pp_sketchlib.longToSquare(qqDists[:, [self.dist_col]], self.threads) qqSquare[qqSquare < epsilon] = epsilon From 0dc0745fa220b269f2afd2cc4b6800e73071781c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 12:10:39 +0100 Subject: [PATCH 094/175] Visualisation using sparse distance matrix --- PopPUNK/sparse_mst.py | 88 +++++++++--------- PopPUNK/visualise.py | 201 +++++++++++++++++++++++++++++------------- 2 files changed, 188 insertions(+), 101 deletions(-) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index 1a2f86d0..755c185f 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -73,6 +73,50 @@ def get_options(): return parser.parse_args() +def generate_mst_from_sparse_input(sparse_mat, distance_pkl, previous_mst = None, gpu_graph = False): + if gpu_graph: + # Load previous MST if specified + if previous_mst is not None: + extra_sources, extra_targets, extra_weights = network_to_edges(previous_mst, + rlist, + previous_pkl = distance_pkl, + weights = True, + use_gpu = use_gpu) + sources = np.append(sparse_mat.row, np.asarray(extra_sources)) + targets = np.append(sparse_mat.col, np.asarray(extra_targets)) + weights = np.append(sparse_mat.data, np.asarray(extra_weights)) + else: + sources = sparse_mat.row + targets = sparse_mat.col + weights = sparse_mat.data + G_df = cudf.DataFrame({'source': sources, + 'destination': targets, + 'weights': weights}) + G_cu = cugraph.Graph() + G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) + + # Generate minimum spanning tree + G = cugraph.minimum_spanning_tree(G_cu, weight='weights') + else: + # Load previous MST if specified + if previous_mst is not None: + G = construct_network_from_sparse_matrix(rlist, + rlist, + sparse_mat, + summarise=False, + previous_network = args.previous_mst) + else: + G = construct_network_from_sparse_matrix(rlist, + rlist, + sparse_mat, + summarise=False) + sys.stderr.write("Calculating MST (CPU)\n") + + G = generate_minimum_spanning_tree(G, args.gpu_graph) + + return(G) + + def main(): # Check input args ok @@ -109,46 +153,10 @@ def main(): # Create network with sparse dists sys.stderr.write("Loading distances into graph\n") sparse_mat = sparse.load_npz(args.rank_fit) - if args.gpu_graph: - # Load previous MST if specified - if args.previous_mst is not None: - print("Previous: " + str(args.previous_mst)) - extra_sources, extra_targets, extra_weights = network_to_edges(args.previous_mst, - rlist, - previous_pkl = args.distance_pkl, - weights = True, - use_gpu = use_gpu) - sources = np.append(sparse_mat.row, np.asarray(extra_sources)) - targets = np.append(sparse_mat.col, np.asarray(extra_targets)) - weights = np.append(sparse_mat.data, np.asarray(extra_weights)) - else: - sources = sparse_mat.row - targets = sparse_mat.col - weights = sparse_mat.data - G_df = cudf.DataFrame({'source': sources, - 'destination': targets, - 'weights': weights}) - G_cu = cugraph.Graph() - G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) - - # Generate minimum spanning tree - G = cugraph.minimum_spanning_tree(G_cu, weight='weights') - else: - # Load previous MST if specified - if args.previous_mst is not None: - G = construct_network_from_sparse_matrix(rlist, - rlist, - sparse_mat, - summarise=False, - previous_network = args.previous_mst) - else: - G = construct_network_from_sparse_matrix(rlist, - rlist, - sparse_mat, - summarise=False) - sys.stderr.write("Calculating MST (CPU)\n") - - G = generate_minimum_spanning_tree(G, args.gpu_graph) + G = generate_mst_from_sparse_input(sparse_mat, + distance_pkl, + previous_mst = args.previous_mst, + gpu_graph = args.gpu_graph) # Save output sys.stderr.write("Generating output\n") diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 38d866d4..57c03253 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -49,7 +49,10 @@ def get_options(): help='Location of query database, if distances ' 'are from ref-query') iGroup.add_argument('--distances', - help='Prefix of input pickle of pre-calculated distances') + help='Prefix of input pickle of pre-calculated distances', + default=None) + iGroup.add_argument('--rank-fit', + help='Location of rank fit, a sparse matrix (*_rank*_fit.npz)') iGroup.add_argument('--include-files', help='File with list of sequences to include in visualisation. ' 'Default is to use all sequences in database.', @@ -72,6 +75,10 @@ def get_options(): 'from poppunk_assign [default = use that in the directory ' 'of the query database]', type = str) + iGroup.add_argument('--previous-mst', + help='File containing previous minimum spanning tree', + default=None, + type = str) iGroup.add_argument('--network-file', help='Specify a file to use for any graph visualisations', type = str) @@ -145,6 +152,7 @@ def get_options(): def generate_visualisations(query_db, ref_db, distances, + rank_fit, threads, output, gpu_dist, @@ -160,6 +168,7 @@ def generate_visualisations(query_db, model_dir, previous_clustering, previous_query_clustering, + previous_mst, network_file, gpu_graph, info_csv, @@ -193,6 +202,8 @@ def generate_visualisations(query_db, from .sketchlib import readDBParams from .sketchlib import getKmersFromReferenceDatabase from .sketchlib import addRandom + + from .sparse_mst import generate_mst_from_sparse_input from .trees import load_tree, generate_nj_tree, mst_to_phylogeny @@ -204,6 +215,12 @@ def generate_visualisations(query_db, from .utils import joinClusterDicts from .utils import listDistInts + #******************************# + #* *# + #* Initial checks and set up *# + #* *# + #******************************# + # Check on parallelisation of graph-tools setGtThreads(threads) @@ -220,6 +237,12 @@ def generate_visualisations(query_db, sys.stderr.write("Cannot create output directory\n") sys.exit(1) + #******************************# + #* *# + #* Process dense or sparse distances *# + #* *# + #******************************# + if distances is None: if query_db is None: distances = ref_db + "/" + os.path.basename(ref_db) + ".dists" @@ -228,53 +251,77 @@ def generate_visualisations(query_db, else: distances = distances - rlist, qlist, self, complete_distMat = readPickle(distances) - if not self: - qr_distMat = complete_distMat - else: - rr_distMat = complete_distMat - - # Fill in qq-distances if required - if self == False: - sys.stderr.write("Note: Distances in " + distances + " are from assign mode\n" - "Note: Distance will be extended to full all-vs-all distances\n" - "Note: Re-run poppunk_assign with --update-db to avoid this\n") - ref_db_loc = ref_db + "/" + os.path.basename(ref_db) - rlist_original, qlist_original, self_ref, rr_distMat = readPickle(ref_db_loc + ".dists") - if not self_ref: - sys.stderr.write("Distances in " + ref_db + " not self all-vs-all either\n") - sys.exit(1) - kmers, sketch_sizes, codon_phased = readDBParams(query_db) - addRandom(query_db, qlist, kmers, - strand_preserved = strand_preserved, threads = threads) - query_db_loc = query_db + "/" + os.path.basename(query_db) - qq_distMat = pp_sketchlib.queryDatabase(query_db_loc, query_db_loc, - qlist, qlist, kmers, - True, False, - threads, - gpu_dist, - deviceid) - - # If the assignment was run with references, qrDistMat will be incomplete - if rlist != rlist_original: - rlist = rlist_original - qr_distMat = pp_sketchlib.queryDatabase(ref_db_loc, query_db_loc, - rlist, qlist, kmers, + # Determine whether to use sparse distances + use_sparse = False + use_dense = False + if args.tree == 'mst' and args.rank_fit is not None: + # Set flag + use_sparse = True + # Process only sparse distances + with open(distances + '.pkl', 'rb') as pickle_file: + rlist, qlist, self = pickle.load(pickle_file) + if not self: + sys.stderr.write("Visualisation with a sparse matrix requires an all-v-all" + " dataset\n") + sys.exit(1) + sparse_mat = sparse.load_npz(args.rank_fit) + combined_seq = rlist + if args.tree == 'nj' or args.tree == 'both': + use_dense = True + # Process dense distance matrix + rlist, qlist, self, complete_distMat = readPickle(distances) + if not self: + qr_distMat = complete_distMat + else: + rr_distMat = complete_distMat + + # Fill in qq-distances if required + if self == False: + sys.stderr.write("Note: Distances in " + distances + " are from assign mode\n" + "Note: Distance will be extended to full all-vs-all distances\n" + "Note: Re-run poppunk_assign with --update-db to avoid this\n") + ref_db_loc = ref_db + "/" + os.path.basename(ref_db) + rlist_original, qlist_original, self_ref, rr_distMat = readPickle(ref_db_loc + ".dists") + if not self_ref: + sys.stderr.write("Distances in " + ref_db + " not self all-vs-all either\n") + sys.exit(1) + kmers, sketch_sizes, codon_phased = readDBParams(query_db) + addRandom(query_db, qlist, kmers, + strand_preserved = strand_preserved, threads = threads) + query_db_loc = query_db + "/" + os.path.basename(query_db) + qq_distMat = pp_sketchlib.queryDatabase(query_db_loc, query_db_loc, + qlist, qlist, kmers, True, False, threads, gpu_dist, deviceid) - else: - qlist = None - qr_distMat = None - qq_distMat = None + # If the assignment was run with references, qrDistMat will be incomplete + if rlist != rlist_original: + rlist = rlist_original + qr_distMat = pp_sketchlib.queryDatabase(ref_db_loc, query_db_loc, + rlist, qlist, kmers, + True, False, + threads, + gpu_dist, + deviceid) - # Turn long form matrices into square form - combined_seq, core_distMat, acc_distMat = \ - update_distance_matrices(rlist, rr_distMat, - qlist, qr_distMat, qq_distMat, - threads = threads) + else: + qlist = None + qr_distMat = None + qq_distMat = None + + # Turn long form matrices into square form + combined_seq, core_distMat, acc_distMat = \ + update_distance_matrices(rlist, rr_distMat, + qlist, qr_distMat, qq_distMat, + threads = threads) + + #******************************# + #* *# + #* Extract subset of sequences *# + #* *# + #******************************# # extract subset of distances if requested if include_files is not None: @@ -288,13 +335,22 @@ def generate_visualisations(query_db, # Only keep found rows row_slice = [True if name in viz_subset else False for name in combined_seq] combined_seq = [name for name in combined_seq if name in viz_subset] - if qlist != None: - qlist = list(viz_subset.intersection(qlist)) - core_distMat = core_distMat[np.ix_(row_slice, row_slice)] - acc_distMat = acc_distMat[np.ix_(row_slice, row_slice)] + if use_sparse: + sparse_mat = sparse_mat[np.ix_(row_slice, row_slice)] + if use_dense: + if qlist != None: + qlist = list(viz_subset.intersection(qlist)) + core_distMat = core_distMat[np.ix_(row_slice, row_slice)] + acc_distMat = acc_distMat[np.ix_(row_slice, row_slice)] else: viz_subset = None + #******************************# + #* *# + #* Process clustering information *# + #* *# + #******************************# + # Either use strain definitions, lineage assignments or external clustering isolateClustering = {} # Use external clustering if specified @@ -361,7 +417,13 @@ def generate_visualisations(query_db, return_dict = True) isolateClustering = joinClusterDicts(isolateClustering, queryIsolateClustering) - # Generate MST + #******************************# + #* *# + #* Generate trees *# + #* *# + #******************************# + + # Generate trees mst_tree = None mst_graph = None nj_tree = None @@ -383,20 +445,29 @@ def generate_visualisations(query_db, clustering_name = display_cluster else: clustering_name = list(isolateClustering.keys())[0] - # Get distance matrix - complete_distMat = \ - np.hstack((pp_sketchlib.squareToLong(core_distMat, threads).reshape(-1, 1), - pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1))) - # Dense network may be slow - sys.stderr.write("Generating MST from dense distances (may be slow)\n") - G = construct_network_from_assignments(combined_seq, - combined_seq, - [0]*complete_distMat.shape[0], - within_label = 0, - distMat = complete_distMat, - weights_type = mst_distances, - use_gpu = gpu_graph, - summarise = False) + if use_sparse: + G = generate_mst_from_sparse_input(sparse_mat, + distances + '.pkl', + previous_mst = previous_mst, + gpu_graph = gpu_graph) + elif use_dense: + # Get distance matrix + complete_distMat = \ + np.hstack((pp_sketchlib.squareToLong(core_distMat, threads).reshape(-1, 1), + pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1))) + # Dense network may be slow + sys.stderr.write("Generating MST from dense distances (may be slow)\n") + G = construct_network_from_assignments(combined_seq, + combined_seq, + [0]*complete_distMat.shape[0], + within_label = 0, + distMat = complete_distMat, + weights_type = mst_distances, + use_gpu = gpu_graph, + summarise = False) + else: + sys.stderr.write("Need either sparse or dense distances matrix to construct MST\n") + exit(1) if gpu_graph: G = cugraph.minimum_spanning_tree(G, weight='weights') mst_graph = generate_minimum_spanning_tree(G, gpu_graph) @@ -439,6 +510,12 @@ def generate_visualisations(query_db, else: sys.stderr.write("Fewer than three sequences, not drawing trees\n") + #******************************# + #* *# + #* Write output *# + #* *# + #******************************# + # Now have all the objects needed to generate selected visualisations if microreact: sys.stderr.write("Writing microreact output\n") @@ -504,6 +581,7 @@ def main(): generate_visualisations(args.query_db, args.ref_db, args.distances, + args.rank_fit, args.threads, args.output, args.gpu_dist, @@ -519,6 +597,7 @@ def main(): args.model_dir, args.previous_clustering, args.previous_query_clustering, + args.previous_mst, args.network_file, args.gpu_graph, args.info_csv, From b76159906af69a2ed6e6ac1d646d0cde06c6b4f6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 13:26:11 +0100 Subject: [PATCH 095/175] Update sparse MST arguments --- PopPUNK/models.py | 2 +- PopPUNK/sparse_mst.py | 3 ++- PopPUNK/visualise.py | 11 +++++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 5a0d6ba1..c03fa9ba 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -1137,7 +1137,7 @@ def edge_weights(self, rank): else: return (self.nn_dists[rank].data) - def extend(self, qqDists, qrDists) + def extend(self, qqDists, qrDists): '''Update the sparse distance matrix of nearest neighbours after querying Args: diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index 755c185f..d6405f1f 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -73,7 +73,7 @@ def get_options(): return parser.parse_args() -def generate_mst_from_sparse_input(sparse_mat, distance_pkl, previous_mst = None, gpu_graph = False): +def generate_mst_from_sparse_input(sparse_mat, rlist, distance_pkl, previous_mst = None, gpu_graph = False): if gpu_graph: # Load previous MST if specified if previous_mst is not None: @@ -154,6 +154,7 @@ def main(): sys.stderr.write("Loading distances into graph\n") sparse_mat = sparse.load_npz(args.rank_fit) G = generate_mst_from_sparse_input(sparse_mat, + rlist, distance_pkl, previous_mst = args.previous_mst, gpu_graph = args.gpu_graph) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 57c03253..706b76b8 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -5,9 +5,10 @@ # universal import os import sys +import pickle # additional import numpy as np -import scipy.sparse +from scipy import sparse try: import cudf @@ -254,7 +255,7 @@ def generate_visualisations(query_db, # Determine whether to use sparse distances use_sparse = False use_dense = False - if args.tree == 'mst' and args.rank_fit is not None: + if tree == 'mst' and rank_fit is not None: # Set flag use_sparse = True # Process only sparse distances @@ -264,9 +265,9 @@ def generate_visualisations(query_db, sys.stderr.write("Visualisation with a sparse matrix requires an all-v-all" " dataset\n") sys.exit(1) - sparse_mat = sparse.load_npz(args.rank_fit) + sparse_mat = sparse.load_npz(rank_fit) combined_seq = rlist - if args.tree == 'nj' or args.tree == 'both': + if tree == 'nj' or tree == 'both': use_dense = True # Process dense distance matrix rlist, qlist, self, complete_distMat = readPickle(distances) @@ -447,6 +448,7 @@ def generate_visualisations(query_db, clustering_name = list(isolateClustering.keys())[0] if use_sparse: G = generate_mst_from_sparse_input(sparse_mat, + rlist, distances + '.pkl', previous_mst = previous_mst, gpu_graph = gpu_graph) @@ -598,6 +600,7 @@ def main(): args.previous_clustering, args.previous_query_clustering, args.previous_mst, + args.previous_distances, args.network_file, args.gpu_graph, args.info_csv, From 1b1b525299eac0b3c45e9c6e472aa77d5d8ddb92 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 13:33:34 +0100 Subject: [PATCH 096/175] Fix visualisation rlist reading --- PopPUNK/visualise.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 706b76b8..555812f9 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -150,6 +150,15 @@ def get_options(): return args +def read_rlist_from_distance_pickle(fn): + with open(fn, 'rb') as pickle_file: + rlist, qlist, self = pickle.load(pickle_file) + if not self: + sys.stderr.write("Visualisation with a sparse matrix requires an all-v-all" + " dataset\n") + sys.exit(1) + return rlist + def generate_visualisations(query_db, ref_db, distances, @@ -170,6 +179,7 @@ def generate_visualisations(query_db, previous_clustering, previous_query_clustering, previous_mst, + previous_distances network_file, gpu_graph, info_csv, @@ -258,13 +268,9 @@ def generate_visualisations(query_db, if tree == 'mst' and rank_fit is not None: # Set flag use_sparse = True - # Process only sparse distances - with open(distances + '.pkl', 'rb') as pickle_file: - rlist, qlist, self = pickle.load(pickle_file) - if not self: - sys.stderr.write("Visualisation with a sparse matrix requires an all-v-all" - " dataset\n") - sys.exit(1) + # Process recent and old distance matrix + rlist = read_rlist_from_distance_file(distances + '.pkl') + old_rlist = read_rlist_from_distance_file(previous_distances + '.pkl') sparse_mat = sparse.load_npz(rank_fit) combined_seq = rlist if tree == 'nj' or tree == 'both': @@ -448,7 +454,7 @@ def generate_visualisations(query_db, clustering_name = list(isolateClustering.keys())[0] if use_sparse: G = generate_mst_from_sparse_input(sparse_mat, - rlist, + old_rlist, distances + '.pkl', previous_mst = previous_mst, gpu_graph = gpu_graph) From dd812081f14221c83778e4c4ca29efd5877b4b93 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 13:48:24 +0100 Subject: [PATCH 097/175] Fix use of previous data --- PopPUNK/visualise.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 555812f9..9bbaad02 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -179,7 +179,7 @@ def generate_visualisations(query_db, previous_clustering, previous_query_clustering, previous_mst, - previous_distances + previous_distances, network_file, gpu_graph, info_csv, @@ -268,11 +268,16 @@ def generate_visualisations(query_db, if tree == 'mst' and rank_fit is not None: # Set flag use_sparse = True - # Process recent and old distance matrix + # Read list of sequence names and sparse distance matrix rlist = read_rlist_from_distance_file(distances + '.pkl') - old_rlist = read_rlist_from_distance_file(previous_distances + '.pkl') sparse_mat = sparse.load_npz(rank_fit) combined_seq = rlist + # Check previous distances have been supplied if building on a previous MST + if args.previous_distances is not None: + old_rlist = read_rlist_from_distance_file(previous_distances + '.pkl') + elif args.previous_mst is not None: + sys.stderr.write('The prefix of the distance files used to create the previous MST' + ' is needed to use the network') if tree == 'nj' or tree == 'both': use_dense = True # Process dense distance matrix From b504f1f207bf4d1f3aa59c788e10d6cd9f06ce77 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 13:51:07 +0100 Subject: [PATCH 098/175] Update command line arguments --- PopPUNK/visualise.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 9bbaad02..c79513f6 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -80,6 +80,11 @@ def get_options(): help='File containing previous minimum spanning tree', default=None, type = str) + iGroup.add_argument('--previous-distances', + help='Prefix of distance files used to generate the previous ' + 'minimum spanning tree', + default=None, + type = str) iGroup.add_argument('--network-file', help='Specify a file to use for any graph visualisations', type = str) From 809d4ff3955c4feef7f7df465f4ae40300b4a9c1 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 13:54:16 +0100 Subject: [PATCH 099/175] Fix function name in call --- PopPUNK/visualise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index c79513f6..f8a06b25 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -274,12 +274,12 @@ def generate_visualisations(query_db, # Set flag use_sparse = True # Read list of sequence names and sparse distance matrix - rlist = read_rlist_from_distance_file(distances + '.pkl') + rlist = read_rlist_from_distance_pickle(distances + '.pkl') sparse_mat = sparse.load_npz(rank_fit) combined_seq = rlist # Check previous distances have been supplied if building on a previous MST if args.previous_distances is not None: - old_rlist = read_rlist_from_distance_file(previous_distances + '.pkl') + old_rlist = read_rlist_from_distance_pickle(previous_distances + '.pkl') elif args.previous_mst is not None: sys.stderr.write('The prefix of the distance files used to create the previous MST' ' is needed to use the network') From 09daef0c4bc1ecb959e26328656f7356b3f627fc Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 13:55:10 +0100 Subject: [PATCH 100/175] Fix variable names --- PopPUNK/visualise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index f8a06b25..8d1e45de 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -278,9 +278,9 @@ def generate_visualisations(query_db, sparse_mat = sparse.load_npz(rank_fit) combined_seq = rlist # Check previous distances have been supplied if building on a previous MST - if args.previous_distances is not None: + if previous_distances is not None: old_rlist = read_rlist_from_distance_pickle(previous_distances + '.pkl') - elif args.previous_mst is not None: + elif previous_mst is not None: sys.stderr.write('The prefix of the distance files used to create the previous MST' ' is needed to use the network') if tree == 'nj' or tree == 'both': From a75b64111f9a520479a82ee85b69d2e7681980c1 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 13:59:52 +0100 Subject: [PATCH 101/175] Remove check of undefined variable --- PopPUNK/visualise.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 8d1e45de..793bcfa3 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -422,17 +422,18 @@ def generate_visualisations(query_db, isolateClustering[type] = indiv_isolateClustering['Cluster'] # Join clusters with query clusters if required - if not self: - if previous_query_clustering is not None: - prev_query_clustering = previous_query_clustering - else: - prev_query_clustering = os.path.basename(query_db) + '/' + os.path.basename(query_db) + suffix + if use_dense: + if not self: + if previous_query_clustering is not None: + prev_query_clustering = previous_query_clustering + else: + prev_query_clustering = os.path.basename(query_db) + '/' + os.path.basename(query_db) + suffix - queryIsolateClustering = readIsolateTypeFromCsv( - prev_query_clustering, - mode = mode, - return_dict = True) - isolateClustering = joinClusterDicts(isolateClustering, queryIsolateClustering) + queryIsolateClustering = readIsolateTypeFromCsv( + prev_query_clustering, + mode = mode, + return_dict = True) + isolateClustering = joinClusterDicts(isolateClustering, queryIsolateClustering) #******************************# #* *# From 69ee541cafd3aec60c0a09cb3f197d20dec97e2b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 14:33:23 +0100 Subject: [PATCH 102/175] Edit undefined variables --- PopPUNK/sparse_mst.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index d6405f1f..cfd71b6c 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -81,7 +81,7 @@ def generate_mst_from_sparse_input(sparse_mat, rlist, distance_pkl, previous_mst rlist, previous_pkl = distance_pkl, weights = True, - use_gpu = use_gpu) + use_gpu = gpu_graph) sources = np.append(sparse_mat.row, np.asarray(extra_sources)) targets = np.append(sparse_mat.col, np.asarray(extra_targets)) weights = np.append(sparse_mat.data, np.asarray(extra_weights)) @@ -104,7 +104,7 @@ def generate_mst_from_sparse_input(sparse_mat, rlist, distance_pkl, previous_mst rlist, sparse_mat, summarise=False, - previous_network = args.previous_mst) + previous_network = previous_mst) else: G = construct_network_from_sparse_matrix(rlist, rlist, From ab6e7fb55379393a010b73fde1dd30c9a56138c9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 16:32:10 +0100 Subject: [PATCH 103/175] Change loading of previous MST --- PopPUNK/sparse_mst.py | 19 +++++++++++-------- PopPUNK/utils.py | 20 ++++++++++++++++++++ PopPUNK/visualise.py | 14 +++----------- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index cfd71b6c..aa1f5024 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -53,6 +53,7 @@ def get_options(): iGroup.add_argument('--previous-mst', help='Graph tool file from which previous MST can be loaded', default=None) iGroup.add_argument('--distance-pkl', help='Input pickle from distances, which contains sample names') + iGroup.add_argument('--previous-distance-pkl', help='Input pickle from distances, which contains sample names') iGroup.add_argument('--display-cluster', default=None, help='Column of clustering CSV to use for plotting') # output options @@ -73,13 +74,13 @@ def get_options(): return parser.parse_args() -def generate_mst_from_sparse_input(sparse_mat, rlist, distance_pkl, previous_mst = None, gpu_graph = False): +def generate_mst_from_sparse_input(sparse_mat, rlist, old_rlist = None, previous_mst = None, gpu_graph = False): if gpu_graph: # Load previous MST if specified if previous_mst is not None: extra_sources, extra_targets, extra_weights = network_to_edges(previous_mst, rlist, - previous_pkl = distance_pkl, + old_ids = old_rlist, weights = True, use_gpu = gpu_graph) sources = np.append(sparse_mat.row, np.asarray(extra_sources)) @@ -132,15 +133,17 @@ def main(): " must be provided\n") sys.exit(1) elif os.path.exists(args.distance_pkl): - with open(args.distance_pkl, 'rb') as pickle_file: - rlist, qlist, self = pickle.load(pickle_file) - if not self: - sys.stderr.write("This script must be run on a full all-v-all model\n") - sys.exit(1) + rlist = read_rlist_from_distance_pickle(args.distance_pkl, + allow_non_self = False) else: sys.stderr.write("Cannot find file " + args.distance_pkl + "\n") sys.exit(1) + # Read in old sequence names + if args.previous_distance_pkl is not None and os.path.exists(args.previous_distance_pkl): + old_rlist = read_rlist_from_distance_pickle(args.previous_distance_pkl, + allow_non_self = False) + # Check output path ok if not os.path.isdir(args.output): try: @@ -155,7 +158,7 @@ def main(): sparse_mat = sparse.load_npz(args.rank_fit) G = generate_mst_from_sparse_input(sparse_mat, rlist, - distance_pkl, + old_rlist = old_rlist, previous_mst = args.previous_mst, gpu_graph = args.gpu_graph) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 8d160bac..c4da1a0d 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -645,3 +645,23 @@ def check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = False): assert(rmm.is_initialized()) return use_gpu + +def read_rlist_from_distance_pickle(fn, allow_non_self = True): + """Return the list of reference sequences from a distance pickle. + + Args: + fn (str) + Name of distance pickle + allow_non_self (bool) + Whether non-self distance datasets are permissible + Returns: + rlist (list) + List of reference sequence names + """ + with open(fn, 'rb') as pickle_file: + rlist, qlist, self = pickle.load(pickle_file) + if not allow_non_self and not self: + sys.stderr.write("Thi analysis requires an all-v-all" + " distance dataset\n") + sys.exit(1) + return rlist diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 793bcfa3..105a6de9 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -155,15 +155,6 @@ def get_options(): return args -def read_rlist_from_distance_pickle(fn): - with open(fn, 'rb') as pickle_file: - rlist, qlist, self = pickle.load(pickle_file) - if not self: - sys.stderr.write("Visualisation with a sparse matrix requires an all-v-all" - " dataset\n") - sys.exit(1) - return rlist - def generate_visualisations(query_db, ref_db, distances, @@ -230,6 +221,7 @@ def generate_visualisations(query_db, from .utils import readIsolateTypeFromCsv from .utils import joinClusterDicts from .utils import listDistInts + from .utils import read_rlist_from_distance_pickle #******************************# #* *# @@ -465,8 +457,8 @@ def generate_visualisations(query_db, clustering_name = list(isolateClustering.keys())[0] if use_sparse: G = generate_mst_from_sparse_input(sparse_mat, - old_rlist, - distances + '.pkl', + rlist, + old_ids = old_rlist, previous_mst = previous_mst, gpu_graph = gpu_graph) elif use_dense: From dfb663957efe1bcb3a4e70d54ae967d26422ce35 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 16:34:34 +0100 Subject: [PATCH 104/175] Fix MST function call --- PopPUNK/visualise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 105a6de9..818cf560 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -458,7 +458,7 @@ def generate_visualisations(query_db, if use_sparse: G = generate_mst_from_sparse_input(sparse_mat, rlist, - old_ids = old_rlist, + old_rlist = old_rlist, previous_mst = previous_mst, gpu_graph = gpu_graph) elif use_dense: From 7702ba5f76ca266fe413285ecea21e7f7e238cec Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 17:15:27 +0100 Subject: [PATCH 105/175] Change edges DF --- PopPUNK/network.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 7fab7128..02b5e6bf 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -527,10 +527,8 @@ def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, sys.stderr.write('Loaded network does not have edge weights; try a different ' 'network or turn off graph weights\n') exit(1) - G_df.columns = ['source','destination','weight'] edge_weights = G_df['weight'].to_arrow().to_pylist() - else: - G_df.columns = ['source','destination'] + G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) old_source_ids = G_df['source'].to_arrow().to_pylist() old_target_ids = G_df['destination'].to_arrow().to_pylist() else: From 0ec7fb61f6a888a0860b7bcf23a2ffe72c9495f3 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 17:22:29 +0100 Subject: [PATCH 106/175] Address possible bug in graph extraction --- PopPUNK/network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 02b5e6bf..f4b8e97c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -527,7 +527,8 @@ def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, sys.stderr.write('Loaded network does not have edge weights; try a different ' 'network or turn off graph weights\n') exit(1) - edge_weights = G_df['weight'].to_arrow().to_pylist() + G_df.columns = ['weights','src','dst'] # This appears to be a bug in cugraph v0.19 + edge_weights = G_df['weights'].to_arrow().to_pylist() G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) old_source_ids = G_df['source'].to_arrow().to_pylist() old_target_ids = G_df['destination'].to_arrow().to_pylist() From adbcda43f62ddea98d1f97b7c93148e97955c04e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 17:25:46 +0100 Subject: [PATCH 107/175] Convert values to integers --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index f4b8e97c..03cd4b21 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -530,8 +530,8 @@ def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, G_df.columns = ['weights','src','dst'] # This appears to be a bug in cugraph v0.19 edge_weights = G_df['weights'].to_arrow().to_pylist() G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) - old_source_ids = G_df['source'].to_arrow().to_pylist() - old_target_ids = G_df['destination'].to_arrow().to_pylist() + old_source_ids = G_df['source'].astype('int32').to_arrow().to_pylist() + old_target_ids = G_df['destination'].astype('int32').to_arrow().to_pylist() else: # get the source and target nodes old_source_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "source") From ad531ab671342ca631f63a83ca90c69c45da3cd5 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 17:27:17 +0100 Subject: [PATCH 108/175] Fix variable name --- PopPUNK/sparse_mst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index aa1f5024..3929f649 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -113,7 +113,7 @@ def generate_mst_from_sparse_input(sparse_mat, rlist, old_rlist = None, previous summarise=False) sys.stderr.write("Calculating MST (CPU)\n") - G = generate_minimum_spanning_tree(G, args.gpu_graph) + G = generate_minimum_spanning_tree(G, gpu_graph) return(G) From 83305e2c2cacf73ea138a73cb386be9e9bebb761 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 20:32:08 +0100 Subject: [PATCH 109/175] Process distmat for t-sne --- PopPUNK/visualise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 818cf560..1c22fb3a 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -275,7 +275,7 @@ def generate_visualisations(query_db, elif previous_mst is not None: sys.stderr.write('The prefix of the distance files used to create the previous MST' ' is needed to use the network') - if tree == 'nj' or tree == 'both': + if tree == 'nj' or tree == 'both' or microreact: use_dense = True # Process dense distance matrix rlist, qlist, self, complete_distMat = readPickle(distances) From fe8761a840d0040712fd02be135a2baf0f297f97 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 21:12:16 +0100 Subject: [PATCH 110/175] Fix plotting for reference databases --- PopPUNK/sketchlib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 8de3591d..6e24a2d6 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -545,7 +545,7 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num raw = np.zeros(len(klist)) corrected = np.zeros(len(klist)) raw = pp_sketchlib.queryDatabase(ref_db, - query_db, + ref_db, example[0], example[1], klist, @@ -554,7 +554,7 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num num_threads = threads, use_gpu = False) corrected = pp_sketchlib.queryDatabase(ref_db, - query_db, + ref_db, example[0], example[1], klist, From 3241a0e7c66e3777278803521b8d7d48e66db9d7 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 21:16:18 +0100 Subject: [PATCH 111/175] Fix plot fit isolate names --- PopPUNK/sketchlib.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 6e24a2d6..7e1adab8 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -546,8 +546,8 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num corrected = np.zeros(len(klist)) raw = pp_sketchlib.queryDatabase(ref_db, ref_db, - example[0], - example[1], + [example[0]], + [example[1]], klist, random_correct = False, jaccard = True, @@ -555,8 +555,8 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num use_gpu = False) corrected = pp_sketchlib.queryDatabase(ref_db, ref_db, - example[0], - example[1], + [example[0]], + [example[1]], klist, random_correct = True, jaccard = True, From ad8918d2870250725123cf888f2032be98ab7baf Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 12 Jul 2021 21:51:20 +0100 Subject: [PATCH 112/175] Convert lists to arrays --- PopPUNK/models.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index c03fa9ba..40e32260 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -1036,12 +1036,15 @@ def fit(self, X, accessory): 0, rank ) - data[data < epsilon] = epsilon if self.use_gpu: - self.nn_dists[rank] = cupyx.scipy.sparse.coo_matrix((cp.array(data),(cp.array(row),cp.array(col))), + data = cp.array(data) + data[data < epsilon] = epsilon + self.nn_dists[rank] = cupyx.scipy.sparse.coo_matrix((data,(cp.array(row),cp.array(col))), shape=(sample_size, sample_size), dtype = X.dtype) else: + data = np.array(data) + data[data < epsilon] = epsilon self.nn_dists[rank] = scipy.sparse.coo_matrix((data, (row, col)), shape=(sample_size, sample_size), dtype = X.dtype) From f00f77d5cd8f6dca982f8bf20675ed0187662066 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 10:12:17 +0100 Subject: [PATCH 113/175] Generate networks from sparse matrices --- PopPUNK/network.py | 22 ++++++++++++++++++++++ PopPUNK/visualise.py | 12 ++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 03cd4b21..ea6a6a62 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1880,3 +1880,25 @@ def cugraph_to_graph_tool(G, rlist): vals = rlist) G.vp.id = vid return G + +def sparse_mat_to_network(sparse_mat, rlist, use_gpu = False): + + if use_gpu: + G_df = cudf.DataFrame(columns = ['source','destination','weights']) + G_df['source'] = sparse_mat.row + G_df['destination'] = sparse_mat.col + G_df['weights'] = sparse_mat.data + max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) + max_in_vertex_labels = len(vertex_labels)-1 + G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) + else: + connections = [] + for (src,dst) in zip(sparse_mat.row,sparse_mat.col): + connections.append(src,dst) + G = construct_network_from_edge_list(rlist, + rlist, + connections, + weights = sparse_mat.data, + summarise=False) + + return G diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 1c22fb3a..d718c046 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -196,6 +196,7 @@ def generate_visualisations(query_db, from .network import load_network_file from .network import cugraph_to_graph_tool from .network import save_network + from .network import sparse_mat_to_network from .plot import drawMST from .plot import outputsForMicroreact @@ -262,7 +263,7 @@ def generate_visualisations(query_db, # Determine whether to use sparse distances use_sparse = False use_dense = False - if tree == 'mst' and rank_fit is not None: + if (tree == 'mst' or cytoscape) and rank_fit is not None: # Set flag use_sparse = True # Read list of sequence names and sparse distance matrix @@ -566,10 +567,13 @@ def generate_visualisations(query_db, if cytoscape: sys.stderr.write("Writing cytoscape output\n") - if network_file is None: - sys.stderr.write('Cytoscape output requires a network file is provided\n') + if network_file is not None: + genomeNetwork = load_network_file(network_file, use_gpu = gpu_graph) + elif: + genomeNetwork = sparse_mat_to_network(sparse_mat, combined_seq, use_gpu = gpu_graph) + else: + sys.stderr.write('Cytoscape output requires a network file or lineage rank fit is provided\n') sys.exit(1) - genomeNetwork = load_network_file(network_file, use_gpu = gpu_graph) if gpu_graph: genomeNetwork = cugraph_to_graph_tool(genomeNetwork, isolateNameToLabel(combined_seq)) outputsForCytoscape(genomeNetwork, From c017d4c238bfe7f465458d5bbb93c2ae412b25eb Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 10:32:36 +0100 Subject: [PATCH 114/175] Ensure variable defined --- PopPUNK/visualise.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index d718c046..d3838ee7 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -263,7 +263,7 @@ def generate_visualisations(query_db, # Determine whether to use sparse distances use_sparse = False use_dense = False - if (tree == 'mst' or cytoscape) and rank_fit is not None: + if (tree == 'mst' or tree == 'both' or cytoscape) and rank_fit is not None: # Set flag use_sparse = True # Read list of sequence names and sparse distance matrix @@ -271,6 +271,7 @@ def generate_visualisations(query_db, sparse_mat = sparse.load_npz(rank_fit) combined_seq = rlist # Check previous distances have been supplied if building on a previous MST + old_rlist = None if previous_distances is not None: old_rlist = read_rlist_from_distance_pickle(previous_distances + '.pkl') elif previous_mst is not None: From 9f389d44b465b45f1d2191d785ccfccc3ade8701 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 10:33:46 +0100 Subject: [PATCH 115/175] Test if rank_fit supplied --- PopPUNK/visualise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index d3838ee7..7e2b1959 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -570,7 +570,7 @@ def generate_visualisations(query_db, sys.stderr.write("Writing cytoscape output\n") if network_file is not None: genomeNetwork = load_network_file(network_file, use_gpu = gpu_graph) - elif: + elif rank_fit is not None: genomeNetwork = sparse_mat_to_network(sparse_mat, combined_seq, use_gpu = gpu_graph) else: sys.stderr.write('Cytoscape output requires a network file or lineage rank fit is provided\n') From be84f95d47b932e8c3120405412a6dfa19e2ba53 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 11:03:54 +0100 Subject: [PATCH 116/175] Fix network construction from sparse matrices --- PopPUNK/network.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ea6a6a62..ffa17572 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -892,7 +892,6 @@ def construct_network_from_df(rlist, qlist, G_df, # direct conversion # ensure the highest-integer node is included in the edge list # by adding a self-loop if necessary; see https://github.com/rapidsai/cugraph/issues/1206 - max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) max_in_vertex_labels = len(vertex_labels)-1 use_weights = False if weights: @@ -1025,7 +1024,6 @@ def construct_dense_weighted_network(rlist, distMat = None, weights_type = None, G_df['source'] = [edge_list[0][0]] G_df['destination'] = [edge_list[0][1]] G_df['weights'] = weights - max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) max_in_vertex_labels = len(vertex_labels)-1 G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) else: @@ -1734,7 +1732,6 @@ def generate_minimum_spanning_tree(G, from_cugraph = False): # MST - check cuDF implementation is the same max_indices = mst_df.groupby(['labels'])['degree'].idxmax() seed_vertices = mst_df.iloc[max_indices]['vertex'] - num_components = seed_vertices.size() else: component_assignments, component_frequencies = gt.label_components(mst_network) num_components = len(component_frequencies) @@ -1758,9 +1755,9 @@ def generate_minimum_spanning_tree(G, from_cugraph = False): # so no extra edges can be retrieved from the graph G_df = G.view_edge_list() max_weight = G_df['weights'].max() - first_seed = seed_vertices[0] + first_seed = seed_vertices.iloc[0] G_seed_link_df = cudf.DataFrame() - G_seed_link_df['dst'] = seed_vertices.iloc[1:seed_vertices.size()] + G_seed_link_df['dst'] = seed_vertices.iloc[1:seed_vertices.size] G_seed_link_df['src'] = seed_vertices.iloc[0] G_seed_link_df['weights'] = seed_vertices.iloc[0] G_df = G_df.append(G_seed_link_df) @@ -1785,7 +1782,9 @@ def generate_minimum_spanning_tree(G, from_cugraph = False): # Construct graph if from_cugraph: - mst_network = G_df.from_cudf_edgelist(edge_attr='weights', renumber=False) + mst_network = cugraph.Graph() + G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) + mst_network.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: seed_G = gt.Graph(directed = False) seed_G.add_vertex(len(seed_vertex)) @@ -1888,8 +1887,7 @@ def sparse_mat_to_network(sparse_mat, rlist, use_gpu = False): G_df['source'] = sparse_mat.row G_df['destination'] = sparse_mat.col G_df['weights'] = sparse_mat.data - max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) - max_in_vertex_labels = len(vertex_labels)-1 + max_in_vertex_labels = len(rlist)-1 G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) else: connections = [] From f8dbdba9ed9661902ff558fcaded315c8ac771f2 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 11:16:46 +0100 Subject: [PATCH 117/175] Import function --- PopPUNK/sparse_mst.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index 3929f649..13806437 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -34,9 +34,12 @@ from .network import construct_network_from_sparse_matrix from .plot import drawMST + from .trees import mst_to_phylogeny, write_tree + from .utils import setGtThreads, readIsolateTypeFromCsv from .utils import check_and_set_gpu +from .utils import read_rlist_from_distance_pickle # command line parsing def get_options(): From badfba0f75cc85fe0f11f2527bd4c50027d8de09 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 11:19:00 +0100 Subject: [PATCH 118/175] Ensure variable always defined --- PopPUNK/sparse_mst.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index 13806437..3ecfa86f 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -143,6 +143,7 @@ def main(): sys.exit(1) # Read in old sequence names + old_rlist = None if args.previous_distance_pkl is not None and os.path.exists(args.previous_distance_pkl): old_rlist = read_rlist_from_distance_pickle(args.previous_distance_pkl, allow_non_self = False) From 8cce9a830742d7bd252c43e74a12adc8b038f874 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 11:55:13 +0100 Subject: [PATCH 119/175] Fix network output function --- PopPUNK/plot.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index d346d208..c380c0a2 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -502,19 +502,16 @@ def outputsForCytoscape(G, G_mst, isolate_names, clustering, outPrefix, epiCsv, # write graph file if suffix is None: - graph_file_name = os.path.basename(outPrefix) + "_cytoscape.graphml" + suffix = '_cytoscape' else: - graph_file_name = os.path.basename(outPrefix) + "_" + suffix + "_cytoscape.graphml" - G.save(outPrefix + "/" + graph_file_name, fmt = 'graphml') + suffix = suffix + '_cytoscape' + save_network(G, prefix = outPrefix, suffix = suffix + '_cytoscape', use_graphml = True) if G_mst != None: isolate_labels = isolateNameToLabel(G_mst.vp.id) for n,v in enumerate(G_mst.vertices()): G_mst.vp.id[v] = isolate_labels[n] - if suffix is not None: - graph_suffix = '_' + suffix + '_cytoscape_mst' - else: - graph_suffix = '_cytoscape_mst' + suffix = suffix + '_mst' save_network(G_mst, prefix = outPrefix, suffix = graph_suffix, use_graphml = True) # Write CSV of metadata From eac9eea1ff5b0572cae72c83f7ae9af6f8ef4d51 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 11:55:25 +0100 Subject: [PATCH 120/175] Only calculate MST once --- PopPUNK/visualise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 7e2b1959..cedb8315 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -478,11 +478,11 @@ def generate_visualisations(query_db, weights_type = mst_distances, use_gpu = gpu_graph, summarise = False) + if gpu_graph: + G = cugraph.minimum_spanning_tree(G, weight='weights') else: sys.stderr.write("Need either sparse or dense distances matrix to construct MST\n") exit(1) - if gpu_graph: - G = cugraph.minimum_spanning_tree(G, weight='weights') mst_graph = generate_minimum_spanning_tree(G, gpu_graph) del G # save outputs From 9436228cc142cd0d122852e2ef505517e61cc94f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 12:01:18 +0100 Subject: [PATCH 121/175] Add docstring for new function --- PopPUNK/network.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ffa17572..751b621e 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1881,7 +1881,20 @@ def cugraph_to_graph_tool(G, rlist): return G def sparse_mat_to_network(sparse_mat, rlist, use_gpu = False): + """Generate a network from a lineage rank fit + Args: + sparse_mat (scipy or cupyx sparse matrix) + Sparse matrix of kNN from lineage fit + rlist (list) + List of sequence names + use_gpu (bool) + Whether GPU libraries should be used + + Returns: + G (network) + Graph tool or cugraph network + """ if use_gpu: G_df = cudf.DataFrame(columns = ['source','destination','weights']) G_df['source'] = sparse_mat.row From 105db6cf4ca6c9b7c80f62ba5c2e8699897c2c3a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 14:49:27 +0100 Subject: [PATCH 122/175] Add script for comparing network properties and sequence properties --- scripts/poppunk_sample_info.py | 225 +++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100755 scripts/poppunk_sample_info.py diff --git a/scripts/poppunk_sample_info.py b/scripts/poppunk_sample_info.py new file mode 100755 index 00000000..2513e54f --- /dev/null +++ b/scripts/poppunk_sample_info.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python +# vim: set fileencoding= : +# Copyright 2018-2020 John Lees and Nick Croucher + +import sys +import argparse +import h5py +import pandas as pd +from scipy import sparse + +# Load GPU libraries +try: + import cupyx + import cugraph + import cudf + import cupy as cp + from numba import cuda + import rmm + gpu_lib = True +except ImportError as e: + gpu_lib = False + +def setGtThreads(threads): + import graph_tool.all as gt + # Check on parallelisation of graph-tools + if gt.openmp_enabled(): + gt.openmp_set_num_threads(threads) + sys.stderr.write('\nGraph-tools OpenMP parallelisation enabled:') + sys.stderr.write(' with ' + str(gt.openmp_get_num_threads()) + ' threads\n') + +def load_network_file(fn, use_gpu = False): + """Load the network based on input options + + Returns the network as a graph-tool format graph, and sets + the slope parameter of the passed model object. + + Args: + fn (str) + Network file name + use_gpu (bool) + Use cugraph library to load graph + + Returns: + genomeNetwork (graph) + The loaded network + """ + # Load the network from the specified file + if use_gpu: + G_df = cudf.read_csv(fn, compression = 'gzip') + genomeNetwork = cugraph.Graph() + if 'weights' in G_df.columns: + G_df.columns = ['source','destination','weights'] + genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) + else: + G_df.columns = ['source','destination'] + genomeNetwork.from_cudf_edgelist(G_df,renumber=False) + sys.stderr.write("Network loaded: " + str(genomeNetwork.number_of_vertices()) + " samples\n") + else: + genomeNetwork = gt.load_graph(fn) + sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") + + return genomeNetwork + +def check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = False): + """Check GPU libraries can be loaded and set managed memory. + + Args: + use_gpu (bool) + Whether GPU packages have been requested + gpu_lib (bool) + Whether GPU packages are available + Returns: + use_gpu (bool) + Whether GPU packages can be used + """ + # load CUDA libraries + if use_gpu and not gpu_lib: + if quit_on_fail: + sys.stderr.write('Unable to load GPU libraries; exiting\n') + sys.exit(1) + else: + sys.stderr.write('Unable to load GPU libraries; using CPU libraries ' + 'instead\n') + use_gpu = False + + # Set memory management for large networks + if use_gpu: + rmm.reinitialize(managed_memory=True) + cudf.set_allocator("managed") + if "cupy" in sys.modules: + cupy.cuda.set_allocator(rmm.rmm_cupy_allocator) + if "cuda" in sys.modules: + cuda.set_memory_manager(rmm.RMMNumbaManager) + assert(rmm.is_initialized()) + + return use_gpu + +def sparse_mat_to_network(sparse_mat, rlist, use_gpu = False): + """Generate a network from a lineage rank fit + + Args: + sparse_mat (scipy or cupyx sparse matrix) + Sparse matrix of kNN from lineage fit + rlist (list) + List of sequence names + use_gpu (bool) + Whether GPU libraries should be used + + Returns: + G (network) + Graph tool or cugraph network + """ + if use_gpu: + G_df = cudf.DataFrame(columns = ['source','destination','weights']) + G_df['source'] = sparse_mat.row + G_df['destination'] = sparse_mat.col + G_df['weights'] = sparse_mat.data + max_in_vertex_labels = len(rlist)-1 + G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) + else: + connections = [] + for (src,dst) in zip(sparse_mat.row,sparse_mat.col): + connections.append(src,dst) + G = construct_network_from_edge_list(rlist, + rlist, + connections, + weights = sparse_mat.data, + summarise=False) + + return G + +# command line parsing +def get_options(): + + parser = argparse.ArgumentParser(description='Get information about a PopPUNK database', + prog='poppunk_db_info') + + # input options + parser.add_argument('--db', + required = True, + help='Database file (.h5)') + parser.add_argument('--network', + required = True, + help='Network or lineage fit file for analysis') + parser.add_argument('--threads', + default = 1, + help='Number of cores to use in analysis') + parser.add_argument('--use-gpu', + default = False, + action = 'store_true', + help='Whether GPU libraries should be used in analysis') + parser.add_argument('--output', + required = True, + help='Prefix for output files') + + return parser.parse_args() + +# main code +if __name__ == "__main__": + + # Check input ok + args = get_options() + + # Check whether GPU libraries can be loaded + use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = False) + + # Set threads for graph-tool + setGtThreads(args.threads) + + # Open and process sequence database + ref_db = h5py.File(args.db, 'r') + sample_names = list(ref_db['sketches'].keys()) + + sample_sequence_length = {} + sample_missing_bases = {} + sample_base_frequencies = {name: [] for name in sample_names} + + for sample_name in sample_names: + sample_base_frequencies[sample_name] = ref_db['sketches/' + sample_name].attrs['base_freq'] + sample_sequence_length[sample_name] = ref_db['sketches/' + sample_name].attrs['length'] + sample_missing_bases[sample_name] = ref_db['sketches/' + sample_name].attrs['missing_bases'] + + # Open network file + if args.network.endswith('.gt'): + G = load_network_file(args.network, use_gpu = False) + elif args.network.endswith('.csv.gz'): + if use_gpu: + G = load_network_file(args.network, use_gpu = True) + else: + sys.stderr.write('Unable to load necessary GPU libraries\n') + exit(1) + elif args.network.endswith('.npz'): + sparse_mat = sparse.load_npz(args.network) + G = sparse_mat_to_network(sparse_mat, sample_names, use_gpu = use_gpu) + else: + sys.stderr.write('Unrecognised suffix: expected ".gt", ".csv.gz" or ".npz"\n') + exit(1) + + # Analyse network + if use_gpu: + component_assignments_df = cugraph.components.connectivity.connected_components(G) + component_assignments_df['component_count'] = component_assignments.groupby('partition')['vertex'].transform('count') + outdegree_df = cugraph.out_degree(G) + graph_properties_df = component_assignments_df.merge(outdegree_df, on = ['vertex']) + else: + graph_properties_df = pd.DataFrame() + graph_properties_df['vertex'] = np.arange(len(rlist)) + graph_properties_df['partition'] = gt.label_components(G)[0].a + graph_properties_df['outdegree'] = G.get_out_degrees(G.get_vertices()) + graph_properties_df['component_count'] = component_assignments.groupby('partition')['vertex'].transform('count') + graph_properties_df['vertex'] = sample_names + + # Merge data and print output + with open(args.output,'w') as out_file: + out_file.write('Sample,Length,Missing_bases,Frequency_A,Frequency_C,Frequency_G,Frequency_T,Component,Component_size,Node_degree\n') + for i,sample_name in enumerate(sample_names): + out_file.write(sample_name + ',' + str(sample_sequence_length[sample_name]) + ',' + str(sample_missing_bases[sample_name]) + ',' + for frequency in sample_base_frequencies[sample_name]: + out_file.write(str(frequency) + ',' + out_file.write(str(graph_properties_df['partition'].where(graph_properties_df['vertex']==sample_name))) + out_file.write(str(graph_properties_df['partition'].where(graph_properties_df['vertex']==sample_name))) + out_file.write(str(graph_properties_df['outdegree'].where(graph_properties_df['vertex']==sample_name))) + out_file.write("\n") + + sys.exit(0) From b5644cd9311ac8fea3b3f440c0a104c883bae2a1 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 15:54:14 +0100 Subject: [PATCH 123/175] Fix data frame properties --- scripts/poppunk_sample_info.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/scripts/poppunk_sample_info.py b/scripts/poppunk_sample_info.py index 2513e54f..fe8ea459 100755 --- a/scripts/poppunk_sample_info.py +++ b/scripts/poppunk_sample_info.py @@ -13,7 +13,7 @@ import cupyx import cugraph import cudf - import cupy as cp + import cupy from numba import cuda import rmm gpu_lib = True @@ -162,7 +162,7 @@ def get_options(): args = get_options() # Check whether GPU libraries can be loaded - use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = False) + use_gpu = check_and_set_gpu(args.use_gpu, gpu_lib, quit_on_fail = False) # Set threads for graph-tool setGtThreads(args.threads) @@ -199,14 +199,16 @@ def get_options(): # Analyse network if use_gpu: component_assignments_df = cugraph.components.connectivity.connected_components(G) - component_assignments_df['component_count'] = component_assignments.groupby('partition')['vertex'].transform('count') - outdegree_df = cugraph.out_degree(G) - graph_properties_df = component_assignments_df.merge(outdegree_df, on = ['vertex']) + component_counts_df = component_assignments_df.groupby('labels')['labels'].count() + component_counts_df.name = 'component_count' + component_information_df = component_assignments_df.merge(component_counts_df, on = ['labels'], how = 'left') + outdegree_df = G.out_degree() + graph_properties_df = component_information_df.merge(outdegree_df, on = ['vertex']) else: graph_properties_df = pd.DataFrame() graph_properties_df['vertex'] = np.arange(len(rlist)) - graph_properties_df['partition'] = gt.label_components(G)[0].a - graph_properties_df['outdegree'] = G.get_out_degrees(G.get_vertices()) + graph_properties_df['labels'] = gt.label_components(G)[0].a + graph_properties_df['degree'] = G.get_out_degrees(G.get_vertices()) graph_properties_df['component_count'] = component_assignments.groupby('partition')['vertex'].transform('count') graph_properties_df['vertex'] = sample_names @@ -214,12 +216,13 @@ def get_options(): with open(args.output,'w') as out_file: out_file.write('Sample,Length,Missing_bases,Frequency_A,Frequency_C,Frequency_G,Frequency_T,Component,Component_size,Node_degree\n') for i,sample_name in enumerate(sample_names): - out_file.write(sample_name + ',' + str(sample_sequence_length[sample_name]) + ',' + str(sample_missing_bases[sample_name]) + ',' + out_file.write(sample_name + ',' + str(sample_sequence_length[sample_name]) + ',' + str(sample_missing_bases[sample_name]) + ',') for frequency in sample_base_frequencies[sample_name]: - out_file.write(str(frequency) + ',' - out_file.write(str(graph_properties_df['partition'].where(graph_properties_df['vertex']==sample_name))) - out_file.write(str(graph_properties_df['partition'].where(graph_properties_df['vertex']==sample_name))) - out_file.write(str(graph_properties_df['outdegree'].where(graph_properties_df['vertex']==sample_name))) + out_file.write(str(frequency) + ',') + graph_properties_row = graph_properties_df.iloc[graph_properties_df['vertex']==sample_name,:] + out_file.write(str(graph_properties_row['labels'].values[0]) + ',') + out_file.write(str(graph_properties_row['component_count'].values[0]) + ',') + out_file.write(str(graph_properties_row['degree'].values[0])) out_file.write("\n") sys.exit(0) From 52c4e35aca8f25af2063e1005718736a3e2db2fd Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 16:00:13 +0100 Subject: [PATCH 124/175] Change sample ordering --- scripts/poppunk_sample_info.py | 40 +++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/scripts/poppunk_sample_info.py b/scripts/poppunk_sample_info.py index fe8ea459..920bc94d 100755 --- a/scripts/poppunk_sample_info.py +++ b/scripts/poppunk_sample_info.py @@ -28,6 +28,26 @@ def setGtThreads(threads): sys.stderr.write('\nGraph-tools OpenMP parallelisation enabled:') sys.stderr.write(' with ' + str(gt.openmp_get_num_threads()) + ' threads\n') +def read_rlist_from_distance_pickle(fn, allow_non_self = True): + """Return the list of reference sequences from a distance pickle. + + Args: + fn (str) + Name of distance pickle + allow_non_self (bool) + Whether non-self distance datasets are permissible + Returns: + rlist (list) + List of reference sequence names + """ + with open(fn, 'rb') as pickle_file: + rlist, qlist, self = pickle.load(pickle_file) + if not allow_non_self and not self: + sys.stderr.write("Thi analysis requires an all-v-all" + " distance dataset\n") + sys.exit(1) + return rlist + def load_network_file(fn, use_gpu = False): """Load the network based on input options @@ -136,12 +156,15 @@ def get_options(): prog='poppunk_db_info') # input options - parser.add_argument('--db', + parser.add_argument('--ref-db', required = True, - help='Database file (.h5)') + help='PopPUNK database directory') parser.add_argument('--network', required = True, help='Network or lineage fit file for analysis') + parser.add_argument('--distances', + default = None, + help='Prefix of distance files') parser.add_argument('--threads', default = 1, help='Number of cores to use in analysis') @@ -168,7 +191,8 @@ def get_options(): setGtThreads(args.threads) # Open and process sequence database - ref_db = h5py.File(args.db, 'r') + h5_fn = os.join(args.ref_db,os.path.basename(args.ref_db) + '.h5') + ref_db = h5py.File(h5_fn, 'r') sample_names = list(ref_db['sketches'].keys()) sample_sequence_length = {} @@ -179,7 +203,13 @@ def get_options(): sample_base_frequencies[sample_name] = ref_db['sketches/' + sample_name].attrs['base_freq'] sample_sequence_length[sample_name] = ref_db['sketches/' + sample_name].attrs['length'] sample_missing_bases[sample_name] = ref_db['sketches/' + sample_name].attrs['missing_bases'] - + + # Process distance file + distance_pkl = os.join(args.ref_db,os.path.basename(args.ref_db) + '.dists.pkl') + if args.distances is not None: + distance_pkl = args.distances + '.dists.pkl' + rlist = read_rlist_from_distance_pickle(distance_pkl) + # Open network file if args.network.endswith('.gt'): G = load_network_file(args.network, use_gpu = False) @@ -210,7 +240,7 @@ def get_options(): graph_properties_df['labels'] = gt.label_components(G)[0].a graph_properties_df['degree'] = G.get_out_degrees(G.get_vertices()) graph_properties_df['component_count'] = component_assignments.groupby('partition')['vertex'].transform('count') - graph_properties_df['vertex'] = sample_names + graph_properties_df['vertex'] = rlist # Merge data and print output with open(args.output,'w') as out_file: From 2796773424fe86576353e39f9c1cc90c65405693 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 16:06:05 +0100 Subject: [PATCH 125/175] Change file name processing --- scripts/poppunk_sample_info.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/poppunk_sample_info.py b/scripts/poppunk_sample_info.py index 920bc94d..9b79b4f6 100755 --- a/scripts/poppunk_sample_info.py +++ b/scripts/poppunk_sample_info.py @@ -2,8 +2,10 @@ # vim: set fileencoding= : # Copyright 2018-2020 John Lees and Nick Croucher +import os import sys import argparse +import pickle import h5py import pandas as pd from scipy import sparse @@ -191,7 +193,7 @@ def get_options(): setGtThreads(args.threads) # Open and process sequence database - h5_fn = os.join(args.ref_db,os.path.basename(args.ref_db) + '.h5') + h5_fn = os.path.join(args.ref_db,os.path.basename(args.ref_db) + '.h5') ref_db = h5py.File(h5_fn, 'r') sample_names = list(ref_db['sketches'].keys()) @@ -205,7 +207,7 @@ def get_options(): sample_missing_bases[sample_name] = ref_db['sketches/' + sample_name].attrs['missing_bases'] # Process distance file - distance_pkl = os.join(args.ref_db,os.path.basename(args.ref_db) + '.dists.pkl') + distance_pkl = os.path.join(args.ref_db,os.path.basename(args.ref_db) + '.dists.pkl') if args.distances is not None: distance_pkl = args.distances + '.dists.pkl' rlist = read_rlist_from_distance_pickle(distance_pkl) From 886e6546cbe5df1926de9751b4b2245a387d839c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 16:07:10 +0100 Subject: [PATCH 126/175] Enable sparse matrix processing --- scripts/poppunk_sample_info.py | 42 ++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/scripts/poppunk_sample_info.py b/scripts/poppunk_sample_info.py index 9b79b4f6..4acbed8b 100755 --- a/scripts/poppunk_sample_info.py +++ b/scripts/poppunk_sample_info.py @@ -30,6 +30,48 @@ def setGtThreads(threads): sys.stderr.write('\nGraph-tools OpenMP parallelisation enabled:') sys.stderr.write(' with ' + str(gt.openmp_get_num_threads()) + ' threads\n') +def add_self_loop(G_df, seq_num, weights = False, renumber = True): + """Adds self-loop to cugraph graph to ensure all nodes are included in + the graph, even if singletons. + + Args: + G_df (cudf) + cudf data frame containing edge list + seq_num (int) + The expected number of nodes in the graph + renumber (bool) + Whether to renumber the vertices when added to the graph + + Returns: + G_new (graph) + Dictionary of cluster assignments (keys are sequence names) + """ + # use self-loop to ensure all nodes are present + min_in_df = np.amin([G_df['source'].min(), G_df['destination'].min()]) + if min_in_df.item() > 0: + G_self_loop = cudf.DataFrame() + G_self_loop['source'] = [0] + G_self_loop['destination'] = [0] + if weights: + G_self_loop['weights'] = 0.0 + G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) + max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) + if max_in_df.item() != seq_num: + G_self_loop = cudf.DataFrame() + G_self_loop['source'] = [seq_num] + G_self_loop['destination'] = [seq_num] + if weights: + G_self_loop['weights'] = 0.0 + G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) + # Construct graph + G_new = cugraph.Graph() + if weights: + G_new.from_cudf_edgelist(G_df, edge_attr = 'weights', renumber = renumber) + else: + G_new.from_cudf_edgelist(G_df, renumber = renumber) + return G_new + + def read_rlist_from_distance_pickle(fn, allow_non_self = True): """Return the list of reference sequences from a distance pickle. From adf7a621a18bdfb8142d4aa6fb864c492d4992ac Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 16:38:41 +0100 Subject: [PATCH 127/175] Fix column ordering --- PopPUNK/network.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 751b621e..01cb05a2 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -145,12 +145,12 @@ def load_network_file(fn, use_gpu = False): # Load the network from the specified file if use_gpu: G_df = cudf.read_csv(fn, compression = 'gzip') + if 'src' in G_df.columns: + G_df.rename(columns={'source': 'src','destination': 'dst'}, inplace=True) genomeNetwork = cugraph.Graph() if 'weights' in G_df.columns: - G_df.columns = ['source','destination','weights'] genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: - G_df.columns = ['source','destination'] genomeNetwork.from_cudf_edgelist(G_df,renumber=False) sys.stderr.write("Network loaded: " + str(genomeNetwork.number_of_vertices()) + " samples\n") else: @@ -527,7 +527,8 @@ def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, sys.stderr.write('Loaded network does not have edge weights; try a different ' 'network or turn off graph weights\n') exit(1) - G_df.columns = ['weights','src','dst'] # This appears to be a bug in cugraph v0.19 + if 'src' in G_df.columns: + G_df.rename(columns={'source': 'src','destination': 'dst'}, inplace=True) edge_weights = G_df['weights'].to_arrow().to_pylist() G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) old_source_ids = G_df['source'].astype('int32').to_arrow().to_pylist() From 449f220f7a0deb0dea24863c6d24abcddcc18db2 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 16:39:08 +0100 Subject: [PATCH 128/175] Fix column ordering --- scripts/poppunk_sample_info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/poppunk_sample_info.py b/scripts/poppunk_sample_info.py index 4acbed8b..0c0e071d 100755 --- a/scripts/poppunk_sample_info.py +++ b/scripts/poppunk_sample_info.py @@ -111,12 +111,12 @@ def load_network_file(fn, use_gpu = False): # Load the network from the specified file if use_gpu: G_df = cudf.read_csv(fn, compression = 'gzip') + if 'src' in G_df.columns: + G_df.rename(columns={'source': 'src','destination': 'dst'}, inplace=True) genomeNetwork = cugraph.Graph() if 'weights' in G_df.columns: - G_df.columns = ['source','destination','weights'] genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: - G_df.columns = ['source','destination'] genomeNetwork.from_cudf_edgelist(G_df,renumber=False) sys.stderr.write("Network loaded: " + str(genomeNetwork.number_of_vertices()) + " samples\n") else: From 3249f053a5a237020f607b6e31a8deae4a8a2b4e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 16:46:01 +0100 Subject: [PATCH 129/175] Change column order on loading --- PopPUNK/network.py | 1 + scripts/poppunk_sample_info.py | 1 + 2 files changed, 2 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 01cb05a2..54525924 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -149,6 +149,7 @@ def load_network_file(fn, use_gpu = False): G_df.rename(columns={'source': 'src','destination': 'dst'}, inplace=True) genomeNetwork = cugraph.Graph() if 'weights' in G_df.columns: + G_df = G_df['source','destination','weights'] genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: genomeNetwork.from_cudf_edgelist(G_df,renumber=False) diff --git a/scripts/poppunk_sample_info.py b/scripts/poppunk_sample_info.py index 0c0e071d..b8e90a09 100755 --- a/scripts/poppunk_sample_info.py +++ b/scripts/poppunk_sample_info.py @@ -115,6 +115,7 @@ def load_network_file(fn, use_gpu = False): G_df.rename(columns={'source': 'src','destination': 'dst'}, inplace=True) genomeNetwork = cugraph.Graph() if 'weights' in G_df.columns: + G_df = G_df['source','destination','weights'] genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: genomeNetwork.from_cudf_edgelist(G_df,renumber=False) From a9b84a91b377ebbed7e5d9738d5ec8f0a5ecdce9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 16:55:04 +0100 Subject: [PATCH 130/175] Change column renaming --- PopPUNK/network.py | 2 +- scripts/poppunk_sample_info.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 54525924..f94e5f23 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -146,7 +146,7 @@ def load_network_file(fn, use_gpu = False): if use_gpu: G_df = cudf.read_csv(fn, compression = 'gzip') if 'src' in G_df.columns: - G_df.rename(columns={'source': 'src','destination': 'dst'}, inplace=True) + G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) genomeNetwork = cugraph.Graph() if 'weights' in G_df.columns: G_df = G_df['source','destination','weights'] diff --git a/scripts/poppunk_sample_info.py b/scripts/poppunk_sample_info.py index b8e90a09..18aa5ec0 100755 --- a/scripts/poppunk_sample_info.py +++ b/scripts/poppunk_sample_info.py @@ -7,6 +7,7 @@ import argparse import pickle import h5py +import numpy as np import pandas as pd from scipy import sparse @@ -112,7 +113,7 @@ def load_network_file(fn, use_gpu = False): if use_gpu: G_df = cudf.read_csv(fn, compression = 'gzip') if 'src' in G_df.columns: - G_df.rename(columns={'source': 'src','destination': 'dst'}, inplace=True) + G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) genomeNetwork = cugraph.Graph() if 'weights' in G_df.columns: G_df = G_df['source','destination','weights'] From a4ddb976df5c402b9f1e7b7151a9e7fe16b75a85 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 13 Jul 2021 17:10:09 +0100 Subject: [PATCH 131/175] Change taxon ordering --- scripts/poppunk_sample_info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/poppunk_sample_info.py b/scripts/poppunk_sample_info.py index 18aa5ec0..34f70d15 100755 --- a/scripts/poppunk_sample_info.py +++ b/scripts/poppunk_sample_info.py @@ -267,7 +267,7 @@ def get_options(): exit(1) elif args.network.endswith('.npz'): sparse_mat = sparse.load_npz(args.network) - G = sparse_mat_to_network(sparse_mat, sample_names, use_gpu = use_gpu) + G = sparse_mat_to_network(sparse_mat, rlist, use_gpu = use_gpu) else: sys.stderr.write('Unrecognised suffix: expected ".gt", ".csv.gz" or ".npz"\n') exit(1) @@ -275,7 +275,7 @@ def get_options(): # Analyse network if use_gpu: component_assignments_df = cugraph.components.connectivity.connected_components(G) - component_counts_df = component_assignments_df.groupby('labels')['labels'].count() + component_counts_df = component_assignments_df.groupby('labels')['vertex'].count() component_counts_df.name = 'component_count' component_information_df = component_assignments_df.merge(component_counts_df, on = ['labels'], how = 'left') outdegree_df = G.out_degree() From 6efdd843b0cd88766b3c1c8d71bf39eddb420595 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 14 Jul 2021 21:44:39 +0100 Subject: [PATCH 132/175] Sort vertex order --- scripts/poppunk_sample_info.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/poppunk_sample_info.py b/scripts/poppunk_sample_info.py index 34f70d15..d75f2bb5 100755 --- a/scripts/poppunk_sample_info.py +++ b/scripts/poppunk_sample_info.py @@ -286,11 +286,12 @@ def get_options(): graph_properties_df['labels'] = gt.label_components(G)[0].a graph_properties_df['degree'] = G.get_out_degrees(G.get_vertices()) graph_properties_df['component_count'] = component_assignments.groupby('partition')['vertex'].transform('count') + graph_properties_df = graph_properties_df.sort_values('vertex', axis = 0) # inplace not implemented for cudf graph_properties_df['vertex'] = rlist # Merge data and print output with open(args.output,'w') as out_file: - out_file.write('Sample,Length,Missing_bases,Frequency_A,Frequency_C,Frequency_G,Frequency_T,Component,Component_size,Node_degree\n') + out_file.write('Sample,Length,Missing_bases,Frequency_A,Frequency_C,Frequency_G,Frequency_T,Component_label,Component_size,Node_degree\n') for i,sample_name in enumerate(sample_names): out_file.write(sample_name + ',' + str(sample_sequence_length[sample_name]) + ',' + str(sample_missing_bases[sample_name]) + ',') for frequency in sample_base_frequencies[sample_name]: From e9885618b4cc3855cafbcab4e4aa698da2be9bc6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 14 Jul 2021 22:08:49 +0100 Subject: [PATCH 133/175] Remove unnecessary distance argument --- scripts/poppunk_sample_info.py | 35 +++------------------------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/scripts/poppunk_sample_info.py b/scripts/poppunk_sample_info.py index d75f2bb5..064af984 100755 --- a/scripts/poppunk_sample_info.py +++ b/scripts/poppunk_sample_info.py @@ -73,26 +73,6 @@ def add_self_loop(G_df, seq_num, weights = False, renumber = True): return G_new -def read_rlist_from_distance_pickle(fn, allow_non_self = True): - """Return the list of reference sequences from a distance pickle. - - Args: - fn (str) - Name of distance pickle - allow_non_self (bool) - Whether non-self distance datasets are permissible - Returns: - rlist (list) - List of reference sequence names - """ - with open(fn, 'rb') as pickle_file: - rlist, qlist, self = pickle.load(pickle_file) - if not allow_non_self and not self: - sys.stderr.write("Thi analysis requires an all-v-all" - " distance dataset\n") - sys.exit(1) - return rlist - def load_network_file(fn, use_gpu = False): """Load the network based on input options @@ -208,9 +188,6 @@ def get_options(): parser.add_argument('--network', required = True, help='Network or lineage fit file for analysis') - parser.add_argument('--distances', - default = None, - help='Prefix of distance files') parser.add_argument('--threads', default = 1, help='Number of cores to use in analysis') @@ -250,12 +227,6 @@ def get_options(): sample_sequence_length[sample_name] = ref_db['sketches/' + sample_name].attrs['length'] sample_missing_bases[sample_name] = ref_db['sketches/' + sample_name].attrs['missing_bases'] - # Process distance file - distance_pkl = os.path.join(args.ref_db,os.path.basename(args.ref_db) + '.dists.pkl') - if args.distances is not None: - distance_pkl = args.distances + '.dists.pkl' - rlist = read_rlist_from_distance_pickle(distance_pkl) - # Open network file if args.network.endswith('.gt'): G = load_network_file(args.network, use_gpu = False) @@ -267,7 +238,7 @@ def get_options(): exit(1) elif args.network.endswith('.npz'): sparse_mat = sparse.load_npz(args.network) - G = sparse_mat_to_network(sparse_mat, rlist, use_gpu = use_gpu) + G = sparse_mat_to_network(sparse_mat, sample_names, use_gpu = use_gpu) else: sys.stderr.write('Unrecognised suffix: expected ".gt", ".csv.gz" or ".npz"\n') exit(1) @@ -282,12 +253,12 @@ def get_options(): graph_properties_df = component_information_df.merge(outdegree_df, on = ['vertex']) else: graph_properties_df = pd.DataFrame() - graph_properties_df['vertex'] = np.arange(len(rlist)) + graph_properties_df['vertex'] = np.arange(len(sample_names)) graph_properties_df['labels'] = gt.label_components(G)[0].a graph_properties_df['degree'] = G.get_out_degrees(G.get_vertices()) graph_properties_df['component_count'] = component_assignments.groupby('partition')['vertex'].transform('count') graph_properties_df = graph_properties_df.sort_values('vertex', axis = 0) # inplace not implemented for cudf - graph_properties_df['vertex'] = rlist + graph_properties_df['vertex'] = sample_names # Merge data and print output with open(args.output,'w') as out_file: From f4b2cd13eaeccbd88f29fcc1abafb9f97345b6b8 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 15 Jul 2021 06:08:59 +0100 Subject: [PATCH 134/175] Define suffix variable --- PopPUNK/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index c380c0a2..f6e6d4ed 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -512,7 +512,7 @@ def outputsForCytoscape(G, G_mst, isolate_names, clustering, outPrefix, epiCsv, for n,v in enumerate(G_mst.vertices()): G_mst.vp.id[v] = isolate_labels[n] suffix = suffix + '_mst' - save_network(G_mst, prefix = outPrefix, suffix = graph_suffix, use_graphml = True) + save_network(G_mst, prefix = outPrefix, suffix = suffix, use_graphml = True) # Write CSV of metadata if writeCsv: From 2fd9df21d0b0a4ad7affb7badf69501fe5b60073 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 15 Jul 2021 09:18:14 +0100 Subject: [PATCH 135/175] Re-enable joining of databases with lineages --- PopPUNK/assign.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 970b75b4..36bee22c 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -308,7 +308,7 @@ def assign_query(dbFuncs, else: sys.stderr.write("Updating reference database to " + output + "\n") # Update the network + ref list (everything) - no need to duplicate for core/accessory - if fit_type == 'original': + if fit_type not in ['core','accessory']: joinDBs(ref_db, output, output, {"threads": threads, "strand_preserved": strand_preserved}) if model.type == 'lineage': From 2be55ddbca5964c0fddd5158db6adfdd48549b22 Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 08:19:19 +0100 Subject: [PATCH 136/175] Remove unnecessary branch --- PopPUNK/assign.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 36bee22c..b3223800 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -163,7 +163,7 @@ def assign_query(dbFuncs, calc_random = False, use_gpu = gpu_sketch, deviceid = deviceid) - if (fit_type == 'original'): + if (fit_type == 'original' or (fit_type != 'original' and use_ref_graph)): # run query qrDistMat = queryDatabase(rNames = rNames, qNames = qNames, @@ -174,17 +174,6 @@ def assign_query(dbFuncs, number_plot_fits = plot_fit, threads = threads, use_gpu = gpu_dist) - elif (fit_type != 'original' and use_ref_graph == False): - # Only re-run query if references are being used - qrDistMat = queryDatabase(rNames = rNames, - qNames = qNames, - dbPrefix = ref_db, - queryPrefix = output, - klist = kmers, - self = False, - number_plot_fits = plot_fit, - threads = threads, - use_gpu = gpu_dist) # QC distance matrix if qc_dict['run_qc']: @@ -283,7 +272,11 @@ def assign_query(dbFuncs, output, distances = distances, distance_type = dist_type, - queryQuery = update_db, + queryQuery = (update_db and + (fit_type == 'original' or + (fit_type != 'original' and use_ref_graph) + ) + ), strand_preserved = strand_preserved, weights = weights, threads = threads, From 5054109712988396b8d62f72ac18c9bd4ff0a8d2 Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 08:43:36 +0100 Subject: [PATCH 137/175] Check model type before assigning different modes --- PopPUNK/assign.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index b3223800..c60bbf80 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -122,12 +122,13 @@ def assign_query(dbFuncs, # but have different networks, references, reference distances and assignments fit_type_list = ['original'] fit_string_list = [''] - if core: - fit_type_list.append('core') - fit_string_list.append('_core') - if accessory: - fit_type_list.append('accessory') - fit_string_list.append('_accessory') + if model.type == 'refine' and self.indiv_fitted: + if core: + fit_type_list.append('core') + fit_string_list.append('_core') + if accessory: + fit_type_list.append('accessory') + fit_string_list.append('_accessory') for fit_type, fit_string in zip(fit_type_list, fit_string_list): # Find distances vs ref seqs @@ -276,7 +277,7 @@ def assign_query(dbFuncs, (fit_type == 'original' or (fit_type != 'original' and use_ref_graph) ) - ), + ), strand_preserved = strand_preserved, weights = weights, threads = threads, From 3a5fb220c7d9bff41556edfd96e2977f26ae48d6 Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 08:58:01 +0100 Subject: [PATCH 138/175] Fix file name processing --- PopPUNK/assign.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index c60bbf80..b5a3e781 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -121,23 +121,25 @@ def assign_query(dbFuncs, # Core and accessory assignments use the same model and same overall set of distances # but have different networks, references, reference distances and assignments fit_type_list = ['original'] - fit_string_list = [''] if model.type == 'refine' and self.indiv_fitted: if core: fit_type_list.append('core') - fit_string_list.append('_core') if accessory: fit_type_list.append('accessory') - fit_string_list.append('_accessory') - for fit_type, fit_string in zip(fit_type_list, fit_string_list): + for fit_type in fit_type_list: + # Define file name extension + file_extension_string = '' + if fit_type != 'original': + file_extension_string = '_' + fit_type # Find distances vs ref seqs rNames = [] + ref_file_name = os.path.join(model_prefix, + os.path.basename(model_prefix) + file_extension_string + ".refs") use_ref_graph = \ - os.path.isfile(model_prefix + "/" + os.path.basename(model_prefix) + fit_string + ".refs") \ - and not update_db and model.type != 'lineage' + os.path.isfile(ref_file_name) and not update_db and model.type != 'lineage' if use_ref_graph: - with open(model_prefix + "/" + os.path.basename(model_prefix) + fit_string + ".refs") as refFile: + with open(ref_file_name) as refFile: for reference in refFile: rNames.append(reference.rstrip()) else: @@ -147,7 +149,7 @@ def assign_query(dbFuncs, sys.stderr.write("Reference distances missing, cannot use --update-db\n") sys.exit(1) else: - rNames = getSeqsInDb(ref_db + "/" + os.path.basename(ref_db) + ".h5") + rNames = getSeqsInDb(os.path.join(ref_db, os.path.basename(ref_db) + ".h5") # construct database - use a single database directory for all query outputs if (web and json_sketch): qNames = sketch_to_hdf5(json_sketch, output) @@ -283,7 +285,7 @@ def assign_query(dbFuncs, threads = threads, use_gpu = gpu_graph) - output_fn = output + "/" + os.path.basename(output) + fit_string + output_fn = os.path.join(output, os.path.basename(output) + file_extension_string) isolateClustering = \ {'combined': printClusters(genomeNetwork, rNames + qNames, @@ -306,12 +308,15 @@ def assign_query(dbFuncs, joinDBs(ref_db, output, output, {"threads": threads, "strand_preserved": strand_preserved}) if model.type == 'lineage': - save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph) + save_network(genomeNetwork[min(model.ranks)], + prefix = output, + suffix = '_graph', + use_gpu = gpu_graph) # Save sparse distance matrices and updated model model.outPrefix = os.path.basename(output) model.save() else: - graph_suffix = fit_string + '_graph' + graph_suffix = file_extension_string + '_graph' save_network(genomeNetwork, prefix = output, suffix = graph_suffix, @@ -343,7 +348,7 @@ def assign_query(dbFuncs, if model.type != 'lineage': existing_ref_list = [] - with open(model_prefix + "/" + os.path.basename(model_prefix) + fit_string + ".refs") as refFile: + with open(ref_file_name) as refFile: for reference in refFile: existing_ref_list.append(reference.rstrip()) @@ -353,7 +358,7 @@ def assign_query(dbFuncs, extractReferences(genomeNetwork, combined_seq, output, - outSuffix = fit_string, + outSuffix = file_extension_string, existingRefs = existing_ref_list, type_isolate = qc_dict['type_isolate'], threads = threads, @@ -369,17 +374,17 @@ def assign_query(dbFuncs, if (len(names_to_remove) > 0): # This function also writes out the new ref distance matrix - dists_suffix = fit_string + '.refs.dists' + dists_suffix = file_extension_string + '.refs.dists' postpruning_combined_seq, newDistMat = \ prune_distance_matrix(combined_seq, names_to_remove, complete_distMat, output + "/" + os.path.basename(output) + dists_suffix) - graph_suffix = fit_string + '_refs_graph' + graph_suffix = file_extension_string + '_refs_graph' save_network(genomeNetwork, prefix = output, suffix = graph_suffix, use_gpu = gpu_graph) removeFromDB(output, output, names_to_remove) - db_suffix = fit_string + '.refs.h5' + db_suffix = file_extension_string + '.refs.h5' os.rename(output + "/" + os.path.basename(output) + ".tmp.h5", output + "/" + os.path.basename(output) + db_suffix) @@ -393,7 +398,7 @@ def assign_query(dbFuncs, if model.type == 'lineage': save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph) else: - graph_suffix = fit_string + '_graph' + graph_suffix = file_extension_string + '_graph' save_network(genomeNetwork, prefix = output, suffix = graph_suffix, use_gpu = gpu_graph) return(isolateClustering) From ec7088292016d3f58dcd5df162ccabc6f6d89ef2 Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 09:02:06 +0100 Subject: [PATCH 139/175] Clarify conditional test --- PopPUNK/assign.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index b5a3e781..807d488a 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -304,7 +304,7 @@ def assign_query(dbFuncs, else: sys.stderr.write("Updating reference database to " + output + "\n") # Update the network + ref list (everything) - no need to duplicate for core/accessory - if fit_type not in ['core','accessory']: + if fit_type == 'original': joinDBs(ref_db, output, output, {"threads": threads, "strand_preserved": strand_preserved}) if model.type == 'lineage': From aeeb3dc24a8dc3d5107e39b56644b9430f663c75 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 27 Jul 2021 09:03:21 +0100 Subject: [PATCH 140/175] Fix section title Co-authored-by: John Lees --- PopPUNK/visualise.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index cedb8315..91259ca7 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -248,7 +248,8 @@ def generate_visualisations(query_db, #******************************# #* *# - #* Process dense or sparse distances *# + #* Process dense or sparse *# + #* distances *# #* *# #******************************# From d16b5dad4afd3dc4c0496996686b6df4dbda66a7 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 27 Jul 2021 09:03:45 +0100 Subject: [PATCH 141/175] Fix whitespace Co-authored-by: John Lees --- PopPUNK/network.py | 1 - 1 file changed, 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index f94e5f23..c19c75b1 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -277,7 +277,6 @@ def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None references = set(existingRefs) index_lookup = {v:k for k,v in enumerate(dbOrder)} reference_indices = set([index_lookup[r] for r in references]) - # Add type isolate, if necessary type_isolate_index = None if type_isolate is not None: From 9e2ad6226108b017a355c1fcafda39652a4b9d24 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 27 Jul 2021 09:04:16 +0100 Subject: [PATCH 142/175] Require necessary function argument Co-authored-by: John Lees --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index c19c75b1..fb85ce83 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -977,7 +977,7 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) return G -def construct_dense_weighted_network(rlist, distMat = None, weights_type = None, use_gpu = False): +def construct_dense_weighted_network(rlist, distMat, weights_type = None, use_gpu = False): """Construct an undirected network using sequence lists, assignments of pairwise distances to clusters, and the identifier of the cluster assigned to within-strain distances. Nodes are samples and edges where samples are within the same cluster From 722d9a1ea29406b0ce9d3f40a2466917dae1ab29 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 27 Jul 2021 09:04:55 +0100 Subject: [PATCH 143/175] Fix whitespace Co-authored-by: John Lees --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index fb85ce83..79a10719 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1910,7 +1910,7 @@ def sparse_mat_to_network(sparse_mat, rlist, use_gpu = False): G = construct_network_from_edge_list(rlist, rlist, connections, - weights = sparse_mat.data, + weights=sparse_mat.data, summarise=False) return G From 8098aab71c4337853f8e1782a5a0db8c188bbf2f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 27 Jul 2021 09:07:29 +0100 Subject: [PATCH 144/175] Fix section title Co-authored-by: John Lees --- PopPUNK/visualise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 91259ca7..e78b7163 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -226,7 +226,7 @@ def generate_visualisations(query_db, #******************************# #* *# - #* Initial checks and set up *# + #* Initial checks and set up *# #* *# #******************************# From 7d1bd047f7b7b356c3d93875213104dc81b94c55 Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 09:13:13 +0100 Subject: [PATCH 145/175] Tidy up function calls --- PopPUNK/network.py | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 79a10719..248951c1 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -700,10 +700,18 @@ def process_previous_network(previous_network = None, adding_qq_dists = False, o return extra_sources, extra_targets, extra_weights -def construct_network_from_edge_list(rlist, qlist, edge_list, - weights = None, distMat = None, previous_network = None, adding_qq_dists = False, - old_ids = None, previous_pkl = None, betweenness_sample = betweenness_sample_default, - summarise = True, use_gpu = False): +def construct_network_from_edge_list(rlist, + qlist, + edge_list, + weights = None, + distMat = None, + previous_network = None, + adding_qq_dists = False, + old_ids = None, + previous_pkl = None, + betweenness_sample = betweenness_sample_default, + summarise = True, + use_gpu = False): """Construct an undirected network using a data frame of edges. Nodes are samples and edges where samples are within the same cluster @@ -813,10 +821,18 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, return G -def construct_network_from_df(rlist, qlist, G_df, - weights = False, distMat = None, previous_network = None, adding_qq_dists = False, - old_ids = None, previous_pkl = None, betweenness_sample = betweenness_sample_default, - summarise = True, use_gpu = False): +def construct_network_from_df(rlist, + qlist, + G_df, + weights = False, + distMat = None, + previous_network = None, + adding_qq_dists = False, + old_ids = None, + previous_pkl = None, + betweenness_sample = betweenness_sample_default, + summarise = True, + use_gpu = False): """Construct an undirected network using a data frame of edges. Nodes are samples and edges where samples are within the same cluster @@ -918,9 +934,15 @@ def construct_network_from_df(rlist, qlist, G_df, print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) return G -def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, - weights = None, previous_network = None, previous_pkl = None, - betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): +def construct_network_from_sparse_matrix(rlist, + qlist, + sparse_input, + weights = None, + previous_network = None, + previous_pkl = None, + betweenness_sample = betweenness_sample_default, + summarise = True, + use_gpu = False): """Construct an undirected network using a sparse matrix. Nodes are samples and edges where samples are within the same cluster From a8ffbd4dd65125ccbe0a75de250b2bb050cd32ae Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 09:36:03 +0100 Subject: [PATCH 146/175] Remove parentheses from isolate names --- PopPUNK/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index c4da1a0d..f823defe 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -544,7 +544,8 @@ def isolateNameToLabel(names): """ # useful to have as a function in case we # want to remove certain characters - labels = [name.split('/')[-1].replace('.','_').replace(':','') for name in names] + labels = [name.split('/')[-1].replace('.','_').replace(':','').replace('(','_').replace(')','_') \ + for name in names] return labels From 2d70db89e6db4c69d63cc6aabdc3a4aff4534af4 Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 09:58:34 +0100 Subject: [PATCH 147/175] Initialise weights list --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 248951c1..4a43438f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1051,7 +1051,7 @@ def construct_dense_weighted_network(rlist, distMat, weights_type = None, use_gp G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) else: # Construct network with CPU via edge list - #weighted_edges = [] + weighted_edges = [] for ((src, dest), weight) in zip(edge_list, weights): weighted_edges.append((src, dest, weight)) # build the graph From efa5a7d06083fc1a99f1f875aa7843da98d0ffe2 Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 10:10:40 +0100 Subject: [PATCH 148/175] Clarify assert statement for references --- PopPUNK/assign.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 807d488a..adfc3a9f 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -391,7 +391,8 @@ def assign_query(dbFuncs, # Check that the updated set of references includes all old references, and references added from # queries; there may be further new references, even from the original database, where paths are # added between reference isolates in the same component, or new cliques formed - assert set(postpruning_combined_seq).issuperset(set(existing_ref_list).union(set(newQueries))) + added_references = set(existing_ref_list).union(set(newQueries)) + assert set(postpruning_combined_seq).issuperset(added_references), "Error identifying references" else: storePickle(rNames, qNames, False, qrDistMat, dists_out) if save_partial_query_graph: From cc0f023d91d8cfdd8e43df654e9c6f35f50a1ddf Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 10:14:00 +0100 Subject: [PATCH 149/175] Remove defaults --- src/python_bindings.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python_bindings.cpp b/src/python_bindings.cpp index 39d7ca5e..56a92004 100644 --- a/src/python_bindings.cpp +++ b/src/python_bindings.cpp @@ -33,8 +33,8 @@ edge_tuple edgeThreshold(const Eigen::Ref &distMat, edge_tuple generateTuples(const std::vector &assignments, const int within_label, bool self = true, - const int num_ref = 0, - const int int_offset = 0) { + const int num_ref, + const int int_offset) { edge_tuple edges = generate_tuples(assignments, within_label, self, num_ref, int_offset); return (edges); @@ -44,7 +44,7 @@ network_coo thresholdIterate1D(const Eigen::Ref &distMat, const std::vector &offsets, const int slope, const double x0, const double y0, const double x1, - const double y1, const int num_threads = 1) { + const double y1, const int num_threads) { if (!std::is_sorted(offsets.begin(), offsets.end())) { throw std::runtime_error("Offsets to thresholdIterate1D must be sorted"); } From 08276f5f088c007c50870b4348373673adcc8c38 Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 11:45:31 +0100 Subject: [PATCH 150/175] Consolidate info scripts into new code --- PopPUNK/info.py | 165 ++++++++++++++++++++ poppunk_info-runner.py | 11 ++ scripts/poppunk_db_info.py | 68 -------- scripts/poppunk_sample_info.py | 276 --------------------------------- src/boundary.cpp | 25 +++ src/boundary.hpp | 5 + src/python_bindings.cpp | 40 +++-- 7 files changed, 237 insertions(+), 353 deletions(-) create mode 100644 PopPUNK/info.py create mode 100755 poppunk_info-runner.py delete mode 100755 scripts/poppunk_db_info.py delete mode 100755 scripts/poppunk_sample_info.py diff --git a/PopPUNK/info.py b/PopPUNK/info.py new file mode 100644 index 00000000..03c634c2 --- /dev/null +++ b/PopPUNK/info.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python +# vim: set fileencoding= : +# Copyright 2018-2020 John Lees and Nick Croucher + +# universal +import os +import sys +import pickle +# additional +import h5py +import argparse +import numpy as np +import pandas as pd +from scipy import sparse + +# Load GPU libraries +try: + import cupyx + import cugraph + import cudf + import cupy + from numba import cuda + import rmm + gpu_lib = True +except ImportError as e: + gpu_lib = False + +# command line parsing +def get_options(): + + parser = argparse.ArgumentParser(description='Get information about a PopPUNK database', + prog='poppunk_db_info') + + # input options + parser.add_argument('--ref-db', + required = True, + help='PopPUNK database directory') + parser.add_argument('--network', + required = True, + help='Network or lineage fit file for analysis') + parser.add_argument('--threads', + default = 1, + help='Number of cores to use in analysis') + parser.add_argument('--use-gpu', + default = False, + action = 'store_true', + help='Whether GPU libraries should be used in analysis') + parser.add_argument('--output', + required = True, + help='Prefix for output files') + + return parser.parse_args() + +# main code +if __name__ == "__main__": + + # Import functions + from .network import add_self_loop + from .network import load_network_file + from .network import sparse_mat_to_network + from .utils import check_and_set_gpu + from .utils import setGtThreads + + # Check input ok + args = get_options() + + # Check whether GPU libraries can be loaded + use_gpu = check_and_set_gpu(args.use_gpu, gpu_lib, quit_on_fail = False) + + # Set threads for graph-tool + setGtThreads(args.threads) + + # Open and process sequence database + h5_fn = os.path.join(args.ref_db,os.path.basename(args.ref_db) + '.h5') + ref_db = h5py.File(h5_fn, 'r') + + # Print overall database information + ref_db = h5py.File(args.db, 'r') + print("PopPUNK database:\t\t" + args.db) + + sketch_version = ref_db['sketches'].attrs['sketch_version'] + print("Sketch version:\t\t\t" + sketch_version) + + num_samples = len(ref_db['sketches'].keys()) + print("Number of samples:\t\t" + str(num_samples)) + + first_sample = list(ref_db['sketches'].keys())[0] + kmer_size = ref_db['sketches/' + first_sample].attrs['kmers'] + print("K-mer sizes:\t\t\t" + ",".join([str(x) for x in kmer_size])) + + sketch_size = int(ref_db['sketches/' + first_sample].attrs['sketchsize64']) * 64 + print("Sketch size:\t\t\t" + str(sketch_size)) + + if 'random' in ref_db.keys(): + has_random = True + else: + has_random = False + print("Contains random matches:\t" + str(has_random)) + + try: + codon_phased = ref_db['sketches'].attrs['codon_phased'] == 1 + except KeyError: + codon_phased = False + print("Codon phased seeds:\t\t" + str(codon_phased)) + + # Print sample information + sample_names = list(ref_db['sketches'].keys()) + sample_sequence_length = {} + sample_missing_bases = {} + sample_base_frequencies = {name: [] for name in sample_names} + + for sample_name in sample_names: + sample_base_frequencies[sample_name] = ref_db['sketches/' + sample_name].attrs['base_freq'] + sample_sequence_length[sample_name] = ref_db['sketches/' + sample_name].attrs['length'] + sample_missing_bases[sample_name] = ref_db['sketches/' + sample_name].attrs['missing_bases'] + + # Open network file + if args.network.endswith('.gt'): + G = load_network_file(args.network, use_gpu = False) + elif args.network.endswith('.csv.gz'): + if use_gpu: + G = load_network_file(args.network, use_gpu = True) + else: + sys.stderr.write('Unable to load necessary GPU libraries\n') + exit(1) + elif args.network.endswith('.npz'): + sparse_mat = sparse.load_npz(args.network) + G = sparse_mat_to_network(sparse_mat, sample_names, use_gpu = use_gpu) + else: + sys.stderr.write('Unrecognised suffix: expected ".gt", ".csv.gz" or ".npz"\n') + exit(1) + + # Analyse network + if use_gpu: + component_assignments_df = cugraph.components.connectivity.connected_components(G) + component_counts_df = component_assignments_df.groupby('labels')['vertex'].count() + component_counts_df.name = 'component_count' + component_information_df = component_assignments_df.merge(component_counts_df, on = ['labels'], how = 'left') + outdegree_df = G.out_degree() + graph_properties_df = component_information_df.merge(outdegree_df, on = ['vertex']) + else: + graph_properties_df = pd.DataFrame() + graph_properties_df['vertex'] = np.arange(len(sample_names)) + graph_properties_df['labels'] = gt.label_components(G)[0].a + graph_properties_df['degree'] = G.get_out_degrees(G.get_vertices()) + graph_properties_df['component_count'] = component_assignments.groupby('partition')['vertex'].transform('count') + graph_properties_df = graph_properties_df.sort_values('vertex', axis = 0) # inplace not implemented for cudf + graph_properties_df['vertex'] = sample_names + + # Merge data and print output + with open(args.output,'w') as out_file: + out_file.write( + 'Sample,Length,Missing_bases,Frequency_A,Frequency_C,Frequency_G,Frequency_T,Component_label,Component_size,Node_degree\n' + ) + for i,sample_name in enumerate(sample_names): + out_file.write(sample_name + ',' + str(sample_sequence_length[sample_name]) + ',' + str(sample_missing_bases[sample_name]) + ',') + for frequency in sample_base_frequencies[sample_name]: + out_file.write(str(frequency) + ',') + graph_properties_row = graph_properties_df.iloc[graph_properties_df['vertex']==sample_name,:] + out_file.write(str(graph_properties_row['labels'].values[0]) + ',') + out_file.write(str(graph_properties_row['component_count'].values[0]) + ',') + out_file.write(str(graph_properties_row['degree'].values[0])) + out_file.write("\n") + + sys.exit(0) diff --git a/poppunk_info-runner.py b/poppunk_info-runner.py new file mode 100755 index 00000000..764311f3 --- /dev/null +++ b/poppunk_info-runner.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# vim: set fileencoding= : +# Copyright 2018-2020 John Lees, Daniel Anderson and Nick Croucher + +"""Convenience wrapper for running poppunk_info directly from source tree.""" + +from PopPUNK.info import main + +if __name__ == '__main__': + main() + diff --git a/scripts/poppunk_db_info.py b/scripts/poppunk_db_info.py deleted file mode 100755 index 77ae20f9..00000000 --- a/scripts/poppunk_db_info.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python -# vim: set fileencoding= : -# Copyright 2018-2020 John Lees and Nick Croucher - -import sys -import argparse -import h5py - -# command line parsing -def get_options(): - - parser = argparse.ArgumentParser(description='Get information about a PopPUNK database', - prog='poppunk_db_info') - - # input options - parser.add_argument('db', help='Database file (.h5)') - parser.add_argument('--list-samples', action='store_true', default=False, - help='List every sample in the database') - - return parser.parse_args() - -# main code -if __name__ == "__main__": - - # Check input ok - args = get_options() - - ref_db = h5py.File(args.db, 'r') - print("PopPUNK database:\t\t" + args.db) - - sketch_version = ref_db['sketches'].attrs['sketch_version'] - print("Sketch version:\t\t\t" + sketch_version) - - num_samples = len(ref_db['sketches'].keys()) - print("Number of samples:\t\t" + str(num_samples)) - - first_sample = list(ref_db['sketches'].keys())[0] - kmer_size = ref_db['sketches/' + first_sample].attrs['kmers'] - print("K-mer sizes:\t\t\t" + ",".join([str(x) for x in kmer_size])) - - sketch_size = int(ref_db['sketches/' + first_sample].attrs['sketchsize64']) * 64 - print("Sketch size:\t\t\t" + str(sketch_size)) - - if 'random' in ref_db.keys(): - has_random = True - else: - has_random = False - print("Contains random matches:\t" + str(has_random)) - - try: - codon_phased = ref_db['sketches'].attrs['codon_phased'] == 1 - except KeyError: - codon_phased = False - print("Codon phased seeds:\t\t" + str(codon_phased)) - - if args.list_samples: - print("\n") - print("\t".join(["name", "base_frequencies", "length", "missing_bases"])) - for sample_name in list(ref_db['sketches'].keys()): - sample_string = [sample_name] - base_freq = ref_db['sketches/' + sample_name].attrs['base_freq'] - sample_string.append(",".join([base + ':' + "{:.3f}".format(x) for base, x in zip(['A', 'C', 'G', 'T'], base_freq)])) - sample_string.append(str(ref_db['sketches/' + sample_name].attrs['length'])) - sample_string.append(str(ref_db['sketches/' + sample_name].attrs['missing_bases'])) - print("\t".join(sample_string)) - - - sys.exit(0) diff --git a/scripts/poppunk_sample_info.py b/scripts/poppunk_sample_info.py deleted file mode 100755 index 064af984..00000000 --- a/scripts/poppunk_sample_info.py +++ /dev/null @@ -1,276 +0,0 @@ -#!/usr/bin/env python -# vim: set fileencoding= : -# Copyright 2018-2020 John Lees and Nick Croucher - -import os -import sys -import argparse -import pickle -import h5py -import numpy as np -import pandas as pd -from scipy import sparse - -# Load GPU libraries -try: - import cupyx - import cugraph - import cudf - import cupy - from numba import cuda - import rmm - gpu_lib = True -except ImportError as e: - gpu_lib = False - -def setGtThreads(threads): - import graph_tool.all as gt - # Check on parallelisation of graph-tools - if gt.openmp_enabled(): - gt.openmp_set_num_threads(threads) - sys.stderr.write('\nGraph-tools OpenMP parallelisation enabled:') - sys.stderr.write(' with ' + str(gt.openmp_get_num_threads()) + ' threads\n') - -def add_self_loop(G_df, seq_num, weights = False, renumber = True): - """Adds self-loop to cugraph graph to ensure all nodes are included in - the graph, even if singletons. - - Args: - G_df (cudf) - cudf data frame containing edge list - seq_num (int) - The expected number of nodes in the graph - renumber (bool) - Whether to renumber the vertices when added to the graph - - Returns: - G_new (graph) - Dictionary of cluster assignments (keys are sequence names) - """ - # use self-loop to ensure all nodes are present - min_in_df = np.amin([G_df['source'].min(), G_df['destination'].min()]) - if min_in_df.item() > 0: - G_self_loop = cudf.DataFrame() - G_self_loop['source'] = [0] - G_self_loop['destination'] = [0] - if weights: - G_self_loop['weights'] = 0.0 - G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) - max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) - if max_in_df.item() != seq_num: - G_self_loop = cudf.DataFrame() - G_self_loop['source'] = [seq_num] - G_self_loop['destination'] = [seq_num] - if weights: - G_self_loop['weights'] = 0.0 - G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) - # Construct graph - G_new = cugraph.Graph() - if weights: - G_new.from_cudf_edgelist(G_df, edge_attr = 'weights', renumber = renumber) - else: - G_new.from_cudf_edgelist(G_df, renumber = renumber) - return G_new - - -def load_network_file(fn, use_gpu = False): - """Load the network based on input options - - Returns the network as a graph-tool format graph, and sets - the slope parameter of the passed model object. - - Args: - fn (str) - Network file name - use_gpu (bool) - Use cugraph library to load graph - - Returns: - genomeNetwork (graph) - The loaded network - """ - # Load the network from the specified file - if use_gpu: - G_df = cudf.read_csv(fn, compression = 'gzip') - if 'src' in G_df.columns: - G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) - genomeNetwork = cugraph.Graph() - if 'weights' in G_df.columns: - G_df = G_df['source','destination','weights'] - genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) - else: - genomeNetwork.from_cudf_edgelist(G_df,renumber=False) - sys.stderr.write("Network loaded: " + str(genomeNetwork.number_of_vertices()) + " samples\n") - else: - genomeNetwork = gt.load_graph(fn) - sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") - - return genomeNetwork - -def check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = False): - """Check GPU libraries can be loaded and set managed memory. - - Args: - use_gpu (bool) - Whether GPU packages have been requested - gpu_lib (bool) - Whether GPU packages are available - Returns: - use_gpu (bool) - Whether GPU packages can be used - """ - # load CUDA libraries - if use_gpu and not gpu_lib: - if quit_on_fail: - sys.stderr.write('Unable to load GPU libraries; exiting\n') - sys.exit(1) - else: - sys.stderr.write('Unable to load GPU libraries; using CPU libraries ' - 'instead\n') - use_gpu = False - - # Set memory management for large networks - if use_gpu: - rmm.reinitialize(managed_memory=True) - cudf.set_allocator("managed") - if "cupy" in sys.modules: - cupy.cuda.set_allocator(rmm.rmm_cupy_allocator) - if "cuda" in sys.modules: - cuda.set_memory_manager(rmm.RMMNumbaManager) - assert(rmm.is_initialized()) - - return use_gpu - -def sparse_mat_to_network(sparse_mat, rlist, use_gpu = False): - """Generate a network from a lineage rank fit - - Args: - sparse_mat (scipy or cupyx sparse matrix) - Sparse matrix of kNN from lineage fit - rlist (list) - List of sequence names - use_gpu (bool) - Whether GPU libraries should be used - - Returns: - G (network) - Graph tool or cugraph network - """ - if use_gpu: - G_df = cudf.DataFrame(columns = ['source','destination','weights']) - G_df['source'] = sparse_mat.row - G_df['destination'] = sparse_mat.col - G_df['weights'] = sparse_mat.data - max_in_vertex_labels = len(rlist)-1 - G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) - else: - connections = [] - for (src,dst) in zip(sparse_mat.row,sparse_mat.col): - connections.append(src,dst) - G = construct_network_from_edge_list(rlist, - rlist, - connections, - weights = sparse_mat.data, - summarise=False) - - return G - -# command line parsing -def get_options(): - - parser = argparse.ArgumentParser(description='Get information about a PopPUNK database', - prog='poppunk_db_info') - - # input options - parser.add_argument('--ref-db', - required = True, - help='PopPUNK database directory') - parser.add_argument('--network', - required = True, - help='Network or lineage fit file for analysis') - parser.add_argument('--threads', - default = 1, - help='Number of cores to use in analysis') - parser.add_argument('--use-gpu', - default = False, - action = 'store_true', - help='Whether GPU libraries should be used in analysis') - parser.add_argument('--output', - required = True, - help='Prefix for output files') - - return parser.parse_args() - -# main code -if __name__ == "__main__": - - # Check input ok - args = get_options() - - # Check whether GPU libraries can be loaded - use_gpu = check_and_set_gpu(args.use_gpu, gpu_lib, quit_on_fail = False) - - # Set threads for graph-tool - setGtThreads(args.threads) - - # Open and process sequence database - h5_fn = os.path.join(args.ref_db,os.path.basename(args.ref_db) + '.h5') - ref_db = h5py.File(h5_fn, 'r') - sample_names = list(ref_db['sketches'].keys()) - - sample_sequence_length = {} - sample_missing_bases = {} - sample_base_frequencies = {name: [] for name in sample_names} - - for sample_name in sample_names: - sample_base_frequencies[sample_name] = ref_db['sketches/' + sample_name].attrs['base_freq'] - sample_sequence_length[sample_name] = ref_db['sketches/' + sample_name].attrs['length'] - sample_missing_bases[sample_name] = ref_db['sketches/' + sample_name].attrs['missing_bases'] - - # Open network file - if args.network.endswith('.gt'): - G = load_network_file(args.network, use_gpu = False) - elif args.network.endswith('.csv.gz'): - if use_gpu: - G = load_network_file(args.network, use_gpu = True) - else: - sys.stderr.write('Unable to load necessary GPU libraries\n') - exit(1) - elif args.network.endswith('.npz'): - sparse_mat = sparse.load_npz(args.network) - G = sparse_mat_to_network(sparse_mat, sample_names, use_gpu = use_gpu) - else: - sys.stderr.write('Unrecognised suffix: expected ".gt", ".csv.gz" or ".npz"\n') - exit(1) - - # Analyse network - if use_gpu: - component_assignments_df = cugraph.components.connectivity.connected_components(G) - component_counts_df = component_assignments_df.groupby('labels')['vertex'].count() - component_counts_df.name = 'component_count' - component_information_df = component_assignments_df.merge(component_counts_df, on = ['labels'], how = 'left') - outdegree_df = G.out_degree() - graph_properties_df = component_information_df.merge(outdegree_df, on = ['vertex']) - else: - graph_properties_df = pd.DataFrame() - graph_properties_df['vertex'] = np.arange(len(sample_names)) - graph_properties_df['labels'] = gt.label_components(G)[0].a - graph_properties_df['degree'] = G.get_out_degrees(G.get_vertices()) - graph_properties_df['component_count'] = component_assignments.groupby('partition')['vertex'].transform('count') - graph_properties_df = graph_properties_df.sort_values('vertex', axis = 0) # inplace not implemented for cudf - graph_properties_df['vertex'] = sample_names - - # Merge data and print output - with open(args.output,'w') as out_file: - out_file.write('Sample,Length,Missing_bases,Frequency_A,Frequency_C,Frequency_G,Frequency_T,Component_label,Component_size,Node_degree\n') - for i,sample_name in enumerate(sample_names): - out_file.write(sample_name + ',' + str(sample_sequence_length[sample_name]) + ',' + str(sample_missing_bases[sample_name]) + ',') - for frequency in sample_base_frequencies[sample_name]: - out_file.write(str(frequency) + ',') - graph_properties_row = graph_properties_df.iloc[graph_properties_df['vertex']==sample_name,:] - out_file.write(str(graph_properties_row['labels'].values[0]) + ',') - out_file.write(str(graph_properties_row['component_count'].values[0]) + ',') - out_file.write(str(graph_properties_row['degree'].values[0])) - out_file.write("\n") - - sys.exit(0) diff --git a/src/boundary.cpp b/src/boundary.cpp index 1ec62fdd..bde772bf 100644 --- a/src/boundary.cpp +++ b/src/boundary.cpp @@ -146,6 +146,31 @@ edge_tuple generate_tuples(const std::vector &assignments, return edge_vec; } +edge_tuple generate_all_tuples(const int num_ref, + const int num_queries, + bool self, + const int int_offset) { + edge_tuple edge_vec; + if (self) { + const size_t n_rows = ((2 * num_ref - 1)**2 - 1) / 8; + for (long row_idx = 0; row_idx < n_rows; row_idx++) { + long i = calc_row_idx(row_idx, n_samples); + long j = calc_col_idx(row_idx, i, n_samples) + int_offset; + i = i + int_offset; + long min_node = std::min(i,j); + long max_node = std::max(i,j); + edge_vec.push_back(std::make_tuple(min_node, max_node)); + } + } else { + for (long i = 0; i < num_ref; i++) { + for (long j = 0; j < num_queries; j++) { + edge_vec.push_back(std::make_tuple(i, j + num_ref)); + } + } + } + return edge_vec; +} + // Line defined between (x0, y0) and (x1, y1) // Offset is distance along this line, starting at (x0, y0) network_coo threshold_iterate_1D(const NumpyMatrix &distMat, diff --git a/src/boundary.hpp b/src/boundary.hpp index fa6fab11..d987bd3e 100644 --- a/src/boundary.hpp +++ b/src/boundary.hpp @@ -32,6 +32,11 @@ edge_tuple generate_tuples(const std::vector &assignments, const int num_ref = 0, const int int_offset = 0); +edge_tuple generate_all_tuples(const int num_ref, + const int num_queries, + bool self = true, + const int int_offset = 0) + network_coo threshold_iterate_1D(const NumpyMatrix &distMat, const std::vector &offsets, const int slope, const float x0, diff --git a/src/python_bindings.cpp b/src/python_bindings.cpp index 56a92004..ae893908 100644 --- a/src/python_bindings.cpp +++ b/src/python_bindings.cpp @@ -35,9 +35,23 @@ edge_tuple generateTuples(const std::vector &assignments, bool self = true, const int num_ref, const int int_offset) { - edge_tuple edges = generate_tuples(assignments, within_label, self, num_ref, - int_offset); - return (edges); + edge_tuple edges = generate_tuples(assignments, + within_label, + self, + num_ref, + int_offset); + return (edges); +} + +edge_tuple generateAllTuples(const int num_ref, + const int num_queries, + bool self = true, + const int int_offset = 0) { + edge_tuple edges = generate_all_tuples(const int num_ref, + const int num_queries, + bool self = true, + const int int_offset = 0); + return (edges); } network_coo thresholdIterate1D(const Eigen::Ref &distMat, @@ -84,18 +98,26 @@ PYBIND11_MODULE(poppunk_refine, m) { py::arg("y_max")); m.def("generateTuples", &generateTuples, - py::return_value_policy::reference_internal, - "Return edge tuples based on assigned groups", - py::arg("assignments"), py::arg("within_label"), - py::arg("self") = true, py::arg("num_ref") = 0, - py::arg("int_offset") = 0); + py::return_value_policy::reference_internal, + "Return edge tuples based on assigned groups", + py::arg("assignments"), py::arg("within_label"), + py::arg("self") = true, py::arg("num_ref"), + py::arg("int_offset")); + m.def("generateTuples", &generateTuples, + py::return_value_policy::reference_internal, + "Return all edge tuples", + py::arg("num_ref"), + py::arg("num_queries"), + py::arg("self") = true, + py::arg("int_offset")); + m.def("thresholdIterate1D", &thresholdIterate1D, py::return_value_policy::reference_internal, "Move a 2D boundary to grow a network by adding edges at each offset", py::arg("distMat").noconvert(), py::arg("offsets"), py::arg("slope"), py::arg("x0"), py::arg("y0"), py::arg("x1"), py::arg("y1"), - py::arg("num_threads") = 1); + py::arg("num_threads")); m.def("thresholdIterate2D", &thresholdIterate2D, py::return_value_policy::reference_internal, From 0613aa328f17769cc7ba6dccc2e95ef0f48df8f0 Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 11:52:57 +0100 Subject: [PATCH 151/175] Update function docstring --- PopPUNK/network.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 4a43438f..dd2a2380 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -222,9 +222,8 @@ def cliquePrune(component, graph, reference_indices, components_list): return(list(ref_list)) def translate_network_indices(G_ref_df, reference_indices): - """Extract references for each cluster based on cliques - - Writes chosen references to file by calling :func:`~writeReferences` + """Function for ensuring an updated reference network retains + numbering consistent with sample names Args: G_ref_df (cudf data frame) From 3ee56832946dfbfb34f6f5f0ed179a595a015b93 Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 12:05:15 +0100 Subject: [PATCH 152/175] Add info scripts --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index fb3cd788..ca76070a 100644 --- a/setup.py +++ b/setup.py @@ -113,7 +113,8 @@ def build_extension(self, ext): 'poppunk_mst = PopPUNK.sparse_mst:main', 'poppunk_prune = PopPUNK.prune_db:main', 'poppunk_references = PopPUNK.reference_pick:main', - 'poppunk_tsne = PopPUNK.tsne:main' + 'poppunk_tsne = PopPUNK.tsne:main', + 'poppunk_info = PopPUNK.info:main' ] }, scripts=['scripts/poppunk_calculate_rand_indices.py', @@ -122,7 +123,6 @@ def build_extension(self, ext): 'scripts/poppunk_batch_mst.py', 'scripts/poppunk_extract_distances.py', 'scripts/poppunk_add_weights.py', - 'scripts/poppunk_db_info.py', 'scripts/poppunk_easy_run.py', 'scripts/poppunk_pickle_fix.py'], ext_modules=[CMakeExtension('poppunk_refine')], From ffac5dcf2921e84576ccbc6386a7dd9bb36158dc Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 12:25:22 +0100 Subject: [PATCH 153/175] Fix compilation issues --- src/boundary.cpp | 6 +++--- src/boundary.hpp | 6 +++--- src/python_bindings.cpp | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/boundary.cpp b/src/boundary.cpp index bde772bf..1a963a99 100644 --- a/src/boundary.cpp +++ b/src/boundary.cpp @@ -152,10 +152,10 @@ edge_tuple generate_all_tuples(const int num_ref, const int int_offset) { edge_tuple edge_vec; if (self) { - const size_t n_rows = ((2 * num_ref - 1)**2 - 1) / 8; + const size_t n_rows = (pow(2 * num_ref - 1, 2) - 1) / 8; for (long row_idx = 0; row_idx < n_rows; row_idx++) { - long i = calc_row_idx(row_idx, n_samples); - long j = calc_col_idx(row_idx, i, n_samples) + int_offset; + long i = calc_row_idx(row_idx, num_ref); + long j = calc_col_idx(row_idx, i, num_ref) + int_offset; i = i + int_offset; long min_node = std::min(i,j); long max_node = std::max(i,j); diff --git a/src/boundary.hpp b/src/boundary.hpp index d987bd3e..68b4264b 100644 --- a/src/boundary.hpp +++ b/src/boundary.hpp @@ -33,9 +33,9 @@ edge_tuple generate_tuples(const std::vector &assignments, const int int_offset = 0); edge_tuple generate_all_tuples(const int num_ref, - const int num_queries, - bool self = true, - const int int_offset = 0) + const int num_queries, + bool self = true, + const int int_offset = 0); network_coo threshold_iterate_1D(const NumpyMatrix &distMat, const std::vector &offsets, diff --git a/src/python_bindings.cpp b/src/python_bindings.cpp index ae893908..a88ea181 100644 --- a/src/python_bindings.cpp +++ b/src/python_bindings.cpp @@ -32,7 +32,7 @@ edge_tuple edgeThreshold(const Eigen::Ref &distMat, edge_tuple generateTuples(const std::vector &assignments, const int within_label, - bool self = true, + bool self, const int num_ref, const int int_offset) { edge_tuple edges = generate_tuples(assignments, @@ -47,10 +47,10 @@ edge_tuple generateAllTuples(const int num_ref, const int num_queries, bool self = true, const int int_offset = 0) { - edge_tuple edges = generate_all_tuples(const int num_ref, - const int num_queries, - bool self = true, - const int int_offset = 0); + edge_tuple edges = generate_all_tuples(num_ref, + num_queries, + self, + int_offset); return (edges); } @@ -104,7 +104,7 @@ PYBIND11_MODULE(poppunk_refine, m) { py::arg("self") = true, py::arg("num_ref"), py::arg("int_offset")); - m.def("generateTuples", &generateTuples, + m.def("generateAllTuples", &generateAllTuples, py::return_value_policy::reference_internal, "Return all edge tuples", py::arg("num_ref"), From ed17e8226ff1c7ae47b0d4fbc728b75e7111743e Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 13:09:27 +0100 Subject: [PATCH 154/175] Fix syntax error --- PopPUNK/assign.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index adfc3a9f..a7345e17 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -149,23 +149,23 @@ def assign_query(dbFuncs, sys.stderr.write("Reference distances missing, cannot use --update-db\n") sys.exit(1) else: - rNames = getSeqsInDb(os.path.join(ref_db, os.path.basename(ref_db) + ".h5") + rNames = getSeqsInDb(os.path.join(ref_db, os.path.basename(ref_db) + ".h5")) # construct database - use a single database directory for all query outputs - if (web and json_sketch): + if (web and json_sketch is not None): qNames = sketch_to_hdf5(json_sketch, output) elif (fit_type == 'original'): - # construct database - createDatabaseDir(output, kmers) - qNames = constructDatabase(q_files, - kmers, - sketch_sizes, - output, - threads, - overwrite, - codon_phased = codon_phased, - calc_random = False, - use_gpu = gpu_sketch, - deviceid = deviceid) + # construct database + createDatabaseDir(output, kmers) + qNames = constructDatabase(q_files, + kmers, + sketch_sizes, + output, + threads, + overwrite, + codon_phased = codon_phased, + calc_random = False, + use_gpu = gpu_sketch, + deviceid = deviceid) if (fit_type == 'original' or (fit_type != 'original' and use_ref_graph)): # run query qrDistMat = queryDatabase(rNames = rNames, From 723736b91eec193cb2346c40e39e33ed8db28159 Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 14:59:36 +0100 Subject: [PATCH 155/175] Remove default values --- src/boundary.hpp | 2 +- src/python_bindings.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/boundary.hpp b/src/boundary.hpp index 68b4264b..d1385062 100644 --- a/src/boundary.hpp +++ b/src/boundary.hpp @@ -33,7 +33,7 @@ edge_tuple generate_tuples(const std::vector &assignments, const int int_offset = 0); edge_tuple generate_all_tuples(const int num_ref, - const int num_queries, + const int num_queries = 0, bool self = true, const int int_offset = 0); diff --git a/src/python_bindings.cpp b/src/python_bindings.cpp index a88ea181..17886204 100644 --- a/src/python_bindings.cpp +++ b/src/python_bindings.cpp @@ -88,7 +88,7 @@ PYBIND11_MODULE(poppunk_refine, m) { py::return_value_policy::reference_internal, "Assign samples based on their relation to a 2D boundary", py::arg("distMat").noconvert(), py::arg("slope"), py::arg("x_max"), - py::arg("y_max"), py::arg("num_threads") = 1); + py::arg("y_max"), py::arg("num_threads")); m.def("edgeThreshold", &edgeThreshold, py::return_value_policy::reference_internal, @@ -101,7 +101,7 @@ PYBIND11_MODULE(poppunk_refine, m) { py::return_value_policy::reference_internal, "Return edge tuples based on assigned groups", py::arg("assignments"), py::arg("within_label"), - py::arg("self") = true, py::arg("num_ref"), + py::arg("self"), py::arg("num_ref"), py::arg("int_offset")); m.def("generateAllTuples", &generateAllTuples, @@ -109,7 +109,7 @@ PYBIND11_MODULE(poppunk_refine, m) { "Return all edge tuples", py::arg("num_ref"), py::arg("num_queries"), - py::arg("self") = true, + py::arg("self"), py::arg("int_offset")); m.def("thresholdIterate1D", &thresholdIterate1D, From e410974ca37c80a3f7180b3d8118da46aa1b53de Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 14:59:57 +0100 Subject: [PATCH 156/175] Restructure functions --- PopPUNK/info.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/PopPUNK/info.py b/PopPUNK/info.py index 03c634c2..32dff47b 100644 --- a/PopPUNK/info.py +++ b/PopPUNK/info.py @@ -52,7 +52,7 @@ def get_options(): return parser.parse_args() # main code -if __name__ == "__main__": +def main(): # Import functions from .network import add_self_loop @@ -163,3 +163,8 @@ def get_options(): out_file.write("\n") sys.exit(0) + +if __name__ == '__main__': + main() + + sys.exit(0) From a67de1659e9b7e5d3229aa8519d8d99b3ee1581b Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 15:00:09 +0100 Subject: [PATCH 157/175] Bump version --- PopPUNK/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py index 012742f2..82bed522 100644 --- a/PopPUNK/__init__.py +++ b/PopPUNK/__init__.py @@ -3,7 +3,7 @@ '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)''' -__version__ = '2.4.2' +__version__ = '2.4.3' # Minimum sketchlib version SKETCHLIB_MAJOR = 1 From 1f0946e99bf4ebf60d8a400b095c30436652ae45 Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 15:18:32 +0100 Subject: [PATCH 158/175] Move default values --- src/boundary.hpp | 16 ++++++++-------- src/python_bindings.cpp | 18 ++++++++++-------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/boundary.hpp b/src/boundary.hpp index d1385062..d29b2448 100644 --- a/src/boundary.hpp +++ b/src/boundary.hpp @@ -21,27 +21,27 @@ typedef std::vector> edge_tuple; Eigen::VectorXf assign_threshold(const NumpyMatrix &distMat, const int slope, const float x_max, const float y_max, - unsigned int num_threads = 1); + unsigned int num_threads); edge_tuple edge_iterate(const NumpyMatrix &distMat, const int slope, const float x_max, const float y_max); edge_tuple generate_tuples(const std::vector &assignments, const int within_label, - bool self = true, - const int num_ref = 0, - const int int_offset = 0); + bool self, + const int num_ref, + const int int_offset); edge_tuple generate_all_tuples(const int num_ref, - const int num_queries = 0, - bool self = true, - const int int_offset = 0); + const int num_queries, + bool self, + const int int_offset); network_coo threshold_iterate_1D(const NumpyMatrix &distMat, const std::vector &offsets, const int slope, const float x0, const float y0, const float x1, const float y1, - const int num_threads = 1); + const int num_threads); network_coo threshold_iterate_2D(const NumpyMatrix &distMat, const std::vector &x_max, diff --git a/src/python_bindings.cpp b/src/python_bindings.cpp index 17886204..1fdf6634 100644 --- a/src/python_bindings.cpp +++ b/src/python_bindings.cpp @@ -88,7 +88,7 @@ PYBIND11_MODULE(poppunk_refine, m) { py::return_value_policy::reference_internal, "Assign samples based on their relation to a 2D boundary", py::arg("distMat").noconvert(), py::arg("slope"), py::arg("x_max"), - py::arg("y_max"), py::arg("num_threads")); + py::arg("y_max"), py::arg("num_threads") = 1); m.def("edgeThreshold", &edgeThreshold, py::return_value_policy::reference_internal, @@ -100,24 +100,26 @@ PYBIND11_MODULE(poppunk_refine, m) { m.def("generateTuples", &generateTuples, py::return_value_policy::reference_internal, "Return edge tuples based on assigned groups", - py::arg("assignments"), py::arg("within_label"), - py::arg("self"), py::arg("num_ref"), - py::arg("int_offset")); + py::arg("assignments"), + py::arg("within_label"), + py::arg("self") = true, + py::arg("num_ref") = 0, + py::arg("int_offset") = 0); m.def("generateAllTuples", &generateAllTuples, py::return_value_policy::reference_internal, "Return all edge tuples", py::arg("num_ref"), - py::arg("num_queries"), - py::arg("self"), - py::arg("int_offset")); + py::arg("num_queries") = 0, + py::arg("self") = true, + py::arg("int_offset") = 0); m.def("thresholdIterate1D", &thresholdIterate1D, py::return_value_policy::reference_internal, "Move a 2D boundary to grow a network by adding edges at each offset", py::arg("distMat").noconvert(), py::arg("offsets"), py::arg("slope"), py::arg("x0"), py::arg("y0"), py::arg("x1"), py::arg("y1"), - py::arg("num_threads")); + py::arg("num_threads") = 1); m.def("thresholdIterate2D", &thresholdIterate2D, py::return_value_policy::reference_internal, From dc59e91b9d1cfc245b2e417bc0d2652bcf4fd8fa Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 15:47:39 +0100 Subject: [PATCH 159/175] Fix model object reference --- PopPUNK/assign.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index a7345e17..cc79d41f 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -121,7 +121,7 @@ def assign_query(dbFuncs, # Core and accessory assignments use the same model and same overall set of distances # but have different networks, references, reference distances and assignments fit_type_list = ['original'] - if model.type == 'refine' and self.indiv_fitted: + if model.type == 'refine' and model.indiv_fitted: if core: fit_type_list.append('core') if accessory: From 235f9b60a4671ccc8520da9d8356d393e97f77ea Mon Sep 17 00:00:00 2001 From: Croucher Date: Tue, 27 Jul 2021 15:48:05 +0100 Subject: [PATCH 160/175] Update tuple generation function --- PopPUNK/network.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index dd2a2380..3bdebef2 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1034,10 +1034,8 @@ def construct_dense_weighted_network(rlist, distMat, weights_type = None, use_gp weights = process_weights(distMat, weights_type) # Convert edge indices to tuples - edge_list = poppunk_refine.generateTuples([0] * len(weights), - 0, + edge_list = poppunk_refine.generateAllTuples(num_ref = len(rlist), self = True, - num_ref = len(rlist), int_offset = 0) if use_gpu: From 90ae4dff17d9ae451325eba01b6f44c3b3259539 Mon Sep 17 00:00:00 2001 From: Croucher Date: Wed, 28 Jul 2021 06:32:14 +0100 Subject: [PATCH 161/175] Add missing brackets --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 3bdebef2..c5a1e7e7 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -149,7 +149,7 @@ def load_network_file(fn, use_gpu = False): G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) genomeNetwork = cugraph.Graph() if 'weights' in G_df.columns: - G_df = G_df['source','destination','weights'] + G_df = G_df[['source','destination','weights']] genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: genomeNetwork.from_cudf_edgelist(G_df,renumber=False) From 90be56db38679b56523a07e4b66932389c8c6727 Mon Sep 17 00:00:00 2001 From: Croucher Date: Wed, 28 Jul 2021 06:35:47 +0100 Subject: [PATCH 162/175] Change args name --- PopPUNK/info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/info.py b/PopPUNK/info.py index 32dff47b..a8a0420a 100644 --- a/PopPUNK/info.py +++ b/PopPUNK/info.py @@ -32,7 +32,7 @@ def get_options(): prog='poppunk_db_info') # input options - parser.add_argument('--ref-db', + parser.add_argument('--db', required = True, help='PopPUNK database directory') parser.add_argument('--network', @@ -71,7 +71,7 @@ def main(): setGtThreads(args.threads) # Open and process sequence database - h5_fn = os.path.join(args.ref_db,os.path.basename(args.ref_db) + '.h5') + h5_fn = os.path.join(args.db,os.path.basename(args.db) + '.h5') ref_db = h5py.File(h5_fn, 'r') # Print overall database information From dc69a4791ebed3ac9770c9f56b41f29d10c26fcb Mon Sep 17 00:00:00 2001 From: Croucher Date: Wed, 28 Jul 2021 06:44:23 +0100 Subject: [PATCH 163/175] Change network loading --- PopPUNK/info.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/PopPUNK/info.py b/PopPUNK/info.py index a8a0420a..be628bd9 100644 --- a/PopPUNK/info.py +++ b/PopPUNK/info.py @@ -36,7 +36,8 @@ def get_options(): required = True, help='PopPUNK database directory') parser.add_argument('--network', - required = True, + required = False, + default = None, help='Network or lineage fit file for analysis') parser.add_argument('--threads', default = 1, @@ -48,6 +49,10 @@ def get_options(): parser.add_argument('--output', required = True, help='Prefix for output files') + parser.add_argument('--simple', + default = False, + action = 'store_true', + help='Do not print per sample information') return parser.parse_args() @@ -71,11 +76,10 @@ def main(): setGtThreads(args.threads) # Open and process sequence database - h5_fn = os.path.join(args.db,os.path.basename(args.db) + '.h5') + h5_fn = os.path.join(args.db, os.path.basename(args.db) + '.h5') ref_db = h5py.File(h5_fn, 'r') # Print overall database information - ref_db = h5py.File(args.db, 'r') print("PopPUNK database:\t\t" + args.db) sketch_version = ref_db['sketches'].attrs['sketch_version'] @@ -103,6 +107,10 @@ def main(): codon_phased = False print("Codon phased seeds:\t\t" + str(codon_phased)) + # Stop if requested + if args.simple: + sys.exit(0) + # Print sample information sample_names = list(ref_db['sketches'].keys()) sample_sequence_length = {} @@ -114,6 +122,14 @@ def main(): sample_sequence_length[sample_name] = ref_db['sketches/' + sample_name].attrs['length'] sample_missing_bases[sample_name] = ref_db['sketches/' + sample_name].attrs['missing_bases'] + # Select network file name + network_fn = args.network + if network_fn is None: + if use_gpu: + network_fn = os.path.join(args.db, os.path.basename(args.db) + '_graph.csv.gz') + else: + network_fn = os.path.join(args.db, os.path.basename(args.db) + '_graph.gt') + # Open network file if args.network.endswith('.gt'): G = load_network_file(args.network, use_gpu = False) From 2a524ebbe49f2a71acfdbc0f888637206a658db2 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 28 Jul 2021 08:16:32 +0100 Subject: [PATCH 164/175] Correct network file name processing --- PopPUNK/info.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/PopPUNK/info.py b/PopPUNK/info.py index be628bd9..5fdcb4f6 100644 --- a/PopPUNK/info.py +++ b/PopPUNK/info.py @@ -131,16 +131,16 @@ def main(): network_fn = os.path.join(args.db, os.path.basename(args.db) + '_graph.gt') # Open network file - if args.network.endswith('.gt'): - G = load_network_file(args.network, use_gpu = False) - elif args.network.endswith('.csv.gz'): + if network_fn.endswith('.gt'): + G = load_network_file(network_fn, use_gpu = False) + elif network_fn.endswith('.csv.gz'): if use_gpu: - G = load_network_file(args.network, use_gpu = True) + G = load_network_file(network_fn, use_gpu = True) else: sys.stderr.write('Unable to load necessary GPU libraries\n') exit(1) - elif args.network.endswith('.npz'): - sparse_mat = sparse.load_npz(args.network) + elif network_fn.endswith('.npz'): + sparse_mat = sparse.load_npz(network_fn) G = sparse_mat_to_network(sparse_mat, sample_names, use_gpu = use_gpu) else: sys.stderr.write('Unrecognised suffix: expected ".gt", ".csv.gz" or ".npz"\n') From 26f2f0447b3adba735cf184cc9ba005747d184fd Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 28 Jul 2021 08:24:34 +0100 Subject: [PATCH 165/175] Update tests for info and trees --- test/run_test.py | 6 +++++- test/test-gpu.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/test/run_test.py b/test/run_test.py index 67ff51ee..31a01c91 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -80,7 +80,7 @@ # MST sys.stderr.write("Running MST\n") -subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree mst", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree both", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_mst-runner.py --distance-pkl example_db/example_db.dists.pkl --rank-fit example_lineages/example_lineages_rank5_fit.npz --previous-clustering example_dbscan/example_dbscan_clusters.csv --output example_sparse_mst --no-plot", shell=True, check=True) # t-sne @@ -95,6 +95,10 @@ sys.stderr.write("Running poppunk_references\n") subprocess.run(python_cmd + " ../poppunk_references-runner.py --network example_db/example_db_graph.gt --distances example_db/example_db.dists --ref-db example_db --output example_refs --model example_db", shell=True, check=True) +# info +sys.stderr.write("Running poppunk_info\n") +subprocess.run(python_cmd + "../poppunk_info-runner.py --db example_db --output example_db.info.csv", shell=True, check=True) + # citations sys.stderr.write("Printing citations\n") subprocess.run(python_cmd + " ../poppunk-runner.py --citation --fit-model bgmm --ref-db example_db --K 4", shell=True, check=True) diff --git a/test/test-gpu.py b/test/test-gpu.py index e5b09183..a6ac2208 100755 --- a/test/test-gpu.py +++ b/test/test-gpu.py @@ -80,7 +80,7 @@ # MST sys.stderr.write("Running MST\n") -subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree mst --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree both --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_mst-runner.py --distance-pkl example_db/example_db.dists.pkl --rank-fit example_lineages/example_lineages_rank5_fit.npz --previous-clustering example_dbscan/example_dbscan_clusters.csv --output example_sparse_mst --no-plot --gpu-graph", shell=True, check=True) # t-sne @@ -95,6 +95,10 @@ sys.stderr.write("Running poppunk_references\n") subprocess.run(python_cmd + " ../poppunk_references-runner.py --network example_db/example_db_graph.csv.gz --distances example_db/example_db.dists --ref-db example_db --output example_refs --model example_db --use-gpu", shell=True, check=True) +# info +sys.stderr.write("Running poppunk_info\n") +subprocess.run(python_cmd + "../poppunk_info-runner.py --db example_db --output example_db.info.csv", shell=True, check=True) + # citations sys.stderr.write("Printing citations\n") subprocess.run(python_cmd + " ../poppunk-runner.py --citation --fit-model bgmm --ref-db example_db --K 4", shell=True, check=True) From 4af6078b09e1c4318bf5e0a967fa8c40846111be Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 28 Jul 2021 09:01:40 +0100 Subject: [PATCH 166/175] Fix info test commands --- test/run_test.py | 2 +- test/test-gpu.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/run_test.py b/test/run_test.py index 31a01c91..9d7fab26 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -97,7 +97,7 @@ # info sys.stderr.write("Running poppunk_info\n") -subprocess.run(python_cmd + "../poppunk_info-runner.py --db example_db --output example_db.info.csv", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_info-runner.py --db example_db --output example_db.info.csv", shell=True, check=True) # citations sys.stderr.write("Printing citations\n") diff --git a/test/test-gpu.py b/test/test-gpu.py index a6ac2208..9b932cbb 100755 --- a/test/test-gpu.py +++ b/test/test-gpu.py @@ -97,7 +97,7 @@ # info sys.stderr.write("Running poppunk_info\n") -subprocess.run(python_cmd + "../poppunk_info-runner.py --db example_db --output example_db.info.csv", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_info-runner.py --db example_db --output example_db.info.csv --use-gpu", shell=True, check=True) # citations sys.stderr.write("Printing citations\n") From cbfcdd615f84d89486171724be0d716c284d81dd Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 28 Jul 2021 09:24:54 +0100 Subject: [PATCH 167/175] Change component analysis --- PopPUNK/info.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/PopPUNK/info.py b/PopPUNK/info.py index 5fdcb4f6..449b4c68 100644 --- a/PopPUNK/info.py +++ b/PopPUNK/info.py @@ -12,6 +12,7 @@ import numpy as np import pandas as pd from scipy import sparse +import graph_tool.all as gt # Load GPU libraries try: @@ -159,7 +160,7 @@ def main(): graph_properties_df['vertex'] = np.arange(len(sample_names)) graph_properties_df['labels'] = gt.label_components(G)[0].a graph_properties_df['degree'] = G.get_out_degrees(G.get_vertices()) - graph_properties_df['component_count'] = component_assignments.groupby('partition')['vertex'].transform('count') + graph_properties_df['component_count'] = graph_properties_df.groupby('labels')['vertex'].transform('count') graph_properties_df = graph_properties_df.sort_values('vertex', axis = 0) # inplace not implemented for cudf graph_properties_df['vertex'] = sample_names @@ -172,7 +173,7 @@ def main(): out_file.write(sample_name + ',' + str(sample_sequence_length[sample_name]) + ',' + str(sample_missing_bases[sample_name]) + ',') for frequency in sample_base_frequencies[sample_name]: out_file.write(str(frequency) + ',') - graph_properties_row = graph_properties_df.iloc[graph_properties_df['vertex']==sample_name,:] + graph_properties_row = graph_properties_df.loc[graph_properties_df['vertex']==sample_name,:] out_file.write(str(graph_properties_row['labels'].values[0]) + ',') out_file.write(str(graph_properties_row['component_count'].values[0]) + ',') out_file.write(str(graph_properties_row['degree'].values[0])) From 4d6c1a2f3c1bfeb9985cabd6a72ba10639537c4f Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 28 Jul 2021 11:46:32 +0100 Subject: [PATCH 168/175] Update tuple generation functions --- src/boundary.cpp | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/src/boundary.cpp b/src/boundary.cpp index 1a963a99..94638a43 100644 --- a/src/boundary.cpp +++ b/src/boundary.cpp @@ -121,26 +121,21 @@ edge_tuple generate_tuples(const std::vector &assignments, const size_t n_rows = assignments.size(); const size_t n_samples = 0.5 * (1 + sqrt(1 + 8 * (n_rows))); edge_tuple edge_vec; - if (self) { - for (long row_idx = 0; row_idx < n_rows; row_idx++) { - if (assignments[row_idx] == within_label) { - long i = calc_row_idx(row_idx, n_samples); - long j = calc_col_idx(row_idx, i, n_samples) + int_offset; + for (long row_idx = 0; row_idx < n_rows; row_idx++) { + unsigned long i, j; + if (assignments[row_idx] == within_label) { + if (self) { + i = calc_row_idx(row_idx, n_samples); + j = calc_col_idx(row_idx, i, n_samples) + int_offset; i = i + int_offset; - long min_node = std::min(i,j); - long max_node = std::max(i,j); - edge_vec.push_back(std::make_tuple(min_node, max_node)); + } else { + i = row_idx % num_ref + int_offset; + j = row_idx / num_ref + num_ref + int_offset; } - } - } else { - for (long row_idx = 0; row_idx < n_rows; row_idx++) { - if (assignments[row_idx] == within_label) { - unsigned long i = row_idx % num_ref + int_offset; - unsigned long j = row_idx / num_ref + num_ref + int_offset; - long min_node = std::min(i,j); - long max_node = std::max(i,j); - edge_vec.push_back(std::make_tuple(min_node, max_node)); + if (i > j) { + std::swap(i, j); } + edge_vec.push_back(std::make_tuple(i, j)); } } return edge_vec; @@ -154,16 +149,18 @@ edge_tuple generate_all_tuples(const int num_ref, if (self) { const size_t n_rows = (pow(2 * num_ref - 1, 2) - 1) / 8; for (long row_idx = 0; row_idx < n_rows; row_idx++) { - long i = calc_row_idx(row_idx, num_ref); - long j = calc_col_idx(row_idx, i, num_ref) + int_offset; + unsigned long i, j; + i = calc_row_idx(row_idx, num_ref); + j = calc_col_idx(row_idx, i, num_ref) + int_offset; i = i + int_offset; - long min_node = std::min(i,j); - long max_node = std::max(i,j); - edge_vec.push_back(std::make_tuple(min_node, max_node)); + if (i > j) { + std::swap(i, j); + } + edge_vec.push_back(std::make_tuple(i, j)); } } else { - for (long i = 0; i < num_ref; i++) { - for (long j = 0; j < num_queries; j++) { + for (unsigned long i = 0; i < num_ref; i++) { + for (unsigned long j = 0; j < num_queries; j++) { edge_vec.push_back(std::make_tuple(i, j + num_ref)); } } From 0ec1bcc2daaafc736d5fefb776b0839ec506d86d Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 28 Jul 2021 12:21:01 +0100 Subject: [PATCH 169/175] Clarify model type variable names --- PopPUNK/assign.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index cc79d41f..7517ac19 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -120,17 +120,17 @@ def assign_query(dbFuncs, # Iterate through different types of model fit with a refined model when specified # Core and accessory assignments use the same model and same overall set of distances # but have different networks, references, reference distances and assignments - fit_type_list = ['original'] + fit_type_list = ['default'] if model.type == 'refine' and model.indiv_fitted: if core: - fit_type_list.append('core') + fit_type_list.append('core_refined') if accessory: - fit_type_list.append('accessory') + fit_type_list.append('accessory_refined') for fit_type in fit_type_list: # Define file name extension file_extension_string = '' - if fit_type != 'original': + if fit_type != 'default': file_extension_string = '_' + fit_type # Find distances vs ref seqs rNames = [] @@ -153,7 +153,7 @@ def assign_query(dbFuncs, # construct database - use a single database directory for all query outputs if (web and json_sketch is not None): qNames = sketch_to_hdf5(json_sketch, output) - elif (fit_type == 'original'): + elif (fit_type == 'default'): # construct database createDatabaseDir(output, kmers) qNames = constructDatabase(q_files, @@ -166,7 +166,7 @@ def assign_query(dbFuncs, calc_random = False, use_gpu = gpu_sketch, deviceid = deviceid) - if (fit_type == 'original' or (fit_type != 'original' and use_ref_graph)): + if (fit_type == 'default' or (fit_type != 'default' and use_ref_graph)): # run query qrDistMat = queryDatabase(rNames = rNames, qNames = qNames, @@ -190,8 +190,8 @@ def assign_query(dbFuncs, model, rNames, ref_graph = use_ref_graph, - core_only = (fit_type == 'core'), - accessory_only = (fit_type == 'accessory'), + core_only = (fit_type == 'core_refined'), + accessory_only = (fit_type == 'accessory_refined'), use_gpu = gpu_graph) if max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) != (len(rNames) - 1): @@ -248,13 +248,13 @@ def assign_query(dbFuncs, else: # Assign these distances as within or between strain - if fit_type == 'original': + if fit_type == 'default': queryAssignments = model.assign(qrDistMat) dist_type = 'euclidean' - elif fit_type == 'core': + elif fit_type == 'core_refined': queryAssignments = model.assign(qrDistMat, slope = 0) dist_type = 'core' - elif fit_type == 'accessory': + elif fit_type == 'accessory_refined': queryAssignments = model.assign(qrDistMat, slope = 1) dist_type = 'accessory' @@ -276,8 +276,8 @@ def assign_query(dbFuncs, distances = distances, distance_type = dist_type, queryQuery = (update_db and - (fit_type == 'original' or - (fit_type != 'original' and use_ref_graph) + (fit_type == 'default' or + (fit_type != 'default' and use_ref_graph) ) ), strand_preserved = strand_preserved, @@ -304,7 +304,7 @@ def assign_query(dbFuncs, else: sys.stderr.write("Updating reference database to " + output + "\n") # Update the network + ref list (everything) - no need to duplicate for core/accessory - if fit_type == 'original': + if fit_type == 'default': joinDBs(ref_db, output, output, {"threads": threads, "strand_preserved": strand_preserved}) if model.type == 'lineage': @@ -341,7 +341,7 @@ def assign_query(dbFuncs, storePickle(combined_seq, combined_seq, True, complete_distMat, dists_out) # Copy model if needed - if output != model.outPrefix and fit_type == 'original': + if output != model.outPrefix and fit_type == 'default': model.copy(output) # Clique pruning From 6f74e520ff33dc49b3311c10292839ec97c4680d Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 28 Jul 2021 20:13:19 +0100 Subject: [PATCH 170/175] Correct tuple generation order --- src/boundary.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/boundary.cpp b/src/boundary.cpp index 94638a43..d1991fa7 100644 --- a/src/boundary.cpp +++ b/src/boundary.cpp @@ -159,8 +159,8 @@ edge_tuple generate_all_tuples(const int num_ref, edge_vec.push_back(std::make_tuple(i, j)); } } else { - for (unsigned long i = 0; i < num_ref; i++) { - for (unsigned long j = 0; j < num_queries; j++) { + for (unsigned long j = 0; j < num_ref; j++) { + for (unsigned long i = 0; i < num_queries; i++) { edge_vec.push_back(std::make_tuple(i, j + num_ref)); } } From 6a90918d696b705fcfca50649da1bd6c56ad6ccd Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 28 Jul 2021 21:21:58 +0100 Subject: [PATCH 171/175] Tidy up section headings --- PopPUNK/visualise.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index e78b7163..f60333a2 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -329,11 +329,11 @@ def generate_visualisations(query_db, qlist, qr_distMat, qq_distMat, threads = threads) - #******************************# - #* *# - #* Extract subset of sequences *# - #* *# - #******************************# + #*******************************# + #* *# + #* Extract subset of sequences *# + #* *# + #*******************************# # extract subset of distances if requested if include_files is not None: @@ -357,11 +357,11 @@ def generate_visualisations(query_db, else: viz_subset = None - #******************************# - #* *# - #* Process clustering information *# - #* *# - #******************************# + #**********************************# + #* *# + #* Process clustering information *# + #* *# + #**********************************# # Either use strain definitions, lineage assignments or external clustering isolateClustering = {} @@ -430,11 +430,11 @@ def generate_visualisations(query_db, return_dict = True) isolateClustering = joinClusterDicts(isolateClustering, queryIsolateClustering) - #******************************# - #* *# - #* Generate trees *# - #* *# - #******************************# + #*******************# + #* *# + #* Generate trees *# + #* *# + #*******************# # Generate trees mst_tree = None @@ -524,11 +524,11 @@ def generate_visualisations(query_db, else: sys.stderr.write("Fewer than three sequences, not drawing trees\n") - #******************************# - #* *# - #* Write output *# - #* *# - #******************************# + #****************# + #* *# + #* Write output *# + #* *# + #****************# # Now have all the objects needed to generate selected visualisations if microreact: From e1a3ffd5de9eec702ec4b739d5eb59b6710ffd2f Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Wed, 28 Jul 2021 22:35:40 +0100 Subject: [PATCH 172/175] Update function arguments --- test/test-web.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/test-web.py b/test/test-web.py index a69505c1..d0045b6e 100644 --- a/test/test-web.py +++ b/test/test-web.py @@ -26,6 +26,7 @@ def main(): os.mkdir(outdir) args = default_options(species_db) qc_dict = {'run_qc': False } + print("Weights: " + str(args.assign.graph_weights)) dbFuncs = setupDBFuncs(args.assign, args.assign.min_kmer_count, qc_dict) ClusterResult = assign_query(dbFuncs, args.assign.ref_db, @@ -38,7 +39,7 @@ def main(): args.assign.threads, args.assign.overwrite, args.assign.plot_fit, - args.assign.graph_weights, + False, #args.assign.graph_weights, args.assign.max_a_dist, args.assign.max_pi_dist, args.assign.type_isolate, @@ -60,13 +61,14 @@ def main(): colours = get_colours(query, clusters) url = api(query, "example_viz") sys.stderr.write('PopPUNK-web assign test successful\n') - + print("Done clustering") # Test generate_visualisations() for PopPUNK-web sys.stderr.write('\nTesting visualisations for PopPUNK-web\n') if len(to_include) < 3: args.visualise.microreact = False generate_visualisations(outdir, species_db, + os.path.join(species_db, species_db + '.dists'), # distances, None, args.visualise.threads, outdir, @@ -83,7 +85,9 @@ def main(): species_db, species_db + "/" + os.path.basename(species_db) + "_clusters.csv", args.visualise.previous_query_clustering, - outdir + "/" + os.path.basename(outdir) + "_graph.gt", + None, # previous MST + None, # previous distances, + species_db + "/" + os.path.basename(species_db) + "_graph.gt", args.visualise.gpu_graph, args.visualise.info_csv, args.visualise.rapidnj, @@ -128,4 +132,4 @@ def main(): sys.stderr.write('\nAPI tests complete\n') if __name__ == "__main__": - main() \ No newline at end of file + main() From e81f6ff5df9378c5b6f6318cc2e97c30e054016b Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 29 Jul 2021 06:20:50 +0100 Subject: [PATCH 173/175] Fix graph suffix error --- PopPUNK/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index f6e6d4ed..99c733a1 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -505,7 +505,7 @@ def outputsForCytoscape(G, G_mst, isolate_names, clustering, outPrefix, epiCsv, suffix = '_cytoscape' else: suffix = suffix + '_cytoscape' - save_network(G, prefix = outPrefix, suffix = suffix + '_cytoscape', use_graphml = True) + save_network(G, prefix = outPrefix, suffix = suffix, use_graphml = True) if G_mst != None: isolate_labels = isolateNameToLabel(G_mst.vp.id) From 8dffc6150b8fcf8f72357517b5c2b6cd087b140d Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 29 Jul 2021 06:33:22 +0100 Subject: [PATCH 174/175] Correct network input file --- test/test-web.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test-web.py b/test/test-web.py index d0045b6e..a12ba280 100644 --- a/test/test-web.py +++ b/test/test-web.py @@ -68,7 +68,7 @@ def main(): args.visualise.microreact = False generate_visualisations(outdir, species_db, - os.path.join(species_db, species_db + '.dists'), # distances, + os.path.join(outdir, outdir + '.dists'), # distances, None, args.visualise.threads, outdir, @@ -87,7 +87,7 @@ def main(): args.visualise.previous_query_clustering, None, # previous MST None, # previous distances, - species_db + "/" + os.path.basename(species_db) + "_graph.gt", + outdir + "/" + os.path.basename(outdir) + "_graph.gt", args.visualise.gpu_graph, args.visualise.info_csv, args.visualise.rapidnj, From 5facf69334e0517b63dff86406f72769cd86ae3e Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 29 Jul 2021 11:28:53 +0100 Subject: [PATCH 175/175] Check for strand-specific databases --- PopPUNK/info.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/PopPUNK/info.py b/PopPUNK/info.py index 449b4c68..8a084d87 100644 --- a/PopPUNK/info.py +++ b/PopPUNK/info.py @@ -108,6 +108,10 @@ def main(): codon_phased = False print("Codon phased seeds:\t\t" + str(codon_phased)) + if 'use_rc' in ref_db.keys(): + use_rc = ref_db['sketches'].attrs['use_rc'] == 1 + print("Uses canonical k-mers:\t" + str(use_rc)) + # Stop if requested if args.simple: sys.exit(0)