From 04c8f0d13245527f0ba108d9ba3aeb8fb2cff1e1 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 1 Feb 2021 15:24:04 +0000 Subject: [PATCH 001/327] Change reference to clustering in MST plot --- PopPUNK/plot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 521b41ae..92526178 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -446,12 +446,12 @@ def drawMST(mst, outPrefix, isolate_clustering, overwrite): output=graph1_file_name, output_size=(3000, 3000)) if overwrite or not os.path.isfile(graph2_file_name): cluster_fill = {} - for cluster in set(isolate_clustering['Cluster'].values()): + for cluster in set(isolate_clustering['Rank_50_Lineage'].values()): cluster_fill[cluster] = list(np.random.rand(3)) + [0.9] plot_color = mst.new_vertex_property('vector') mst.vertex_properties['plot_color'] = plot_color for v in mst.vertices(): - plot_color[v] = cluster_fill[isolate_clustering['Cluster'][mst.vp.id[v]]] + plot_color[v] = cluster_fill[isolate_clustering['Rank_50_Lineage'][mst.vp.id[v]]] gt.graph_draw(mst, pos=pos, vertex_fill_color=mst.vertex_properties['plot_color'], output=graph2_file_name, output_size=(3000, 3000)) From 352a8aa558f317f8a259616e375ec41a850312fd Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 1 Feb 2021 21:39:50 +0000 Subject: [PATCH 002/327] Enable colouring by lineage with MSTs --- PopPUNK/plot.py | 8 +++++--- PopPUNK/sparse_mst.py | 18 ++++++++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 92526178..51e40e09 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -414,7 +414,7 @@ def distHistogram(dists, rank, outPrefix): "_rank_" + str(rank) + "_histogram.png") plt.close() -def drawMST(mst, outPrefix, isolate_clustering, overwrite): +def drawMST(mst, outPrefix, isolate_clustering, clustering_name, overwrite): """Plot a layout of the minimum spanning tree Args: @@ -424,6 +424,8 @@ def drawMST(mst, outPrefix, isolate_clustering, overwrite): Output prefix for save files isolate_clustering (dict) Dictionary of ID: cluster, used for colouring vertices + clustering_name (str) + Name of clustering scheme to be used for colouring overwrite (bool) Overwrite existing output files """ @@ -446,12 +448,12 @@ def drawMST(mst, outPrefix, isolate_clustering, overwrite): output=graph1_file_name, output_size=(3000, 3000)) if overwrite or not os.path.isfile(graph2_file_name): cluster_fill = {} - for cluster in set(isolate_clustering['Rank_50_Lineage'].values()): + for cluster in set(isolate_clustering[clustering_name].values()): cluster_fill[cluster] = list(np.random.rand(3)) + [0.9] plot_color = mst.new_vertex_property('vector') mst.vertex_properties['plot_color'] = plot_color for v in mst.vertices(): - plot_color[v] = cluster_fill[isolate_clustering['Rank_50_Lineage'][mst.vp.id[v]]] + plot_color[v] = cluster_fill[isolate_clustering[clustering_name][mst.vp.id[v]]] gt.graph_draw(mst, pos=pos, vertex_fill_color=mst.vertex_properties['plot_color'], output=graph2_file_name, output_size=(3000, 3000)) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index 50a9e4b1..a106e009 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -32,6 +32,7 @@ def get_options(): iGroup.add_argument('--distances', required=True, help='Prefix of input pickle of pre-calculated distances (required)') iGroup.add_argument('--rank-fit', required=True, help='Location of rank fit, a sparse matrix (*_rank*_fit.npz)') iGroup.add_argument('--previous-clustering', help='CSV file with cluster definitions') + iGroup.add_argument('--display-cluster', default=None, help='Column of clustering CSV to use for plotting') # output options oGroup = parser.add_argument_group('Output options') @@ -116,7 +117,7 @@ def main(): if not args.no_plot: if args.previous_clustering != None: mode = "clusters" - if re.match(r"_lineages\.csv$", args.previous_clustering): + if args.previous_clustering.endswith('_lineages.csv'): mode = "lineages" isolateClustering = readIsolateTypeFromCsv(args.previous_clustering, mode = mode, @@ -127,7 +128,20 @@ def main(): for v in mst.vertices: isolateClustering['Cluster'][mst.vp.id[v]] = '0' - drawMST(mst, args.output, isolateClustering, True) + # Check selecting clustering type is in CSV + clustering_name = 'Cluster' + if args.display_cluster != None and args.previous_clustering != None: + if args.display_cluster not in isolateClustering.keys(): + sys.stderr.write('Unable to find clustering column ' + args.display_cluster + ' in file ' + + args.previous_clustering + '\n') + sys.exit() + else: + clustering_name = args.display_cluster + else: + args.display_cluster = list(isolateClustering.keys())[0] + + # Draw MST + drawMST(mst, args.output, isolateClustering, args.display_cluster, True) sys.exit(0) From 5bbc775f4ad7219a3b875db4b1a27b8944c547b9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 1 Feb 2021 22:10:10 +0000 Subject: [PATCH 003/327] Fix name processing for display clustering --- PopPUNK/sparse_mst.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index a106e009..b53866fa 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -138,10 +138,10 @@ def main(): else: clustering_name = args.display_cluster else: - args.display_cluster = list(isolateClustering.keys())[0] + clustering_name = list(isolateClustering.keys())[0] # Draw MST - drawMST(mst, args.output, isolateClustering, args.display_cluster, True) + drawMST(mst, args.output, isolateClustering, clustering_name, True) sys.exit(0) From d70d65e2638a126d693daff8413287ed33142232 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 1 Feb 2021 22:25:03 +0000 Subject: [PATCH 004/327] Harmonise visualise.py with sparse_mst.py --- PopPUNK/visualise.py | 58 +++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index d767a54d..ab9fdf73 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -53,15 +53,18 @@ def get_options(): 'to clusters [default = reference database directory]', type = str) iGroup.add_argument('--previous-clustering', - help='Directory containing previous cluster definitions ' + help='File containing previous cluster definitions ' 'and network [default = use that in the directory ' 'containing the model]', type = str) iGroup.add_argument('--previous-query-clustering', - help='Directory containing previous cluster definitions ' + help='File containing previous cluster definitions ' 'from poppunk_assign [default = use that in the directory ' - 'containing the model]', + 'of the query database]', type = str) + iGroup.add_argument('--display-cluster', + help='Column of clustering CSV to use for plotting', + default=None) # output options oGroup = parser.add_argument_group('Output options') @@ -295,34 +298,37 @@ def generate_visualisations(query_db, sys.exit(1) # Load previous clusters - mode = "clusters" - suffix = "_clusters.csv" - if model.type == "lineage": - mode = "lineages" - suffix = "_lineages.csv" - if model.indiv_fitted: - sys.stderr.write("Note: Individual (core/accessory) fits found, but " - "visualisation only supports combined boundary fit\n") - - # Set directories of previous fit if previous_clustering is not None: prev_clustering = previous_clustering + mode = "clusters" + suffix = "_clusters.csv" + if prev_clustering.endswith('_lineages.csv'): + mode = "lineages" + suffix = "_lineages.csv" else: - prev_clustering = os.path.dirname(model_file) - cluster_file = prev_clustering + '/' + os.path.basename(prev_clustering) + suffix - isolateClustering = readIsolateTypeFromCsv(cluster_file, + # Identify type of clustering based on model + mode = "clusters" + suffix = "_clusters.csv" + if model.type == "lineage": + mode = "lineages" + suffix = "_lineages.csv" + if model.indiv_fitted: + sys.stderr.write("Note: Individual (core/accessory) fits found, but " + "visualisation only supports combined boundary fit\n") + prev_clustering = os.path.dirname(model_file) + '/' + os.path.basename(model_file) + suffix + isolateClustering = readIsolateTypeFromCsv(prev_clustering, mode = mode, return_dict = True) # Join clusters with query clusters if required if not self: if previous_query_clustering is not None: - prev_query_clustering = previous_query_clustering + '/' + os.path.basename(previous_query_clustering) + prev_query_clustering = previous_query_clustering else: - prev_query_clustering = query_db + prev_query_clustering = os.path.dirname(query_db) + '/' + os.path.basename(query_db) + suffix queryIsolateClustering = readIsolateTypeFromCsv( - prev_query_clustering + suffix, + prev_query_clustering, mode = mode, return_dict = True) isolateClustering = joinClusterDicts(isolateClustering, queryIsolateClustering) @@ -348,7 +354,19 @@ def generate_visualisations(query_db, weights_type=mst_distances, summarise=False) mst_graph = generate_minimum_spanning_tree(G) - drawMST(mst_graph, output, isolateClustering, overwrite) + # Check selecting clustering type is in CSV + clustering_name = 'Cluster' + if args.display_cluster != None: + if args.display_cluster not in isolateClustering.keys(): + sys.stderr.write('Unable to find clustering column ' + args.display_cluster + ' in file ' + + prev_clustering + '\n') + sys.exit() + else: + clustering_name = args.display_cluster + else: + clustering_name = list(isolateClustering.keys())[0] + # Draw MST + drawMST(mst_graph, output, isolateClustering, clustering_name, overwrite) mst_tree = mst_to_phylogeny(mst_graph, isolateNameToLabel(combined_seq)) else: mst_tree = existing_tree From 77bd637b31db75172da7f21e017830d44e369a9a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 1 Feb 2021 22:27:42 +0000 Subject: [PATCH 005/327] Pass display cluster argument correctly --- PopPUNK/visualise.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index ab9fdf73..8bfeef25 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -153,6 +153,7 @@ def generate_visualisations(query_db, overwrite, core_only, accessory_only, + display_cluster, web): from .models import loadClusterFit @@ -356,13 +357,13 @@ def generate_visualisations(query_db, mst_graph = generate_minimum_spanning_tree(G) # Check selecting clustering type is in CSV clustering_name = 'Cluster' - if args.display_cluster != None: - if args.display_cluster not in isolateClustering.keys(): - sys.stderr.write('Unable to find clustering column ' + args.display_cluster + ' in file ' + + if display_cluster != None: + if display_cluster not in isolateClustering.keys(): + sys.stderr.write('Unable to find clustering column ' + display_cluster + ' in file ' + prev_clustering + '\n') sys.exit() else: - clustering_name = args.display_cluster + clustering_name = display_cluster else: clustering_name = list(isolateClustering.keys())[0] # Draw MST @@ -461,6 +462,7 @@ def main(): args.overwrite, args.core_only, args.accessory_only, + args.display_cluster, web = False) if __name__ == '__main__': From 1aed21aabd0a20731bad031235a3690a1c9996ef Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 2 Feb 2021 08:12:14 +0000 Subject: [PATCH 006/327] Update test for new argument parsing --- test/run_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/run_test.py b/test/run_test.py index c3f11051..ad249492 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -62,7 +62,7 @@ subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree", shell=True, check=True) subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt", shell=True, check=True) subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --query-db example_query --output example_viz_query --microreact", shell=True, check=True) -subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True) +subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages/example_dbscan_clusters.csv --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True) subprocess.run("python ../poppunk_visualise-runner.py --distances example_query/example_query.dists --ref-db example_db --model-dir example_lineages --query-db example_lineage_query --output example_viz_query_lineages --microreact", shell=True, check=True) # MST From 282dc921a42cf7163f15528c56cae23e9561d79f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 2 Feb 2021 09:42:50 +0000 Subject: [PATCH 007/327] Fix reference to fetchNetwork --- PopPUNK/visualise.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 8bfeef25..385294b3 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -425,7 +425,12 @@ def generate_visualisations(query_db, if cytoscape: sys.stderr.write("Writing cytoscape output\n") - genomeNetwork, cluster_file = fetchNetwork(prev_clustering, model, rlist, False, core_only, accessory_only) + genomeNetwork, cluster_file = fetchNetwork(os.path.dirname(prev_clustering), + model, + rlist, + False, + core_only, + accessory_only) outputsForCytoscape(genomeNetwork, mst_graph, isolateClustering, output, info_csv, viz_subset = viz_subset) if model.type == 'lineage': sys.stderr.write("Note: Only support for output of cytoscape graph at lowest rank\n") From 6aba143c0e15465499c24e824d842cc9c589b17f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 2 Feb 2021 10:32:39 +0000 Subject: [PATCH 008/327] Do not exit when display cluster is missing --- PopPUNK/visualise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 385294b3..75682248 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -359,9 +359,9 @@ def generate_visualisations(query_db, clustering_name = 'Cluster' if display_cluster != None: if display_cluster not in isolateClustering.keys(): + clustering_name = list(isolateClustering.keys())[0] sys.stderr.write('Unable to find clustering column ' + display_cluster + ' in file ' + - prev_clustering + '\n') - sys.exit() + prev_clustering + '; instead using ' + clustering_name + '\n') else: clustering_name = display_cluster else: From 51b939fdf1ddb1a8f259c1840a425a0efee57c9f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 2 Feb 2021 10:35:18 +0000 Subject: [PATCH 009/327] Update documents for --previous-clustering --- docs/visualisation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/visualisation.rst b/docs/visualisation.rst index 666daa7a..43d655e8 100644 --- a/docs/visualisation.rst +++ b/docs/visualisation.rst @@ -44,7 +44,7 @@ Visualisation after query assignment:: Visualisation when sketches and models are in different folders:: - poppunk_visualise --ref-db example_db --previous-clustering example_lineages \ + poppunk_visualise --ref-db example_db --previous-clustering example_lineages/example_lineages_lineages.csv \ --model-dir example_lineages --output example_viz --microreact Visualisation with a lineage model, which has been queried (query-query distances must be provided):: From 1f7d34fbd5a041413a520b68b7de718ee96068b0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 2 Feb 2021 10:38:10 +0000 Subject: [PATCH 010/327] Update tests for visualisation --- test/run_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/run_test.py b/test/run_test.py index ad249492..0e028fd9 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -62,7 +62,7 @@ subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree", shell=True, check=True) subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt", shell=True, check=True) subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --query-db example_query --output example_viz_query --microreact", shell=True, check=True) -subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages/example_dbscan_clusters.csv --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True) +subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages/example_lineages_lineages.csv --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True) subprocess.run("python ../poppunk_visualise-runner.py --distances example_query/example_query.dists --ref-db example_db --model-dir example_lineages --query-db example_lineage_query --output example_viz_query_lineages --microreact", shell=True, check=True) # MST From 9df22acc518bada0769a59431824e41c7785da52 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 2 Feb 2021 11:14:06 +0000 Subject: [PATCH 011/327] Indent change to MST calculation --- PopPUNK/sparse_mst.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index b53866fa..e0ad75f1 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -92,6 +92,7 @@ def main(): G_cu = cugraph.Graph() G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) + # Generate minimum spanning tree sys.stderr.write("Calculating MST (GPU part)\n") G_mst = cugraph.minimum_spanning_tree(G_cu, weight='weights') edge_df = G_mst.view_edge_list() @@ -106,8 +107,10 @@ def main(): G = constructNetwork(rlist, rlist, None, 0, sparse_input=sparse_mat, summarise=False) sys.stderr.write("Calculating MST (CPU)\n") - - mst = generate_minimum_spanning_tree(G, args.gpu_graph) + # Generate minimum spanning tree + mst = generate_minimum_spanning_tree(G, args.gpu_graph) + + # Save output sys.stderr.write("Generating output\n") mst.save(args.output + "/" + os.path.basename(args.output) + ".graphml", fmt="graphml") mst_as_tree = mst_to_phylogeny(mst, rlist) From 89ccbb0abb0b49956f43f52023beab1a5fa777d6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 2 Feb 2021 11:34:49 +0000 Subject: [PATCH 012/327] Correct grammar in help message --- scripts/poppunk_easy_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/poppunk_easy_run.py b/scripts/poppunk_easy_run.py index ccefca10..28d53e87 100755 --- a/scripts/poppunk_easy_run.py +++ b/scripts/poppunk_easy_run.py @@ -13,7 +13,7 @@ def get_options(): prog='easy_run') # input options - parser.add_argument('--r-files', help='List of sequence names and files (as for --r-files') + parser.add_argument('--r-files', help='List of sequence names and files (as for --r-files)') parser.add_argument('--output', help='Prefix for output files') parser.add_argument('--analysis-args', help="Other arguments to pass to poppunk. e.g. " From e5f4a229ef39558d666f1eb89cbf2d5bb1f1d13d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 2 Feb 2021 15:26:28 +0000 Subject: [PATCH 013/327] Add batch MST script --- scripts/poppunk_batch_mst.py | 158 +++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100755 scripts/poppunk_batch_mst.py diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py new file mode 100755 index 00000000..06c0e5ef --- /dev/null +++ b/scripts/poppunk_batch_mst.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +# vim: set fileencoding= : +# Copyright 2018-2020 John Lees and Nick Croucher + +# universal +import os +import sys +import argparse +import subprocess +import shutil +import glob +from collections import defaultdict + +def write_batch(batched_sequences, files, batch, output): + out_fn = output + '.' + batch + '.list' + with open(out_fn,'w') as out_file: + for seq in batched_sequences[batch]: + out_file.write(seq + "\t" + files[seq] + "\n") + return out_fn + +# command line parsing +def get_options(): + + parser = argparse.ArgumentParser(description='Batch MST mode (create db + lineage model fit + assign)', + prog='batch_mst') + + # input options + ioGroup = parser.add_argument_group('Input and output file options') + ioGroup.add_argument('--batch-file', help='Tab-separated list of sequence names, files ' + 'and batch assignments', + required = True) + ioGroup.add_argument('--batch-order', help='File specifying order in which batches should ' + 'be processed') + ioGroup.add_argument('--keep-intermediates', help='Retain the outputs of each batch', + default=False, + action='store_true') + ioGroup.add_argument('--output', help='Prefix for output files', required=True) + + # analysis options + aGroup = parser.add_argument_group('Analysis options') + aGroup.add_argument('--rank', help='Rank used to fit lineage model (int)', + type = int, + default = 1) + aGroup.add_argument('--threads', help='Number of threads for parallelisation (int)', + type = int, + default = 1) + aGroup.add_argument('--gpu', help='Use GPU for analysis', + default=False, + action='store_true') + aGroup.add_argument('--deviceid', help='GPU device ID (int)', + type = int, + default = 0) + aGroup.add_argument('--db-args', help="Other arguments to pass to poppunk. e.g. " + "'--min-k 13 --max-k 29'", + default = "") + aGroup.add_argument('--model-args', help="Other arguments to pass to lineage model fit", + default = "") + aGroup.add_argument('--assign-args', help="Other arguments to pass to poppunk_assign", + default = "") + + # Executable options + eGroup = parser.add_argument_group('Executable locations') + eGroup.add_argument('--poppunk-exe', help="Location of poppunk executable. Use " + "'python poppunk-runner.py' to run from source tree") + eGroup.add_argument('--assign-exe', help="Location of poppunk executable. Use " + "'python poppunk-runner.py' to run from source tree") + eGroup.add_argument('--mst-exe', help="Location of poppunk executable. Use " + "'python poppunk-runner.py' to run from source tree") + + return parser.parse_args() + +# main code +if __name__ == "__main__": + + # Check input ok + args = get_options() + + # Get poppunk executable + if args.poppunk_exe is None: + poppunk = "poppunk" + else: + poppunk = args.poppunk_exe + # Need to add poppunk_assign_exe + + # Check input file and batching + batch_set = set() + files = {} + batched_sequences = defaultdict(list) + with open(args.batch_file,'r') as input_file: + for line in input_file.readlines(): + info = line.rstrip().split() + files[info[0]] = info[1] + batch_set.add(info[2]) + batched_sequences[info[2]].append(info[0]) + + # Check on batch order + batches = [] + if args.batch_order is not None: + with open(args.batch_order,'r') as order_file: + batches = [line for line in input_file.readlines().rstrip()] + if set(batches) != batch_set: + batch_discrepancies = set(batches).difference(batch_set) + \ + batch_set.difference(set(batches)) + sys.stderr.write('Discrepancies between input file and batch ' + 'ordering: ' + str(batch_discrepancies) + '\n') + sys.exit() + else: + batches = list(batch_set) + + # Iterate through batches + first_batch = True + current_dir = args.output + for batch in batches: + # Write batch file + batch_fn = write_batch(batched_sequences, files, batch, args.output) + if first_batch: + # Initialise database + create_db_cmd = poppunk + " --create-db --r-files " + batch_fn + " --output " + args.output + " " + args.db_args + " --threads " + str(args.threads) + " " + args.db_args + if args.gpu: + create_db_cmd = create_db_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid) + sys.stderr.write(create_db_cmd + "\n") + subprocess.run(create_db_cmd, shell=True, check=True) + # Fit lineage model + fit_model_cmd = poppunk + " --fit-model lineage --ref-db " + args.output + " --rank " + str(args.rank) + " --threads " + str(args.threads) + " " + args.model_args + sys.stderr.write(fit_model_cmd + "\n") + subprocess.run(fit_model_cmd, shell=True, check=True) + # Completed first batch + first_batch = False + else: + # Define batch prefix + batch_prefix = args.output + "_" + batch + # Add to first batch through querying + assign_cmd = "poppunk_assign --db " + current_dir + " --query " + batch_fn + " --model-dir " + args.output + " --output " + batch_prefix + " --threads " + str(args.threads) + " --update-db " + args.assign_args + if args.gpu: + assign_cmd = assign_cmd + " --gpu-dist --deviceid " + str(args.deviceid) + sys.stderr.write(assign_cmd + "\n") + subprocess.run(assign_cmd, shell=True, check=True) + # Process output + if args.keep_intermediates: +# shutil.rmtree(batch_prefix) + current_dir = batch_prefix + print("Switch current dir to " + current_dir) + else: + for file in glob.glob(args.output + "_" + batch + "/*"): + file_basename = os.path.basename(file) + if file_basename.startswith(batch_prefix): + print("Moving file " + args.output + "_" + batch + '/' + file_basename + " to " + current_dir + '/' + file_basename.replace(batch_prefix,args.output)) + os.rename(args.output + "_" + batch + '/' + file_basename, + current_dir + '/' + file_basename.replace(batch_prefix,args.output)) + shutil.rmtree(args.output + "_" + batch) + + # Remove npy dist file +# os.remove(args.output + "/" + args.output + ".dists.npy") + + # Calculate MST + mst_command = "poppunk_mst --distances " + args.output + "/" + args.output + ".dists --rank-fit " + args.output + "/" + args.output + "_rank" + str(args.rank) + "_fit.npz --previous-clustering " + args.output + "/" + args.output + "_lineages.csv --output " + args.output + " --threads " + str(args.threads) + if args.gpu: + mst_command = mst_command + " --gpu-network" From 4da168254545328e5d2fffdcca4629b32e2236ce Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 3 Feb 2021 18:15:18 +0000 Subject: [PATCH 014/327] Refactor batch script --- PopPUNK/sparse_mst.py | 27 ++-- scripts/poppunk_batch_mst.py | 233 +++++++++++++++++++---------------- setup.py | 1 + 3 files changed, 141 insertions(+), 120 deletions(-) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index e0ad75f1..71b78f10 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -29,9 +29,9 @@ def get_options(): # input options iGroup = parser.add_argument_group('Input files') - iGroup.add_argument('--distances', required=True, help='Prefix of input pickle of pre-calculated distances (required)') iGroup.add_argument('--rank-fit', required=True, help='Location of rank fit, a sparse matrix (*_rank*_fit.npz)') iGroup.add_argument('--previous-clustering', help='CSV file with cluster definitions') + iGroup.add_argument('--distance-pkl', help='Input pickle from distances, which contains sample names') iGroup.add_argument('--display-cluster', default=None, help='Column of clustering CSV to use for plotting') # output options @@ -66,6 +66,18 @@ def main(): sys.stderr.write("cugraph and cudf unavailable\n") raise ImportError(e) + # Read in sample names + if (args.distance_pkl is not None) ^ (args.previous_clustering is not None): + sys.stderr.write("To label strains, both --distance-pkl and --previous-clustering" + " must be provided\n") + sys.exit(1) + elif os.path.exists(args.distance_pkl): + with open(args.distances + ".pkl", 'rb') as pickle_file: + rlist, qlist, self = pickle.load(pickle_file) + if not self: + sys.stderr.write("This script must be run on a full all-v-all model\n") + sys.exit(1) + # Check output path ok if not os.path.isdir(args.output): try: @@ -75,13 +87,6 @@ def main(): sys.exit(1) setGtThreads(args.threads) - # Read in sample names - with open(args.distances + ".pkl", 'rb') as pickle_file: - rlist, qlist, self = pickle.load(pickle_file) - if not self: - sys.stderr.write("This script must be run on a full all-v-all model\n") - sys.exit(1) - # Create network with sparse dists sys.stderr.write("Loading distances into graph\n") sparse_mat = sparse.load_npz(args.rank_fit) @@ -107,9 +112,9 @@ def main(): G = constructNetwork(rlist, rlist, None, 0, sparse_input=sparse_mat, summarise=False) sys.stderr.write("Calculating MST (CPU)\n") - # Generate minimum spanning tree - mst = generate_minimum_spanning_tree(G, args.gpu_graph) - + + mst = generate_minimum_spanning_tree(G, args.gpu_graph) + # Save output sys.stderr.write("Generating output\n") mst.save(args.output + "/" + os.path.basename(args.output) + ".graphml", fmt="graphml") diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 06c0e5ef..77208d30 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # vim: set fileencoding= : -# Copyright 2018-2020 John Lees and Nick Croucher +# Copyright 2018-2021 John Lees and Nick Croucher # universal import os @@ -9,150 +9,165 @@ import subprocess import shutil import glob +import tempfile from collections import defaultdict -def write_batch(batched_sequences, files, batch, output): - out_fn = output + '.' + batch + '.list' - with open(out_fn,'w') as out_file: - for seq in batched_sequences[batch]: - out_file.write(seq + "\t" + files[seq] + "\n") - return out_fn +rfile_names = "rlist.txt" # command line parsing def get_options(): - parser = argparse.ArgumentParser(description='Batch MST mode (create db + lineage model fit + assign)', - prog='batch_mst') + parser = argparse.ArgumentParser(description='Batch MST mode (create db + lineage model fit + assign + sparse_mst)', + prog='poppunk_batch_mst') # input options ioGroup = parser.add_argument_group('Input and output file options') - ioGroup.add_argument('--batch-file', help='Tab-separated list of sequence names, files ' - 'and batch assignments', - required = True) - ioGroup.add_argument('--batch-order', help='File specifying order in which batches should ' - 'be processed') + ioGroup.add_argument('--r-files', help="Sample names and locations (as for poppunk --r-files)", + required=True) + ioGroup.add_argument('--batch-file', help="Batches to process samples in --r-files in", + required = True) + ioGroup.add_argument('--output', help='Prefix for output files', required=True) + ioGroup.add_argument('--previous-clustering', help='CSV file with previous clusters in MST drawing', + default=None) ioGroup.add_argument('--keep-intermediates', help='Retain the outputs of each batch', default=False, action='store_true') - ioGroup.add_argument('--output', help='Prefix for output files', required=True) - + # analysis options aGroup = parser.add_argument_group('Analysis options') aGroup.add_argument('--rank', help='Rank used to fit lineage model (int)', - type = int, - default = 1) + type = int, + default = 10) aGroup.add_argument('--threads', help='Number of threads for parallelisation (int)', - type = int, - default = 1) - aGroup.add_argument('--gpu', help='Use GPU for analysis', - default=False, - action='store_true') + type = int, + default = 1) + aGroup.add_argument('--use-gpu', help='Use GPU for analysis', + default=False, + action='store_true') aGroup.add_argument('--deviceid', help='GPU device ID (int)', - type = int, - default = 0) + type = int, + default = 0) aGroup.add_argument('--db-args', help="Other arguments to pass to poppunk. e.g. " "'--min-k 13 --max-k 29'", - default = "") + default = "") aGroup.add_argument('--model-args', help="Other arguments to pass to lineage model fit", - default = "") + default = "") aGroup.add_argument('--assign-args', help="Other arguments to pass to poppunk_assign", - default = "") + default = "") # Executable options eGroup = parser.add_argument_group('Executable locations') eGroup.add_argument('--poppunk-exe', help="Location of poppunk executable. Use " - "'python poppunk-runner.py' to run from source tree") + "'python poppunk-runner.py' to run from source tree", + default="poppunk") eGroup.add_argument('--assign-exe', help="Location of poppunk executable. Use " - "'python poppunk-runner.py' to run from source tree") + "'python poppunk_assign-runner.py' to run from source tree", + default="poppunk_assign") eGroup.add_argument('--mst-exe', help="Location of poppunk executable. Use " - "'python poppunk-runner.py' to run from source tree") + "'python poppunk_mst-runner.py' to run from source tree", + default="poppunk_visulaise") return parser.parse_args() +def writeBatch(rlines, batches, batch_selected): + tmpdir = tempfile.mkdtemp(prefix="pp_mst", dir="./") + with open(tmpdir + "/" + rfile_names, 'w') as outfile: + for rline, batch in zip(rlines, batches): + if batch == batch_selected: + outfile.write(rline) + + return tmpdir + +def runCmd(cmd_string): + sys.stderr.write("Running command:\n") + sys.stderr.write(cmd_string) + subprocess.run(cmd_string, shell=True, check=True) + # main code if __name__ == "__main__": # Check input ok args = get_options() - - # Get poppunk executable - if args.poppunk_exe is None: - poppunk = "poppunk" - else: - poppunk = args.poppunk_exe - # Need to add poppunk_assign_exe + if args.previous_clustering is not None and \ + not os.path.isfile(args.previous_clustering): + sys.stderr.write("Provided --previous-clustering file cannot be found\n") + sys.exit(1) # Check input file and batching - batch_set = set() - files = {} - batched_sequences = defaultdict(list) - with open(args.batch_file,'r') as input_file: - for line in input_file.readlines(): - info = line.rstrip().split() - files[info[0]] = info[1] - batch_set.add(info[2]) - batched_sequences[info[2]].append(info[0]) - - # Check on batch order + rlines = [] batches = [] - if args.batch_order is not None: - with open(args.batch_order,'r') as order_file: - batches = [line for line in input_file.readlines().rstrip()] - if set(batches) != batch_set: - batch_discrepancies = set(batches).difference(batch_set) + \ - batch_set.difference(set(batches)) - sys.stderr.write('Discrepancies between input file and batch ' - 'ordering: ' + str(batch_discrepancies) + '\n') - sys.exit() - else: - batches = list(batch_set) - - # Iterate through batches - first_batch = True - current_dir = args.output - for batch in batches: - # Write batch file - batch_fn = write_batch(batched_sequences, files, batch, args.output) - if first_batch: - # Initialise database - create_db_cmd = poppunk + " --create-db --r-files " + batch_fn + " --output " + args.output + " " + args.db_args + " --threads " + str(args.threads) + " " + args.db_args - if args.gpu: - create_db_cmd = create_db_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid) - sys.stderr.write(create_db_cmd + "\n") - subprocess.run(create_db_cmd, shell=True, check=True) - # Fit lineage model - fit_model_cmd = poppunk + " --fit-model lineage --ref-db " + args.output + " --rank " + str(args.rank) + " --threads " + str(args.threads) + " " + args.model_args - sys.stderr.write(fit_model_cmd + "\n") - subprocess.run(fit_model_cmd, shell=True, check=True) - # Completed first batch - first_batch = False - else: - # Define batch prefix - batch_prefix = args.output + "_" + batch - # Add to first batch through querying - assign_cmd = "poppunk_assign --db " + current_dir + " --query " + batch_fn + " --model-dir " + args.output + " --output " + batch_prefix + " --threads " + str(args.threads) + " --update-db " + args.assign_args + with open(args.r_file,'rU') as r_file, open(args.batch_file, 'rU') as batch_file: + for r_line, batch_line in zip(r_file, batch_file): + rlines.append(r_line) + batch_fields = batch_line.rstrip() + batches.append(batch_fields) + + batch_names = sorted(set(batches)) + if len(batch_names) < 2: + sys.stderr.write("You must supply multiple batches") + sys.exit(1) + first_batch = batch_names.pop(0) + + # try/except block to clean up tmp files + wd = writeBatch(rlines, batches, first_batch) + tmp_dirs = [] + try: + # First batch is create DB + lineage + create_db_cmd = args.poppunk_exe + " --create-db --r-files " + \ + wd + "/" + rfile_names + \ + " --output " + wd + " " + \ + args.db_args + " --threads " + \ + str(args.threads) + " " + \ + args.db_args + if args.gpu: + create_db_cmd += " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid) + runCmd(create_db_cmd) + + # Fit lineage model + fit_model_cmd = args.poppunk_exe + " --fit-model lineage --ref-db " + \ + wd + " --rank " + \ + str(args.rank) + " --threads " + \ + str(args.threads) + " " + \ + args.model_args + runCmd(fit_model_cmd) + + for batch_idx, batch in enumerate(batch_names): + batch_wd = writeBatch(rlines, batches, batch) + tmp_dirs.append(batch_wd) + + assign_cmd = args.assign_exe + " --db " + wd + \ + " --query " + batch_wd + "/" + rfile_names + \ + " --model-dir " + wd + " --output " + batch_wd + \ + " --threads " + str(args.threads) + " --update-db " + \ + args.assign_args if args.gpu: - assign_cmd = assign_cmd + " --gpu-dist --deviceid " + str(args.deviceid) - sys.stderr.write(assign_cmd + "\n") - subprocess.run(assign_cmd, shell=True, check=True) - # Process output - if args.keep_intermediates: -# shutil.rmtree(batch_prefix) - current_dir = batch_prefix - print("Switch current dir to " + current_dir) - else: - for file in glob.glob(args.output + "_" + batch + "/*"): - file_basename = os.path.basename(file) - if file_basename.startswith(batch_prefix): - print("Moving file " + args.output + "_" + batch + '/' + file_basename + " to " + current_dir + '/' + file_basename.replace(batch_prefix,args.output)) - os.rename(args.output + "_" + batch + '/' + file_basename, - current_dir + '/' + file_basename.replace(batch_prefix,args.output)) - shutil.rmtree(args.output + "_" + batch) - - # Remove npy dist file -# os.remove(args.output + "/" + args.output + ".dists.npy") - - # Calculate MST - mst_command = "poppunk_mst --distances " + args.output + "/" + args.output + ".dists --rank-fit " + args.output + "/" + args.output + "_rank" + str(args.rank) + "_fit.npz --previous-clustering " + args.output + "/" + args.output + "_lineages.csv --output " + args.output + " --threads " + str(args.threads) - if args.gpu: - mst_command = mst_command + " --gpu-network" + assign_cmd = assign_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid) + runCmd(assign_cmd) + + # Remove the previous batch + if batch_idx > 0 and args.keep_intermediates == False: + shutil.rmtree(tmp_dirs[batch_idx - 1]) + + # Calculate MST + output_dir = tmp_dirs[-1] + mst_command = args.mst_ext + " --distance-pkl " + output_dir + \ + "/" + output_dir + ".dists.pkl --rank-fit " + \ + output_dir + "/" + output_dir + "_rank" + \ + str(args.rank) + "_fit.npz " + \ + "--previous-clustering " + args.previous_clustering + \ + " --output " + args.output + \ + " --threads " + str(args.threads) + if args.gpu: + mst_command = mst_command + " --gpu-graph" + runCmd(mst_command) + except: + if args.keep_intermediates == False: + for tmpdir in tmp_dirs: + shutil.rmtree(wd) + shutil.rmtree(tmpdir) + print("Unexpected error:", sys.exc_info()[0]) + raise + + if args.keep_intermediates == False: + shutil.rmtree(wd) + shutil.rmtree(output_dir) \ No newline at end of file diff --git a/setup.py b/setup.py index a0bea06d..27166238 100644 --- a/setup.py +++ b/setup.py @@ -119,6 +119,7 @@ def build_extension(self, ext): scripts=['scripts/poppunk_calculate_rand_indices.py', 'scripts/poppunk_extract_components.py', 'scripts/poppunk_calculate_silhouette.py', + 'scripts/poppunk_batch_mst.py', 'scripts/poppunk_extract_distances.py', 'scripts/poppunk_add_weights.py', 'scripts/poppunk_easy_run.py', From fe4cb41e62fcfd5133e61a68b94d46a0cf38997c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 4 Feb 2021 05:00:16 +0000 Subject: [PATCH 015/327] Update sparse_mst distance file processing --- PopPUNK/sparse_mst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index 71b78f10..6133881c 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -72,7 +72,7 @@ def main(): " must be provided\n") sys.exit(1) elif os.path.exists(args.distance_pkl): - with open(args.distances + ".pkl", 'rb') as pickle_file: + with open(args.distance_pkl, 'rb') as pickle_file: rlist, qlist, self = pickle.load(pickle_file) if not self: sys.stderr.write("This script must be run on a full all-v-all model\n") From 3098aeaba985aeb7a8730810c2b7d3849fb59aa8 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 4 Feb 2021 05:15:39 +0000 Subject: [PATCH 016/327] Consistent argument names --- scripts/poppunk_batch_mst.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 77208d30..939767f3 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -96,7 +96,7 @@ def runCmd(cmd_string): # Check input file and batching rlines = [] batches = [] - with open(args.r_file,'rU') as r_file, open(args.batch_file, 'rU') as batch_file: + with open(args.r_files,'r') as r_file, open(args.batch_file, 'r') as batch_file: for r_line, batch_line in zip(r_file, batch_file): rlines.append(r_line) batch_fields = batch_line.rstrip() @@ -119,7 +119,7 @@ def runCmd(cmd_string): args.db_args + " --threads " + \ str(args.threads) + " " + \ args.db_args - if args.gpu: + if args.use_gpu: create_db_cmd += " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid) runCmd(create_db_cmd) @@ -140,7 +140,7 @@ def runCmd(cmd_string): " --model-dir " + wd + " --output " + batch_wd + \ " --threads " + str(args.threads) + " --update-db " + \ args.assign_args - if args.gpu: + if args.use_gpu: assign_cmd = assign_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid) runCmd(assign_cmd) @@ -157,7 +157,7 @@ def runCmd(cmd_string): "--previous-clustering " + args.previous_clustering + \ " --output " + args.output + \ " --threads " + str(args.threads) - if args.gpu: + if args.use_gpu: mst_command = mst_command + " --gpu-graph" runCmd(mst_command) except: @@ -170,4 +170,4 @@ def runCmd(cmd_string): if args.keep_intermediates == False: shutil.rmtree(wd) - shutil.rmtree(output_dir) \ No newline at end of file + shutil.rmtree(output_dir) From 0eb0265ce01f2a6823e40cb228df69b5d1391c1e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 4 Feb 2021 05:45:52 +0000 Subject: [PATCH 017/327] FIx arguments to mst command --- scripts/poppunk_batch_mst.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 939767f3..9a8b77c7 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -60,12 +60,12 @@ def get_options(): eGroup.add_argument('--poppunk-exe', help="Location of poppunk executable. Use " "'python poppunk-runner.py' to run from source tree", default="poppunk") - eGroup.add_argument('--assign-exe', help="Location of poppunk executable. Use " + eGroup.add_argument('--assign-exe', help="Location of poppunk_assign executable. Use " "'python poppunk_assign-runner.py' to run from source tree", default="poppunk_assign") eGroup.add_argument('--mst-exe', help="Location of poppunk executable. Use " "'python poppunk_mst-runner.py' to run from source tree", - default="poppunk_visulaise") + default="poppunk_mst") return parser.parse_args() @@ -150,13 +150,16 @@ def runCmd(cmd_string): # Calculate MST output_dir = tmp_dirs[-1] - mst_command = args.mst_ext + " --distance-pkl " + output_dir + \ - "/" + output_dir + ".dists.pkl --rank-fit " + \ - output_dir + "/" + output_dir + "_rank" + \ + mst_command = args.mst_exe + " --distance-pkl " + output_dir + \ + "/" + os.path.basename(output_dir) + ".dists.pkl --rank-fit " + \ + output_dir + "/" + os.path.basename(output_dir) + "_rank" + \ str(args.rank) + "_fit.npz " + \ - "--previous-clustering " + args.previous_clustering + \ " --output " + args.output + \ " --threads " + str(args.threads) + if args.previous_clustering is not None: + mst_command = mst_command + " --previous-clustering " + args.previous_clustering + else: + mst_command = mst_command + " --previous-clustering " + os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv") if args.use_gpu: mst_command = mst_command + " --gpu-graph" runCmd(mst_command) From 6c723c662f7ef50693941f80a41f5d3093d8a78a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 4 Feb 2021 09:10:34 +0000 Subject: [PATCH 018/327] Update MST test --- test/run_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/run_test.py b/test/run_test.py index 0e028fd9..aaf7eaa2 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -68,7 +68,7 @@ # MST sys.stderr.write("Running MST\n") subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree mst", shell=True, check=True) -subprocess.run("python ../poppunk_mst-runner.py --distances example_db/example_db.dists --rank-fit example_lineages/example_lineages_rank5_fit.npz --previous-clustering example_dbscan/example_dbscan_clusters.csv --output example_sparse_mst --no-plot", shell=True, check=True) +subprocess.run("python ../poppunk_mst-runner.py --distance-pkl example_db/example_db.dists.pkl --rank-fit example_lineages/example_lineages_rank5_fit.npz --previous-clustering example_dbscan/example_dbscan_clusters.csv --output example_sparse_mst --no-plot", shell=True, check=True) # t-sne sys.stderr.write("Running tsne viz\n") From a678080ab9813f885852cb18d1e33aa36118ebcb Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 4 Feb 2021 09:47:15 +0000 Subject: [PATCH 019/327] Output updated sparse distance matrix with --update-db --- PopPUNK/assign.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index c5a422bc..c7482f61 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -9,12 +9,15 @@ import numpy as np import subprocess from collections import defaultdict +import scipy.optimize +from scipy.sparse import coo_matrix, bmat, find # required from v2.1.1 onwards (no mash support) import pp_sketchlib # import poppunk package from .__init__ import __version__ +from .models import rankFile #*******************************# #* *# @@ -235,6 +238,14 @@ def assign_query(dbFuncs, else: genomeNetwork.save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt') + # Save sparse distance matrices + if model.type == 'lineage': + for rank in model.ranks: + scipy.sparse.save_npz( + output + "/" + os.path.basename(output) + \ + rankFile(rank), + self.nn_dists[rank]) + # Update distance matrices with all calculated distances if distances == None: distanceFiles = ref_db + "/" + os.path.basename(ref_db) + ".dists" From ccae98e6ea2f1e00307281e072b6720fe47b35e7 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 4 Feb 2021 10:18:49 +0000 Subject: [PATCH 020/327] Update lineage models for iterative MST generation --- PopPUNK/assign.py | 11 +++-------- scripts/poppunk_batch_mst.py | 3 ++- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index c7482f61..a2202108 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -235,17 +235,12 @@ def assign_query(dbFuncs, joinDBs(ref_db, output, output) if model.type == 'lineage': genomeNetwork[min(model.ranks)].save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt') + # Save sparse distance matrices and updated model + model.outPrefix = os.path.basename(output) + model.save() else: genomeNetwork.save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt') - # Save sparse distance matrices - if model.type == 'lineage': - for rank in model.ranks: - scipy.sparse.save_npz( - output + "/" + os.path.basename(output) + \ - rankFile(rank), - self.nn_dists[rank]) - # Update distance matrices with all calculated distances if distances == None: distanceFiles = ref_db + "/" + os.path.basename(ref_db) + ".dists" diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 9a8b77c7..9115945b 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -159,7 +159,8 @@ def runCmd(cmd_string): if args.previous_clustering is not None: mst_command = mst_command + " --previous-clustering " + args.previous_clustering else: - mst_command = mst_command + " --previous-clustering " + os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv") + mst_command = mst_command + " --previous-clustering " + \ + os.path.join(output_dir,output_dir + "_lineages.csv") if args.use_gpu: mst_command = mst_command + " --gpu-graph" runCmd(mst_command) From b9fdf3a54f5f57ee298eb3e4f9efcfff6afab681 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 4 Feb 2021 10:49:35 +0000 Subject: [PATCH 021/327] Save lineage definitions --- scripts/poppunk_batch_mst.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 9115945b..629b6e59 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -93,6 +93,9 @@ def runCmd(cmd_string): sys.stderr.write("Provided --previous-clustering file cannot be found\n") sys.exit(1) + # If no batch file is provided, generate one + + # Check input file and batching rlines = [] batches = [] @@ -164,6 +167,11 @@ def runCmd(cmd_string): if args.use_gpu: mst_command = mst_command + " --gpu-graph" runCmd(mst_command) + + # Retrieve lineages from previous round + os.rename(os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv"), + os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv")) + except: if args.keep_intermediates == False: for tmpdir in tmp_dirs: From 8192ba73eeb011d5ba27c9249efdbe603cd42b81 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 4 Feb 2021 11:00:35 +0000 Subject: [PATCH 022/327] Retain named batches if requested --- scripts/poppunk_batch_mst.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 629b6e59..a822956a 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -32,7 +32,9 @@ def get_options(): ioGroup.add_argument('--keep-intermediates', help='Retain the outputs of each batch', default=False, action='store_true') - + ioGroup.add_argument('--use-batch-names', help='Name the stored outputs of each batch', + default=False, + action='store_true') # analysis options aGroup = parser.add_argument_group('Analysis options') aGroup.add_argument('--rank', help='Rank used to fit lineage model (int)', @@ -69,8 +71,13 @@ def get_options(): return parser.parse_args() -def writeBatch(rlines, batches, batch_selected): - tmpdir = tempfile.mkdtemp(prefix="pp_mst", dir="./") +def writeBatch(rlines, batches, batch_selected, use_names = False): + tmpdir = "" + if use_names: + tmpdir = "./pp_mst_" + batch_selected + os.mkdir(tmpdir) + else: + tmpdir = tempfile.mkdtemp(prefix="pp_mst", dir="./") with open(tmpdir + "/" + rfile_names, 'w') as outfile: for rline, batch in zip(rlines, batches): if batch == batch_selected: @@ -93,9 +100,6 @@ def runCmd(cmd_string): sys.stderr.write("Provided --previous-clustering file cannot be found\n") sys.exit(1) - # If no batch file is provided, generate one - - # Check input file and batching rlines = [] batches = [] @@ -112,7 +116,7 @@ def runCmd(cmd_string): first_batch = batch_names.pop(0) # try/except block to clean up tmp files - wd = writeBatch(rlines, batches, first_batch) + wd = writeBatch(rlines, batches, first_batch, args.use_batch_names) tmp_dirs = [] try: # First batch is create DB + lineage @@ -135,7 +139,7 @@ def runCmd(cmd_string): runCmd(fit_model_cmd) for batch_idx, batch in enumerate(batch_names): - batch_wd = writeBatch(rlines, batches, batch) + batch_wd = writeBatch(rlines, batches, batch, args.use_batch_names) tmp_dirs.append(batch_wd) assign_cmd = args.assign_exe + " --db " + wd + \ From 1189b87ba42f499f5dc0390afd7984175de511bd Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 4 Feb 2021 11:08:21 +0000 Subject: [PATCH 023/327] Allow for clustering with multiple lineage ranks --- scripts/poppunk_batch_mst.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index a822956a..d65f9e56 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -37,9 +37,9 @@ def get_options(): action='store_true') # analysis options aGroup = parser.add_argument_group('Analysis options') - aGroup.add_argument('--rank', help='Rank used to fit lineage model (int)', - type = int, - default = 10) + aGroup.add_argument('--rank', help='Comma separated ranks used to fit lineage model (list of ints)', + type = str, + default = "10") aGroup.add_argument('--threads', help='Number of threads for parallelisation (int)', type = int, default = 1) @@ -100,6 +100,10 @@ def runCmd(cmd_string): sys.stderr.write("Provided --previous-clustering file cannot be found\n") sys.exit(1) + # Check ranks + ranks = [int(rank) for rank in args.rank.split(',')] + max_rank = max(ranks) + # Check input file and batching rlines = [] batches = [] @@ -133,7 +137,7 @@ def runCmd(cmd_string): # Fit lineage model fit_model_cmd = args.poppunk_exe + " --fit-model lineage --ref-db " + \ wd + " --rank " + \ - str(args.rank) + " --threads " + \ + args.rank + " --threads " + \ str(args.threads) + " " + \ args.model_args runCmd(fit_model_cmd) @@ -160,7 +164,7 @@ def runCmd(cmd_string): mst_command = args.mst_exe + " --distance-pkl " + output_dir + \ "/" + os.path.basename(output_dir) + ".dists.pkl --rank-fit " + \ output_dir + "/" + os.path.basename(output_dir) + "_rank" + \ - str(args.rank) + "_fit.npz " + \ + str(max_rank) + "_fit.npz " + \ " --output " + args.output + \ " --threads " + str(args.threads) if args.previous_clustering is not None: From 2d8813300cce4cb59b06789f0088bc0a6b57d4b6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 4 Feb 2021 11:21:48 +0000 Subject: [PATCH 024/327] Alter batch processing --- scripts/poppunk_batch_mst.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index d65f9e56..2cc78f39 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -100,25 +100,27 @@ def runCmd(cmd_string): sys.stderr.write("Provided --previous-clustering file cannot be found\n") sys.exit(1) - # Check ranks + # Extract ranks ranks = [int(rank) for rank in args.rank.split(',')] max_rank = max(ranks) - # Check input file and batching + # Check batching rlines = [] batches = [] - with open(args.r_files,'r') as r_file, open(args.batch_file, 'r') as batch_file: - for r_line, batch_line in zip(r_file, batch_file): - rlines.append(r_line) - batch_fields = batch_line.rstrip() - batches.append(batch_fields) - + with open(args.batch_file,'r') as batch_file: + batches = [batch_line.rstrip() for batch_line in batch_file.readlines()] batch_names = sorted(set(batches)) if len(batch_names) < 2: sys.stderr.write("You must supply multiple batches") sys.exit(1) first_batch = batch_names.pop(0) + # Check input file + with open(args.r_files,'r') as r_file: + for r_line in r_file: + rlines.append(r_line) + + # try/except block to clean up tmp files wd = writeBatch(rlines, batches, first_batch, args.use_batch_names) tmp_dirs = [] From b9993ccda6959bed26446347372fe332c581f771 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 4 Feb 2021 11:39:08 +0000 Subject: [PATCH 025/327] Allow for automatic batching --- scripts/poppunk_batch_mst.py | 80 +++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 33 deletions(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 2cc78f39..8bb44acd 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -22,52 +22,55 @@ def get_options(): # input options ioGroup = parser.add_argument_group('Input and output file options') - ioGroup.add_argument('--r-files', help="Sample names and locations (as for poppunk --r-files)", - required=True) - ioGroup.add_argument('--batch-file', help="Batches to process samples in --r-files in", - required = True) - ioGroup.add_argument('--output', help='Prefix for output files', required=True) + ioGroup.add_argument('--r-files', help='Sample names and locations (as for poppunk --r-files)', + required=True) + ioGroup.add_argument('--batch-file', help='Single column list of batches to process samples in --r-files in') + ioGroup.add_argument('--n-batches', help='Number of batches for process if --batch-file is not specified', + type=int, + default=10) + ioGroup.add_argument('--output', help='Prefix for output files', + required=True) ioGroup.add_argument('--previous-clustering', help='CSV file with previous clusters in MST drawing', - default=None) + default=None) ioGroup.add_argument('--keep-intermediates', help='Retain the outputs of each batch', - default=False, - action='store_true') + default=False, + action='store_true') ioGroup.add_argument('--use-batch-names', help='Name the stored outputs of each batch', - default=False, - action='store_true') + default=False, + action='store_true') # analysis options aGroup = parser.add_argument_group('Analysis options') aGroup.add_argument('--rank', help='Comma separated ranks used to fit lineage model (list of ints)', - type = str, - default = "10") + type = str, + default = "10") aGroup.add_argument('--threads', help='Number of threads for parallelisation (int)', - type = int, - default = 1) + type = int, + default = 1) aGroup.add_argument('--use-gpu', help='Use GPU for analysis', - default=False, - action='store_true') + default=False, + action='store_true') aGroup.add_argument('--deviceid', help='GPU device ID (int)', - type = int, - default = 0) + type = int, + default = 0) aGroup.add_argument('--db-args', help="Other arguments to pass to poppunk. e.g. " "'--min-k 13 --max-k 29'", - default = "") + default = "") aGroup.add_argument('--model-args', help="Other arguments to pass to lineage model fit", - default = "") + default = "") aGroup.add_argument('--assign-args', help="Other arguments to pass to poppunk_assign", - default = "") + default = "") # Executable options eGroup = parser.add_argument_group('Executable locations') eGroup.add_argument('--poppunk-exe', help="Location of poppunk executable. Use " "'python poppunk-runner.py' to run from source tree", - default="poppunk") + default="poppunk") eGroup.add_argument('--assign-exe', help="Location of poppunk_assign executable. Use " "'python poppunk_assign-runner.py' to run from source tree", - default="poppunk_assign") + default="poppunk_assign") eGroup.add_argument('--mst-exe', help="Location of poppunk executable. Use " "'python poppunk_mst-runner.py' to run from source tree", - default="poppunk_mst") + default="poppunk_mst") return parser.parse_args() @@ -104,23 +107,34 @@ def runCmd(cmd_string): ranks = [int(rank) for rank in args.rank.split(',')] max_rank = max(ranks) - # Check batching + # Check input file rlines = [] + with open(args.r_files,'r') as r_file: + for r_line in r_file: + rlines.append(r_line) + + # Check batching batches = [] - with open(args.batch_file,'r') as batch_file: - batches = [batch_line.rstrip() for batch_line in batch_file.readlines()] + if args.batch_file: + # Read specified batches + with open(args.batch_file,'r') as batch_file: + batches = [batch_line.rstrip() for batch_line in batch_file.readlines()] + else: + # Generate arbitrary batches + x = 0 + while x < len(rlines): + if n > args.n_batches: + n = 1 + batches.append(n) + n = n + 1 + x = x + 1 + # Validate batches batch_names = sorted(set(batches)) if len(batch_names) < 2: sys.stderr.write("You must supply multiple batches") sys.exit(1) first_batch = batch_names.pop(0) - # Check input file - with open(args.r_files,'r') as r_file: - for r_line in r_file: - rlines.append(r_line) - - # try/except block to clean up tmp files wd = writeBatch(rlines, batches, first_batch, args.use_batch_names) tmp_dirs = [] From 05c960940fe92eac565b7ef03b5f6dd806d36d53 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 4 Feb 2021 14:44:23 +0000 Subject: [PATCH 026/327] Add parsing of additional information --- scripts/poppunk_batch_mst.py | 112 ++++++++++++++++++++++++++++++++--- 1 file changed, 105 insertions(+), 7 deletions(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 8bb44acd..f2bdaab4 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -11,6 +11,7 @@ import glob import tempfile from collections import defaultdict +import pandas as pd rfile_names = "rlist.txt" @@ -28,6 +29,7 @@ def get_options(): ioGroup.add_argument('--n-batches', help='Number of batches for process if --batch-file is not specified', type=int, default=10) + ioGroup.add_argument('--info-csv', help='CSV containing information about sequences', default=None) ioGroup.add_argument('--output', help='Prefix for output files', required=True) ioGroup.add_argument('--previous-clustering', help='CSV file with previous clusters in MST drawing', @@ -93,6 +95,83 @@ def runCmd(cmd_string): sys.stderr.write(cmd_string) subprocess.run(cmd_string, shell=True, check=True) +def readLineages(clustCSV): + clusters = defaultdict(dict) + # read CSV + clustersCsv = pd.read_csv(clustCSV, index_col = 0, quotechar='"') + # select relevant columns + type_columns = [n for n,col in enumerate(clustersCsv.columns) if ('Rank_' in col or 'overall' in col)] + # read file + for row in clustersCsv.itertuples(): + for cls_idx in type_columns: + cluster_name = clustersCsv.columns[cls_idx] + cluster_name = cluster_name.replace('__autocolour','') + clusters[cluster_name][row.Index] = str(row[cls_idx + 1]) + # return data structure + return clusters + +def isolateNameToLabel(names): + labels = [name.split('/')[-1].split('.')[0] for name in names] + return labels + +def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, + epiCsv = None, suffix = '_Lineage'): + # set order of column names + colnames = ['ID'] + for cluster_type in clustering: + col_name = cluster_type + suffix + colnames.append(col_name) + # process epidemiological data + d = defaultdict(list) + # process epidemiological data without duplicating names + # used by PopPUNK + columns_to_be_omitted = ['id', 'Id', 'ID', 'combined_Cluster__autocolour', + 'core_Cluster__autocolour', 'accessory_Cluster__autocolour', + 'overall_Lineage'] + if epiCsv is not None: + epiData = pd.read_csv(epiCsv, index_col = False, quotechar='"') + epiData.index = isolateNameToLabel(epiData.iloc[:,0]) + for e in epiData.columns.values: + if e not in columns_to_be_omitted: + colnames.append(str(e)) + # get example clustering name for validation + example_cluster_title = list(clustering.keys())[0] + for name, label in zip(nodeNames, isolateNameToLabel(nodeLabels)): + print('Example: ' + example_cluster_title + '\nClustering: ' + str(clustering[example_cluster_title])) + if name in clustering[example_cluster_title]: + d['ID'].append(label) + for cluster_type in clustering: + col_name = cluster_type + suffix + d[col_name].append(clustering[cluster_type][name]) + if epiCsv is not None: + if label in epiData.index: + for col, value in zip(epiData.columns.values, epiData.loc[label].values): + if col not in columns_to_be_omitted: + d[col].append(str(value)) + else: + for col in epiData.columns.values: + if col not in columns_to_be_omitted: + d[col].append('nan') + else: + sys.stderr.write("Cannot find " + name + " in clustering\n") + sys.exit(1) + # print CSV + sys.stderr.write("Parsed data, now writing to CSV\n") + try: + pd.DataFrame(data=d).to_csv(outfile, columns = colnames, index = False) + except subprocess.CalledProcessError as e: + sys.stderr.write("Problem with epidemiological data CSV; returned code: " + str(e.returncode) + "\n") + # check CSV + prev_col_items = -1 + prev_col_name = "unknown" + for col in d: + this_col_items = len(d[col]) + if prev_col_items > -1 and prev_col_items != this_col_items: + sys.stderr.write("Discrepant length between " + prev_col_name + \ + " (length of " + prev_col_items + ") and " + \ + col + "(length of " + this_col_items + ")\n") + sys.exit(1) + # main code if __name__ == "__main__": @@ -109,9 +188,14 @@ def runCmd(cmd_string): # Check input file rlines = [] + nodeNames = [] + nodeLabels = [] with open(args.r_files,'r') as r_file: for r_line in r_file: rlines.append(r_line) + node_info = r_line.rstrip().split() + nodeNames.append(node_info[0]) + nodeLabels.append(node_info[1]) # Check batching batches = [] @@ -122,6 +206,7 @@ def runCmd(cmd_string): else: # Generate arbitrary batches x = 0 + n = 1 while x < len(rlines): if n > args.n_batches: n = 1 @@ -137,7 +222,7 @@ def runCmd(cmd_string): # try/except block to clean up tmp files wd = writeBatch(rlines, batches, first_batch, args.use_batch_names) - tmp_dirs = [] + tmp_dirs = [wd] try: # First batch is create DB + lineage create_db_cmd = args.poppunk_exe + " --create-db --r-files " + \ @@ -159,12 +244,13 @@ def runCmd(cmd_string): runCmd(fit_model_cmd) for batch_idx, batch in enumerate(batch_names): + prev_wd = tmp_dirs[-1] batch_wd = writeBatch(rlines, batches, batch, args.use_batch_names) tmp_dirs.append(batch_wd) - assign_cmd = args.assign_exe + " --db " + wd + \ + assign_cmd = args.assign_exe + " --db " + prev_wd + \ " --query " + batch_wd + "/" + rfile_names + \ - " --model-dir " + wd + " --output " + batch_wd + \ + " --model-dir " + prev_wd + " --output " + batch_wd + \ " --threads " + str(args.threads) + " --update-db " + \ args.assign_args if args.use_gpu: @@ -187,19 +273,31 @@ def runCmd(cmd_string): mst_command = mst_command + " --previous-clustering " + args.previous_clustering else: mst_command = mst_command + " --previous-clustering " + \ - os.path.join(output_dir,output_dir + "_lineages.csv") + os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv") if args.use_gpu: mst_command = mst_command + " --gpu-graph" runCmd(mst_command) - # Retrieve lineages from previous round + # Retrieve isolate names and lineages from previous round + os.rename(os.path.join(output_dir,os.path.basename(output_dir) + ".dists.pkl"), + os.path.join(args.output,os.path.basename(args.output) + ".dists.pkl")) os.rename(os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv"), os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv")) - + + # Merge with epidemiological data if requested + if args.info_csv is not None: + lineage_clustering = readLineages(os.path.join(args.output, + os.path.basename(args.output) + "_lineages.csv")) + writeClusterCsv(os.path.join(args.output, + os.path.basename(args.output) + "_info.csv"), + nodeNames, + nodeLabels, + lineage_clustering, + epiCsv = args.info_csv) + except: if args.keep_intermediates == False: for tmpdir in tmp_dirs: - shutil.rmtree(wd) shutil.rmtree(tmpdir) print("Unexpected error:", sys.exc_info()[0]) raise From 40238dd7e67a676f8e0a1cfd32cbc2ba5d487d59 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 5 Feb 2021 07:12:28 +0000 Subject: [PATCH 027/327] Enable expansion of previous MST --- PopPUNK/network.py | 61 +++++++++++++++++++++++++++++++++++- PopPUNK/sparse_mst.py | 12 +++++-- scripts/poppunk_batch_mst.py | 5 ++- 3 files changed, 74 insertions(+), 4 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index b5c9cadd..209b7af2 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -266,9 +266,49 @@ def writeReferences(refList, outPrefix): return refFileName +def load_previous_network(prev_G, rlist, weights=False): + """Load previous network with graph-tool, extract the edges to match the + vertex order specified in rlist, and also return weights if specified. + + Args: + prev_G (str) + Path of file containing existing network. + rlist (list) + List of reference sequence labels in new network + weights (bool) + Whether to return edge weights + (default = False) + + Returns: + source_ids (list) + Source nodes for each edge + target_ids (list) + Target nodes for each edge + edge_weights (list) + Weights for each new edge + """ + # get list for translating node IDs to rlist + old_ids = prev_G.vp["id"] + old_id_indices = [rlist.index(x) for x in old_ids] + # get the source and target nods + source_old_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "source") + target_old_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "target") + # translate to indices + source_ids = [old_id_indices[x] for x in source_old_ids] + target_ids = [old_id_indices[x] for x in target_old_ids] + # convert to ndarray + # get the weights + if weights: + edge_weights = prev_G.ep['weight'] + # return values + return source_ids, target_ids, edge_weights + else: + return source_ids, target_ids + def constructNetwork(rlist, qlist, assignments, within_label, summarise = True, edge_list = False, weights = None, - weights_type = 'euclidean', sparse_input = None): + weights_type = 'euclidean', sparse_input = None, + previous_network = None): """Construct an unweighted, undirected network without self-loops. Nodes are samples and edges where samples are within the same cluster @@ -297,6 +337,9 @@ def constructNetwork(rlist, qlist, assignments, within_label, accessory or euclidean distance sparse_input (numpy.array) Sparse distance matrix from lineage fit + previous_network (str) + Name of file containing a previous network to be integrated into this new + network Returns: G (graph) @@ -348,6 +391,22 @@ def constructNetwork(rlist, qlist, assignments, within_label, edge_tuple = (ref, query) connections.append(edge_tuple) + # read previous graph + if previous_network is not None: + prev_G = gt.load_graph(previous_network) + if weights is not None: + extra_sources, extra_targets, extra_weights = load_previous_network(prev_G,rlist, + weights = True) + for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights): + edge_tuple = (ref, query, dist) + connections.append(edge_tuple) + else: + extra_sources, extra_targets = load_previous_network(prev_G,rlist, + weights = False) + for (ref, query) in zip(extra_sources, extra_targets): + edge_tuple = (ref, query) + connections.append(edge_tuple) + # build the graph G = gt.Graph(directed = False) G.add_vertex(len(vertex_labels)) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index 6133881c..8c120b00 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -8,6 +8,7 @@ import pickle import re +import numpy as np import pandas as pd from scipy import sparse @@ -31,6 +32,8 @@ def get_options(): iGroup = parser.add_argument_group('Input files') iGroup.add_argument('--rank-fit', required=True, help='Location of rank fit, a sparse matrix (*_rank*_fit.npz)') iGroup.add_argument('--previous-clustering', help='CSV file with cluster definitions') + iGroup.add_argument('--previous-mst', help='Graph tool file from which previous MST can be loaded', + default=None) iGroup.add_argument('--distance-pkl', help='Input pickle from distances, which contains sample names') iGroup.add_argument('--display-cluster', default=None, help='Column of clustering CSV to use for plotting') @@ -109,8 +112,13 @@ def main(): weights=edge_df['weights'].values_host, summarise=False) else: - G = constructNetwork(rlist, rlist, None, 0, - sparse_input=sparse_mat, summarise=False) + if args.previous_mst is not None: + G = constructNetwork(rlist, rlist, None, 0, + sparse_input=sparse_mat, summarise=False, + previous_network = args.previous_mst) + else: + G = constructNetwork(rlist, rlist, None, 0, + sparse_input=sparse_mat, summarise=False) sys.stderr.write("Calculating MST (CPU)\n") mst = generate_minimum_spanning_tree(G, args.gpu_graph) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index f2bdaab4..8d87c080 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -34,6 +34,8 @@ def get_options(): required=True) ioGroup.add_argument('--previous-clustering', help='CSV file with previous clusters in MST drawing', default=None) + ioGroup.add_argument('--previous-mst', help='MST calculated from a subset of the data in graph tool format', + default=None) ioGroup.add_argument('--keep-intermediates', help='Retain the outputs of each batch', default=False, action='store_true') @@ -137,7 +139,6 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, # get example clustering name for validation example_cluster_title = list(clustering.keys())[0] for name, label in zip(nodeNames, isolateNameToLabel(nodeLabels)): - print('Example: ' + example_cluster_title + '\nClustering: ' + str(clustering[example_cluster_title])) if name in clustering[example_cluster_title]: d['ID'].append(label) for cluster_type in clustering: @@ -269,6 +270,8 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, str(max_rank) + "_fit.npz " + \ " --output " + args.output + \ " --threads " + str(args.threads) + if args.previous_mst is not None: + mst_command = mst_command + " --previous-mst " + args.previous_mst if args.previous_clustering is not None: mst_command = mst_command + " --previous-clustering " + args.previous_clustering else: From 68f4fba2a294db6167bfa397e02fa8d29f5b94bd Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 5 Feb 2021 07:26:35 +0000 Subject: [PATCH 028/327] Add graph extension for GPUs --- PopPUNK/sparse_mst.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index 8c120b00..80c794ed 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -15,7 +15,7 @@ # import poppunk package from .__init__ import __version__ -from .network import constructNetwork, generate_minimum_spanning_tree +from .network import constructNetwork, generate_minimum_spanning_tree, load_previous_network from .plot import drawMST from .trees import mst_to_phylogeny, write_tree from .utils import setGtThreads, readIsolateTypeFromCsv @@ -94,9 +94,21 @@ def main(): sys.stderr.write("Loading distances into graph\n") sparse_mat = sparse.load_npz(args.rank_fit) if args.gpu_graph: - G_df = cudf.DataFrame({'source': sparse_mat.row, - 'destination': sparse_mat.col, - 'weights': sparse_mat.data}) + # Load previous MST if specified + if args.previous_mst is not None: + extra_sources, extra_targets, extra_weights = load_previous_network(args.previous_mst, + rlist, + weights = True) + sources = np.append(sparse_mat.row, np.asarray(extra_sources)) + targets = np.append(sparse_mat.col, np.asarray(extra_targets)) + weights = np.append(sparse_mat.data, np.asarray(extra_weights)) + else: + sources = sparse_mat.row + targets = sparse_mat.col + weights = sparse_mat.data + G_df = cudf.DataFrame({'source': sources, + 'destination': targets, + 'weights': weights}) G_cu = cugraph.Graph() G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) @@ -112,6 +124,7 @@ def main(): weights=edge_df['weights'].values_host, summarise=False) else: + # Load previous MST if specified if args.previous_mst is not None: G = constructNetwork(rlist, rlist, None, 0, sparse_input=sparse_mat, summarise=False, From 1b710722e2cb6956565cdae0b7af6b47f5bcedda Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 5 Feb 2021 10:51:55 +0000 Subject: [PATCH 029/327] Fix iterative MST mode --- PopPUNK/network.py | 5 +- scripts/poppunk_batch_mst.py | 101 ++++++++++++++++++++++++++++------- 2 files changed, 85 insertions(+), 21 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 209b7af2..4390e901 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -301,6 +301,7 @@ def load_previous_network(prev_G, rlist, weights=False): if weights: edge_weights = prev_G.ep['weight'] # return values + print("Old weights: " + str(edge_weights)) return source_ids, target_ids, edge_weights else: return source_ids, target_ids @@ -394,11 +395,11 @@ def constructNetwork(rlist, qlist, assignments, within_label, # read previous graph if previous_network is not None: prev_G = gt.load_graph(previous_network) - if weights is not None: + if weights is not None or sparse_input is not None: extra_sources, extra_targets, extra_weights = load_previous_network(prev_G,rlist, weights = True) for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights): - edge_tuple = (ref, query, dist) + edge_tuple = (ref, query, weight) connections.append(edge_tuple) else: extra_sources, extra_targets = load_previous_network(prev_G,rlist, diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 8d87c080..d3166fec 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -34,8 +34,9 @@ def get_options(): required=True) ioGroup.add_argument('--previous-clustering', help='CSV file with previous clusters in MST drawing', default=None) - ioGroup.add_argument('--previous-mst', help='MST calculated from a subset of the data in graph tool format', - default=None) + ioGroup.add_argument('--iterative-mst', help='Re-calculate the MST for each batch', + default=False, + action='store_true') ioGroup.add_argument('--keep-intermediates', help='Retain the outputs of each batch', default=False, action='store_true') @@ -94,7 +95,7 @@ def writeBatch(rlines, batches, batch_selected, use_names = False): def runCmd(cmd_string): sys.stderr.write("Running command:\n") - sys.stderr.write(cmd_string) + sys.stderr.write(cmd_string + '\n') subprocess.run(cmd_string, shell=True, check=True) def readLineages(clustCSV): @@ -176,6 +177,10 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, # main code if __name__ == "__main__": + ########### + # Prepare # + ########### + # Check input ok args = get_options() if args.previous_clustering is not None and \ @@ -225,6 +230,11 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, wd = writeBatch(rlines, batches, first_batch, args.use_batch_names) tmp_dirs = [wd] try: + + ############### + # First batch # + ############### + # First batch is create DB + lineage create_db_cmd = args.poppunk_exe + " --create-db --r-files " + \ wd + "/" + rfile_names + \ @@ -243,7 +253,26 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, str(args.threads) + " " + \ args.model_args runCmd(fit_model_cmd) - + + # Calculate MST if operating iteratively + if args.iterative_mst: + + mst_command = args.mst_exe + " --distance-pkl " + wd + \ + "/" + os.path.basename(wd) + ".dists.pkl --rank-fit " + \ + wd + "/" + os.path.basename(wd) + "_rank" + \ + str(max_rank) + "_fit.npz " + \ + " --output " + wd + \ + " --threads " + str(args.threads) + \ + " --previous-clustering " + wd + \ + "/" + os.path.basename(wd) + "_lineages.csv" + if args.use_gpu: + mst_command = mst_command + " --gpu-graph" + runCmd(mst_command) + + ########### + # Iterate # + ########### + for batch_idx, batch in enumerate(batch_names): prev_wd = tmp_dirs[-1] batch_wd = writeBatch(rlines, batches, batch, args.use_batch_names) @@ -257,29 +286,63 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, if args.use_gpu: assign_cmd = assign_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid) runCmd(assign_cmd) + + # Calculate MST if operating iteratively + if args.iterative_mst: + + mst_command = args.mst_exe + " --distance-pkl " + batch_wd + \ + "/" + os.path.basename(batch_wd) + ".dists.pkl --rank-fit " + \ + batch_wd + "/" + os.path.basename(batch_wd) + "_rank" + \ + str(max_rank) + "_fit.npz " + \ + " --output " + batch_wd + \ + " --threads " + str(args.threads) + \ + " --previous-mst " + \ + prev_wd + "/" + os.path.basename(prev_wd) + ".graphml" + \ + " --previous-clustering " + batch_wd + \ + "/" + os.path.basename(batch_wd) + "_lineages.csv" + if args.use_gpu: + mst_command = mst_command + " --gpu-graph" + runCmd(mst_command) # Remove the previous batch if batch_idx > 0 and args.keep_intermediates == False: shutil.rmtree(tmp_dirs[batch_idx - 1]) + ########## + # Finish # + ########## + # Calculate MST output_dir = tmp_dirs[-1] - mst_command = args.mst_exe + " --distance-pkl " + output_dir + \ - "/" + os.path.basename(output_dir) + ".dists.pkl --rank-fit " + \ - output_dir + "/" + os.path.basename(output_dir) + "_rank" + \ - str(max_rank) + "_fit.npz " + \ - " --output " + args.output + \ - " --threads " + str(args.threads) - if args.previous_mst is not None: - mst_command = mst_command + " --previous-mst " + args.previous_mst - if args.previous_clustering is not None: - mst_command = mst_command + " --previous-clustering " + args.previous_clustering + if args.iterative_mst: + # Create directory + if os.path.exists(args.output): + if os.path.isdir(args.output): + shutil.rmtree(args.output) + else: + os.remove(args.output) + os.mkdir(args.output) + # Copy over final MST + shutil.copy(os.path.join(output_dir,os.path.basename(output_dir) + ".graphml"), + os.path.join(args.output,os.path.basename(args.output) + ".graphml")) + shutil.copy(os.path.join(output_dir,os.path.basename(output_dir) + "_MST.nwk"), + os.path.join(args.output,os.path.basename(args.output) + "_MST.nwk")) else: - mst_command = mst_command + " --previous-clustering " + \ - os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv") - if args.use_gpu: - mst_command = mst_command + " --gpu-graph" - runCmd(mst_command) + # Calculate MST + mst_command = args.mst_exe + " --distance-pkl " + output_dir + \ + "/" + os.path.basename(output_dir) + ".dists.pkl --rank-fit " + \ + output_dir + "/" + os.path.basename(output_dir) + "_rank" + \ + str(max_rank) + "_fit.npz " + \ + " --output " + args.output + \ + " --threads " + str(args.threads) + if args.previous_clustering is not None: + mst_command = mst_command + " --previous-clustering " + args.previous_clustering + else: + mst_command = mst_command + " --previous-clustering " + \ + os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv") + if args.use_gpu: + mst_command = mst_command + " --gpu-graph" + runCmd(mst_command) # Retrieve isolate names and lineages from previous round os.rename(os.path.join(output_dir,os.path.basename(output_dir) + ".dists.pkl"), From 05ffc813f272f91ba10d417b6b36e56e1df6e2f4 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 5 Feb 2021 11:10:11 +0000 Subject: [PATCH 030/327] Add output to lineage model fitting --- scripts/poppunk_batch_mst.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index d3166fec..42fe6fcd 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -251,7 +251,8 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, wd + " --rank " + \ args.rank + " --threads " + \ str(args.threads) + " " + \ - args.model_args + args.model_args + \ + " --output " + args.output runCmd(fit_model_cmd) # Calculate MST if operating iteratively From fef9e0127dd09016be766c3d20b2e1ca958b0694 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 5 Feb 2021 11:26:34 +0000 Subject: [PATCH 031/327] Remove unnecessary flags/messages --- PopPUNK/network.py | 1 - scripts/poppunk_batch_mst.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 4390e901..ad8206d5 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -301,7 +301,6 @@ def load_previous_network(prev_G, rlist, weights=False): if weights: edge_weights = prev_G.ep['weight'] # return values - print("Old weights: " + str(edge_weights)) return source_ids, target_ids, edge_weights else: return source_ids, target_ids diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 42fe6fcd..d3166fec 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -251,8 +251,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, wd + " --rank " + \ args.rank + " --threads " + \ str(args.threads) + " " + \ - args.model_args + \ - " --output " + args.output + args.model_args runCmd(fit_model_cmd) # Calculate MST if operating iteratively From 46ef3a1965d6b8055b5700dd8e6359b421345e1e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 5 Feb 2021 22:09:46 +0000 Subject: [PATCH 032/327] Consistent behaviour between GPU and non-GPU processes --- PopPUNK/network.py | 10 +++++----- PopPUNK/sparse_mst.py | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ad8206d5..d059d57d 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -266,12 +266,12 @@ def writeReferences(refList, outPrefix): return refFileName -def load_previous_network(prev_G, rlist, weights=False): +def load_previous_network(prev_G_fn, rlist, weights=False): """Load previous network with graph-tool, extract the edges to match the vertex order specified in rlist, and also return weights if specified. Args: - prev_G (str) + prev_G_fn (str) Path of file containing existing network. rlist (list) List of reference sequence labels in new network @@ -288,6 +288,7 @@ def load_previous_network(prev_G, rlist, weights=False): Weights for each new edge """ # get list for translating node IDs to rlist + prev_G = gt.load_graph(prev_G_fn) old_ids = prev_G.vp["id"] old_id_indices = [rlist.index(x) for x in old_ids] # get the source and target nods @@ -299,7 +300,7 @@ def load_previous_network(prev_G, rlist, weights=False): # convert to ndarray # get the weights if weights: - edge_weights = prev_G.ep['weight'] + edge_weights = list(prev_G.ep['weight']) # return values return source_ids, target_ids, edge_weights else: @@ -393,9 +394,8 @@ def constructNetwork(rlist, qlist, assignments, within_label, # read previous graph if previous_network is not None: - prev_G = gt.load_graph(previous_network) if weights is not None or sparse_input is not None: - extra_sources, extra_targets, extra_weights = load_previous_network(prev_G,rlist, + extra_sources, extra_targets, extra_weights = load_previous_network(previous_network,rlist, weights = True) for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights): edge_tuple = (ref, query, weight) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index 80c794ed..c5d15757 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -96,6 +96,7 @@ def main(): if args.gpu_graph: # Load previous MST if specified if args.previous_mst is not None: + print("Previous: " + str(args.previous_mst)) extra_sources, extra_targets, extra_weights = load_previous_network(args.previous_mst, rlist, weights = True) From 2a04c21e27852820a42c41fea72a9442e41c2704 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 5 Feb 2021 22:14:40 +0000 Subject: [PATCH 033/327] Remove gpu-sketch flags --- scripts/poppunk_batch_mst.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index d3166fec..0d84c956 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -243,7 +243,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, str(args.threads) + " " + \ args.db_args if args.use_gpu: - create_db_cmd += " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid) + create_db_cmd += " --gpu-dist --deviceid " + str(args.deviceid) runCmd(create_db_cmd) # Fit lineage model @@ -284,7 +284,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, " --threads " + str(args.threads) + " --update-db " + \ args.assign_args if args.use_gpu: - assign_cmd = assign_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid) + assign_cmd = assign_cmd + " --gpu-dist --deviceid " + str(args.deviceid) runCmd(assign_cmd) # Calculate MST if operating iteratively From 7309dc24346a97f4329961dea642780427332b07 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 8 Feb 2021 10:22:50 +0000 Subject: [PATCH 034/327] Add QC options to assign --- PopPUNK/assign.py | 46 ++++++++++++++++++++++++++++++++++-- scripts/poppunk_batch_mst.py | 45 +++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index a2202108..2142731f 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -335,9 +335,24 @@ def get_options(): 'k-mers [default = use canonical k-mers]') # qc options - qcGroup = parser.add_argument_group('Quality control options') + qcGroup = parser.add_argument_group('Quality control options for distances') + qcGroup.add_argument('--qc-filter', help='Behaviour following sequence QC step: "stop" [default], "prune"' + ' (analyse data passing QC), or "continue" (analyse all data)', + default='stop', type = str, choices=['stop', 'prune', 'continue']) + qcGroup.add_argument('--retain-failures', help='Retain sketches of genomes that do not pass QC filters in ' + 'separate database [default = False]', default=False, action='store_true') qcGroup.add_argument('--max-a-dist', help='Maximum accessory distance to permit [default = 0.5]', default = 0.5, type = float) + qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond ' + 'which sequences will be excluded [default = 5]', default = None, type = int) + qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded ' + '[two values needed - lower and upper bounds]', default=[None,None], + type = int, nargs = 2) + qcGroup.add_argument('--prop-n', help='Threshold ambiguous base proportion above which sequences will be excluded' + ' [default = 0.1]', default = None, + type = float) + qcGroup.add_argument('--upper-n', help='Threshold ambiguous base count above which sequences will be excluded', + default=None, type = int) # sequence querying queryingGroup = parser.add_argument_group('Database querying options') @@ -389,7 +404,34 @@ def main(): from .utils import setupDBFuncs # Dict of QC options for passing to database construction and querying functions - qc_dict = {'run_qc': False } + if args.length_sigma is None and None in args.length_range and args.prop_n is None \ + and args.upper_n is None: + qc_dict = {'run_qc': False } + else: + # define defaults if one QC parameter given + # length_sigma + if args.length_sigma is not None: + length_sigma = args.length_sigma + elif None in args.length_range: + length_sigma = 5 # default used in __main__ + else: + length_sigma = None + # prop_n + if args.prop_n is not None: + prop_n = args.prop_n + elif args.upper_n is None: + prop_n = 0.1 # default used in __main__ + else: + prop_n = None + qc_dict = { + 'run_qc': True, + 'qc_filter': args.qc_filter, + 'retain_failures': args.retain_failures, + 'length_sigma': length_sigma, + 'length_range': args.length_range, + 'prop_n': prop_n, + 'upper_n': args.upper_n + } # Dict of DB access functions for assign_query (which is out of scope) dbFuncs = setupDBFuncs(args, args.min_kmer_count, qc_dict) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index d3166fec..e299c527 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -65,6 +65,26 @@ def get_options(): aGroup.add_argument('--assign-args', help="Other arguments to pass to poppunk_assign", default = "") + # QC options + qcGroup = parser.add_argument_group('Quality control options for distances') + qcGroup.add_argument('--qc-filter', help='Behaviour following sequence QC step: "stop" [default], "prune"' + ' (analyse data passing QC), or "continue" (analyse all data)', + default='stop', type = str, choices=['stop', 'prune', 'continue']) + qcGroup.add_argument('--retain-failures', help='Retain sketches of genomes that do not pass QC filters in ' + 'separate database [default = False]', default=False, action='store_true') + qcGroup.add_argument('--max-a-dist', help='Maximum accessory distance to permit [default = 0.5]', + default = 0.5, type = float) + qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond ' + 'which sequences will be excluded [default = 5]', default = None, type = int) + qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded ' + '[two values needed - lower and upper bounds]', default=[None,None], + type = int, nargs = 2) + qcGroup.add_argument('--prop-n', help='Threshold ambiguous base proportion above which sequences will be excluded' + ' [default = 0.1]', default = None, + type = float) + qcGroup.add_argument('--upper-n', help='Threshold ambiguous base count above which sequences will be excluded', + default=None, type = int) + # Executable options eGroup = parser.add_argument_group('Executable locations') eGroup.add_argument('--poppunk-exe', help="Location of poppunk executable. Use " @@ -242,6 +262,16 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, args.db_args + " --threads " + \ str(args.threads) + " " + \ args.db_args + # QC options + if None not in args.length_range: + create_db_cmd += " --length-range " + str(length_range[0]) + " " + str(length_range[1]) + elif args.length_sigma is not None: + create_db_cmd += " --length-sigma " + str(args.length_sigma) + if args.upper_n is not None: + create_db_cmd += " --upper-n " + str(args.upper_n) + elif args.prop_n is not None: + create_db_cmd += " --prop-n " + str(args.prop_n) + # GPU options if args.use_gpu: create_db_cmd += " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid) runCmd(create_db_cmd) @@ -265,6 +295,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, " --threads " + str(args.threads) + \ " --previous-clustering " + wd + \ "/" + os.path.basename(wd) + "_lineages.csv" + # GPU options if args.use_gpu: mst_command = mst_command + " --gpu-graph" runCmd(mst_command) @@ -283,6 +314,20 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, " --model-dir " + prev_wd + " --output " + batch_wd + \ " --threads " + str(args.threads) + " --update-db " + \ args.assign_args + # QC options + if None not in args.length_range: + create_db_cmd += " --length-range " + str(length_range[0]) + " " + str(length_range[1]) + elif args.length_sigma is not None: + create_db_cmd += " --length-sigma " + str(args.length_sigma) + else: + create_db_cmd += " --length-sigma 5" # default from __main__ + if args.upper_n is not None: + create_db_cmd += " --upper-n " + str(args.upper_n) + elif args.prop_n is not None: + create_db_cmd += " --prop-n " + str(args.prop_n) + else: + create_db_cmd += " --prop-n 0.1" # default from __main__ + # GPU options if args.use_gpu: assign_cmd = assign_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid) runCmd(assign_cmd) From 73515b365e592af4fdf0bb5460f47124fa3bec3b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 8 Feb 2021 10:36:55 +0000 Subject: [PATCH 035/327] Fix QC option parsing --- scripts/poppunk_batch_mst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index f8ccebf0..421dad8f 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -264,7 +264,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, args.db_args # QC options if None not in args.length_range: - create_db_cmd += " --length-range " + str(length_range[0]) + " " + str(length_range[1]) + create_db_cmd += " --length-range " + str(args.length_range[0]) + " " + str(args.length_range[1]) elif args.length_sigma is not None: create_db_cmd += " --length-sigma " + str(args.length_sigma) if args.upper_n is not None: From 8c86be812a64c8c023fb04d7935fc24636b485a0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 8 Feb 2021 12:24:36 +0000 Subject: [PATCH 036/327] Define qc filter behaviour --- scripts/poppunk_batch_mst.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 421dad8f..33979610 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -271,6 +271,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, create_db_cmd += " --upper-n " + str(args.upper_n) elif args.prop_n is not None: create_db_cmd += " --prop-n " + str(args.prop_n) + create_db_cmd += " --qc-filter " + args.qc_filter # GPU options if args.use_gpu: create_db_cmd += " --gpu-dist --deviceid " + str(args.deviceid) @@ -327,6 +328,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, create_db_cmd += " --prop-n " + str(args.prop_n) else: create_db_cmd += " --prop-n 0.1" # default from __main__ + create_db_cmd += " --qc-filter " + args.qc_filter # GPU options if args.use_gpu: assign_cmd = assign_cmd + " --gpu-dist --deviceid " + str(args.deviceid) From 10bd9fa7b6707b30ef29e77704630ca816c27f85 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 8 Feb 2021 13:10:54 +0000 Subject: [PATCH 037/327] Fix length range error --- scripts/poppunk_batch_mst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 33979610..6d831c43 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -317,7 +317,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, args.assign_args # QC options if None not in args.length_range: - create_db_cmd += " --length-range " + str(length_range[0]) + " " + str(length_range[1]) + create_db_cmd += " --length-range " + str(args.length_range[0]) + " " + str(args.length_range[1]) elif args.length_sigma is not None: create_db_cmd += " --length-sigma " + str(args.length_sigma) else: From 752772d6a92541c4917d83b93093cbb4573bf20f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 8 Feb 2021 13:54:51 +0000 Subject: [PATCH 038/327] Change GPU use options --- scripts/poppunk_batch_mst.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 6d831c43..87d9e112 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -51,7 +51,10 @@ def get_options(): aGroup.add_argument('--threads', help='Number of threads for parallelisation (int)', type = int, default = 1) - aGroup.add_argument('--use-gpu', help='Use GPU for analysis', + aGroup.add_argument('--gpu-dist', help='Use GPU for distance calculations', + default=False, + action='store_true') + aGroup.add_argument('--gpu-graph', help='Use GPU for network analysis', default=False, action='store_true') aGroup.add_argument('--deviceid', help='GPU device ID (int)', @@ -273,7 +276,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, create_db_cmd += " --prop-n " + str(args.prop_n) create_db_cmd += " --qc-filter " + args.qc_filter # GPU options - if args.use_gpu: + if args.gpu_dist: create_db_cmd += " --gpu-dist --deviceid " + str(args.deviceid) runCmd(create_db_cmd) @@ -297,7 +300,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, " --previous-clustering " + wd + \ "/" + os.path.basename(wd) + "_lineages.csv" # GPU options - if args.use_gpu: + if args.gpu_graph: mst_command = mst_command + " --gpu-graph" runCmd(mst_command) @@ -330,7 +333,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, create_db_cmd += " --prop-n 0.1" # default from __main__ create_db_cmd += " --qc-filter " + args.qc_filter # GPU options - if args.use_gpu: + if args.gpu_dist: assign_cmd = assign_cmd + " --gpu-dist --deviceid " + str(args.deviceid) runCmd(assign_cmd) @@ -347,7 +350,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, prev_wd + "/" + os.path.basename(prev_wd) + ".graphml" + \ " --previous-clustering " + batch_wd + \ "/" + os.path.basename(batch_wd) + "_lineages.csv" - if args.use_gpu: + if args.gpu_graph: mst_command = mst_command + " --gpu-graph" runCmd(mst_command) @@ -387,7 +390,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, else: mst_command = mst_command + " --previous-clustering " + \ os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv") - if args.use_gpu: + if args.gpu_graph: mst_command = mst_command + " --gpu-graph" runCmd(mst_command) From fc549cc7418d7bc543fcf5a40195f79e2165136d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 8 Feb 2021 16:02:56 +0000 Subject: [PATCH 039/327] Add QC to assign command --- scripts/poppunk_batch_mst.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 87d9e112..959e3c0a 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -320,18 +320,18 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, args.assign_args # QC options if None not in args.length_range: - create_db_cmd += " --length-range " + str(args.length_range[0]) + " " + str(args.length_range[1]) + assign_cmd += " --length-range " + str(args.length_range[0]) + " " + str(args.length_range[1]) elif args.length_sigma is not None: - create_db_cmd += " --length-sigma " + str(args.length_sigma) + assign_cmd += " --length-sigma " + str(args.length_sigma) else: - create_db_cmd += " --length-sigma 5" # default from __main__ + assign_cmd += " --length-sigma 5" # default from __main__ if args.upper_n is not None: create_db_cmd += " --upper-n " + str(args.upper_n) elif args.prop_n is not None: - create_db_cmd += " --prop-n " + str(args.prop_n) + assign_cmd += " --prop-n " + str(args.prop_n) else: - create_db_cmd += " --prop-n 0.1" # default from __main__ - create_db_cmd += " --qc-filter " + args.qc_filter + assign_cmd += " --prop-n 0.1" # default from __main__ + assign_cmd += " --qc-filter " + args.qc_filter # GPU options if args.gpu_dist: assign_cmd = assign_cmd + " --gpu-dist --deviceid " + str(args.deviceid) From 6836d0f578d965411c3ee6a5b1903a2e29a5ec76 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 9 Feb 2021 08:18:26 +0000 Subject: [PATCH 040/327] Manual updates from master --- PopPUNK/visualise.py | 30 ++++++++++++++++++------- test/run_test.py | 52 ++++++++++++++++++++++++++------------------ 2 files changed, 53 insertions(+), 29 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 75682248..31be5033 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -62,6 +62,9 @@ def get_options(): 'from poppunk_assign [default = use that in the directory ' 'of the query database]', type = str) + iGroup.add_argument('--use-network', + help='Specify a directory containing a .gt file to use for any graph visualisations', + type = str) iGroup.add_argument('--display-cluster', help='Column of clustering CSV to use for plotting', default=None) @@ -146,6 +149,7 @@ def generate_visualisations(query_db, model_dir, previous_clustering, previous_query_clustering, + use_network, info_csv, rapidnj, tree, @@ -203,9 +207,9 @@ def generate_visualisations(query_db, if distances is None: if query_db is None: - distances = os.path.basename(ref_db) + "/" + ref_db + ".dists" + distances = ref_db + "/" + os.path.basename(ref_db) + ".dists" else: - distances = os.path.basename(query_db) + "/" + query_db + ".dists" + distances = query_db + "/" + os.path.basename(query_db) + ".dists" else: distances = distances @@ -220,16 +224,16 @@ def generate_visualisations(query_db, sys.stderr.write("Note: Distances in " + distances + " are from assign mode\n" "Note: Distance will be extended to full all-vs-all distances\n" "Note: Re-run poppunk_assign with --update-db to avoid this\n") - ref_db = os.path.basename(ref_db) + "/" + ref_db - rlist_original, qlist_original, self_ref, rr_distMat = readPickle(ref_db + ".dists") + ref_db_loc = ref_db + "/" + os.path.basename(ref_db) + rlist_original, qlist_original, self_ref, rr_distMat = readPickle(ref_db_loc + ".dists") if not self_ref: sys.stderr.write("Distances in " + ref_db + " not self all-vs-all either\n") sys.exit(1) kmers, sketch_sizes, codon_phased = readDBParams(query_db) addRandom(query_db, qlist, kmers, strand_preserved = strand_preserved, threads = threads) - query_db = os.path.basename(query_db) + "/" + query_db - qq_distMat = pp_sketchlib.queryDatabase(query_db, query_db, + query_db_loc = query_db + "/" + os.path.basename(query_db) + qq_distMat = pp_sketchlib.queryDatabase(query_db_loc, query_db_loc, qlist, qlist, kmers, True, False, threads, @@ -239,7 +243,7 @@ def generate_visualisations(query_db, # If the assignment was run with references, qrDistMat will be incomplete if rlist != rlist_original: rlist = rlist_original - qr_distMat = pp_sketchlib.queryDatabase(ref_db, query_db, + qr_distMat = pp_sketchlib.queryDatabase(ref_db_loc, query_db_loc, rlist, qlist, kmers, True, False, threads, @@ -291,7 +295,7 @@ def generate_visualisations(query_db, else: model_prefix = ref_db try: - model_file = os.path.basename(model_prefix) + "/" + os.path.basename(model_prefix) + model_file = model_prefix + "/" + os.path.basename(model_prefix) model = loadClusterFit(model_file + '_fit.pkl', model_file + '_fit.npz') except FileNotFoundError: @@ -321,6 +325,15 @@ def generate_visualisations(query_db, mode = mode, return_dict = True) + # Set graph location + if use_network is not None: + graph_dir = use_network + if graph_dir != prev_clustering: + sys.stderr.write("WARNING: Loading graph from a different directory to clusters\n") + sys.stderr.write("WARNING: Ensure that they are consistent\n") + else: + graph_dir = prev_clustering + # Join clusters with query clusters if required if not self: if previous_query_clustering is not None: @@ -460,6 +473,7 @@ def main(): args.model_dir, args.previous_clustering, args.previous_query_clustering, + args.use_network, args.info_csv, args.rapidnj, args.tree, diff --git a/test/run_test.py b/test/run_test.py index aaf7eaa2..9b0c2b75 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -12,20 +12,26 @@ sys.stderr.write("Extracting example dataset\n") subprocess.run("tar xf example_set.tar.bz2", shell=True, check=True) +if os.environ.get("POPPUNK_PYTHON"): + python_cmd = os.environ.get("POPPUNK_PYTHON") +else: + python_cmd = "python" + +#easy run sys.stderr.write("Running database creation (--create-db)\n") -subprocess.run("python ../poppunk-runner.py --create-db --r-files references.txt --min-k 13 --k-step 3 --output example_db --qc-filter prune --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files references.txt --min-k 13 --k-step 3 --output example_db --qc-filter prune --overwrite", shell=True, check=True) # create database with different QC options sys.stderr.write("Running database QC test (--create-db)\n") -subprocess.run("python ../poppunk-runner.py --create-db --r-files references.txt --min-k 13 --k-step 3 --output example_qc --qc-filter continue --length-range 2000000 3000000 --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files references.txt --min-k 13 --k-step 3 --output example_qc --qc-filter continue --length-range 2000000 3000000 --overwrite", shell=True, check=True) #fit GMM sys.stderr.write("Running GMM model fit (--fit-model gmm)\n") -subprocess.run("python ../poppunk-runner.py --fit-model bgmm --ref-db example_db --K 4 --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model bgmm --ref-db example_db --K 4 --overwrite", shell=True, check=True) #fit dbscan sys.stderr.write("Running DBSCAN model fit (--fit-model dbscan)\n") -subprocess.run("python ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --graph-weights", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --graph-weights", shell=True, check=True) #refine model with GMM sys.stderr.write("Running model refinement (--fit-model refine)\n") @@ -37,11 +43,11 @@ # lineage clustering sys.stderr.write("Running lineage clustering test (--fit-model lineage)\n") -subprocess.run("python ../poppunk-runner.py --fit-model lineage --output example_lineages --ranks 1,2,3,5 --ref-db example_db --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --output example_lineages --ranks 1,2,3,5 --ref-db example_db --overwrite", shell=True, check=True) #use model sys.stderr.write("Running with an existing model (--use-model)\n") -subprocess.run("python ../poppunk-runner.py --use-model --ref-db example_db --model-dir example_db --output example_use --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --use-model --ref-db example_db --model-dir example_db --output example_use --overwrite", shell=True, check=True) # tests of other command line programs sys.stderr.write("Testing C++ extension\n") @@ -49,21 +55,21 @@ #assign query sys.stderr.write("Running query assignment\n") -subprocess.run("python ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query --overwrite", shell=True, check=True) -subprocess.run("python ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query_update --update-db --graph-weights --overwrite", shell=True, check=True) -subprocess.run("python ../poppunk_assign-runner.py --query single_query.txt --db example_db --output example_single_query --update-db --overwrite", shell=True, check=True) -subprocess.run("python ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_lineages --output example_lineage_query --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query_update --update-db --graph-weights --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --output example_single_query --update-db --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_lineages --output example_lineage_query --overwrite", shell=True, check=True) # viz sys.stderr.write("Running visualisations (poppunk_visualise)\n") -subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --microreact", shell=True, check=True) -subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --cytoscape", shell=True, check=True) -subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --phandango", shell=True, check=True) -subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree", shell=True, check=True) -subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt", shell=True, check=True) -subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --query-db example_query --output example_viz_query --microreact", shell=True, check=True) -subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages/example_lineages_lineages.csv --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True) -subprocess.run("python ../poppunk_visualise-runner.py --distances example_query/example_query.dists --ref-db example_db --model-dir example_lineages --query-db example_lineage_query --output example_viz_query_lineages --microreact", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --microreact", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --cytoscape", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --phandango", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --query-db example_query --output example_viz_query --microreact", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --distances example_query/example_query.dists --ref-db example_db --model-dir example_lineages --query-db example_lineage_query --output example_viz_query_lineages --microreact", shell=True, check=True) # MST sys.stderr.write("Running MST\n") @@ -72,15 +78,19 @@ # t-sne sys.stderr.write("Running tsne viz\n") -subprocess.run("python ../poppunk_tsne-runner.py --distances example_db/example_db.dists --output example_tsne --perplexity 5 --verbosity 1", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_tsne-runner.py --distances example_db/example_db.dists --output example_tsne --perplexity 5 --verbosity 1", shell=True, check=True) # prune sys.stderr.write("Running poppunk_prune\n") -subprocess.run("python ../poppunk_prune-runner.py --distances example_db/example_db.dists --ref-db example_db --remove subset.txt --output example_prune", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_prune-runner.py --distances example_db/example_db.dists --ref-db example_db --remove subset.txt --output example_prune", shell=True, check=True) # references sys.stderr.write("Running poppunk_references\n") -subprocess.run("python ../poppunk_references-runner.py --network example_db/example_db_graph.gt --distances example_db/example_db.dists --ref-db example_db --output example_refs --model example_db", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_references-runner.py --network example_db/example_db_graph.gt --distances example_db/example_db.dists --ref-db example_db --output example_refs --model example_db", shell=True, check=True) + +# web API +sys.stderr.write("Running API tests\n") +subprocess.run(python_cmd + " test_web.py", shell=True, check=True) sys.stderr.write("Tests completed\n") From 8b3995855f4fbfdbd89b1f8798e364dfe9ba3aa3 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 9 Feb 2021 09:29:36 +0000 Subject: [PATCH 041/327] Fix file paths in tests --- PopPUNK/visualise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 31be5033..27e5e1b5 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -320,7 +320,7 @@ def generate_visualisations(query_db, if model.indiv_fitted: sys.stderr.write("Note: Individual (core/accessory) fits found, but " "visualisation only supports combined boundary fit\n") - prev_clustering = os.path.dirname(model_file) + '/' + os.path.basename(model_file) + suffix + prev_clustering = os.path.basename(model_file) + '/' + os.path.basename(model_file) + suffix isolateClustering = readIsolateTypeFromCsv(prev_clustering, mode = mode, return_dict = True) @@ -339,7 +339,7 @@ def generate_visualisations(query_db, if previous_query_clustering is not None: prev_query_clustering = previous_query_clustering else: - prev_query_clustering = os.path.dirname(query_db) + '/' + os.path.basename(query_db) + suffix + prev_query_clustering = os.path.basename(query_db) + '/' + os.path.basename(query_db) + suffix queryIsolateClustering = readIsolateTypeFromCsv( prev_query_clustering, From 64d0bbe24a6297591f25c2b48cfcfd62b74f7d12 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 9 Feb 2021 12:15:18 +0000 Subject: [PATCH 042/327] Add error message when distance file is missing --- PopPUNK/sparse_mst.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index c5d15757..34b47763 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -80,6 +80,9 @@ def main(): if not self: sys.stderr.write("This script must be run on a full all-v-all model\n") sys.exit(1) + else: + sys.stderr.write("Cannot find file " + args.distance_pkl + "\n") + sys.exit(1) # Check output path ok if not os.path.isdir(args.output): From f94648d6e53f194ad881b2d622f9d5c92082b8c8 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 9 Feb 2021 13:03:33 +0000 Subject: [PATCH 043/327] Prune MST when created --- PopPUNK/network.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index d059d57d..3a385829 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -369,12 +369,16 @@ def constructNetwork(rlist, qlist, assignments, within_label, if edge_list: if weights is not None: for weight, (ref, query) in zip(weights, assignments): - connections.append((ref, query, weight)) + # sparse matrix is symmetrical, avoid redundant loops + if ref < query: + connections.append((ref, query, weight)) else: connections = assignments elif sparse_input is not None: for ref, query, weight in zip(sparse_input.row, sparse_input.col, sparse_input.data): - connections.append((ref, query, weight)) + # sparse matrix is symmetrical, avoid redundant loops + if ref < query: + connections.append((ref, query, weight)) else: for row_idx, (assignment, (ref, query)) in enumerate(zip(assignments, listDistInts(rlist, qlist, @@ -399,14 +403,16 @@ def constructNetwork(rlist, qlist, assignments, within_label, weights = True) for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights): edge_tuple = (ref, query, weight) - connections.append(edge_tuple) + if ref < query: + connections.append(edge_tuple) else: extra_sources, extra_targets = load_previous_network(prev_G,rlist, weights = False) for (ref, query) in zip(extra_sources, extra_targets): edge_tuple = (ref, query) - connections.append(edge_tuple) - + if ref < query: + connections.append(edge_tuple) + # build the graph G = gt.Graph(directed = False) G.add_vertex(len(vertex_labels)) @@ -834,6 +840,7 @@ def generate_minimum_spanning_tree(G, from_cugraph = False): if "weight" in G.edge_properties: mst_edge_prop_map = gt.min_spanning_tree(G, weights = G.ep["weight"]) mst_network = gt.GraphView(G, efilt = mst_edge_prop_map) + mst_network = gt.Graph(mst_network, prune = True) else: sys.stderr.write("generate_minimum_spanning_tree requires a weighted graph\n") raise RuntimeError("MST passed unweighted graph") From 88eb94b8fdd4328754de36b5dc5a80be9ec57cb5 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 9 Feb 2021 13:15:10 +0000 Subject: [PATCH 044/327] Restore correct test for visualisation --- test/run_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/run_test.py b/test/run_test.py index 9b0c2b75..2337a282 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -68,7 +68,7 @@ subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --query-db example_query --output example_viz_query --microreact", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages/example_lineages_lineages.csv --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --distances example_query/example_query.dists --ref-db example_db --model-dir example_lineages --query-db example_lineage_query --output example_viz_query_lineages --microreact", shell=True, check=True) # MST From 25d57f4a9ffe467beeabff6283e30778aac06cbf Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Feb 2021 12:38:51 +0000 Subject: [PATCH 045/327] Add ability to extract distances from a sparse matrix --- scripts/poppunk_extract_distances.py | 46 ++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/scripts/poppunk_extract_distances.py b/scripts/poppunk_extract_distances.py index 6552fd03..3bbe2448 100755 --- a/scripts/poppunk_extract_distances.py +++ b/scripts/poppunk_extract_distances.py @@ -7,6 +7,7 @@ import numpy as np import argparse import dendropy +from scipy import sparse # command line parsing def get_options(): @@ -14,9 +15,17 @@ def get_options(): parser = argparse.ArgumentParser(description='Extract tab-separated file of distances from pkl and npy files', prog='extract_distances') # input options - parser.add_argument('--distances', required=True, help='Prefix of input pickle and numpy file of pre-calculated distances (required)') - parser.add_argument('--tree', required=False, help='Newick file containing phylogeny of isolates', default = None) - parser.add_argument('--output', required=True, help='Name of output file') + parser.add_argument('--distances', help='Prefix of input pickle (and optionally,' + ' numpy file) of pre-calculated distances (required)', + required=True) + parser.add_argument('--sparse', help='Sparse distance matrix file name', + default = None, + required = False) + parser.add_argument('--tree', help='Newick file containing phylogeny of isolates', + required = False, + default = None) + parser.add_argument('--output', help='Name of output file', + required = True) return parser.parse_args() @@ -71,7 +80,6 @@ def isolateNameToLabel(names): # open stored distances with open(args.distances + ".pkl", 'rb') as pickle_file: rlist, qlist, self = pickle.load(pickle_file) - X = np.load(args.distances + ".npy") # get names order r_names = isolateNameToLabel(rlist) @@ -91,14 +99,32 @@ def isolateNameToLabel(names): taxon_name = t.label.replace(' ','_') tip_index[r_names.index(taxon_name)] = t + # Load sparse matrix + if args.sparse is not None: + sparse_mat = sparse.load_npz(args.sparse) + else: + X = np.load(args.distances + ".npy") + # open output file with open(args.output, 'w') as oFile: - oFile.write("\t".join(['Query', 'Reference', 'Core', 'Accessory'])) + # Write header of output file + if args.sparse is not None: + oFile.write("\t".join(['Query', 'Reference', 'Core'])) + else: + oFile.write("\t".join(['Query', 'Reference', 'Core', 'Accessory'])) if args.tree is not None: oFile.write("\t" + 'Patristic') oFile.write("\n") - for i, (r_index, q_index) in enumerate(iterDistRows(r_names, q_names, r_names == q_names)): - oFile.write("\t".join([q_names[q_index], r_names[r_index], str(X[i,0]), str(X[i,1])])) - if args.tree is not None: - oFile.write("\t" + str(pdc(tip_index[r_index], tip_index[q_index]))) - oFile.write("\n") + # Write distances + if args.sparse is not None: + for (r_index, q_index, dist) in zip(sparse_mat.col, sparse_mat.row, sparse_mat.data): + oFile.write("\t".join([q_names[q_index], r_names[r_index], str(dist)])) + if args.tree is not None: + oFile.write("\t" + str(pdc(tip_index[r_index], tip_index[q_index]))) + oFile.write("\n") + else: + for i, (r_name, q_name) in enumerate(iterDistRows(r_names, q_names, r_names == q_names)): + oFile.write("\t".join([q_name, r_name, str(X[i,0]), str(X[i,1])])) + if args.tree is not None: + oFile.write("\t" + str(pdc(tip_index[r_index], tip_index[q_index]))) + oFile.write("\n") From 2b17f81bfc8c6063eda09d9d0bf94b15edf114d6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Feb 2021 12:41:46 +0000 Subject: [PATCH 046/327] Fix naming of temporary directories --- scripts/poppunk_batch_mst.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 959e3c0a..4b37af81 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -105,7 +105,9 @@ def get_options(): def writeBatch(rlines, batches, batch_selected, use_names = False): tmpdir = "" if use_names: - tmpdir = "./pp_mst_" + batch_selected + tmpdir = "./pp_mst_" + str(batch_selected) + if os.path.exists(tmpdir): + shutil.rmtree(tmpdir) os.mkdir(tmpdir) else: tmpdir = tempfile.mkdtemp(prefix="pp_mst", dir="./") @@ -399,6 +401,9 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, os.path.join(args.output,os.path.basename(args.output) + ".dists.pkl")) os.rename(os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv"), os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv")) + for rank in ranks: + os.rename(os.path.join(output_dir,os.path.basename(output_dir) + "_rank" + str(rank) + "_fit.npz"), + os.path.join(args.output,os.path.basename(args.output) + "_rank" + str(rank) + "_fit.npz")) # Merge with epidemiological data if requested if args.info_csv is not None: @@ -414,10 +419,12 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, except: if args.keep_intermediates == False: for tmpdir in tmp_dirs: - shutil.rmtree(tmpdir) + try: + shutil.rmtree(tmpdir) + except: + sys.stderr.write("Unable to remove " + tmpdir + "\n") print("Unexpected error:", sys.exc_info()[0]) raise if args.keep_intermediates == False: - shutil.rmtree(wd) shutil.rmtree(output_dir) From 90758221b922c58e2861591671f5e546eeaf9aa6 Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 12 Feb 2021 15:02:28 +0000 Subject: [PATCH 047/327] Remove unneeded return from sketchlib query Label order remains unchanged (unlike with mash) --- PopPUNK/__main__.py | 20 +++++++------- PopPUNK/assign.py | 62 ++++++++++++++++++++++---------------------- PopPUNK/network.py | 32 +++++++++++------------ PopPUNK/sketchlib.py | 44 +------------------------------ 4 files changed, 58 insertions(+), 100 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 63924906..cb5f981a 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -292,19 +292,19 @@ def main(): rNames = seq_names qNames = seq_names - refList, queryList, distMat = queryDatabase(rNames = rNames, - qNames = qNames, - dbPrefix = args.output, - queryPrefix = args.output, - klist = kmers, - self = True, - number_plot_fits = args.plot_fit, - threads = args.threads) - qcDistMat(distMat, refList, queryList, args.max_a_dist) + distMat = queryDatabase(rNames = rNames, + qNames = qNames, + dbPrefix = args.output, + queryPrefix = args.output, + klist = kmers, + self = True, + number_plot_fits = args.plot_fit, + threads = args.threads) + qcDistMat(distMat, rNames, qNames, args.max_a_dist) # Save results dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" - storePickle(refList, queryList, True, distMat, dists_out) + storePickle(rNames, qNames, True, distMat, dists_out) # Plot results plot_scatter(distMat, diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 4ce970a3..30b2d558 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -131,22 +131,22 @@ def assign_query(dbFuncs, codon_phased = codon_phased, calc_random = False) # run query - refList, queryList, qrDistMat = queryDatabase(rNames = rNames, - qNames = qNames, - dbPrefix = ref_db, - queryPrefix = output, - klist = kmers, - self = False, - number_plot_fits = plot_fit, - threads = threads) + qrDistMat = queryDatabase(rNames = rNames, + qNames = qNames, + dbPrefix = ref_db, + queryPrefix = output, + klist = kmers, + self = False, + number_plot_fits = plot_fit, + threads = threads) # QC distance matrix - qcPass = qcDistMat(qrDistMat, refList, queryList, max_a_dist) + qcPass = qcDistMat(qrDistMat, rNames, qNames, max_a_dist) # Load the network based on supplied options genomeNetwork, old_cluster_file = \ fetchNetwork(prev_clustering, model, - refList, + rNames, ref_graph = use_ref_graph, core_only = core_only, accessory_only = accessory_only) @@ -154,14 +154,14 @@ def assign_query(dbFuncs, if model.type == 'lineage': # Assign lineages by calculating query-query information addRandom(output, qNames, kmers, strand_preserved, overwrite, threads) - qlist1, qlist2, qqDistMat = queryDatabase(rNames = qNames, - qNames = qNames, - dbPrefix = output, - queryPrefix = output, - klist = kmers, - self = True, - number_plot_fits = 0, - threads = threads) + qqDistMat = queryDatabase(rNames = qNames, + qNames = qNames, + dbPrefix = output, + queryPrefix = output, + klist = kmers, + self = True, + number_plot_fits = 0, + threads = threads) model.extend(qqDistMat, qrDistMat) genomeNetwork = {} @@ -182,18 +182,18 @@ def assign_query(dbFuncs, isolateClustering[rank] = \ printClusters(genomeNetwork[rank], - refList + queryList, + rNames + qNames, printCSV = False) overall_lineage = createOverallLineage(model.ranks, isolateClustering) writeClusterCsv( output + "/" + os.path.basename(output) + '_lineages.csv', - refList + queryList, - refList + queryList, + rNames + qNames, + rNames + qNames, overall_lineage, output_format = 'phandango', epiCsv = None, - queryNames = queryList, + queryNames = qNames, suffix = '_Lineage') else: @@ -206,14 +206,14 @@ def assign_query(dbFuncs, else: weights = None qqDistMat = \ - addQueryToNetwork(dbFuncs, refList, queryList, + addQueryToNetwork(dbFuncs, rNames, qNames, genomeNetwork, kmers, queryAssignments, model, output, update_db, strand_preserved, weights = weights, threads = threads) isolateClustering = \ - {'combined': printClusters(genomeNetwork, refList + queryList, + {'combined': printClusters(genomeNetwork, rNames + qNames, output + "/" + os.path.basename(output), old_cluster_file, external_clustering, @@ -248,9 +248,9 @@ def assign_query(dbFuncs, combined_seq, core_distMat, acc_distMat = \ update_distance_matrices(refList, rrDistMat, - queryList, qrDistMat, - qqDistMat, threads = threads) - assert combined_seq == refList + queryList + qNames, qrDistMat, + qqDistMat, threads = threads) + assert combined_seq == refList + qNames # Get full distance matrix and save complete_distMat = \ @@ -260,12 +260,12 @@ def assign_query(dbFuncs, # Clique pruning if model.type != 'lineage': - dbOrder = refList + queryList + dbOrder = refList + qNames newRepresentativesIndices, newRepresentativesNames, \ newRepresentativesFile, genomeNetwork = \ extractReferences(genomeNetwork, dbOrder, output, refList, threads = threads) # intersection that maintains order - newQueries = [x for x in queryList if x in frozenset(newRepresentativesNames)] + newQueries = [x for x in qNames if x in frozenset(newRepresentativesNames)] # could also have newRepresentativesNames in this diff (should be the same) - but want # to ensure consistency with the network in case of bad input/bugs @@ -280,12 +280,12 @@ def assign_query(dbFuncs, genomeNetwork.save(output + "/" + os.path.basename(output) + '.refs_graph.gt', fmt = 'gt') removeFromDB(output, output, names_to_remove) os.rename(output + "/" + os.path.basename(output) + ".tmp.h5", - output + "/" + os.path.basename(output) + ".refs.h5") + output + "/" + os.path.basename(output) + ".refs.h5") # ensure sketch and distMat order match assert postpruning_combined_seq == refList + newQueries else: - storePickle(refList, queryList, False, qrDistMat, dists_out) + storePickle(refList, qNames, False, qrDistMat, dists_out) if save_partial_query_graph: if model.type == 'lineage': genomeNetwork[min(model.ranks)].save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt') diff --git a/PopPUNK/network.py b/PopPUNK/network.py index b5c9cadd..d55a2521 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -489,14 +489,14 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, if queryQuery: sys.stderr.write("Calculating all query-query distances\n") addRandom(queryDB, qList, kmers, strand_preserved, threads = threads) - qlist1, qlist2, qqDistMat = queryDatabase(rNames = qList, - qNames = qList, - dbPrefix = queryDB, - queryPrefix = queryDB, - klist = kmers, - self = True, - number_plot_fits = 0, - threads = threads) + qqDistMat = queryDatabase(rNames = qList, + qNames = qList, + dbPrefix = queryDB, + queryPrefix = queryDB, + klist = kmers, + self = True, + number_plot_fits = 0, + threads = threads) queryAssignation = model.assign(qqDistMat) for row_idx, (assignment, (ref, query)) in enumerate(zip(queryAssignation, listDistInts(qList, qList, self = True))): @@ -519,14 +519,14 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, # use database construction methods to find links between unassigned queries addRandom(queryDB, qList, kmers, strand_preserved, threads = threads) - qlist1, qlist2, qqDistMat = queryDatabase(rNames = list(unassigned), - qNames = list(unassigned), - dbPrefix = queryDB, - queryPrefix = queryDB, - klist = kmers, - self = True, - number_plot_fits = 0, - threads = threads) + qqDistMat = queryDatabase(rNames = list(unassigned), + qNames = list(unassigned), + dbPrefix = queryDB, + queryPrefix = queryDB, + klist = kmers, + self = True, + number_plot_fits = 0, + threads = threads) queryAssignation = model.assign(qqDistMat) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 37ead86d..96509c3d 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -517,10 +517,6 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num (default = 0) Returns: - refList (list) - Names of reference sequences - queryList (list) - Names of query sequences distMat (numpy.array) Core distances (column 0) and accessory distances (column 1) between refList and queryList @@ -568,46 +564,8 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num distMat = pp_sketchlib.queryDatabase(ref_db, query_db, rNames, qNames, klist, True, False, threads, use_gpu, deviceid) - return(rNames, qNames, distMat) + return distMat -def calculateQueryQueryDistances(dbFuncs, qlist, kmers, - queryDB, threads = 1): - """Calculates distances between queries. - - Args: - dbFuncs (list) - List of backend functions from :func:`~PopPUNK.utils.setupDBFuncs` - rlist (list) - List of reference names - qlist (list) - List of query names - kmers (list) - List of k-mer sizes - queryDB (str) - Query database location - threads (int) - Number of threads to use if new db created - (default = 1) - - Returns: - qlist1 (list) - Ordered list of queries - distMat (numpy.array) - Query-query distances - """ - - queryDatabase = dbFuncs['queryDatabase'] - - qlist1, qlist2, distMat = queryDatabase(rNames = qlist, - qNames = qlist, - dbPrefix = queryDB, - queryPrefix = queryDB, - klist = kmers, - self = True, - number_plot_fits = 0, - threads = threads) - - return qlist1, distMat def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads): """Calculates random match probability based on means of genomes From 18dd105406f7c35483acd99d58a1d7b6db60bc98 Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 12 Feb 2021 15:07:48 +0000 Subject: [PATCH 048/327] Fix assign import --- PopPUNK/assign.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 30b2d558..5e5808df 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -60,7 +60,6 @@ def assign_query(dbFuncs, from .prune_db import prune_distance_matrix - from .sketchlib import calculateQueryQueryDistances from .sketchlib import addRandom from .utils import storePickle @@ -285,7 +284,7 @@ def assign_query(dbFuncs, # ensure sketch and distMat order match assert postpruning_combined_seq == refList + newQueries else: - storePickle(refList, qNames, False, qrDistMat, dists_out) + storePickle(rNames, qNames, False, qrDistMat, dists_out) if save_partial_query_graph: if model.type == 'lineage': genomeNetwork[min(model.ranks)].save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt') From 92e5fde579d0891879f2f902d9da9521cb9e73d8 Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 12 Feb 2021 15:16:26 +0000 Subject: [PATCH 049/327] Check for slash in sample names --- PopPUNK/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index bd3d2995..4bf042ce 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -389,6 +389,9 @@ def readRfile(rFile, oneSeq=False): "Must contain sample name and file, tab separated\n") sys.exit(1) + if "/" in rFields[0]: + sys.stderr.write("Sample names may not contain slashes\n") + sys.exit(1) names.append(rFields[0]) sample_files = [] for sequence in rFields[1:]: From 6934b416608d9f3112e07821bd199ddf0d225556 Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 12 Feb 2021 16:28:41 +0000 Subject: [PATCH 050/327] Update distance extract script --- scripts/poppunk_extract_distances.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/scripts/poppunk_extract_distances.py b/scripts/poppunk_extract_distances.py index 6552fd03..914a8ade 100755 --- a/scripts/poppunk_extract_distances.py +++ b/scripts/poppunk_extract_distances.py @@ -20,9 +20,11 @@ def get_options(): return parser.parse_args() -def iterDistRows(refSeqs, querySeqs, self=True): +def listDistInts(refSeqs, querySeqs, self=True): """Gets the ref and query ID for each row of the distance matrix + Returns an iterable with ref and query ID pairs by row. + Args: refSeqs (list) List of reference sequence names. @@ -36,15 +38,21 @@ def iterDistRows(refSeqs, querySeqs, self=True): ref, query (str, str) Iterable of tuples with ref and query names for each distMat row. """ + num_ref = len(refSeqs) + num_query = len(querySeqs) if self: - assert refSeqs == querySeqs - for i, ref in enumerate(refSeqs): - for j in range(i + 1, len(refSeqs)): - yield(refSeqs[j], ref) + if refSeqs != querySeqs: + raise RuntimeError('refSeqs must equal querySeqs for db building (self = true)') + for i in range(num_ref): + for j in range(i + 1, num_ref): + yield(j, i) else: - for query in querySeqs: - for ref in refSeqs: - yield(ref, query) + comparisons = [(0,0)] * (len(refSeqs) * len(querySeqs)) + for i in range(num_query): + for j in range(num_ref): + yield(j, i) + + return comparisons def isolateNameToLabel(names): """Function to process isolate names to labels @@ -97,7 +105,7 @@ def isolateNameToLabel(names): if args.tree is not None: oFile.write("\t" + 'Patristic') oFile.write("\n") - for i, (r_index, q_index) in enumerate(iterDistRows(r_names, q_names, r_names == q_names)): + for i, (r_index, q_index) in enumerate(listDistInts(r_names, q_names, r_names == q_names)): oFile.write("\t".join([q_names[q_index], r_names[r_index], str(X[i,0]), str(X[i,1])])) if args.tree is not None: oFile.write("\t" + str(pdc(tip_index[r_index], tip_index[q_index]))) From 25f74fb45b2a6dd839fb120cdbb43945faedcda2 Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 12 Feb 2021 16:29:15 +0000 Subject: [PATCH 051/327] Save/copy model with assign + update --- PopPUNK/assign.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 5e5808df..7e9c6803 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -257,6 +257,11 @@ def assign_query(dbFuncs, pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1))) storePickle(combined_seq, combined_seq, True, complete_distMat, dists_out) + # Copy model if needed + if output != model.outPrefix: + model.outPrefix = output + model.save() + # Clique pruning if model.type != 'lineage': dbOrder = refList + qNames From c1f4c5a23fecc5d099529c5948642dfa8569ed4c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Feb 2021 09:28:57 +0000 Subject: [PATCH 052/327] Fix distance file names --- PopPUNK/__main__.py | 2 +- PopPUNK/assign.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index cb5f981a..11ab2ee3 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -330,7 +330,7 @@ def main(): sys.stderr.write("Need to provide --ref-db where .h5 and .dists from " "--create-db mode were output") if args.distances is None: - distances = os.path.basename(args.ref_db) + "/" + args.ref_db + ".dists" + distances = args.ref_db + "/" + os.path.basename(args.ref_db) + ".dists" else: distances = args.distances if args.output is None: diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index ad13cd5c..13e77a81 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -456,7 +456,7 @@ def main(): setGtThreads(args.threads) if args.distances is None: - distances = os.path.basename(args.db) + "/" + args.db + ".dists" + distances = args.db + "/" + os.path.basename(args.db) + ".dists" else: distances = args.distances From d2c5d139846a0be0542b84095c5e4f8ff4406303 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Feb 2021 09:29:55 +0000 Subject: [PATCH 053/327] Fix name ordering function --- scripts/poppunk_extract_distances.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/poppunk_extract_distances.py b/scripts/poppunk_extract_distances.py index 80ff64b9..eb4805f1 100755 --- a/scripts/poppunk_extract_distances.py +++ b/scripts/poppunk_extract_distances.py @@ -61,8 +61,6 @@ def listDistInts(refSeqs, querySeqs, self=True): for j in range(num_ref): yield(j, i) - return comparisons - def isolateNameToLabel(names): """Function to process isolate names to labels appropriate for visualisation. From c91eb03f75502b6eb7af9d3ce95e22d7cf393b6f Mon Sep 17 00:00:00 2001 From: John Lees Date: Thu, 18 Feb 2021 11:29:51 +0000 Subject: [PATCH 054/327] Fix ref name order in assign + update --- PopPUNK/__main__.py | 31 +++++++++++++++---------------- PopPUNK/assign.py | 28 +++++++++++++++++++++------- PopPUNK/sketchlib.py | 20 +++++++++++++++----- 3 files changed, 51 insertions(+), 28 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 11ab2ee3..359ac27b 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -280,31 +280,30 @@ def main(): # generate sketches and QC sequences createDatabaseDir(args.output, kmers) - seq_names = constructDatabase( - args.r_files, - kmers, - sketch_sizes, - args.output, - args.threads, - args.overwrite, - codon_phased = args.codon_phased, - calc_random = True) - - rNames = seq_names - qNames = seq_names - distMat = queryDatabase(rNames = rNames, - qNames = qNames, + seq_names_passing = \ + constructDatabase( + args.r_files, + kmers, + sketch_sizes, + args.output, + args.threads, + args.overwrite, + codon_phased = args.codon_phased, + calc_random = True) + + distMat = queryDatabase(rNames = seq_names_passing, + qNames = seq_names_passing, dbPrefix = args.output, queryPrefix = args.output, klist = kmers, self = True, number_plot_fits = args.plot_fit, threads = args.threads) - qcDistMat(distMat, rNames, qNames, args.max_a_dist) + qcDistMat(distMat, seq_names_passing, seq_names_passing, args.max_a_dist) # Save results dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" - storePickle(rNames, qNames, True, distMat, dists_out) + storePickle(seq_names_passing, seq_names_passing, True, distMat, dists_out) # Plot results plot_scatter(distMat, diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 13e77a81..b4ab7c00 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -248,14 +248,28 @@ def assign_query(dbFuncs, else: distanceFiles = distances - refList, refList_copy, self, rrDistMat = readPickle(distanceFiles, - enforce_self = True) + # Load the previous distances + refList_loaded, refList_copy, self, rrDistMat = \ + readPickle(distanceFiles, + enforce_self = True) + # qrDistMat: order of ref labels is the same as in the database (usually + # ordered). Order in original rrDistMat is arbitrary, leading to an + # awkwardness here. We prefer to reorder the qrDistMat to match, as it is + # usually smaller and has a simpler layout in long form + # At the end, rNames is updated to match what has been loaded + if refList_loaded != rNames: + match_order = [rNames.index(i) for i in refList_loaded] * len(qNames) + for q_offset in range(len(qNames)): + for r_offset in range(len(rNames)): + match_order[q_offset * len(rNames) + r_offset] += q_offset * len(rNames) + qrDistMat = qrDistMat[match_order, :] + rNames = refList_loaded combined_seq, core_distMat, acc_distMat = \ - update_distance_matrices(refList, rrDistMat, + update_distance_matrices(rNames, rrDistMat, qNames, qrDistMat, qqDistMat, threads = threads) - assert combined_seq == refList + qNames + assert combined_seq == rNames + qNames # Get full distance matrix and save complete_distMat = \ @@ -270,10 +284,10 @@ def assign_query(dbFuncs, # Clique pruning if model.type != 'lineage': - dbOrder = refList + qNames + dbOrder = rNames + qNames newRepresentativesIndices, newRepresentativesNames, \ newRepresentativesFile, genomeNetwork = \ - extractReferences(genomeNetwork, dbOrder, output, refList, threads = threads) + extractReferences(genomeNetwork, dbOrder, output, rNames, threads = threads) # intersection that maintains order newQueries = [x for x in qNames if x in frozenset(newRepresentativesNames)] @@ -293,7 +307,7 @@ def assign_query(dbFuncs, output + "/" + os.path.basename(output) + ".refs.h5") # ensure sketch and distMat order match - assert postpruning_combined_seq == refList + newQueries + assert postpruning_combined_seq == rNames + newQueries else: storePickle(rNames, qNames, False, qrDistMat, dists_out) if save_partial_query_graph: diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 96509c3d..60f42a54 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -388,6 +388,10 @@ def constructDatabase(assemblyList, klist, sketch_size, oPrefix, deviceid (int) GPU device id (default = 0) + Returns: + names (list) + List of names included in the database (some may be pruned due + to QC) """ # read file names names, sequences = readRfile(assemblyList) @@ -417,6 +421,7 @@ def constructDatabase(assemblyList, klist, sketch_size, oPrefix, # QC sequences if qc_dict['run_qc']: filtered_names = sketchlibAssemblyQC(oPrefix, + names, klist, qc_dict, strand_preserved, @@ -567,13 +572,15 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num return distMat -def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads): +def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads): """Calculates random match probability based on means of genomes in assemblyList, and looks for length outliers. Args: prefix (str) Prefix of output files + names (list) + Names of samples to QC klist (list) List of k-mer sizes to sketch qc_dict (dict) @@ -605,10 +612,11 @@ def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads): # iterate through sketches for dataset in read_grp: - # test thresholds - remove = False - seq_length[dataset] = hdf_in['sketches'][dataset].attrs['length'] - seq_ambiguous[dataset] = hdf_in['sketches'][dataset].attrs['missing_bases'] + if dataset in names: + # test thresholds + remove = False + seq_length[dataset] = hdf_in['sketches'][dataset].attrs['length'] + seq_ambiguous[dataset] = hdf_in['sketches'][dataset].attrs['missing_bases'] # calculate thresholds # get mean length @@ -692,6 +700,8 @@ def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads): del hdf_in['random'] hdf_in.close() + # This gives back retained in the same order as names + retained = [x for x in names if x in frozenset(retained)] return retained def fitKmerCurve(pairwise, klist, jacobian): From 353dd1df6f905cb08d04fa1f6b2fdde7b78b494c Mon Sep 17 00:00:00 2001 From: John Lees Date: Thu, 18 Feb 2021 11:34:32 +0000 Subject: [PATCH 055/327] Add sort on readRfile --- PopPUNK/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 4bf042ce..f15fe72c 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -413,6 +413,14 @@ def readRfile(rFile, oneSeq=False): sys.stderr.write("Non-unique names are " + ",".join(dupes) + "\n") sys.exit(1) + # Names are sorted on return + # We have had issues (though they should be fixed) with unordered input + # not matching the database. This should help simplify things + list_iterable = zip(names, sequences) + sorted_names = sorted(list_iterable) + tuples = zip(*sorted_names) + names, sequences = [list(tuple) for tuple in tuples] + return (names, sequences) def isolateNameToLabel(names): From db54c7a8830e3f29ad7020cbce9acf5d13529c70 Mon Sep 17 00:00:00 2001 From: John Lees Date: Thu, 18 Feb 2021 11:48:42 +0000 Subject: [PATCH 056/327] Add test of distance order --- test/clean_test.py | 5 ++++- test/rfile1.txt | 3 +++ test/rfile12.txt | 6 +++++ test/rfile2.txt | 3 +++ test/run_test.py | 22 ++++++++++-------- test/test-update.py | 37 +++++++++++++++++++++++++++++++ test/{test_web.py => test-web.py} | 0 7 files changed, 66 insertions(+), 10 deletions(-) create mode 100644 test/rfile1.txt create mode 100644 test/rfile12.txt create mode 100644 test/rfile2.txt create mode 100755 test/test-update.py rename test/{test_web.py => test-web.py} (100%) diff --git a/test/clean_test.py b/test/clean_test.py index 3ecc96a1..29852e14 100755 --- a/test/clean_test.py +++ b/test/clean_test.py @@ -40,7 +40,10 @@ def deleteDir(dirname): "example_tsne", "example_prune", "example_refs", - "example_api" + "example_api", + "batch1", + "batch2", + "batch12" ] for outDir in outputDirs: deleteDir(outDir) diff --git a/test/rfile1.txt b/test/rfile1.txt new file mode 100644 index 00000000..4f388da2 --- /dev/null +++ b/test/rfile1.txt @@ -0,0 +1,3 @@ +7 12673_8#24.contigs_velvet.fa +1 12673_8#34.contigs_velvet.fa +2 12673_8#43.contigs_velvet.fa diff --git a/test/rfile12.txt b/test/rfile12.txt new file mode 100644 index 00000000..e4f63584 --- /dev/null +++ b/test/rfile12.txt @@ -0,0 +1,6 @@ +7 12673_8#24.contigs_velvet.fa +1 12673_8#34.contigs_velvet.fa +2 12673_8#43.contigs_velvet.fa +6 12754_4#79.contigs_velvet.fa +4 12754_4#85.contigs_velvet.fa +5 12754_4#89.contigs_velvet.fa diff --git a/test/rfile2.txt b/test/rfile2.txt new file mode 100644 index 00000000..5f6e9a24 --- /dev/null +++ b/test/rfile2.txt @@ -0,0 +1,3 @@ +6 12754_4#79.contigs_velvet.fa +4 12754_4#85.contigs_velvet.fa +5 12754_4#89.contigs_velvet.fa diff --git a/test/run_test.py b/test/run_test.py index 2337a282..c6ed75e3 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -25,6 +25,10 @@ sys.stderr.write("Running database QC test (--create-db)\n") subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files references.txt --min-k 13 --k-step 3 --output example_qc --qc-filter continue --length-range 2000000 3000000 --overwrite", shell=True, check=True) +# test updating order is correct +sys.stderr.write("Running distance matrix order check (--update-db)\n") +subprocess.run(python_cmd + " test-update.py", shell=True, check=True) + #fit GMM sys.stderr.write("Running GMM model fit (--fit-model gmm)\n") subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model bgmm --ref-db example_db --K 4 --overwrite", shell=True, check=True) @@ -35,11 +39,11 @@ #refine model with GMM sys.stderr.write("Running model refinement (--fit-model refine)\n") -subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite", shell=True, check=True) -subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both", shell=True, check=True) -subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1", shell=True, check=True) -subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 2", shell=True, check=True) -subprocess.run("python ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold", shell=True, check=True) +subprocess.run(python_cmd + "../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both", shell=True, check=True) +subprocess.run(python_cmd + "../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 2", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold", shell=True, check=True) # lineage clustering sys.stderr.write("Running lineage clustering test (--fit-model lineage)\n") @@ -51,7 +55,7 @@ # tests of other command line programs sys.stderr.write("Testing C++ extension\n") -subprocess.run("python test-refine.py", shell=True, check=True) +subprocess.run(python_cmd + " test-refine.py", shell=True, check=True) #assign query sys.stderr.write("Running query assignment\n") @@ -73,8 +77,8 @@ # MST sys.stderr.write("Running MST\n") -subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree mst", shell=True, check=True) -subprocess.run("python ../poppunk_mst-runner.py --distance-pkl example_db/example_db.dists.pkl --rank-fit example_lineages/example_lineages_rank5_fit.npz --previous-clustering example_dbscan/example_dbscan_clusters.csv --output example_sparse_mst --no-plot", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree mst", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_mst-runner.py --distance-pkl example_db/example_db.dists.pkl --rank-fit example_lineages/example_lineages_rank5_fit.npz --previous-clustering example_dbscan/example_dbscan_clusters.csv --output example_sparse_mst --no-plot", shell=True, check=True) # t-sne sys.stderr.write("Running tsne viz\n") @@ -90,7 +94,7 @@ # web API sys.stderr.write("Running API tests\n") -subprocess.run(python_cmd + " test_web.py", shell=True, check=True) +subprocess.run(python_cmd + " test-web.py", shell=True, check=True) sys.stderr.write("Tests completed\n") diff --git a/test/test-update.py b/test/test-update.py new file mode 100755 index 00000000..38c950c2 --- /dev/null +++ b/test/test-update.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# Copyright 2018-2020 John Lees and Nick Croucher + +"""Tests for PopPUNK""" + +import subprocess +import os +import sys +import shutil + +import numpy as np +from scipy import stats + +if os.environ.get("POPPUNK_PYTHON"): + python_cmd = os.environ.get("POPPUNK_PYTHON") +else: + python_cmd = "python" + +def run_regression(x, y, threshold = 0.99): + res = stats.linregress(x, y) + if res.rvalue**2 < threshold: + sys.stderr.write("Dist order failed: R^2 = " + str(res.rvalue**2) + "\n") + sys.exit(1) + +# Check that order is the same after doing 1 + 2 with --update-db, as doing all of 1 + 2 together +subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile12.txt --output batch12 --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile1.txt --output batch1 --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch1 --ranks 1", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch1 --query rfile2.txt --output batch2 --update-db --overwrite", shell=True, check=True) + +X1 = np.load("batch12/batch12.dists.npy") +X2 = np.load("batch2/batch2.dists.npy") + +run_regression(X1[:, 0], X2[:, 0]) +run_regression(X1[:, 1], X2[:, 1]) + + diff --git a/test/test_web.py b/test/test-web.py similarity index 100% rename from test/test_web.py rename to test/test-web.py From 2ca5dec552bccfa26169fb8b15f117aa5bac00ad Mon Sep 17 00:00:00 2001 From: John Lees Date: Thu, 18 Feb 2021 12:47:56 +0000 Subject: [PATCH 057/327] Requery in update test --- test/test-update.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/test/test-update.py b/test/test-update.py index 38c950c2..9ffc9192 100755 --- a/test/test-update.py +++ b/test/test-update.py @@ -1,15 +1,20 @@ #!/usr/bin/env python -# Copyright 2018-2020 John Lees and Nick Croucher +# Copyright 2018-2021 John Lees and Nick Croucher -"""Tests for PopPUNK""" +"""Tests for PopPUNK --update-db order""" import subprocess -import os +import os, sys import sys import shutil +import pickle import numpy as np from scipy import stats +import h5py + +sys.path.insert(0, '/Users/jlees/Documents/Imperial/pp-sketchlib/build/lib.macosx-10.9-x86_64-3.8') +import pp_sketchlib if os.environ.get("POPPUNK_PYTHON"): python_cmd = os.environ.get("POPPUNK_PYTHON") @@ -18,8 +23,9 @@ def run_regression(x, y, threshold = 0.99): res = stats.linregress(x, y) + print("R^2: " + str(res.rvalue**2)) if res.rvalue**2 < threshold: - sys.stderr.write("Dist order failed: R^2 = " + str(res.rvalue**2) + "\n") + sys.stderr.write("Distance matrix order failed!\n") sys.exit(1) # Check that order is the same after doing 1 + 2 with --update-db, as doing all of 1 + 2 together @@ -28,9 +34,20 @@ def run_regression(x, y, threshold = 0.99): subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch1 --ranks 1", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch1 --query rfile2.txt --output batch2 --update-db --overwrite", shell=True, check=True) -X1 = np.load("batch12/batch12.dists.npy") +# Load updated distances X2 = np.load("batch2/batch2.dists.npy") - +with open("batch2/batch2.dists.pkl", 'rb') as pickle_file: + rlist2, qlist, self = pickle.load(pickle_file) + +# Get same distances from the full database +ref_db = "batch12/batch12" +ref_h5 = h5py.File(ref_db + ".h5", 'r') +db_kmers = sorted(ref_h5['sketches/' + rlist2[0]].attrs['kmers']) +ref_h5.close() +X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist2, rlist2, db_kmers, + True, False, 1, False, 0) + +# Check distances match run_regression(X1[:, 0], X2[:, 0]) run_regression(X1[:, 1], X2[:, 1]) From 0e33312e940e7f5fcd2a5481a571d1c3de688796 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 18 Feb 2021 13:22:08 +0000 Subject: [PATCH 058/327] Fix refine model test --- test/run_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/run_test.py b/test/run_test.py index c6ed75e3..ef173f39 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -39,7 +39,7 @@ #refine model with GMM sys.stderr.write("Running model refinement (--fit-model refine)\n") -subprocess.run(python_cmd + "../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both", shell=True, check=True) subprocess.run(python_cmd + "../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 2", shell=True, check=True) From dfc83fae0e56566fdbacd4de8b42b96f533ab206 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 18 Feb 2021 13:32:35 +0000 Subject: [PATCH 059/327] Fix second lineage test --- test/run_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/run_test.py b/test/run_test.py index ef173f39..a72b450d 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -41,7 +41,7 @@ sys.stderr.write("Running model refinement (--fit-model refine)\n") subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both", shell=True, check=True) -subprocess.run(python_cmd + "../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 2", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold", shell=True, check=True) From 31dcbfd2bb4f19ce05c79ba40a7dbcd1815e3a59 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 18 Feb 2021 15:08:55 +0000 Subject: [PATCH 060/327] Add display cluster option for MST visualisation --- PopPUNK/visualise.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 9b135448..d977abc9 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -358,6 +358,10 @@ def generate_visualisations(query_db, if not overwrite: existing_tree = load_tree(output, "MST", distances=mst_distances) if existing_tree is None: + # Get a default clustering if none provided + if display_cluster is None: + display_cluster = list(isolateClustering.keys())[0] + # Get distance matrix complete_distMat = \ np.hstack((pp_sketchlib.squareToLong(core_distMat, threads).reshape(-1, 1), pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1))) @@ -371,7 +375,7 @@ def generate_visualisations(query_db, weights_type=mst_distances, summarise=False) mst_graph = generate_minimum_spanning_tree(G) - drawMST(mst_graph, output, isolateClustering, overwrite) + drawMST(mst_graph, output, isolateClustering, display_cluster, overwrite) mst_tree = mst_to_phylogeny(mst_graph, isolateNameToLabel(combined_seq)) else: mst_tree = existing_tree From d396870e9c98c9d8d914947309d09eff6fa710b9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 18 Feb 2021 16:26:02 +0000 Subject: [PATCH 061/327] Add test for querying updated database --- test/rfile123.txt | 9 +++++++++ test/rfile3.txt | 3 +++ test/test-update.py | 20 ++++++++++++++++++++ 3 files changed, 32 insertions(+) create mode 100644 test/rfile123.txt create mode 100644 test/rfile3.txt diff --git a/test/rfile123.txt b/test/rfile123.txt new file mode 100644 index 00000000..af5a0ead --- /dev/null +++ b/test/rfile123.txt @@ -0,0 +1,9 @@ +7 12673_8#24.contigs_velvet.fa +1 12673_8#34.contigs_velvet.fa +2 12673_8#43.contigs_velvet.fa +6 12754_4#79.contigs_velvet.fa +4 12754_4#85.contigs_velvet.fa +5 12754_4#89.contigs_velvet.fa +8 12754_5#73.contigs_velvet.fa +3 12754_5#78.contigs_velvet.fa +9 12754_5#71.contigs_velvet.fa diff --git a/test/rfile3.txt b/test/rfile3.txt new file mode 100644 index 00000000..23104358 --- /dev/null +++ b/test/rfile3.txt @@ -0,0 +1,3 @@ +8 12754_5#73.contigs_velvet.fa +3 12754_5#78.contigs_velvet.fa +9 12754_5#71.contigs_velvet.fa diff --git a/test/test-update.py b/test/test-update.py index 9ffc9192..6ec36309 100755 --- a/test/test-update.py +++ b/test/test-update.py @@ -51,4 +51,24 @@ def run_regression(x, y, threshold = 0.99): run_regression(X1[:, 0], X2[:, 0]) run_regression(X1[:, 1], X2[:, 1]) +# Check that order is the same after doing 1 + 2 + 3 with --update-db, as doing all of 1 + 2 + 3 together +subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile123.txt --output batch123 --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch123 --ranks 1", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch2 --query rfile3.txt --output batch3 --update-db --overwrite", shell=True, check=True) +# Load updated distances +X2 = np.load("batch3/batch3.dists.npy") +with open("batch3/batch3.dists.pkl", 'rb') as pickle_file: + rlist3, qlist, self = pickle.load(pickle_file) + +# Get same distances from the full database +ref_db = "batch123/batch123" +ref_h5 = h5py.File(ref_db + ".h5", 'r') +db_kmers = sorted(ref_h5['sketches/' + rlist3[0]].attrs['kmers']) +ref_h5.close() +X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist3, rlist3, db_kmers, + True, False, 1, False, 0) + +# Check distances match +run_regression(X1[:, 0], X2[:, 0]) +run_regression(X1[:, 1], X2[:, 1]) From 44ec04d8ff08867872625bb7e8ba2101ff0889b8 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 18 Feb 2021 19:48:32 +0000 Subject: [PATCH 062/327] Remove hard coded file path --- test/test-update.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test-update.py b/test/test-update.py index 6ec36309..28c986f8 100755 --- a/test/test-update.py +++ b/test/test-update.py @@ -13,7 +13,6 @@ from scipy import stats import h5py -sys.path.insert(0, '/Users/jlees/Documents/Imperial/pp-sketchlib/build/lib.macosx-10.9-x86_64-3.8') import pp_sketchlib if os.environ.get("POPPUNK_PYTHON"): From 5c6ebf5a04119bdd717c71460c1034c1b0ded855 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 19 Feb 2021 07:24:40 +0000 Subject: [PATCH 063/327] Add sparse distance matrix update testing --- test/test-update.py | 61 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/test/test-update.py b/test/test-update.py index 28c986f8..22aa630e 100755 --- a/test/test-update.py +++ b/test/test-update.py @@ -12,6 +12,7 @@ import numpy as np from scipy import stats import h5py +import scipy.sparse import pp_sketchlib @@ -27,10 +28,43 @@ def run_regression(x, y, threshold = 0.99): sys.stderr.write("Distance matrix order failed!\n") sys.exit(1) +def compare_sparse_matrices(d1,d2,r1,r2): + d1_pairs = get_seq_tuples(d1.row,d1.col,r1) + d2_pairs = get_seq_tuples(d2.row,d2.col,r2) + d1_dists = [] + d2_dists = [] + + for (pair1,dist1) in zip(d1_pairs,d1.data): + for (pair2,dist2) in zip(d2_pairs,d2.data): + if pair1 == pair2: + d1_dists.append(dist1) + d2_dists.append(dist2) + break + + run_regression(np.asarray(d1_dists),np.asarray(d2_dists)) + +def get_seq_tuples(rows,cols,names): + tuple_list = [] + for (i,j) in zip(rows,cols): + sorted_pair = tuple(sorted((names[i],names[j]))) + tuple_list.append(sorted_pair) + return tuple_list + +def old_get_seq_tuples(rows,cols): + max_seqs = np.maximum(rows,cols) + min_seqs = np.minimum(rows,cols) + concat_seqs = np.vstack((max_seqs,min_seqs)) + seq_pairs = concat_seqs.T + seq_tuples = [tuple(row) for row in seq_pairs] + return seq_tuples + +# Check distances after one query + # Check that order is the same after doing 1 + 2 with --update-db, as doing all of 1 + 2 together subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile12.txt --output batch12 --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch12 --ranks 1,2", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile1.txt --output batch1 --overwrite", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch1 --ranks 1", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch1 --ranks 1,2", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch1 --query rfile2.txt --output batch2 --update-db --overwrite", shell=True, check=True) # Load updated distances @@ -50,24 +84,41 @@ def run_regression(x, y, threshold = 0.99): run_regression(X1[:, 0], X2[:, 0]) run_regression(X1[:, 1], X2[:, 1]) +# Check sparse distances after one query +with open("batch12/batch12.dists.pkl", 'rb') as pickle_file: + rlist1, qlist1, self = pickle.load(pickle_file) +S1 = scipy.sparse.load_npz("batch12/batch12_rank2_fit.npz") +S2 = scipy.sparse.load_npz("batch2/batch2_rank2_fit.npz") +compare_sparse_matrices(S1,S2,rlist1,rlist2) + +# Check distances after second query + # Check that order is the same after doing 1 + 2 + 3 with --update-db, as doing all of 1 + 2 + 3 together subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile123.txt --output batch123 --overwrite", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch123 --ranks 1", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch123 --ranks 1,2", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch2 --query rfile3.txt --output batch3 --update-db --overwrite", shell=True, check=True) # Load updated distances X2 = np.load("batch3/batch3.dists.npy") with open("batch3/batch3.dists.pkl", 'rb') as pickle_file: - rlist3, qlist, self = pickle.load(pickle_file) + rlist4, qlist, self = pickle.load(pickle_file) # Get same distances from the full database ref_db = "batch123/batch123" ref_h5 = h5py.File(ref_db + ".h5", 'r') -db_kmers = sorted(ref_h5['sketches/' + rlist3[0]].attrs['kmers']) +db_kmers = sorted(ref_h5['sketches/' + rlist4[0]].attrs['kmers']) ref_h5.close() -X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist3, rlist3, db_kmers, +X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist4, rlist4, db_kmers, True, False, 1, False, 0) # Check distances match run_regression(X1[:, 0], X2[:, 0]) run_regression(X1[:, 1], X2[:, 1]) + +# Check sparse distances after second query +with open("batch123/batch123.dists.pkl", 'rb') as pickle_file: + rlist3, qlist, self = pickle.load(pickle_file) +S3 = scipy.sparse.load_npz("batch123/batch123_rank2_fit.npz") +S4 = scipy.sparse.load_npz("batch3/batch3_rank2_fit.npz") + +compare_sparse_matrices(S3,S4,rlist3,rlist4) From d4dd508e35c998bd051539c62a9a7f38147a7e6d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 19 Feb 2021 19:14:42 +0000 Subject: [PATCH 064/327] Add maximum core distance --- PopPUNK/__main__.py | 6 ++++-- PopPUNK/utils.py | 8 +++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 359ac27b..3639cf84 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -94,6 +94,8 @@ def get_options(): 'separate database [default = False]', default=False, action='store_true') qcGroup.add_argument('--max-a-dist', help='Maximum accessory distance to permit [default = 0.5]', default = 0.5, type = float) + qcGroup.add_argument('--max-pi-dist', help='Maximum core distance to permit [default = 0.5]', + default = 0.5, type = float) qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond ' 'which sequences will be excluded [default = 5]', default = 5, type = int) qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded ' @@ -299,7 +301,7 @@ def main(): self = True, number_plot_fits = args.plot_fit, threads = args.threads) - qcDistMat(distMat, seq_names_passing, seq_names_passing, args.max_a_dist) + qcDistMat(distMat, seq_names_passing, seq_names_passing, args.max_pi_dist, args.max_a_dist) # Save results dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" @@ -353,7 +355,7 @@ def main(): # Load the distances refList, queryList, self, distMat = readPickle(distances, enforce_self=True) - if qcDistMat(distMat, refList, queryList, args.max_a_dist) == False \ + if qcDistMat(distMat, refList, queryList, args.max_pi_dist, args.max_a_dist) == False \ and args.qc_filter == "stop": sys.stderr.write("Distances failed quality control (change QC options to run anyway)\n") sys.exit(1) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index f15fe72c..dffc7b06 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -199,7 +199,7 @@ def listDistInts(refSeqs, querySeqs, self=True): return comparisons -def qcDistMat(distMat, refList, queryList, a_max): +def qcDistMat(distMat, refList, queryList, c_max, a_max): """Checks distance matrix for outliers. At the moment just a threshold for accessory distance @@ -210,6 +210,8 @@ def qcDistMat(distMat, refList, queryList, a_max): Reference labels queryList (list) Query labels (or refList if self) + c_max (float) + Maximum core distance to allow a_max (float) Maximum accessory distance to allow @@ -224,7 +226,7 @@ def qcDistMat(distMat, refList, queryList, a_max): passed = False names = iterDistRows(refList, queryList, refList == queryList) for i, (ref, query) in enumerate(names): - if distMat[i,1] > a_max: + if distMat[i,0] > c_max or distMat[i,1] > a_max: sys.stderr.write("WARNING: Accessory outlier at a=" + str(distMat[i,1]) + " 1:" + ref + " 2:" + query + "\n") @@ -500,4 +502,4 @@ def decisionBoundary(intercept, gradient): """ x = intercept[0] + intercept[1] * gradient y = intercept[1] + intercept[0] / gradient - return(x, y) \ No newline at end of file + return(x, y) From f592701f27c179cbde79b7325af5ec5753955b4d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 19 Feb 2021 19:30:06 +0000 Subject: [PATCH 065/327] Fix search for outlier core distances --- PopPUNK/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index dffc7b06..ebc239b7 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -222,12 +222,12 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max): passed = True # First check with numpy, which is quicker than iterating over everything - if np.any(distMat[:,1] > a_max): + if np.any(distMat[:,1] > a_max) or np.any(distMat[:,0] > c_max): passed = False names = iterDistRows(refList, queryList, refList == queryList) for i, (ref, query) in enumerate(names): if distMat[i,0] > c_max or distMat[i,1] > a_max: - sys.stderr.write("WARNING: Accessory outlier at a=" + str(distMat[i,1]) + + sys.stderr.write("WARNING: Accessory outlier at c = " + str(distMat[i,0]) + " a = " + str(distMat[i,1]) + " 1:" + ref + " 2:" + query + "\n") return passed From 7af793e88a199a153183a378a2571911a0cef79c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Feb 2021 20:32:19 +0000 Subject: [PATCH 066/327] Avoid tree redrawing --- PopPUNK/visualise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index d977abc9..4517b46f 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -95,7 +95,7 @@ def get_options(): faGroup.add_argument('--phandango', help='Generate phylogeny and TSV for Phandango visualisation', default=False, action='store_true') faGroup.add_argument('--grapetree', help='Generate phylogeny and CSV for grapetree visualisation', default=False, action='store_true') faGroup.add_argument('--tree', help='Type of tree to calculate [default = nj]', type=str, default='nj', - choices=['nj', 'mst', 'both']) + choices=['nj', 'mst', 'both', 'none']) faGroup.add_argument('--mst-distances', help='Distances used to calculate a minimum spanning tree [default = core]', type=str, default='core', choices=accepted_weights_types) faGroup.add_argument('--rapidnj', help='Path to rapidNJ binary to build NJ tree for Microreact', default='rapidnj') From 06d9f152561b1d2cda45627aaaa7efc82395f09d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Feb 2021 21:12:47 +0000 Subject: [PATCH 067/327] Allow for pruning based on distances to a reference --- PopPUNK/__main__.py | 36 +++++++++++++++++++++++++++++++----- PopPUNK/utils.py | 19 +++++++++++++++---- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 3639cf84..8c67e9f2 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -96,6 +96,8 @@ def get_options(): default = 0.5, type = float) qcGroup.add_argument('--max-pi-dist', help='Maximum core distance to permit [default = 0.5]', default = 0.5, type = float) + qcGroup.add_argument('--reference-isolate', help='Isolate from which distances can be calculated for pruning [default = None]', + default = None, type = str) qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond ' 'which sequences will be excluded [default = 5]', default = 5, type = int) qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded ' @@ -301,11 +303,35 @@ def main(): self = True, number_plot_fits = args.plot_fit, threads = args.threads) - qcDistMat(distMat, seq_names_passing, seq_names_passing, args.max_pi_dist, args.max_a_dist) - - # Save results - dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" - storePickle(seq_names_passing, seq_names_passing, True, distMat, dists_out) + names_to_remove = qcDistMat(distMat, + seq_names_passing, + seq_names_passing, + args.max_pi_dist, + args.max_a_dist, + args.reference_isolate) + + # prune based on distance from reference if provided + if args.reference_isolate is not None and args.qc_filter == "prune": + # Remove sketches + db_name = args.output + '/' + os.path.basename(args.output) + '.h5' + filtered_db_name = args.output + '/' + 'filtered.' + os.path.basename(args.output) + '.h5' + removeFromDB(db_name, + filtered_db_name, + names_to_remove, + full_names = True) + os.rename(filtered_db_name, db_name) + # Remove from distance matrix + prune_distance_matrix(seq_names_passing, + names_to_remove, + distMat, + args.output + "/" + os.path.basename(args.output) + ".dists") + # Remove from reflist + seq_names_passing = [seq_names_passing.remove(x) for x in names_to_remove] + sys.stderr.write("Successfully removed from the database: " + str(names_to_remove)) + else: + # Save results + dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" + storePickle(seq_names_passing, seq_names_passing, True, distMat, dists_out) # Plot results plot_scatter(distMat, diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index ebc239b7..777bd495 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -199,7 +199,7 @@ def listDistInts(refSeqs, querySeqs, self=True): return comparisons -def qcDistMat(distMat, refList, queryList, c_max, a_max): +def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate): """Checks distance matrix for outliers. At the moment just a threshold for accessory distance @@ -214,12 +214,15 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max): Maximum core distance to allow a_max (float) Maximum accessory distance to allow + ref_isolate (str) + Name of reference from which pruning can occur Returns: passed (bool) False if any samples failed """ passed = True + to_prune = [] # First check with numpy, which is quicker than iterating over everything if np.any(distMat[:,1] > a_max) or np.any(distMat[:,0] > c_max): @@ -227,10 +230,18 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max): names = iterDistRows(refList, queryList, refList == queryList) for i, (ref, query) in enumerate(names): if distMat[i,0] > c_max or distMat[i,1] > a_max: - sys.stderr.write("WARNING: Accessory outlier at c = " + str(distMat[i,0]) + " a = " + str(distMat[i,1]) + + sys.stderr.write("WARNING: Outlier at c = " + str(distMat[i,0]) + " a = " + str(distMat[i,1]) + " 1:" + ref + " 2:" + query + "\n") - - return passed + if ref_isolate is not None: + if ref == ref_isolate: + to_prune.append(query) + elif query == ref_isolate: + to_prune.append(ref) + + if ref_isolate is None: + return passed + else: + return to_prune def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False): From 3bcca2ef781ad66ff722acec27dd18038facbb0e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Feb 2021 21:50:09 +0000 Subject: [PATCH 068/327] Fix filtering condition --- PopPUNK/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 8c67e9f2..25539884 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -311,7 +311,7 @@ def main(): args.reference_isolate) # prune based on distance from reference if provided - if args.reference_isolate is not None and args.qc_filter == "prune": + if args.reference_isolate is not None and len(names_to_remove) > 0 and args.qc_filter == "prune": # Remove sketches db_name = args.output + '/' + os.path.basename(args.output) + '.h5' filtered_db_name = args.output + '/' + 'filtered.' + os.path.basename(args.output) + '.h5' From fde3e7d3dedd2719ed2157940de58929d81f8187 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Feb 2021 21:52:45 +0000 Subject: [PATCH 069/327] Add default reference isolate --- PopPUNK/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 777bd495..14ec39ad 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -199,7 +199,7 @@ def listDistInts(refSeqs, querySeqs, self=True): return comparisons -def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate): +def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate = None): """Checks distance matrix for outliers. At the moment just a threshold for accessory distance From bd896a367274f55133d7253996f0ea01ac4050e2 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 1 Mar 2021 13:52:48 +0000 Subject: [PATCH 070/327] Add QC options to assign --- PopPUNK/assign.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index b4ab7c00..fcb2eee3 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -36,6 +36,8 @@ def assign_query(dbFuncs, plot_fit, graph_weights, max_a_dist, + max_pi_dist, + reference_isolate, model_dir, strand_preserved, previous_clustering, @@ -142,7 +144,7 @@ def assign_query(dbFuncs, number_plot_fits = plot_fit, threads = threads) # QC distance matrix - qcPass = qcDistMat(qrDistMat, rNames, qNames, max_a_dist) + qcPass = qcDistMat(qrDistMat, rNames, qNames, max_c_dist, max_a_dist, reference_isolate) # Load the network based on supplied options genomeNetwork, old_cluster_file = \ @@ -368,6 +370,10 @@ def get_options(): 'separate database [default = False]', default=False, action='store_true') qcGroup.add_argument('--max-a-dist', help='Maximum accessory distance to permit [default = 0.5]', default = 0.5, type = float) + qcGroup.add_argument('--max-pi-dist', help='Maximum core distance to permit [default = 0.5]', + default = 0.5, type = float) + qcGroup.add_argument('--reference-isolate', help='Isolate from which distances can be calculated for pruning [default = None]', + default = None, type = str) qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond ' 'which sequences will be excluded [default = 5]', default = None, type = int) qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded ' @@ -492,6 +498,8 @@ def main(): args.plot_fit, args.graph_weights, args.max_a_dist, + args.max_pi_dist, + args.reference_isolate, args.model_dir, args.strand_preserved, args.previous_clustering, From dbc6098410f84bd8ec994f70fcbda2feba8a9394 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 1 Mar 2021 14:01:12 +0000 Subject: [PATCH 071/327] Make pruning variable names consistent --- PopPUNK/assign.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index fcb2eee3..b1b11236 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -144,7 +144,7 @@ def assign_query(dbFuncs, number_plot_fits = plot_fit, threads = threads) # QC distance matrix - qcPass = qcDistMat(qrDistMat, rNames, qNames, max_c_dist, max_a_dist, reference_isolate) + qcPass = qcDistMat(qrDistMat, rNames, qNames, max_pi_dist, max_a_dist, reference_isolate) # Load the network based on supplied options genomeNetwork, old_cluster_file = \ From 354bf97d50a5a4918defd802d2da1f6e59123302 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 8 Mar 2021 13:26:48 +0000 Subject: [PATCH 072/327] Make data type in isolate clustering consistent --- PopPUNK/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 14ec39ad..33588739 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -289,7 +289,7 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False): cluster_name = clustersCsv.columns[cls_idx] cluster_name = cluster_name.replace('__autocolour','') if return_dict: - clusters[cluster_name][row.Index] = str(row[cls_idx + 1]) + clusters[cluster_name][str(row.Index)] = str(row[cls_idx + 1]) else: if cluster_name not in clusters.keys(): clusters[cluster_name] = defaultdict(set) From d57141469198f17a7f04219410c1abd47d2e0037 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 9 Mar 2021 09:45:59 +0000 Subject: [PATCH 073/327] Remove assumption that sparse matrices are symmetrical --- PopPUNK/network.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index f77271ba..2c82fd81 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -369,9 +369,7 @@ def constructNetwork(rlist, qlist, assignments, within_label, if edge_list: if weights is not None: for weight, (ref, query) in zip(weights, assignments): - # sparse matrix is symmetrical, avoid redundant loops - if ref < query: - connections.append((ref, query, weight)) + connections.append((ref, query, weight)) else: connections = assignments elif sparse_input is not None: From a6463fbb86b94655247a16b4863bc3397d164d58 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 9 Mar 2021 10:12:31 +0000 Subject: [PATCH 074/327] Again remove assumption that sparse matrices are symmetrical --- PopPUNK/network.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 2c82fd81..15074422 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -374,9 +374,7 @@ def constructNetwork(rlist, qlist, assignments, within_label, connections = assignments elif sparse_input is not None: for ref, query, weight in zip(sparse_input.row, sparse_input.col, sparse_input.data): - # sparse matrix is symmetrical, avoid redundant loops - if ref < query: - connections.append((ref, query, weight)) + connections.append((ref, query, weight)) else: for row_idx, (assignment, (ref, query)) in enumerate(zip(assignments, listDistInts(rlist, qlist, From 4a255168c063d321ac0acde8a78db30109332f01 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 9 Mar 2021 22:00:07 +0000 Subject: [PATCH 075/327] Add cugraph support for lineage graphs --- PopPUNK/__main__.py | 11 ++-- PopPUNK/network.py | 119 ++++++++++++++++++++++++++++++-------------- 2 files changed, 88 insertions(+), 42 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 25539884..48527953 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -155,6 +155,7 @@ def get_options(): other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]') other.add_argument('--gpu-sketch', default=False, action='store_true', help='Use a GPU when calculating sketches (read data only) [default = False]') other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]') + other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating networks [default = False]') other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]') other.add_argument('--version', action='version', @@ -455,7 +456,7 @@ def main(): queryList, assignments, model.within_label, - weights=weights) + weights = weights) else: # Lineage fit requires some iteration indivNetworks = {} @@ -471,13 +472,15 @@ def main(): refList, assignments[rank], 0, - edge_list=True, - weights=weights + edge_list = True, + weights = weights, + use_gpu = args.gpu_graph ) lineage_clusters[rank] = \ printClusters(indivNetworks[rank], refList, - printCSV = False) + printCSV = False, + use_gpu = args.gpu_graph) # print output of each rank as CSV overall_lineage = createOverallLineage(rank_list, lineage_clusters) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 15074422..611578c9 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -309,7 +309,7 @@ def load_previous_network(prev_G_fn, rlist, weights=False): def constructNetwork(rlist, qlist, assignments, within_label, summarise = True, edge_list = False, weights = None, weights_type = 'euclidean', sparse_input = None, - previous_network = None): + previous_network = None, use_gpu = False): """Construct an unweighted, undirected network without self-loops. Nodes are samples and edges where samples are within the same cluster @@ -341,6 +341,8 @@ def constructNetwork(rlist, qlist, assignments, within_label, previous_network (str) Name of file containing a previous network to be integrated into this new network + use_gpu (bool) + Whether to use GPUs for network construction Returns: G (graph) @@ -408,37 +410,66 @@ def constructNetwork(rlist, qlist, assignments, within_label, edge_tuple = (ref, query) if ref < query: connections.append(edge_tuple) - - # build the graph - G = gt.Graph(directed = False) - G.add_vertex(len(vertex_labels)) - if weights is not None or sparse_input is not None: - eweight = G.new_ep("float") - G.add_edge_list(connections, eprops = [eweight]) - G.edge_properties["weight"] = eweight + # load GPU libraries if necessary + if use_gpu: + + # load CUDA libraries + try: + import cugraph + import cudf + except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + raise ImportError(e) + + # create DataFrame using edge tuples + if weights is not None or sparse_input is not None: + connections_df = pd.DataFrame(connections, columns =['source', 'destination', 'weights']) + else: + connections_df = pd.DataFrame(connections, columns =['source', 'destination']) + G_df = cudf.DataFrame.from_pandas(connections_df) + + # construct graph + G_cu = cugraph.Graph() + if weights is not None or sparse_input is not None: + G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) + else: + G_cu.from_cudf_edgelist(G_df, renumber=False) + quit() + return G_cu + else: - G.add_edge_list(connections) - - # add isolate ID to network - vid = G.new_vertex_property('string', - vals = vertex_labels) - G.vp.id = vid - - # print some summaries - if summarise: - (metrics, scores) = networkSummary(G) - sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t\t\t\t" + str(metrics[0]), - "\tDensity\t\t\t\t\t" + "{:.4f}".format(metrics[1]), - "\tTransitivity\t\t\t\t" + "{:.4f}".format(metrics[2]), - "\tMean betweenness\t\t\t" + "{:.4f}".format(metrics[3]), - "\tWeighted-mean betweenness\t\t" + "{:.4f}".format(metrics[4]), - "\tScore\t\t\t\t\t" + "{:.4f}".format(scores[0]), - "\tScore (w/ betweenness)\t\t\t" + "{:.4f}".format(scores[1]), - "\tScore (w/ weighted-betweenness)\t\t" + "{:.4f}".format(scores[2])]) - + "\n") - - return G + + # build the graph + G = gt.Graph(directed = False) + G.add_vertex(len(vertex_labels)) + + if weights is not None or sparse_input is not None: + eweight = G.new_ep("float") + G.add_edge_list(connections, eprops = [eweight]) + G.edge_properties["weight"] = eweight + else: + G.add_edge_list(connections) + + # add isolate ID to network + vid = G.new_vertex_property('string', + vals = vertex_labels) + G.vp.id = vid + + # print some summaries + if summarise: + (metrics, scores) = networkSummary(G) + sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t\t\t\t" + str(metrics[0]), + "\tDensity\t\t\t\t\t" + "{:.4f}".format(metrics[1]), + "\tTransitivity\t\t\t\t" + "{:.4f}".format(metrics[2]), + "\tMean betweenness\t\t\t" + "{:.4f}".format(metrics[3]), + "\tWeighted-mean betweenness\t\t" + "{:.4f}".format(metrics[4]), + "\tScore\t\t\t\t\t" + "{:.4f}".format(scores[0]), + "\tScore (w/ betweenness)\t\t\t" + "{:.4f}".format(scores[1]), + "\tScore (w/ weighted-betweenness)\t\t" + "{:.4f}".format(scores[2])]) + + "\n") + quit() + return G def networkSummary(G, calc_betweenness=True): """Provides summary values about the network @@ -621,7 +652,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, externalClusterCSV = None, printRef = True, printCSV = True, - clustering_type = 'combined'): + clustering_type = 'combined', use_gpu = False): """Get cluster assignments Also writes assignments to a CSV file @@ -650,6 +681,8 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, clustering_type (str) Type of clustering network, used for comparison with old clusters Default = 'combined' + use_gpu (bool) + Whether to use cugraph for network analysis Returns: clustering (dict) @@ -660,13 +693,23 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, raise RuntimeError("Trying to print query clusters with no query sequences") # get a sorted list of component assignments - component_assignments, component_frequencies = gt.label_components(G) - component_frequency_ranks = len(component_frequencies) - rankdata(component_frequencies, method = 'ordinal').astype(int) - newClusters = [set() for rank in range(len(component_frequency_ranks))] - for isolate_index, isolate_name in enumerate(rlist): - component = component_assignments.a[isolate_index] - component_rank = component_frequency_ranks[component] - newClusters[component_rank].add(isolate_name) + if use_gpu: + component_assignments = cugraph.components.connectivity.connected_components(G, directed = False) + component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) + newClusters = [set() for rank in range(component_frequencies.size)] + for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment + component = component_assignments[isolate_index] + component_rank = component_frequencies.index[component] + newClusters[component_rank].add(isolate_name) + else: + component_assignments, component_frequencies = gt.label_components(G) + component_frequency_ranks = len(component_frequencies) - rankdata(component_frequencies, method = 'ordinal').astype(int) + # use components to determine new clusters + newClusters = [set() for rank in range(len(component_frequency_ranks))] + for isolate_index, isolate_name in enumerate(rlist): + component = component_assignments.a[isolate_index] + component_rank = component_frequency_ranks[component] + newClusters[component_rank].add(isolate_name) oldNames = set() From c94fe04d2c4354f4da68f635b62304ee24351ed8 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 9 Mar 2021 22:09:45 +0000 Subject: [PATCH 076/327] Remove exit messages --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 611578c9..8bcbb136 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -435,7 +435,7 @@ def constructNetwork(rlist, qlist, assignments, within_label, G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: G_cu.from_cudf_edgelist(G_df, renumber=False) - quit() + return G_cu else: @@ -468,7 +468,7 @@ def constructNetwork(rlist, qlist, assignments, within_label, "\tScore (w/ betweenness)\t\t\t" + "{:.4f}".format(scores[1]), "\tScore (w/ weighted-betweenness)\t\t" + "{:.4f}".format(scores[2])]) + "\n") - quit() + return G def networkSummary(G, calc_betweenness=True): From db949c873b91cd69d304a671b51633b146f8e15b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 9 Mar 2021 22:12:04 +0000 Subject: [PATCH 077/327] Load cugraph in processClusters --- PopPUNK/network.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 8bcbb136..5340302f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -694,6 +694,15 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, # get a sorted list of component assignments if use_gpu: + + # load CUDA libraries + try: + import cugraph + import cudf + except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + raise ImportError(e) + component_assignments = cugraph.components.connectivity.connected_components(G, directed = False) component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) newClusters = [set() for rank in range(component_frequencies.size)] From b1d153893c6babac7c2c208883c327041b043519 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 9 Mar 2021 22:13:42 +0000 Subject: [PATCH 078/327] Fix connected component command --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 5340302f..f1c832f5 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -703,7 +703,7 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, sys.stderr.write("cugraph and cudf unavailable\n") raise ImportError(e) - component_assignments = cugraph.components.connectivity.connected_components(G, directed = False) + component_assignments = cugraph.components.connectivity.connected_components(G) component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) newClusters = [set() for rank in range(component_frequencies.size)] for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment From 31683e164bf05b3b6d1515fc3fb3d0301f6872d2 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Mar 2021 06:07:37 +0000 Subject: [PATCH 079/327] Change cuDf index use --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index f1c832f5..25998a01 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -707,7 +707,7 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) newClusters = [set() for rank in range(component_frequencies.size)] for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment - component = component_assignments[isolate_index] + component = component_assignments['labels'][isolate_index] component_rank = component_frequencies.index[component] newClusters[component_rank].add(isolate_name) else: From 2ae71a84197c18ff84158feb70c286729a938ad5 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Mar 2021 06:11:15 +0000 Subject: [PATCH 080/327] Integer value conversion update --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 25998a01..23901efc 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -709,7 +709,7 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment component = component_assignments['labels'][isolate_index] component_rank = component_frequencies.index[component] - newClusters[component_rank].add(isolate_name) + newClusters[component_rank.astype(int)].add(isolate_name) else: component_assignments, component_frequencies = gt.label_components(G) component_frequency_ranks = len(component_frequencies) - rankdata(component_frequencies, method = 'ordinal').astype(int) From b703f7a1d5856da11fdfa110cf67894437d1c358 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Mar 2021 06:25:58 +0000 Subject: [PATCH 081/327] Integer value conversion --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 23901efc..9a9573c3 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -709,7 +709,7 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment component = component_assignments['labels'][isolate_index] component_rank = component_frequencies.index[component] - newClusters[component_rank.astype(int)].add(isolate_name) + newClusters[component_rank.astype(int).item()].add(isolate_name) else: component_assignments, component_frequencies = gt.label_components(G) component_frequency_ranks = len(component_frequencies) - rankdata(component_frequencies, method = 'ordinal').astype(int) From 2e90d28cbfe2570a0f9513a4ea713ad839bfe9f2 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Mar 2021 10:38:52 +0000 Subject: [PATCH 082/327] Process components correctly with cugraph --- PopPUNK/network.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 9a9573c3..3e349f5c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -707,9 +707,10 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) newClusters = [set() for rank in range(component_frequencies.size)] for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment - component = component_assignments['labels'][isolate_index] - component_rank = component_frequencies.index[component] - newClusters[component_rank.astype(int).item()].add(isolate_name) + component = component_assignments['labels'].iloc[isolate_index].item() + component_rank_bool = component_frequencies.index == component + component_rank = np.argmax(component_rank_bool.to_array()) + newClusters[component_rank].add(isolate_name) else: component_assignments, component_frequencies = gt.label_components(G) component_frequency_ranks = len(component_frequencies) - rankdata(component_frequencies, method = 'ordinal').astype(int) From 0906f0c68f25119aa44fba1a741c14c2c55419c3 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Mar 2021 10:55:28 +0000 Subject: [PATCH 083/327] Allow for checking of cugraphs --- PopPUNK/__main__.py | 4 +++- PopPUNK/network.py | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 48527953..d380d101 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -195,6 +195,7 @@ def main(): from .network import constructNetwork from .network import extractReferences from .network import printClusters + from .network import get_vertex_list from .plot import writeClusterCsv from .plot import plot_scatter @@ -495,7 +496,8 @@ def main(): genomeNetwork = indivNetworks[min(rank_list)] # Ensure all in dists are in final network - networkMissing = set(map(str,set(range(len(refList))).difference(list(genomeNetwork.vertices())))) + vertex_list = get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph) + networkMissing = set(map(str,set(range(len(refList))).difference(vertex_list))) if len(networkMissing) > 0: missing_isolates = [refList[m] for m in networkMissing] sys.stderr.write("WARNING: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n") diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 3e349f5c..0cf4713f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -939,3 +939,25 @@ def generate_minimum_spanning_tree(G, from_cugraph = False): sys.stderr.write("Completed calculation of minimum-spanning tree\n") return mst_network + +def get_vertex_list(G, use_gpu = False): + """Generate a list of node indices + + Args: + G (network) + Graph tool network + use_gpu (bool) + Whether graph is a cugraph or not + [default = False] + + Returns: + vlist (list) + List of integers corresponding to nodes + """ + + if use_gpu: + vlist = G.nodes().tolist() + else: + vlist = list(G.vertices()) + + return vlist From c64a2c56b938b5cb622832ee5f7c4bb712c2dd1d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Mar 2021 11:00:50 +0000 Subject: [PATCH 084/327] Change cudf list conversion --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 0cf4713f..79ed614c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -956,7 +956,7 @@ def get_vertex_list(G, use_gpu = False): """ if use_gpu: - vlist = G.nodes().tolist() + vlist = G.nodes().to_arrow().to_pylist() else: vlist = list(G.vertices()) From b84a269e1817090d2555d483a87b851ab07a7cad Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Mar 2021 11:25:07 +0000 Subject: [PATCH 085/327] Update printClusters flags --- PopPUNK/__main__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index d380d101..bae50c16 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -506,7 +506,8 @@ def main(): isolateClustering = {fit_type: printClusters(genomeNetwork, refList, output + "/" + os.path.basename(output), - externalClusterCSV = args.external_clustering)} + externalClusterCSV = args.external_clustering, + use_gpu = args.gpu_graph)} # Write core and accessory based clusters, if they worked if model.indiv_fitted: From cf1d85b15e596f2f7a72eca6991b55d9cf1d5e6b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Mar 2021 11:57:52 +0000 Subject: [PATCH 086/327] Allow for saving of cugraph objects --- PopPUNK/__main__.py | 9 +++------ PopPUNK/network.py | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index bae50c16..a371da64 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -535,9 +535,7 @@ def main(): fit_type = 'accessory' genomeNetwork = indivNetworks['accessory'] - genomeNetwork.save(output + "/" + \ - os.path.basename(output) + '_graph.gt', - fmt = 'gt') + save_network(genomeNetwork, prefix = output, suffix = "_graph", use_gpu = args.gpu_graph) #******************************# #* *# @@ -557,9 +555,8 @@ def main(): prune_distance_matrix(refList, names_to_remove, distMat, output + "/" + os.path.basename(output) + ".refs.dists") # Save reference network - genomeNetwork.save(output + "/" + \ - os.path.basename(output) + '.refs_graph.gt', - fmt = 'gt') + save_network(genomeNetwork, prefix = output, suffix = ".refs_graph", + use_gpu = args.gpu_graph) removeFromDB(args.ref_db, output, names_to_remove) os.rename(output + "/" + os.path.basename(output) + ".tmp.h5", output + "/" + os.path.basename(output) + ".refs.h5") diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 79ed614c..3de225fd 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -961,3 +961,25 @@ def get_vertex_list(G, use_gpu = False): vlist = list(G.vertices()) return vlist + +def save_network(G, prefix = None, suffix = None, use_gpu = False): + """Save a network to disc + + Args: + G (network) + Graph tool network + prefix (str) + Prefix for output file + use_gpu (bool) + Whether graph is a cugraph or not + [default = False] + + """ + file_name = prefix + "/" + os.path.basename(prefix) + '_' + suffix + os.path.basename(prefix) + '_graph.csv.bz2' + if use_gpu: + G.to_pandas_edgelist().to_csv(file_name + '.csv.bz2', + compression='bz2') + else: + G.save(file_name + '.gt', + fmt = 'gt') From 81b70339100a440eaced4c5841b8373d93edcca0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Mar 2021 12:00:24 +0000 Subject: [PATCH 087/327] Fix missing function reference --- PopPUNK/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index a371da64..9451f2ea 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -196,6 +196,7 @@ def main(): from .network import extractReferences from .network import printClusters from .network import get_vertex_list + from .network import save_network from .plot import writeClusterCsv from .plot import plot_scatter From 1f708d62ebb854f2930446eaf5005cb46d54fec3 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Mar 2021 21:49:26 +0000 Subject: [PATCH 088/327] Change vertex list to set for difference --- PopPUNK/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 9451f2ea..ca9cf606 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -497,7 +497,7 @@ def main(): genomeNetwork = indivNetworks[min(rank_list)] # Ensure all in dists are in final network - vertex_list = get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph) + vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph)) networkMissing = set(map(str,set(range(len(refList))).difference(vertex_list))) if len(networkMissing) > 0: missing_isolates = [refList[m] for m in networkMissing] From 79b754fbc457e89f5e908e5d4014c33559db4dd4 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Mar 2021 21:54:04 +0000 Subject: [PATCH 089/327] GPU graphs for non-lineage mode --- PopPUNK/__main__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index ca9cf606..165325ba 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -458,7 +458,8 @@ def main(): queryList, assignments, model.within_label, - weights = weights) + weights = weights, + use_gpu = args.gpu_graph) else: # Lineage fit requires some iteration indivNetworks = {} From 6566544fd7a9196d680b7fc136a96e84f109d6ca Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 10 Mar 2021 22:31:59 +0000 Subject: [PATCH 090/327] Change node index extraction --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 3de225fd..be557649 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -956,7 +956,7 @@ def get_vertex_list(G, use_gpu = False): """ if use_gpu: - vlist = G.nodes().to_arrow().to_pylist() + vlist = G.nodes().to_array().tolist() else: vlist = list(G.vertices()) From bec01e5f5100c2913f9b84f668b7ca696105a824 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 05:41:56 +0000 Subject: [PATCH 091/327] Restore missing nodes to GPU graph --- PopPUNK/__main__.py | 2 +- PopPUNK/network.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 165325ba..6731b79a 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -499,7 +499,7 @@ def main(): # Ensure all in dists are in final network vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph)) - networkMissing = set(map(str,set(range(len(refList))).difference(vertex_list))) + networkMissing = set(set(range(len(refList))).difference(vertex_list)) if len(networkMissing) > 0: missing_isolates = [refList[m] for m in networkMissing] sys.stderr.write("WARNING: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n") diff --git a/PopPUNK/network.py b/PopPUNK/network.py index be557649..4cce839b 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -431,6 +431,7 @@ def constructNetwork(rlist, qlist, assignments, within_label, # construct graph G_cu = cugraph.Graph() + G_cu.add_nodes_from(len(vertex_labels)) if weights is not None or sparse_input is not None: G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: From d216576555bb78616a3698bbcaa328cdecf21dd4 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 05:48:44 +0000 Subject: [PATCH 092/327] Add missing nodes --- PopPUNK/network.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 4cce839b..83c67f22 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -431,12 +431,11 @@ def constructNetwork(rlist, qlist, assignments, within_label, # construct graph G_cu = cugraph.Graph() - G_cu.add_nodes_from(len(vertex_labels)) if weights is not None or sparse_input is not None: G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: G_cu.from_cudf_edgelist(G_df, renumber=False) - + G_cu.add_nodes_from(len(vertex_labels)) # add any missing unconnected nodes return G_cu else: From a369c9d7fcdedf14e1b84c2fe00c39b5fee9e40b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 05:51:57 +0000 Subject: [PATCH 093/327] Use range list in place of integer --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 83c67f22..b4d52fbd 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -435,7 +435,7 @@ def constructNetwork(rlist, qlist, assignments, within_label, G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: G_cu.from_cudf_edgelist(G_df, renumber=False) - G_cu.add_nodes_from(len(vertex_labels)) # add any missing unconnected nodes + G_cu.add_nodes_from(range(vertex_labels)) # add any missing unconnected nodes return G_cu else: From d51d8fd07ab7be16e23dcf37b73411a77564db9f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 06:14:15 +0000 Subject: [PATCH 094/327] Fix range list --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index b4d52fbd..d2c47854 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -435,7 +435,7 @@ def constructNetwork(rlist, qlist, assignments, within_label, G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: G_cu.from_cudf_edgelist(G_df, renumber=False) - G_cu.add_nodes_from(range(vertex_labels)) # add any missing unconnected nodes + G_cu.add_nodes_from(range(len(vertex_labels))) # add any missing unconnected nodes return G_cu else: From a4c3210e2971026bbb8c55afd1fa43011e703a70 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 07:32:22 +0000 Subject: [PATCH 095/327] Remove pandas intermediate for data frame --- PopPUNK/network.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index d2c47854..130a56bc 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -424,10 +424,9 @@ def constructNetwork(rlist, qlist, assignments, within_label, # create DataFrame using edge tuples if weights is not None or sparse_input is not None: - connections_df = pd.DataFrame(connections, columns =['source', 'destination', 'weights']) + connections_df = cudf.DataFrame(connections, columns =['source', 'destination', 'weights']) else: - connections_df = pd.DataFrame(connections, columns =['source', 'destination']) - G_df = cudf.DataFrame.from_pandas(connections_df) + connections_df = cudf.DataFrame(connections, columns =['source', 'destination']) # construct graph G_cu = cugraph.Graph() From ca28aa5f7bb7014067ab6bcec95ce84becf1bfe1 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 07:39:02 +0000 Subject: [PATCH 096/327] Fix data frame name --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 130a56bc..143be773 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -424,9 +424,9 @@ def constructNetwork(rlist, qlist, assignments, within_label, # create DataFrame using edge tuples if weights is not None or sparse_input is not None: - connections_df = cudf.DataFrame(connections, columns =['source', 'destination', 'weights']) + G_df = cudf.DataFrame(connections, columns =['source', 'destination', 'weights']) else: - connections_df = cudf.DataFrame(connections, columns =['source', 'destination']) + G_df = cudf.DataFrame(connections, columns =['source', 'destination']) # construct graph G_cu = cugraph.Graph() From 702c6b97a753d844e03c35506c1ededd92ce5c1e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 08:20:30 +0000 Subject: [PATCH 097/327] Add in isolated vertices in GPU graph --- PopPUNK/network.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 143be773..76a5be47 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -428,13 +428,28 @@ def constructNetwork(rlist, qlist, assignments, within_label, else: G_df = cudf.DataFrame(connections, columns =['source', 'destination']) + # ensure the highest-integer node is included in the edge list + # by adding a self-loop if necessary; see https://github.com/rapidsai/cugraph/issues/1206 + max_in_df = G_df.max() + max_in_vertex_labels = len(vertex_labels) + print("Max in DF is " + str(max_in_df)) + print("Max in labels is " + str(max_in_vertex_labels)) + if max_in_df.astype(int).item() != max_in_vertex_labels: + if weights is not None or sparse_input is not None: + self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels, 0) + G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination', 'weights']) + else: + self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels) + G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination']) + G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) + # construct graph G_cu = cugraph.Graph() if weights is not None or sparse_input is not None: G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: G_cu.from_cudf_edgelist(G_df, renumber=False) - G_cu.add_nodes_from(range(len(vertex_labels))) # add any missing unconnected nodes + return G_cu else: From 8c14b5ee59a080e669656149799b780f23ed07a9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 08:27:48 +0000 Subject: [PATCH 098/327] Change max to int conversion --- PopPUNK/network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 76a5be47..fb70f88c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -433,8 +433,9 @@ def constructNetwork(rlist, qlist, assignments, within_label, max_in_df = G_df.max() max_in_vertex_labels = len(vertex_labels) print("Max in DF is " + str(max_in_df)) + print("Max type is " + str(type(max_in_df))) print("Max in labels is " + str(max_in_vertex_labels)) - if max_in_df.astype(int).item() != max_in_vertex_labels: + if max_in_df.iloc[0].item() != max_in_vertex_labels: if weights is not None or sparse_input is not None: self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels, 0) G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination', 'weights']) From 414efff569851a822b1a7d96d6e073aed3c0aae3 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 08:33:07 +0000 Subject: [PATCH 099/327] Change max calculation --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index fb70f88c..4346a6a5 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -430,8 +430,8 @@ def constructNetwork(rlist, qlist, assignments, within_label, # ensure the highest-integer node is included in the edge list # by adding a self-loop if necessary; see https://github.com/rapidsai/cugraph/issues/1206 - max_in_df = G_df.max() - max_in_vertex_labels = len(vertex_labels) + max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) + max_in_vertex_labels = len(vertex_labels)-1 print("Max in DF is " + str(max_in_df)) print("Max type is " + str(type(max_in_df))) print("Max in labels is " + str(max_in_vertex_labels)) From 6938b5319e3101d15e25154d56e3b99d360a4b3e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 08:37:07 +0000 Subject: [PATCH 100/327] Change max format --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 4346a6a5..f2bed628 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -435,7 +435,7 @@ def constructNetwork(rlist, qlist, assignments, within_label, print("Max in DF is " + str(max_in_df)) print("Max type is " + str(type(max_in_df))) print("Max in labels is " + str(max_in_vertex_labels)) - if max_in_df.iloc[0].item() != max_in_vertex_labels: + if max_in_df.item() != max_in_vertex_labels: if weights is not None or sparse_input is not None: self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels, 0) G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination', 'weights']) From 90ebda606635f909efb33737046c0c521ae2ca34 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 08:43:21 +0000 Subject: [PATCH 101/327] Add message checking on maximum --- PopPUNK/network.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index f2bed628..b8a234d2 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -443,6 +443,8 @@ def constructNetwork(rlist, qlist, assignments, within_label, self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels) G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination']) G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) + new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) + print("New max in DF is " + str(new_max_in_df)) # construct graph G_cu = cugraph.Graph() From 4915c678abf4714f138a306778b74e5d06ce813a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 08:48:06 +0000 Subject: [PATCH 102/327] Change int to float --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index b8a234d2..ce610e3d 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -437,7 +437,7 @@ def constructNetwork(rlist, qlist, assignments, within_label, print("Max in labels is " + str(max_in_vertex_labels)) if max_in_df.item() != max_in_vertex_labels: if weights is not None or sparse_input is not None: - self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels, 0) + self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels, 0.0) G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination', 'weights']) else: self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels) From 7a0e404b866249ff26a683e6964cdfbdffa2a8ff Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 08:50:57 +0000 Subject: [PATCH 103/327] Add warning for missing nodes --- PopPUNK/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 6731b79a..4b66aba8 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -503,6 +503,7 @@ def main(): if len(networkMissing) > 0: missing_isolates = [refList[m] for m in networkMissing] sys.stderr.write("WARNING: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n") + sys.stderr.write("These correspond to indices " + ", ".join(networkMissing) + "\n") fit_type = model.type isolateClustering = {fit_type: printClusters(genomeNetwork, From 19a4248ba3f7b380c0c5211b0ebb3793c396b99e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 09:01:53 +0000 Subject: [PATCH 104/327] Test DF structure --- PopPUNK/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ce610e3d..5c3e54f0 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -451,6 +451,7 @@ def constructNetwork(rlist, qlist, assignments, within_label, if weights is not None or sparse_input is not None: G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: + print("Data frame is " + str(G_df)) G_cu.from_cudf_edgelist(G_df, renumber=False) return G_cu From 6de75099c48a033b4bbbd0d706530341065694f8 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 09:17:30 +0000 Subject: [PATCH 105/327] Change warning message print format --- PopPUNK/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 4b66aba8..860140a3 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -503,7 +503,7 @@ def main(): if len(networkMissing) > 0: missing_isolates = [refList[m] for m in networkMissing] sys.stderr.write("WARNING: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n") - sys.stderr.write("These correspond to indices " + ", ".join(networkMissing) + "\n") + sys.stderr.write("These correspond to indices " + ", ".join(map(str,networkMissing)) + "\n") fit_type = model.type isolateClustering = {fit_type: printClusters(genomeNetwork, From bf278b0ac6456b5939d4bc71e98744c91f5234ee Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 09:48:50 +0000 Subject: [PATCH 106/327] Change cudf definition --- PopPUNK/network.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 5c3e54f0..0804590d 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -436,12 +436,11 @@ def constructNetwork(rlist, qlist, assignments, within_label, print("Max type is " + str(type(max_in_df))) print("Max in labels is " + str(max_in_vertex_labels)) if max_in_df.item() != max_in_vertex_labels: + G_self_loop = cudf.DataFrame() + G_self_loop['source'] = [max_in_vertex_labels] + G_self_loop['destination'] = [max_in_vertex_labels] if weights is not None or sparse_input is not None: - self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels, 0.0) - G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination', 'weights']) - else: - self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels) - G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination']) + G_self_loop['weights'] = [0.0] G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) print("New max in DF is " + str(new_max_in_df)) From 2d08c725a538dd6b9301e00b0e46c5b5ae93fa0a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 11:29:56 +0000 Subject: [PATCH 107/327] Add reference extraction for GPU graphs --- PopPUNK/__main__.py | 6 +- PopPUNK/network.py | 169 +++++++++++++++++++++++++------------------- 2 files changed, 101 insertions(+), 74 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 860140a3..c9b2dfa0 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -549,7 +549,11 @@ def main(): # (this no longer loses information and should generally be kept on) if model.type != "lineage": newReferencesIndices, newReferencesNames, newReferencesFile, genomeNetwork = \ - extractReferences(genomeNetwork, refList, output, threads = args.threads) + extractReferences(genomeNetwork, + refList, + output, + threads = args.threads, + use_gpu = args.gpu_graph) nodes_to_remove = set(range(len(refList))).difference(newReferencesIndices) names_to_remove = [refList[n] for n in nodes_to_remove] diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 0804590d..087f6ac1 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -138,7 +138,7 @@ def cliquePrune(component, graph, reference_indices, components_list): ref_list = getCliqueRefs(subgraph, refs) return(list(ref_list)) -def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1): +def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, use_gpu = False): """Extract references for each cluster based on cliques Writes chosen references to file by calling :func:`~writeReferences` @@ -152,6 +152,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1): Prefix for output file (.refs will be appended) existingRefs (list) References that should be used for each clique + use_gpu (bool) + Use cugraph for graph analysis (default = False) Returns: refFileName (str) @@ -167,83 +169,104 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1): index_lookup = {v:k for k,v in enumerate(dbOrder)} reference_indices = set([index_lookup[r] for r in references]) - # Each component is independent, so can be multithreaded - components = gt.label_components(G)[0].a - - # Turn gt threading off and on again either side of the parallel loop - if gt.openmp_enabled(): - gt.openmp_set_num_threads(1) + if use_gpu: - # Cliques are pruned, taking one reference from each, until none remain - with Pool(processes=threads) as pool: - ref_lists = pool.map(partial(cliquePrune, - graph=G, - reference_indices=reference_indices, - components_list=components), - set(components)) - # Returns nested lists, which need to be flattened - reference_indices = set([entry for sublist in ref_lists for entry in sublist]) + # load CUDA libraries + try: + import cugraph + import cudf + except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + raise ImportError(e) + + # For large network, use more approximate method for extracting references + reference = {} + G_truss = cugraph.community.ktruss_subgraph.k_truss(G, 3) + component_assignments = cugraph.components.connectivity.connected_components(G_truss) + raw_reference_indices = component_assignments.groupby('').nth(0).iloc[:0] + print("Raw type: " + str(type(raw_reference_indices))) + print("Raw refs: " + str(raw_reference_indices)) + quit() + + else: - if gt.openmp_enabled(): - gt.openmp_set_num_threads(threads) - - # Use a vertex filter to extract the subgraph of refences - # as a graphview - reference_vertex = G.new_vertex_property('bool') - for n, vertex in enumerate(G.vertices()): - if n in reference_indices: - reference_vertex[vertex] = True - else: - reference_vertex[vertex] = False - G_ref = gt.GraphView(G, vfilt = reference_vertex) - G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object - - # Find any clusters which are represented by >1 references - # This creates a dictionary: cluster_id: set(ref_idx in cluster) - clusters_in_full_graph = printClusters(G, dbOrder, printCSV=False) - reference_clusters_in_full_graph = defaultdict(set) - for reference_index in reference_indices: - reference_clusters_in_full_graph[clusters_in_full_graph[dbOrder[reference_index]]].add(reference_index) - - # Calculate the component membership within the reference graph - ref_order = [name for idx, name in enumerate(dbOrder) if idx in frozenset(reference_indices)] - clusters_in_reference_graph = printClusters(G_ref, ref_order, printCSV=False) - # Record the components/clusters the references are in the reference graph - # dict: name: ref_cluster - reference_clusters_in_reference_graph = {} - for reference_name in ref_order: - reference_clusters_in_reference_graph[reference_name] = clusters_in_reference_graph[reference_name] - - # Check if multi-reference components have been split as a validation test - # First iterate through clusters - network_update_required = False - for cluster_id, ref_idxs in reference_clusters_in_full_graph.items(): - # Identify multi-reference clusters by this length - if len(ref_idxs) > 1: - check = list(ref_idxs) - # check if these are still in the same component in the reference graph - for i in range(len(check)): - component_i = reference_clusters_in_reference_graph[dbOrder[check[i]]] - for j in range(i + 1, len(check)): - # Add intermediate nodes - component_j = reference_clusters_in_reference_graph[dbOrder[check[j]]] - if component_i != component_j: - network_update_required = True - vertex_list, edge_list = gt.shortest_path(G, check[i], check[j]) - # update reference list - for vertex in vertex_list: - reference_vertex[vertex] = True - reference_indices.add(int(vertex)) - - # update reference graph if vertices have been added - if network_update_required: + # Each component is independent, so can be multithreaded + components = gt.label_components(G)[0].a + + # Turn gt threading off and on again either side of the parallel loop + if gt.openmp_enabled(): + gt.openmp_set_num_threads(1) + + # Cliques are pruned, taking one reference from each, until none remain + with Pool(processes=threads) as pool: + ref_lists = pool.map(partial(cliquePrune, + graph=G, + reference_indices=reference_indices, + components_list=components), + set(components)) + # Returns nested lists, which need to be flattened + reference_indices = set([entry for sublist in ref_lists for entry in sublist]) + + if gt.openmp_enabled(): + gt.openmp_set_num_threads(threads) + + # Use a vertex filter to extract the subgraph of refences + # as a graphview + reference_vertex = G.new_vertex_property('bool') + for n, vertex in enumerate(G.vertices()): + if n in reference_indices: + reference_vertex[vertex] = True + else: + reference_vertex[vertex] = False G_ref = gt.GraphView(G, vfilt = reference_vertex) G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object - # Order found references as in mash sketch files - reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] - refFileName = writeReferences(reference_names, outPrefix) - return reference_indices, reference_names, refFileName, G_ref + # Find any clusters which are represented by >1 references + # This creates a dictionary: cluster_id: set(ref_idx in cluster) + clusters_in_full_graph = printClusters(G, dbOrder, printCSV=False) + reference_clusters_in_full_graph = defaultdict(set) + for reference_index in reference_indices: + reference_clusters_in_full_graph[clusters_in_full_graph[dbOrder[reference_index]]].add(reference_index) + + # Calculate the component membership within the reference graph + ref_order = [name for idx, name in enumerate(dbOrder) if idx in frozenset(reference_indices)] + clusters_in_reference_graph = printClusters(G_ref, ref_order, printCSV=False) + # Record the components/clusters the references are in the reference graph + # dict: name: ref_cluster + reference_clusters_in_reference_graph = {} + for reference_name in ref_order: + reference_clusters_in_reference_graph[reference_name] = clusters_in_reference_graph[reference_name] + + # Check if multi-reference components have been split as a validation test + # First iterate through clusters + network_update_required = False + for cluster_id, ref_idxs in reference_clusters_in_full_graph.items(): + # Identify multi-reference clusters by this length + if len(ref_idxs) > 1: + check = list(ref_idxs) + # check if these are still in the same component in the reference graph + for i in range(len(check)): + component_i = reference_clusters_in_reference_graph[dbOrder[check[i]]] + for j in range(i + 1, len(check)): + # Add intermediate nodes + component_j = reference_clusters_in_reference_graph[dbOrder[check[j]]] + if component_i != component_j: + network_update_required = True + vertex_list, edge_list = gt.shortest_path(G, check[i], check[j]) + # update reference list + for vertex in vertex_list: + reference_vertex[vertex] = True + reference_indices.add(int(vertex)) + + # update reference graph if vertices have been added + if network_update_required: + G_ref = gt.GraphView(G, vfilt = reference_vertex) + G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object + + # Order found references as in mash sketch files + reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] + refFileName = writeReferences(reference_names, outPrefix) + return reference_indices, reference_names, refFileName, G_ref def writeReferences(refList, outPrefix): """Writes chosen references to file From 21dc84bddbf2f9a1a367347afeb71a6846c06f4e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 11:34:07 +0000 Subject: [PATCH 108/327] Change ktruss command --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 087f6ac1..653a3c3f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -181,7 +181,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # For large network, use more approximate method for extracting references reference = {} - G_truss = cugraph.community.ktruss_subgraph.k_truss(G, 3) + G_truss = cugraph.ktruss_subgraph(G, 3) component_assignments = cugraph.components.connectivity.connected_components(G_truss) raw_reference_indices = component_assignments.groupby('').nth(0).iloc[:0] print("Raw type: " + str(type(raw_reference_indices))) From f64c422228c849b25dd093ab59160c1302600021 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 11:50:51 +0000 Subject: [PATCH 109/327] Change ktruss processing --- PopPUNK/network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 653a3c3f..e803338c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -183,7 +183,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u reference = {} G_truss = cugraph.ktruss_subgraph(G, 3) component_assignments = cugraph.components.connectivity.connected_components(G_truss) - raw_reference_indices = component_assignments.groupby('').nth(0).iloc[:0] + print("Assignments: " + str(component_assignments)) + raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0] print("Raw type: " + str(type(raw_reference_indices))) print("Raw refs: " + str(raw_reference_indices)) quit() From f88a0c2e6c5943e1ac53885abb2e66ec14264294 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 11:54:54 +0000 Subject: [PATCH 110/327] Change components processing --- PopPUNK/network.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index e803338c..0fe1bb85 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -182,7 +182,9 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # For large network, use more approximate method for extracting references reference = {} G_truss = cugraph.ktruss_subgraph(G, 3) - component_assignments = cugraph.components.connectivity.connected_components(G_truss) + component_assignments = cugraph.components.connectivity.connected_components(G_truss, + directed = False, + return_labels = True) print("Assignments: " + str(component_assignments)) raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0] print("Raw type: " + str(type(raw_reference_indices))) From 77133cb36478e3d77b421cbbef9295887382d92f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 12:02:48 +0000 Subject: [PATCH 111/327] Change components options --- PopPUNK/network.py | 1 - 1 file changed, 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 0fe1bb85..ad9b42a6 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -183,7 +183,6 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u reference = {} G_truss = cugraph.ktruss_subgraph(G, 3) component_assignments = cugraph.components.connectivity.connected_components(G_truss, - directed = False, return_labels = True) print("Assignments: " + str(component_assignments)) raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0] From 9dcaefdc4099ec2b2bf900b6b8e220fe1379cc6d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 12:07:18 +0000 Subject: [PATCH 112/327] Format Gtruss for graph input --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ad9b42a6..428d60e1 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -182,8 +182,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # For large network, use more approximate method for extracting references reference = {} G_truss = cugraph.ktruss_subgraph(G, 3) - component_assignments = cugraph.components.connectivity.connected_components(G_truss, - return_labels = True) + print("Gtruss type: " + str(type(G_truss))) + component_assignments = cugraph.components.connectivity.connected_components(G_truss) print("Assignments: " + str(component_assignments)) raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0] print("Raw type: " + str(type(raw_reference_indices))) From 5eb7e692ac3ce9814b01f0b32a71038d37d463b7 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 16:37:44 +0000 Subject: [PATCH 113/327] Try option 1 for ktruss --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 428d60e1..2131f80c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -181,7 +181,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # For large network, use more approximate method for extracting references reference = {} - G_truss = cugraph.ktruss_subgraph(G, 3) + G_truss = cugraph.ktruss_subgraph.k_truss(G, 3) print("Gtruss type: " + str(type(G_truss))) component_assignments = cugraph.components.connectivity.connected_components(G_truss) print("Assignments: " + str(component_assignments)) From b2897b60e05aebe232afae97ac0e620d8f962b36 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 16:42:39 +0000 Subject: [PATCH 114/327] Raise ktruss k to 5 --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 2131f80c..ed14d29f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -181,7 +181,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # For large network, use more approximate method for extracting references reference = {} - G_truss = cugraph.ktruss_subgraph.k_truss(G, 3) + G_truss = cugraph.ktruss_subgraph(G, 5) print("Gtruss type: " + str(type(G_truss))) component_assignments = cugraph.components.connectivity.connected_components(G_truss) print("Assignments: " + str(component_assignments)) From 9b5865537256d46c338329dd9c07e5c42404ac98 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 16:48:05 +0000 Subject: [PATCH 115/327] Change ktruss formats --- PopPUNK/network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ed14d29f..04acb2bf 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -181,9 +181,10 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # For large network, use more approximate method for extracting references reference = {} + print("G type: " + str(type(G))) G_truss = cugraph.ktruss_subgraph(G, 5) print("Gtruss type: " + str(type(G_truss))) - component_assignments = cugraph.components.connectivity.connected_components(G_truss) + component_assignments = cugraph.components.connectivity.connected_components(G) print("Assignments: " + str(component_assignments)) raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0] print("Raw type: " + str(type(raw_reference_indices))) From 98dba614381ef2f46e52ab20117bcc364c7a4852 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 17:01:02 +0000 Subject: [PATCH 116/327] Print network summaries --- PopPUNK/network.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 04acb2bf..77d1a8d3 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -182,8 +182,12 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # For large network, use more approximate method for extracting references reference = {} print("G type: " + str(type(G))) + print("G nodes: " + str(G.number_of_nodes()) + print("G edges: " + str(G.number_of_edges()) G_truss = cugraph.ktruss_subgraph(G, 5) print("Gtruss type: " + str(type(G_truss))) + print("Gtruss nodes: " + str(G_truss.number_of_nodes()) + print("Gtruss edges: " + str(G_truss.number_of_edges()) component_assignments = cugraph.components.connectivity.connected_components(G) print("Assignments: " + str(component_assignments)) raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0] From 24713ee6f7692cfbf205540ed3c525aba371f8b0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 17:02:35 +0000 Subject: [PATCH 117/327] Fix grammar --- PopPUNK/network.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 77d1a8d3..58276d8c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -182,12 +182,12 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # For large network, use more approximate method for extracting references reference = {} print("G type: " + str(type(G))) - print("G nodes: " + str(G.number_of_nodes()) - print("G edges: " + str(G.number_of_edges()) + print("G nodes: " + str(G.number_of_nodes())) + print("G edges: " + str(G.number_of_edges())) G_truss = cugraph.ktruss_subgraph(G, 5) print("Gtruss type: " + str(type(G_truss))) - print("Gtruss nodes: " + str(G_truss.number_of_nodes()) - print("Gtruss edges: " + str(G_truss.number_of_edges()) + print("Gtruss nodes: " + str(G_truss.number_of_nodes())) + print("Gtruss edges: " + str(G_truss.number_of_edges())) component_assignments = cugraph.components.connectivity.connected_components(G) print("Assignments: " + str(component_assignments)) raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0] From 65d8cb4c7f2d9351c5fd443295f44b22d51dddb5 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 21:04:30 +0000 Subject: [PATCH 118/327] Test Louvain --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 58276d8c..1eaa225c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -184,7 +184,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("G type: " + str(type(G))) print("G nodes: " + str(G.number_of_nodes())) print("G edges: " + str(G.number_of_edges())) - G_truss = cugraph.ktruss_subgraph(G, 5) + G_truss = cugraph.louvain(G) print("Gtruss type: " + str(type(G_truss))) print("Gtruss nodes: " + str(G_truss.number_of_nodes())) print("Gtruss edges: " + str(G_truss.number_of_edges())) From 2be951355aad3d2d38b10ff973085acf1b8d788d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 21:08:03 +0000 Subject: [PATCH 119/327] Test Leiden --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 1eaa225c..55609173 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -184,7 +184,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("G type: " + str(type(G))) print("G nodes: " + str(G.number_of_nodes())) print("G edges: " + str(G.number_of_edges())) - G_truss = cugraph.louvain(G) + G_truss = cugraph.leiden(G) print("Gtruss type: " + str(type(G_truss))) print("Gtruss nodes: " + str(G_truss.number_of_nodes())) print("Gtruss edges: " + str(G_truss.number_of_edges())) From cf012fbf2c0bb1edda8d76ad32b7eae09c75335d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 21:58:24 +0000 Subject: [PATCH 120/327] Process Leiden output --- PopPUNK/network.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 55609173..074f1b3c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -184,11 +184,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("G type: " + str(type(G))) print("G nodes: " + str(G.number_of_nodes())) print("G edges: " + str(G.number_of_edges())) - G_truss = cugraph.leiden(G) - print("Gtruss type: " + str(type(G_truss))) - print("Gtruss nodes: " + str(G_truss.number_of_nodes())) - print("Gtruss edges: " + str(G_truss.number_of_edges())) - component_assignments = cugraph.components.connectivity.connected_components(G) + component_assignments = cugraph.leiden(G) print("Assignments: " + str(component_assignments)) raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0] print("Raw type: " + str(type(raw_reference_indices))) From 74bb8ee1725fe7c8d53bbe0a1e9c03559e828f1b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 22:03:10 +0000 Subject: [PATCH 121/327] Process Leiden both outputs --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 074f1b3c..43e3a699 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -184,7 +184,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("G type: " + str(type(G))) print("G nodes: " + str(G.number_of_nodes())) print("G edges: " + str(G.number_of_edges())) - component_assignments = cugraph.leiden(G) + component_assignments, score = cugraph.leiden(G) print("Assignments: " + str(component_assignments)) raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0] print("Raw type: " + str(type(raw_reference_indices))) From a6941765741a15a1cb047d6e244370c8c6cb7fd6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 22:08:16 +0000 Subject: [PATCH 122/327] Change grouping variable --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 43e3a699..8443a07f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -186,7 +186,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("G edges: " + str(G.number_of_edges())) component_assignments, score = cugraph.leiden(G) print("Assignments: " + str(component_assignments)) - raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0] + raw_reference_indices = component_assignments.groupby('partition').nth(0).iloc[:0] print("Raw type: " + str(type(raw_reference_indices))) print("Raw refs: " + str(raw_reference_indices)) quit() From 0c0789d18710fc3d41f3fa408d5ada534d8a42cd Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 11 Mar 2021 22:16:29 +0000 Subject: [PATCH 123/327] Test grouping code --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 8443a07f..1be700ac 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -186,7 +186,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("G edges: " + str(G.number_of_edges())) component_assignments, score = cugraph.leiden(G) print("Assignments: " + str(component_assignments)) - raw_reference_indices = component_assignments.groupby('partition').nth(0).iloc[:0] + raw_reference_indices = component_assignments.groupby('partition').nth(0)#.iloc[:0] print("Raw type: " + str(type(raw_reference_indices))) print("Raw refs: " + str(raw_reference_indices)) quit() From e0d6f87d385fe2aec8d748d4fef607e52b8d22a9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 06:31:16 +0000 Subject: [PATCH 124/327] Fi grouping code --- PopPUNK/network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 1be700ac..ae5045fe 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -186,7 +186,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("G edges: " + str(G.number_of_edges())) component_assignments, score = cugraph.leiden(G) print("Assignments: " + str(component_assignments)) - raw_reference_indices = component_assignments.groupby('partition').nth(0)#.iloc[:0] + # group by partition, which becomes the first column, so retrieve second column + raw_reference_indices = component_assignments.groupby('partition').nth(0).iloc[:1] print("Raw type: " + str(type(raw_reference_indices))) print("Raw refs: " + str(raw_reference_indices)) quit() From b9319dcb256855dae9114615ebfebcb408df4b59 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 07:31:10 +0000 Subject: [PATCH 125/327] Change iloc selection --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ae5045fe..b06caf41 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -187,7 +187,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u component_assignments, score = cugraph.leiden(G) print("Assignments: " + str(component_assignments)) # group by partition, which becomes the first column, so retrieve second column - raw_reference_indices = component_assignments.groupby('partition').nth(0).iloc[:1] + raw_reference_indices = component_assignments.groupby('partition').nth(0).iloc[:,1] print("Raw type: " + str(type(raw_reference_indices))) print("Raw refs: " + str(raw_reference_indices)) quit() From 5b3d1832a5c94ef3b08e68b51f40f06037387b10 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 07:37:14 +0000 Subject: [PATCH 126/327] Change selection processing --- PopPUNK/network.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index b06caf41..dfec5135 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -187,9 +187,10 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u component_assignments, score = cugraph.leiden(G) print("Assignments: " + str(component_assignments)) # group by partition, which becomes the first column, so retrieve second column - raw_reference_indices = component_assignments.groupby('partition').nth(0).iloc[:,1] - print("Raw type: " + str(type(raw_reference_indices))) - print("Raw refs: " + str(raw_reference_indices)) + reference_index_df = component_assignments.groupby('partition').nth(0).iloc[:,1] + print("Raw type: " + str(type(reference_index_df))) + reference_indices = reference_index_df['vertex'].tolist() + print("Raw refs: " + str(reference_indices)) quit() else: From b29d82c3656ca238a388e1795f8765e5b44c476d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 07:38:43 +0000 Subject: [PATCH 127/327] Remove column select --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index dfec5135..aa6c6182 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -187,7 +187,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u component_assignments, score = cugraph.leiden(G) print("Assignments: " + str(component_assignments)) # group by partition, which becomes the first column, so retrieve second column - reference_index_df = component_assignments.groupby('partition').nth(0).iloc[:,1] + reference_index_df = component_assignments.groupby('partition').nth(0) print("Raw type: " + str(type(reference_index_df))) reference_indices = reference_index_df['vertex'].tolist() print("Raw refs: " + str(reference_indices)) From 1b5fd31e5a9e8c118afb60418aecae0152746845 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 07:43:30 +0000 Subject: [PATCH 128/327] Change list conversion --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index aa6c6182..42e253a0 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -189,7 +189,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # group by partition, which becomes the first column, so retrieve second column reference_index_df = component_assignments.groupby('partition').nth(0) print("Raw type: " + str(type(reference_index_df))) - reference_indices = reference_index_df['vertex'].tolist() + reference_indices = reference_index_df['vertex'].to_arrow().to_pylist() print("Raw refs: " + str(reference_indices)) quit() From 85ac6f5ff1d9be8bd104830367deba77a2b8754b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 09:33:09 +0000 Subject: [PATCH 129/327] Add reference graph construction --- PopPUNK/network.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 42e253a0..ed7a2507 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -191,7 +191,18 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("Raw type: " + str(type(reference_index_df))) reference_indices = reference_index_df['vertex'].to_arrow().to_pylist() print("Raw refs: " + str(reference_indices)) - quit() + + # Order found references as in mash sketch files + reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] + print("Reference names: " + str(reference_names)) + refFileName = writeReferences(reference_names, outPrefix) + + # Construct reference graph + G_df = G.view_edge_list() + G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)] + G_ref = cugraph.Graph() + G_ref.from_cudf_edgelist(G_ref_df, edge_attr='weights', renumber=False + return reference_indices, reference_names, refFileName, G_ref else: From 17a0997960b8d97c0b4eb19ebcacbe16228fb7ef Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 09:46:30 +0000 Subject: [PATCH 130/327] Add missing bracket --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ed7a2507..553fb536 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -201,7 +201,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_df = G.view_edge_list() G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)] G_ref = cugraph.Graph() - G_ref.from_cudf_edgelist(G_ref_df, edge_attr='weights', renumber=False + G_ref.from_cudf_edgelist(G_ref_df, edge_attr='weights', renumber=False) return reference_indices, reference_names, refFileName, G_ref else: From 2e1f3eefa524036c72587b8c68cf957835548706 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 09:47:58 +0000 Subject: [PATCH 131/327] Add edge list --- PopPUNK/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 553fb536..d5ccacd1 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -199,6 +199,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Construct reference graph G_df = G.view_edge_list() + print("Edge list: " + str(G_df)) G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)] G_ref = cugraph.Graph() G_ref.from_cudf_edgelist(G_ref_df, edge_attr='weights', renumber=False) From e5cb974305f3a12cfce5ed450d14853acb130641 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 09:48:58 +0000 Subject: [PATCH 132/327] Change column names --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index d5ccacd1..69003b97 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -200,7 +200,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Construct reference graph G_df = G.view_edge_list() print("Edge list: " + str(G_df)) - G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)] + G_ref_df = G_df[G_df['src'].isin(reference_names) & G_df['dst'].isin(reference_names)] G_ref = cugraph.Graph() G_ref.from_cudf_edgelist(G_ref_df, edge_attr='weights', renumber=False) return reference_indices, reference_names, refFileName, G_ref From eecf23930b46749832ba89fe4205e35fb8986f0a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 10:48:58 +0000 Subject: [PATCH 133/327] Remove weights from reference graph --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 69003b97..077dbad3 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -202,7 +202,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("Edge list: " + str(G_df)) G_ref_df = G_df[G_df['src'].isin(reference_names) & G_df['dst'].isin(reference_names)] G_ref = cugraph.Graph() - G_ref.from_cudf_edgelist(G_ref_df, edge_attr='weights', renumber=False) + G_ref.from_cudf_edgelist(G_ref_df, renumber=False) return reference_indices, reference_names, refFileName, G_ref else: From 7fe3277654716419e5f63201da15d0003f1900af Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 11:06:02 +0000 Subject: [PATCH 134/327] Add self loops for reference graph --- PopPUNK/network.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 077dbad3..545f49e4 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -197,12 +197,22 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("Reference names: " + str(reference_names)) refFileName = writeReferences(reference_names, outPrefix) - # Construct reference graph + # Extract reference edges G_df = G.view_edge_list() print("Edge list: " + str(G_df)) - G_ref_df = G_df[G_df['src'].isin(reference_names) & G_df['dst'].isin(reference_names)] + G_df.columns[0:1] = ['source','destination'] + G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)] + print("Ref graph: " + str(G_ref_df)) + # Add self-loop if needing + max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) + max_in_vertex_labels = len(reference_names)-1 + if max_in_df.item() != max_in_vertex_labels: + G_self_loop = cudf.DataFrame() + G_self_loop['source'] = [max_in_vertex_labels] + G_self_loop['destination'] = [max_in_vertex_labels] + # Construct graph G_ref = cugraph.Graph() - G_ref.from_cudf_edgelist(G_ref_df, renumber=False) + G_ref.from_cudf_edgelist(G_ref_df) return reference_indices, reference_names, refFileName, G_ref else: From d6a344222bea77d0c31ccf66df95c54798dfb8fc Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 11:07:32 +0000 Subject: [PATCH 135/327] Change column names --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 545f49e4..4223454a 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -200,7 +200,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Extract reference edges G_df = G.view_edge_list() print("Edge list: " + str(G_df)) - G_df.columns[0:1] = ['source','destination'] + G_df.columns = ['source','destination'] G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)] print("Ref graph: " + str(G_ref_df)) # Add self-loop if needing From fc9d0a594e45cfb0cf90d38745466f38bfa9210f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 11:13:14 +0000 Subject: [PATCH 136/327] Change df concatenation --- PopPUNK/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 4223454a..a6a13139 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -210,6 +210,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_self_loop = cudf.DataFrame() G_self_loop['source'] = [max_in_vertex_labels] G_self_loop['destination'] = [max_in_vertex_labels] + G_ref_df = cudf.concat([G_ref_df,G_self_loop], ignore_index = True) # Construct graph G_ref = cugraph.Graph() G_ref.from_cudf_edgelist(G_ref_df) From 366630b0e088a611c4ed16bcb9baa33c5e339e8b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 11:17:53 +0000 Subject: [PATCH 137/327] Print ref graph --- PopPUNK/network.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index a6a13139..09afdf9f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -211,9 +211,11 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_self_loop['source'] = [max_in_vertex_labels] G_self_loop['destination'] = [max_in_vertex_labels] G_ref_df = cudf.concat([G_ref_df,G_self_loop], ignore_index = True) + print("Ref df: " + str(G_ref_df)) # Construct graph G_ref = cugraph.Graph() G_ref.from_cudf_edgelist(G_ref_df) + print("Ref graph: " + str(G_ref)) return reference_indices, reference_names, refFileName, G_ref else: From 76671a3baa6eb3140404317e23957c719f2fcab0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 11:24:08 +0000 Subject: [PATCH 138/327] Add resolution parameter to Leiden method --- PopPUNK/network.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 09afdf9f..5f308cd1 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -181,28 +181,20 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # For large network, use more approximate method for extracting references reference = {} - print("G type: " + str(type(G))) - print("G nodes: " + str(G.number_of_nodes())) - print("G edges: " + str(G.number_of_edges())) - component_assignments, score = cugraph.leiden(G) - print("Assignments: " + str(component_assignments)) + # Leiden method has resolution parameter - higher values give greater precision + component_assignments, score = cugraph.leiden(G, resolution = 1.0) # group by partition, which becomes the first column, so retrieve second column reference_index_df = component_assignments.groupby('partition').nth(0) - print("Raw type: " + str(type(reference_index_df))) reference_indices = reference_index_df['vertex'].to_arrow().to_pylist() - print("Raw refs: " + str(reference_indices)) # Order found references as in mash sketch files reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] - print("Reference names: " + str(reference_names)) refFileName = writeReferences(reference_names, outPrefix) # Extract reference edges G_df = G.view_edge_list() - print("Edge list: " + str(G_df)) G_df.columns = ['source','destination'] G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)] - print("Ref graph: " + str(G_ref_df)) # Add self-loop if needing max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) max_in_vertex_labels = len(reference_names)-1 @@ -211,11 +203,9 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_self_loop['source'] = [max_in_vertex_labels] G_self_loop['destination'] = [max_in_vertex_labels] G_ref_df = cudf.concat([G_ref_df,G_self_loop], ignore_index = True) - print("Ref df: " + str(G_ref_df)) # Construct graph G_ref = cugraph.Graph() G_ref.from_cudf_edgelist(G_ref_df) - print("Ref graph: " + str(G_ref)) return reference_indices, reference_names, refFileName, G_ref else: From 1a646b62e80bfadc65bb1498bc9b061cf41431a2 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 11:49:21 +0000 Subject: [PATCH 139/327] Add GPU graph loading --- PopPUNK/network.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 5f308cd1..ca6c9c41 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -34,7 +34,7 @@ from .utils import isolateNameToLabel def fetchNetwork(network_dir, model, refList, ref_graph = False, - core_only = False, accessory_only = False): + core_only = False, accessory_only = False, use_gpu = False): """Load the network based on input options Returns the network as a graph-tool format graph, and sets @@ -52,12 +52,12 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, [default = False] core_only (bool) Return the network created using only core distances - [default = False] accessory_only (bool) Return the network created using only accessory distances - [default = False] + use_gpu (bool) + Use cugraph library to load graph Returns: genomeNetwork (graph) @@ -67,25 +67,36 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, """ # If a refined fit, may use just core or accessory distances dir_prefix = network_dir + "/" + os.path.basename(network_dir) + if use_gpu: + graph_suffix = '.csv.bz2' + else: + graph_suffix = '.gt' if core_only and model.type == 'refine': model.slope = 0 - network_file = dir_prefix + '_core_graph.gt' + network_file = dir_prefix + '_core_graph' + graph_suffix cluster_file = dir_prefix + '_core_clusters.csv' elif accessory_only and model.type == 'refine': model.slope = 1 - network_file = dir_prefix + '_accessory_graph.gt' + network_file = dir_prefix + '_accessory_graph' + graph_suffix cluster_file = dir_prefix + '_accessory_clusters.csv' else: - if ref_graph and os.path.isfile(dir_prefix + '.refs_graph.gt'): - network_file = dir_prefix + '.refs_graph.gt' + if ref_graph and os.path.isfile(dir_prefix + '.refs_graph' + graph_suffix): + network_file = dir_prefix + '.refs_graph' + graph_suffix else: - network_file = dir_prefix + '_graph.gt' + network_file = dir_prefix + '_graph' + graph_suffix cluster_file = dir_prefix + '_clusters.csv' if core_only or accessory_only: sys.stderr.write("Can only do --core-only or --accessory-only fits from " "a refined fit. Using the combined distances.\n") - genomeNetwork = gt.load_graph(network_file) + if use_gpu: + G_df = cudf.read_csv(network_file, compression = 'gzip') + if weights in G_df.columns: + genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) + else: + genomeNetwork.from_cudf_edgelist(G_df,renumber=False) + else: + genomeNetwork = gt.load_graph(network_file) sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") # Ensure all in dists are in final network @@ -475,9 +486,6 @@ def constructNetwork(rlist, qlist, assignments, within_label, # by adding a self-loop if necessary; see https://github.com/rapidsai/cugraph/issues/1206 max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) max_in_vertex_labels = len(vertex_labels)-1 - print("Max in DF is " + str(max_in_df)) - print("Max type is " + str(type(max_in_df))) - print("Max in labels is " + str(max_in_vertex_labels)) if max_in_df.item() != max_in_vertex_labels: G_self_loop = cudf.DataFrame() G_self_loop['source'] = [max_in_vertex_labels] @@ -1038,8 +1046,8 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False): file_name = prefix + "/" + os.path.basename(prefix) + '_' + suffix os.path.basename(prefix) + '_graph.csv.bz2' if use_gpu: - G.to_pandas_edgelist().to_csv(file_name + '.csv.bz2', - compression='bz2') + G.to_csv(file_name + '.csv.gz', + compression='gzip') else: G.save(file_name + '.gt', fmt = 'gt') From f22cf552c561afe638abb298f08f2f7c6d4106fd Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 11:53:10 +0000 Subject: [PATCH 140/327] Change GPU graph writing --- PopPUNK/network.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ca6c9c41..d8c29fe0 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1046,8 +1046,7 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False): file_name = prefix + "/" + os.path.basename(prefix) + '_' + suffix os.path.basename(prefix) + '_graph.csv.bz2' if use_gpu: - G.to_csv(file_name + '.csv.gz', - compression='gzip') + G.edges().to_csv(file_name + '.csv.gz', compression='gzip') else: G.save(file_name + '.gt', fmt = 'gt') From accafd3e18c7503a1d14319131d9cefeebdb80d7 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 11:57:54 +0000 Subject: [PATCH 141/327] Change CSV compression --- PopPUNK/network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index d8c29fe0..10d4ed7d 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1046,7 +1046,8 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False): file_name = prefix + "/" + os.path.basename(prefix) + '_' + suffix os.path.basename(prefix) + '_graph.csv.bz2' if use_gpu: - G.edges().to_csv(file_name + '.csv.gz', compression='gzip') + G.to_pandas_edgelist().to_csv(file_name + '.csv.gz', + compression='gzip') else: G.save(file_name + '.gt', fmt = 'gt') From a2bb2845df6ff4e028ddb515f51efbfaf906ddeb Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 12:10:39 +0000 Subject: [PATCH 142/327] Change output file name --- PopPUNK/network.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 10d4ed7d..a7556541 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -494,14 +494,12 @@ def constructNetwork(rlist, qlist, assignments, within_label, G_self_loop['weights'] = [0.0] G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) - print("New max in DF is " + str(new_max_in_df)) # construct graph G_cu = cugraph.Graph() if weights is not None or sparse_input is not None: G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: - print("Data frame is " + str(G_df)) G_cu.from_cudf_edgelist(G_df, renumber=False) return G_cu @@ -1043,8 +1041,7 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False): [default = False] """ - file_name = prefix + "/" + os.path.basename(prefix) + '_' + suffix - os.path.basename(prefix) + '_graph.csv.bz2' + file_name = prefix + "/" + os.path.basename(prefix) if use_gpu: G.to_pandas_edgelist().to_csv(file_name + '.csv.gz', compression='gzip') From 2729a11fccefd41496a196cc1eb1302c7fdc4a82 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 14:10:02 +0000 Subject: [PATCH 143/327] Add suffix to output file --- PopPUNK/network.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index a7556541..2e8a4dcc 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1042,6 +1042,8 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False): """ file_name = prefix + "/" + os.path.basename(prefix) + if suffix is not None: + file_name = file_name + '_' + suffix if use_gpu: G.to_pandas_edgelist().to_csv(file_name + '.csv.gz', compression='gzip') From 9466ae66f074c043259f3ecaffb3bf43d3225eb9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 14:11:50 +0000 Subject: [PATCH 144/327] Correct suffix to output file --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 2e8a4dcc..530e8805 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1043,7 +1043,7 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False): """ file_name = prefix + "/" + os.path.basename(prefix) if suffix is not None: - file_name = file_name + '_' + suffix + file_name = file_name + suffix if use_gpu: G.to_pandas_edgelist().to_csv(file_name + '.csv.gz', compression='gzip') From 80533f836069e17af968436f618c8ba2de27c3f3 Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 12 Mar 2021 15:18:36 +0000 Subject: [PATCH 145/327] Fix dist order with lineage mode --- PopPUNK/__main__.py | 2 +- PopPUNK/assign.py | 30 ++++++++++-------------------- PopPUNK/models.py | 3 +-- PopPUNK/utils.py | 13 ++++++++++--- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 860140a3..c19294d8 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -538,7 +538,7 @@ def main(): fit_type = 'accessory' genomeNetwork = indivNetworks['accessory'] - save_network(genomeNetwork, prefix = output, suffix = "_graph", use_gpu = args.gpu_graph) + save_network(genomeNetwork, prefix = output, suffix = "graph", use_gpu = args.gpu_graph) #******************************# #* *# diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index b1b11236..7b52d943 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -119,7 +119,12 @@ def assign_query(dbFuncs, for reference in refFile: rNames.append(reference.rstrip()) else: - rNames = getSeqsInDb(ref_db + "/" + os.path.basename(ref_db) + ".h5") + if os.path.isfile(distances + ",pkl"): + rNames = readPickle(distances, enforce_self = True, distances=False)[0] + elif update_db: + sys.stderr.write("Reference distances missing, cannot use --update-db\n") + else: + rNames = getSeqsInDb(ref_db + "/" + os.path.basename(ref_db) + ".h5") # construct database if (web and json_sketch): qNames = sketch_to_hdf5(json_sketch, output) @@ -244,28 +249,13 @@ def assign_query(dbFuncs, else: genomeNetwork.save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt') - # Update distance matrices with all calculated distances - if distances == None: - distanceFiles = ref_db + "/" + os.path.basename(ref_db) + ".dists" - else: - distanceFiles = distances - # Load the previous distances refList_loaded, refList_copy, self, rrDistMat = \ - readPickle(distanceFiles, + readPickle(distances, enforce_self = True) - # qrDistMat: order of ref labels is the same as in the database (usually - # ordered). Order in original rrDistMat is arbitrary, leading to an - # awkwardness here. We prefer to reorder the qrDistMat to match, as it is - # usually smaller and has a simpler layout in long form - # At the end, rNames is updated to match what has been loaded - if refList_loaded != rNames: - match_order = [rNames.index(i) for i in refList_loaded] * len(qNames) - for q_offset in range(len(qNames)): - for r_offset in range(len(rNames)): - match_order[q_offset * len(rNames) + r_offset] += q_offset * len(rNames) - qrDistMat = qrDistMat[match_order, :] - rNames = refList_loaded + # This should now always be true, otherwise both qrDistMat and sparse matrix + # may need reordering + assert(refList_loaded == rNames) combined_seq, core_distMat, acc_distMat = \ update_distance_matrices(rNames, rrDistMat, diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 902cf738..4f92c861 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -847,8 +847,7 @@ def fit(self, X, accessory, threads): pp_sketchlib.sparsifyDists( pp_sketchlib.longToSquare(X[:, [self.dist_col]], threads), 0, - rank, - threads + rank ) data = [epsilon if d < epsilon else d for d in data] self.nn_dists[rank] = coo_matrix((data, (row, col)), diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 33588739..cb801865 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -103,7 +103,7 @@ def storePickle(rlist, qlist, self, X, pklName): np.save(pklName + ".npy", X) -def readPickle(pklName, enforce_self = False): +def readPickle(pklName, enforce_self=False, distances=True): """Loads core and accessory distances saved by :func:`~storePickle` Called during ``--fit-model`` @@ -115,6 +115,10 @@ def readPickle(pklName, enforce_self = False): Error if self == False [default = True] + distances (bool) + Read the distance matrix + + [default = Trie] Returns: rlist (list) @@ -131,7 +135,10 @@ def readPickle(pklName, enforce_self = False): if enforce_self and not self: sys.stderr.write("Old distances " + pklName + ".npy not complete\n") sys.stderr.exit(1) - X = np.load(pklName + ".npy") + if distances: + X = np.load(pklName + ".npy") + else: + X = None return rlist, qlist, self, X @@ -432,7 +439,7 @@ def readRfile(rFile, oneSeq=False): list_iterable = zip(names, sequences) sorted_names = sorted(list_iterable) tuples = zip(*sorted_names) - names, sequences = [list(tuple) for tuple in tuples] + names, sequences = [list(r_tuple) for r_tuple in tuples] return (names, sequences) From ce8135e65e035ea3273d3e7902da491712a4e24e Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 12 Mar 2021 15:21:50 +0000 Subject: [PATCH 146/327] docstring typo --- PopPUNK/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index cb801865..8745f968 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -118,7 +118,7 @@ def readPickle(pklName, enforce_self=False, distances=True): distances (bool) Read the distance matrix - [default = Trie] + [default = True] Returns: rlist (list) From cca4a7c0959bef6a7f6dfd8018090323bef1034f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:07:37 +0000 Subject: [PATCH 147/327] Add GPU summaries --- PopPUNK/network.py | 85 +++++++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 31 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 530e8805..e265de8d 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -496,13 +496,11 @@ def constructNetwork(rlist, qlist, assignments, within_label, new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) # construct graph - G_cu = cugraph.Graph() + G = cugraph.Graph() if weights is not None or sparse_input is not None: - G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) + G.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: - G_cu.from_cudf_edgelist(G_df, renumber=False) - - return G_cu + G.from_cudf_edgelist(G_df, renumber=False) else: @@ -522,22 +520,22 @@ def constructNetwork(rlist, qlist, assignments, within_label, vals = vertex_labels) G.vp.id = vid - # print some summaries - if summarise: - (metrics, scores) = networkSummary(G) - sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t\t\t\t" + str(metrics[0]), - "\tDensity\t\t\t\t\t" + "{:.4f}".format(metrics[1]), - "\tTransitivity\t\t\t\t" + "{:.4f}".format(metrics[2]), - "\tMean betweenness\t\t\t" + "{:.4f}".format(metrics[3]), - "\tWeighted-mean betweenness\t\t" + "{:.4f}".format(metrics[4]), - "\tScore\t\t\t\t\t" + "{:.4f}".format(scores[0]), - "\tScore (w/ betweenness)\t\t\t" + "{:.4f}".format(scores[1]), - "\tScore (w/ weighted-betweenness)\t\t" + "{:.4f}".format(scores[2])]) - + "\n") - - return G - -def networkSummary(G, calc_betweenness=True): + # print some summaries + if summarise: + (metrics, scores) = networkSummary(G, use_gpu = use_gpu) + sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t\t\t\t" + str(metrics[0]), + "\tDensity\t\t\t\t\t" + "{:.4f}".format(metrics[1]), + "\tTransitivity\t\t\t\t" + "{:.4f}".format(metrics[2]), + "\tMean betweenness\t\t\t" + "{:.4f}".format(metrics[3]), + "\tWeighted-mean betweenness\t\t" + "{:.4f}".format(metrics[4]), + "\tScore\t\t\t\t\t" + "{:.4f}".format(scores[0]), + "\tScore (w/ betweenness)\t\t\t" + "{:.4f}".format(scores[1]), + "\tScore (w/ weighted-betweenness)\t\t" + "{:.4f}".format(scores[2])]) + + "\n") + + return G + +def networkSummary(G, calc_betweenness=True, use_gpu = False): """Provides summary values about the network Args: @@ -545,6 +543,8 @@ def networkSummary(G, calc_betweenness=True): The network of strains from :func:`~constructNetwork` calc_betweenness (bool) Whether to calculate betweenness stats + use_gpu (bool) + Whether to use cugraph for graph analysis Returns: metrics (list) @@ -553,27 +553,50 @@ def networkSummary(G, calc_betweenness=True): scores (list) List of scores """ - component_assignments, component_frequencies = gt.label_components(G) - components = len(component_frequencies) - density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1)) - transitivity = gt.global_clustering(G)[0] + if use_gpu: + component_assignments = cugraph.components.connectivity.connected_components(G) + components = component_assignments['labels'].unique() + density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)) + triangle_count = cugraph.community.triangle_count.triangles(G) + degree = G.degree() + triad_count = sum([d * (d - 1) for d in degree) + transitivity = triangle_count/triad_count + else: + component_assignments, component_frequencies = gt.label_components(G) + components = len(component_frequencies) + density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1)) + transitivity = gt.global_clustering(G)[0] mean_bt = 0 weighted_mean_bt = 0 if calc_betweenness: betweenness = [] sizes = [] - for component, size in enumerate(component_frequencies): - if size > 3: - vfilt = component_assignments.a == component - subgraph = gt.GraphView(G, vfilt=vfilt) - betweenness.append(max(gt.betweenness(subgraph, norm = True)[0].a)) - sizes.append(size) + + if use_gpu: + component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) + for component in components: + size = component_frequencies[component_frequencies.index == component] + if size > 3: + print("Component count df: " + str(component_assignments)) + component_vertices = component_assignments['vertices'][component_assignments['labels']==component] + subgraph = cugraph.subgraph(G, component_vertices) + component_betweenness = cugraph.betweenness_centrality(G) + betweenness.append(np.amax(component_betweenness)) + sizes.append(size) + else: + for component, size in enumerate(component_frequencies): + if size > 3: + vfilt = component_assignments.a == component + subgraph = gt.GraphView(G, vfilt=vfilt) + betweenness.append(max(gt.betweenness(subgraph, norm = True)[0].a)) + sizes.append(size) if len(betweenness) > 1: mean_bt = np.mean(betweenness) weighted_mean_bt = np.average(betweenness, weights=sizes) + # Calculate scores metrics = [components, density, transitivity, mean_bt, weighted_mean_bt] base_score = transitivity * (1 - density) scores = [base_score, base_score * (1 - metrics[3]), base_score * (1 - metrics[4])] From 4a53fd274b34ed481395c7efd869b714817f7561 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:08:29 +0000 Subject: [PATCH 148/327] Remove surplus bracket --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index e265de8d..fb76ad70 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -556,7 +556,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): if use_gpu: component_assignments = cugraph.components.connectivity.connected_components(G) components = component_assignments['labels'].unique() - density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)) + density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) triangle_count = cugraph.community.triangle_count.triangles(G) degree = G.degree() triad_count = sum([d * (d - 1) for d in degree) From 3c7d1d25248ce3ee88008c4c90c2f22d6b927ccc Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:09:35 +0000 Subject: [PATCH 149/327] Change sum of degree --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index fb76ad70..b78439ea 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -559,7 +559,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) triangle_count = cugraph.community.triangle_count.triangles(G) degree = G.degree() - triad_count = sum([d * (d - 1) for d in degree) + triad_count = sum([d * (d - 1) for d in degree]) transitivity = triangle_count/triad_count else: component_assignments, component_frequencies = gt.label_components(G) From b75dccedd2d58b06e52eec92c0c1b45e61fec50a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:11:01 +0000 Subject: [PATCH 150/327] Load cugraph libraries --- PopPUNK/network.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index b78439ea..c81e9dae 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -554,6 +554,15 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): List of scores """ if use_gpu: + + # load CUDA libraries + try: + import cugraph + import cudf + except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + raise ImportError(e) + component_assignments = cugraph.components.connectivity.connected_components(G) components = component_assignments['labels'].unique() density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) From 8ccddf4c7c56d5c042d92210934a9f235fc49795 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:12:15 +0000 Subject: [PATCH 151/327] Print degree for debug --- PopPUNK/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index c81e9dae..354ed7a2 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -568,6 +568,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) triangle_count = cugraph.community.triangle_count.triangles(G) degree = G.degree() + print("Degree is " + str(degree)) triad_count = sum([d * (d - 1) for d in degree]) transitivity = triangle_count/triad_count else: From 81bfb5ea659046d9d0f232fe478ee2e245e64e87 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:14:47 +0000 Subject: [PATCH 152/327] Change access to degree --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 354ed7a2..07c74412 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -567,9 +567,9 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): components = component_assignments['labels'].unique() density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) triangle_count = cugraph.community.triangle_count.triangles(G) - degree = G.degree() + degree_df = G.degree() print("Degree is " + str(degree)) - triad_count = sum([d * (d - 1) for d in degree]) + triad_count = sum([d * (d - 1) for d in degree_df['degree']) transitivity = triangle_count/triad_count else: component_assignments, component_frequencies = gt.label_components(G) From 3368c6b6f87d7916d6221c3fdd29795265383bc7 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:15:41 +0000 Subject: [PATCH 153/327] Add missing bracket --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 07c74412..7d7d3aaf 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -569,7 +569,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): triangle_count = cugraph.community.triangle_count.triangles(G) degree_df = G.degree() print("Degree is " + str(degree)) - triad_count = sum([d * (d - 1) for d in degree_df['degree']) + triad_count = sum([d * (d - 1) for d in degree_df['degree']]) transitivity = triangle_count/triad_count else: component_assignments, component_frequencies = gt.label_components(G) From ff0720177fb777f5962364d5dfaed99797b5d514 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:16:50 +0000 Subject: [PATCH 154/327] Change degree print statement --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 7d7d3aaf..3f27068a 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -568,7 +568,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) triangle_count = cugraph.community.triangle_count.triangles(G) degree_df = G.degree() - print("Degree is " + str(degree)) + print("Degree is " + str(degree_df['degree'])) triad_count = sum([d * (d - 1) for d in degree_df['degree']]) transitivity = triangle_count/triad_count else: From 628ad32b5c53a96a244a9e02fe3fb1d5d7703e8e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:18:40 +0000 Subject: [PATCH 155/327] Convert to pandas --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 3f27068a..0833120f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -569,7 +569,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): triangle_count = cugraph.community.triangle_count.triangles(G) degree_df = G.degree() print("Degree is " + str(degree_df['degree'])) - triad_count = sum([d * (d - 1) for d in degree_df['degree']]) + triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()]) transitivity = triangle_count/triad_count else: component_assignments, component_frequencies = gt.label_components(G) From 8e36694c70e7ed3a4bb079c36b34363b091e900c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:20:31 +0000 Subject: [PATCH 156/327] Change iteration over components --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 0833120f..14b152f5 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -568,8 +568,8 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) triangle_count = cugraph.community.triangle_count.triangles(G) degree_df = G.degree() - print("Degree is " + str(degree_df['degree'])) triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()]) + print("triad_count is " + str(triad_count)) transitivity = triangle_count/triad_count else: component_assignments, component_frequencies = gt.label_components(G) @@ -585,7 +585,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): if use_gpu: component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) - for component in components: + for component in components.to_pandas(): size = component_frequencies[component_frequencies.index == component] if size > 3: print("Component count df: " + str(component_assignments)) From f19adfc63ea1f72768f0aaefad0b1ad3122da240 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:23:21 +0000 Subject: [PATCH 157/327] Print details of components --- PopPUNK/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 14b152f5..6ca6b789 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -587,6 +587,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) for component in components.to_pandas(): size = component_frequencies[component_frequencies.index == component] + print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies)) if size > 3: print("Component count df: " + str(component_assignments)) component_vertices = component_assignments['vertices'][component_assignments['labels']==component] From 14a14f93c27452ab0931ed214d4f65858fa67e67 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:29:38 +0000 Subject: [PATCH 158/327] Convert series value to int --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 6ca6b789..6cbcdff3 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -586,7 +586,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): if use_gpu: component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) for component in components.to_pandas(): - size = component_frequencies[component_frequencies.index == component] + size = component_frequencies[component_frequencies.index == component].astype(int) print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies)) if size > 3: print("Component count df: " + str(component_assignments)) From 38298f25fd7bb4aa43ef53a307477bc8985f6799 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:33:29 +0000 Subject: [PATCH 159/327] Extract single value for size --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 6cbcdff3..be01983d 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -586,7 +586,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): if use_gpu: component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) for component in components.to_pandas(): - size = component_frequencies[component_frequencies.index == component].astype(int) + size = component_frequencies[component_frequencies.index == component].iloc[0].astype(int) print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies)) if size > 3: print("Component count df: " + str(component_assignments)) From fb6ba093be72dba1525558cb96b2399c5a06bf08 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:35:39 +0000 Subject: [PATCH 160/327] Change column name --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index be01983d..dbe2ac4a 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -590,7 +590,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies)) if size > 3: print("Component count df: " + str(component_assignments)) - component_vertices = component_assignments['vertices'][component_assignments['labels']==component] + component_vertices = component_assignments['vertex'][component_assignments['labels']==component] subgraph = cugraph.subgraph(G, component_vertices) component_betweenness = cugraph.betweenness_centrality(G) betweenness.append(np.amax(component_betweenness)) From 2973f6e26f9c1621ec55e5534b7f3a2c33cfda01 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:37:13 +0000 Subject: [PATCH 161/327] Print component betweenness --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index dbe2ac4a..10cf8c82 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -589,10 +589,10 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): size = component_frequencies[component_frequencies.index == component].iloc[0].astype(int) print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies)) if size > 3: - print("Component count df: " + str(component_assignments)) component_vertices = component_assignments['vertex'][component_assignments['labels']==component] subgraph = cugraph.subgraph(G, component_vertices) component_betweenness = cugraph.betweenness_centrality(G) + print("Component betweenness: " + str(component_betweenness)) betweenness.append(np.amax(component_betweenness)) sizes.append(size) else: From 49e4fc96097703d163258a13867e8f6931099e5a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:40:37 +0000 Subject: [PATCH 162/327] Find maximum betweenness --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 10cf8c82..51b08d9b 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -593,7 +593,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): subgraph = cugraph.subgraph(G, component_vertices) component_betweenness = cugraph.betweenness_centrality(G) print("Component betweenness: " + str(component_betweenness)) - betweenness.append(np.amax(component_betweenness)) + betweenness.append(component_betweenness['component_betweenness'].max()) sizes.append(size) else: for component, size in enumerate(component_frequencies): From 2461855ccfe31634cae4109fff6a4aad474d2824 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:41:44 +0000 Subject: [PATCH 163/327] Change column name --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 51b08d9b..c625cf6b 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -593,7 +593,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): subgraph = cugraph.subgraph(G, component_vertices) component_betweenness = cugraph.betweenness_centrality(G) print("Component betweenness: " + str(component_betweenness)) - betweenness.append(component_betweenness['component_betweenness'].max()) + betweenness.append(component_betweenness['betweenness_centrality'].max()) sizes.append(size) else: for component, size in enumerate(component_frequencies): From 69edec78ddf6080287799e9460b33f6409d91a0e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:44:37 +0000 Subject: [PATCH 164/327] Betweeness access change --- PopPUNK/network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index c625cf6b..a7883444 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -564,7 +564,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): raise ImportError(e) component_assignments = cugraph.components.connectivity.connected_components(G) - components = component_assignments['labels'].unique() + components = component_assignments['labels'].unique().astype(int) density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) triangle_count = cugraph.community.triangle_count.triangles(G) degree_df = G.degree() @@ -594,6 +594,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): component_betweenness = cugraph.betweenness_centrality(G) print("Component betweenness: " + str(component_betweenness)) betweenness.append(component_betweenness['betweenness_centrality'].max()) + print("Betweenness: " + str(betweenness)) sizes.append(size) else: for component, size in enumerate(component_frequencies): From 00d84d572ebdf43737f20180e85c4dae6681c3ed Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:48:14 +0000 Subject: [PATCH 165/327] Change summary stat recording --- PopPUNK/network.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index a7883444..b5f3ca94 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -564,7 +564,8 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): raise ImportError(e) component_assignments = cugraph.components.connectivity.connected_components(G) - components = component_assignments['labels'].unique().astype(int) + component_nums = component_assignments['labels'].unique().astype(int) + components = len(component_nums) density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) triangle_count = cugraph.community.triangle_count.triangles(G) degree_df = G.degree() @@ -585,7 +586,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): if use_gpu: component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) - for component in components.to_pandas(): + for component in component_nums.to_pandas(): size = component_frequencies[component_frequencies.index == component].iloc[0].astype(int) print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies)) if size > 3: @@ -607,6 +608,9 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): if len(betweenness) > 1: mean_bt = np.mean(betweenness) weighted_mean_bt = np.average(betweenness, weights=sizes) + else: + mean_bt = betweenness[0] + weighted_mean_bt = betweenness[0] # Calculate scores metrics = [components, density, transitivity, mean_bt, weighted_mean_bt] From 438c269405dc3db43a72c97b6cfbe9ca1b438054 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 16:50:41 +0000 Subject: [PATCH 166/327] Tidy up debug messages --- PopPUNK/network.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index b5f3ca94..f4182ffa 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -570,7 +570,6 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): triangle_count = cugraph.community.triangle_count.triangles(G) degree_df = G.degree() triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()]) - print("triad_count is " + str(triad_count)) transitivity = triangle_count/triad_count else: component_assignments, component_frequencies = gt.label_components(G) @@ -588,14 +587,11 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) for component in component_nums.to_pandas(): size = component_frequencies[component_frequencies.index == component].iloc[0].astype(int) - print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies)) if size > 3: component_vertices = component_assignments['vertex'][component_assignments['labels']==component] subgraph = cugraph.subgraph(G, component_vertices) component_betweenness = cugraph.betweenness_centrality(G) - print("Component betweenness: " + str(component_betweenness)) betweenness.append(component_betweenness['betweenness_centrality'].max()) - print("Betweenness: " + str(betweenness)) sizes.append(size) else: for component, size in enumerate(component_frequencies): From 03820241068220f2bc4c94f1924196e730e08d29 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 22:05:08 +0000 Subject: [PATCH 167/327] Transitivity calculation details --- PopPUNK/network.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index f4182ffa..884c25fa 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -571,11 +571,13 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): degree_df = G.degree() triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()]) transitivity = triangle_count/triad_count + print("Triangle count CPU: " + str(triangle_count) + " Triad count CPU: " + str(triad_count)) else: component_assignments, component_frequencies = gt.label_components(G) components = len(component_frequencies) density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1)) transitivity = gt.global_clustering(G)[0] + print("Triangle count CPU: " + str(gt.global_clustering(G)[1]) + " Triad count CPU: " + str(gt.global_clustering(G)[2])) mean_bt = 0 weighted_mean_bt = 0 From 583fa13bd4d0134802c603dd6fae4fcdf6ff1d74 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 22:08:56 +0000 Subject: [PATCH 168/327] Change printing of debug --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 884c25fa..af160bcf 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -571,13 +571,13 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): degree_df = G.degree() triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()]) transitivity = triangle_count/triad_count - print("Triangle count CPU: " + str(triangle_count) + " Triad count CPU: " + str(triad_count)) + print("Triangle count GPU: " + str(triangle_count) + " Triad count GPU: " + str(triad_count)) else: component_assignments, component_frequencies = gt.label_components(G) components = len(component_frequencies) density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1)) transitivity = gt.global_clustering(G)[0] - print("Triangle count CPU: " + str(gt.global_clustering(G)[1]) + " Triad count CPU: " + str(gt.global_clustering(G)[2])) + print("Triangle count CPU: " + str(gt.global_clustering(G))) mean_bt = 0 weighted_mean_bt = 0 From e5bb57ac7c3c34e2bedd4000855d3d51f9b464b2 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 12 Mar 2021 22:10:40 +0000 Subject: [PATCH 169/327] Print counts --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index af160bcf..1402b394 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -577,7 +577,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): components = len(component_frequencies) density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1)) transitivity = gt.global_clustering(G)[0] - print("Triangle count CPU: " + str(gt.global_clustering(G))) + print("Triangle count CPU: " + str(gt.global_clustering(G, ret_counts = True))) mean_bt = 0 weighted_mean_bt = 0 From bdf8a83141e10552824cce34a1702d6d47ece760 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 06:51:24 +0000 Subject: [PATCH 170/327] Enable GPUs for refinement --- PopPUNK/network.py | 2 +- PopPUNK/refine.py | 58 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 1402b394..d3d1ab26 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -577,7 +577,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): components = len(component_frequencies) density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1)) transitivity = gt.global_clustering(G)[0] - print("Triangle count CPU: " + str(gt.global_clustering(G, ret_counts = True))) + print("Triangle/triad count CPU: " + str(gt.global_clustering(G, ret_counts = True))) mean_bt = 0 weighted_mean_bt = 0 diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 4dab0523..1baf2ff3 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -32,7 +32,7 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1, max_move, min_move, slope = 2, score_idx = 0, - unconstrained = False, no_local = False, num_processes = 1): + unconstrained = False, no_local = False, num_processes = 1, use_gpu = use_gpu): """Try to refine a fit by maximising a network score based on transitivity and density. Iteratively move the decision boundary to do this, using starting point from existing model. @@ -65,8 +65,10 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1, Quicker, but may be less well refined. num_processes (int) Number of threads to use in the global optimisation step. - (default = 1) + use_gpu (bool) + Whether to use cugraph for graph analyses + Returns: start_point (tuple) (x, y) co-ordinates of starting point @@ -117,7 +119,8 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1, distMat = distances_shared, x_range = x_max, y_range = y_max, - score_idx = score_idx), + score_idx = score_idx, + use_gpu = use_gpu), range(global_grid_resolution)) if gt.openmp_enabled(): @@ -148,7 +151,7 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1, poppunk_refine.thresholdIterate1D(distMat, s_range, slope, start_point[0], start_point[1], mean1[0], mean1[1], num_processes) - global_s = growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx) + global_s = growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, use_gpu = use_gpu) min_idx = np.argmin(np.array(global_s)) if min_idx > 0 and min_idx < len(s_range) - 1: bounds = [s_range[min_idx-1], s_range[min_idx+1]] @@ -162,7 +165,8 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1, local_s = scipy.optimize.minimize_scalar(newNetwork, bounds=bounds, method='Bounded', options={'disp': True}, - args = (sample_names, distMat, start_point, mean1, gradient, slope, score_idx)) + args = (sample_names, distMat, start_point, mean1, gradient, slope, score_idx, use_gpu = use_gpu), + ) optimised_s = local_s.x # Convert to x_max, y_max if needed @@ -180,7 +184,7 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1, return start_point, optimal_x, optimal_y, min_move, max_move -def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_idx = 0): +def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_idx = 0, use_gpu = False): """Construct a network, then add edges to it iteratively. Input is from ``pp_sketchlib.iterateBoundary1D`` or``pp_sketchlib.iterateBoundary2D`` @@ -201,6 +205,9 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_ [default = 0] thread_idx (int) Optional thread idx (if multithreaded) to offset progress bar by + use_gpu (bool) + Whether to use cugraph for graph analyses + Returns: scores (list) -1 * network score for each of x_range. @@ -219,12 +226,17 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_ # At first offset, make a new network, otherwise just add the new edges if prev_idx == 0: G = constructNetwork(sample_names, sample_names, edge_list, -1, - summarise=False, edge_list=True) + summarise=False, edge_list=True, use_gpu = use_gpu) else: - G.add_edge_list(edge_list) + if use_gpu: + G = constructNetwork(sample_names, sample_names, edge_list, -1, + summarise=False, edge_list=True, use_gpu = use_gpu) + else: + # Not currently possible with GPU - https://github.com/rapidsai/cugraph/issues/805 + G.add_edge_list(edge_list) # Add score into vector for any offsets passed (should usually just be one) for s in range(prev_idx, idx): - scores.append(-networkSummary(G, score_idx > 0)[1][score_idx]) + scores.append(-networkSummary(G, score_idx > 0, use_gpu = use_gpu)[1][score_idx]) pbar.update(1) prev_idx = idx edge_list = [] @@ -233,18 +245,23 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_ # Add score for final offset(s) at end of loop if prev_idx == 0: G = constructNetwork(sample_names, sample_names, edge_list, -1, - summarise=False, edge_list=True) + summarise=False, edge_list=True, use_gpu = use_gpu) else: - G.add_edge_list(edge_list) + if use_gpu: + G = constructNetwork(sample_names, sample_names, edge_list, -1, + summarise=False, edge_list=True, use_gpu = use_gpu) + else: + # Not currently possible with GPU - https://github.com/rapidsai/cugraph/issues/805 + G.add_edge_list(edge_list) for s in range(prev_idx, len(s_range)): - scores.append(-networkSummary(G, score_idx > 0)[1][score_idx]) + scores.append(-networkSummary(G, score_idx > 0, use_gpu = use_gpu)[1][score_idx]) pbar.update(1) return(scores) def newNetwork(s, sample_names, distMat, start_point, mean1, gradient, - slope=2, score_idx=0, cpus=1): + slope=2, score_idx=0, cpus=1, use_gpu = False): """Wrapper function for :func:`~PopPUNK.network.constructNetwork` which is called by optimisation functions moving a triangular decision boundary. @@ -273,6 +290,9 @@ def newNetwork(s, sample_names, distMat, start_point, mean1, gradient, [default = 0] cpus (int) Number of CPUs to use for calculating assignment + use_gpu (bool) + Whether to use cugraph for graph analysis + Returns: score (float) -1 * network score. Where network score is from :func:`~PopPUNK.network.networkSummary` @@ -294,13 +314,14 @@ def newNetwork(s, sample_names, distMat, start_point, mean1, gradient, # Make network boundary_assignments = poppunk_refine.assignThreshold(distMat, slope, x_max, y_max, cpus) - G = constructNetwork(sample_names, sample_names, boundary_assignments, -1, summarise = False) + G = constructNetwork(sample_names, sample_names, boundary_assignments, -1, summarise = False, + use_gpu = use_gpu) # Return score - score = networkSummary(G, score_idx > 0)[1][score_idx] + score = networkSummary(G, score_idx > 0, use_gpu = use_gpu)[1][score_idx] return(-score) -def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0): +def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0, use_gpu = use_gpu): """Wrapper function for thresholdIterate2D and :func:`growNetwork`. For a given y_max, constructs networks across x_range and returns a list @@ -320,6 +341,9 @@ def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0): score_idx (int) Index of score from :func:`~PopPUNK.network.networkSummary` to use [default = 0] + use_gpu (bool) + Whether to use cugraph for graph analysis + Returns: scores (list) -1 * network score for each of x_range. @@ -334,7 +358,7 @@ def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0): y_max = y_range[y_idx] i_vec, j_vec, idx_vec = \ poppunk_refine.thresholdIterate2D(distMat, x_range, y_max) - scores = growNetwork(sample_names, i_vec, j_vec, idx_vec, x_range, score_idx, y_idx) + scores = growNetwork(sample_names, i_vec, j_vec, idx_vec, x_range, score_idx, y_idx, use_gpu = use_gpu) return(scores) def readManualStart(startFile): From 2e1c802791a26678f8cb2272439cb96cb3d16852 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 06:57:11 +0000 Subject: [PATCH 171/327] Change kwarg to arg in optimise --- PopPUNK/refine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 1baf2ff3..fe873317 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -165,7 +165,7 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1, local_s = scipy.optimize.minimize_scalar(newNetwork, bounds=bounds, method='Bounded', options={'disp': True}, - args = (sample_names, distMat, start_point, mean1, gradient, slope, score_idx, use_gpu = use_gpu), + args = (sample_names, distMat, start_point, mean1, gradient, slope, score_idx, use_gpu), ) optimised_s = local_s.x From 4e68b7baf99f78748ee0a17202c6f56b2aba285e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 06:58:23 +0000 Subject: [PATCH 172/327] Change default arguments --- PopPUNK/refine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index fe873317..2af32f64 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -32,7 +32,7 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1, max_move, min_move, slope = 2, score_idx = 0, - unconstrained = False, no_local = False, num_processes = 1, use_gpu = use_gpu): + unconstrained = False, no_local = False, num_processes = 1, use_gpu = False): """Try to refine a fit by maximising a network score based on transitivity and density. Iteratively move the decision boundary to do this, using starting point from existing model. @@ -321,7 +321,7 @@ def newNetwork(s, sample_names, distMat, start_point, mean1, gradient, score = networkSummary(G, score_idx > 0, use_gpu = use_gpu)[1][score_idx] return(-score) -def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0, use_gpu = use_gpu): +def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0, use_gpu = False): """Wrapper function for thresholdIterate2D and :func:`growNetwork`. For a given y_max, constructs networks across x_range and returns a list From a2ce78919a79d1ecbf793d1ce36ccd6dd948c307 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 07:14:50 +0000 Subject: [PATCH 173/327] Cascade use_gpu argument through functions --- PopPUNK/__main__.py | 3 ++- PopPUNK/refine.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index c9b2dfa0..41e2f198 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -415,7 +415,8 @@ def main(): args.unconstrained, args.score_idx, args.no_local, - args.threads) + args.threads, + use_gpu = args.gpu_graph) new_model.plot(distMat) model = new_model elif args.fit_model == "threshold": diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 2af32f64..8bff91dd 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -165,7 +165,8 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1, local_s = scipy.optimize.minimize_scalar(newNetwork, bounds=bounds, method='Bounded', options={'disp': True}, - args = (sample_names, distMat, start_point, mean1, gradient, slope, score_idx, use_gpu), + args = (sample_names, distMat, start_point, mean1, gradient, + slope, score_idx, num_processes, use_gpu), ) optimised_s = local_s.x From 97487851cbd56293d8ecba7135dc7e7e9474a604 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 07:16:02 +0000 Subject: [PATCH 174/327] Change refine arguments --- PopPUNK/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 41e2f198..3a55283e 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -416,7 +416,7 @@ def main(): args.score_idx, args.no_local, args.threads, - use_gpu = args.gpu_graph) + args.gpu_graph) new_model.plot(distMat) model = new_model elif args.fit_model == "threshold": From e7ff375777a60bc3f14e27716d076bfef3a8508d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 07:21:14 +0000 Subject: [PATCH 175/327] Communicate GPU use --- PopPUNK/models.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 902cf738..5bac8220 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -530,7 +530,7 @@ def __init__(self, outPrefix): self.unconstrained = False def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indiv_refine = False, - unconstrained = False, score_idx = 0, no_local = False, threads = 1): + unconstrained = False, score_idx = 0, no_local = False, threads = 1, use_gpu = False): '''Extends :func:`~ClusterFit.fit` Fits the distances by optimising network score, by calling @@ -553,11 +553,9 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi startFile (str) A file defining an initial fit, rather than one from ``--fit-model``. See documentation for format. - (default = None). indiv_refine (bool) Run refinement for core and accessory distances separately - (default = False). unconstrained (bool) If True, search in 2D and change the slope of the boundary @@ -569,8 +567,10 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi Quicker, but may be less well refined. num_processes (int) Number of threads to use in the global optimisation step. - (default = 1) + use_gpu (bool) + Whether to use cugraph for graph analyses + Returns: y (numpy.array) Cluster assignments of samples in X @@ -581,6 +581,14 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi self.min_move = min_move self.unconstrained = unconstrained + # load CUDA libraries + try: + import cugraph + import cudf + except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + raise ImportError(e) + # Get starting point model.no_scale() if startFile: @@ -618,7 +626,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi refineFit(X/self.scale, sample_names, self.start_s, self.mean0, self.mean1, self.max_move, self.min_move, slope = 2, score_idx = score_idx, unconstrained = unconstrained, - no_local = no_local, num_processes = threads) + no_local = no_local, num_processes = threads, use_gpu = use_gpu) self.fitted = True # Try and do a 1D refinement for both core and accessory @@ -631,12 +639,14 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi start_point, self.core_boundary, core_acc, self.min_move, self.max_move = \ refineFit(X/self.scale, sample_names, self.start_s, self.mean0, self.mean1, self.max_move, self.min_move, - slope = 0, score_idx = score_idx, no_local = no_local,num_processes = threads) + slope = 0, score_idx = score_idx, no_local = no_local,num_processes = threads, + use_gpu = use_gpu) # optimise accessory distance boundary start_point, acc_core, self.accessory_boundary, self.min_move, self.max_move = \ refineFit(X/self.scale, sample_names, self.start_s,self.mean0, self.mean1, self.max_move, self.min_move, - slope = 1, score_idx = score_idx, no_local = no_local, num_processes = threads) + slope = 1, score_idx = score_idx, no_local = no_local, num_processes = threads, + use_gpu = use_gpu) self.indiv_fitted = True except RuntimeError as e: sys.stderr.write("Could not separately refine core and accessory boundaries. " From 9eba43877fa7f321f76e8137d5c31a7dc7dedcff Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 19:08:17 +0000 Subject: [PATCH 176/327] Improve graph reconstruction in refinement --- PopPUNK/refine.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 8bff91dd..90840633 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -230,8 +230,10 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_ summarise=False, edge_list=True, use_gpu = use_gpu) else: if use_gpu: - G = constructNetwork(sample_names, sample_names, edge_list, -1, - summarise=False, edge_list=True, use_gpu = use_gpu) + G_extra_df = cudf.DataFrame(edge_list, columns =['source', 'destination']) + G_df = cudf.concat([G.view_edge_list(),G_extra_df], ignore_index = True) + G = cugraph.Graph() + G.from_cudf_edgelist(G_df) else: # Not currently possible with GPU - https://github.com/rapidsai/cugraph/issues/805 G.add_edge_list(edge_list) From ca2992b62796885979509ddbb1f5faa2e81e306c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 19:09:50 +0000 Subject: [PATCH 177/327] Load CUDA libraries --- PopPUNK/refine.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 90840633..7453e80d 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -214,6 +214,15 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_ -1 * network score for each of x_range. Where network score is from :func:`~PopPUNK.network.networkSummary` """ + + # load CUDA libraries + try: + import cugraph + import cudf + except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + raise ImportError(e) + scores = [] edge_list = [] prev_idx = 0 From 64e6284c1ccf1cf2f46e6d7263442e057fabbc30 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 19:11:44 +0000 Subject: [PATCH 178/327] Add debug message --- PopPUNK/refine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 7453e80d..f151e94a 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -241,6 +241,7 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_ if use_gpu: G_extra_df = cudf.DataFrame(edge_list, columns =['source', 'destination']) G_df = cudf.concat([G.view_edge_list(),G_extra_df], ignore_index = True) + print("DF is " + str(G_df)) G = cugraph.Graph() G.from_cudf_edgelist(G_df) else: From 181a3282b45a320a40d03a0e546d88b31d495519 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 19:13:04 +0000 Subject: [PATCH 179/327] Fix column names --- PopPUNK/refine.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index f151e94a..13055d7d 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -239,9 +239,8 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_ summarise=False, edge_list=True, use_gpu = use_gpu) else: if use_gpu: - G_extra_df = cudf.DataFrame(edge_list, columns =['source', 'destination']) + G_extra_df = cudf.DataFrame(edge_list, columns =['src', 'dst']) G_df = cudf.concat([G.view_edge_list(),G_extra_df], ignore_index = True) - print("DF is " + str(G_df)) G = cugraph.Graph() G.from_cudf_edgelist(G_df) else: From 900ed2450b5d61cdf774ed47f0dd871c66df5ab6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 19:14:49 +0000 Subject: [PATCH 180/327] Make column names consistent --- PopPUNK/refine.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 13055d7d..073910d1 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -239,8 +239,10 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_ summarise=False, edge_list=True, use_gpu = use_gpu) else: if use_gpu: - G_extra_df = cudf.DataFrame(edge_list, columns =['src', 'dst']) - G_df = cudf.concat([G.view_edge_list(),G_extra_df], ignore_index = True) + G_current_df = G.view_edge_list() + G_current_df.columns = ['source','destination'] + G_extra_df = cudf.DataFrame(edge_list, columns =['source','destination']) + G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True) G = cugraph.Graph() G.from_cudf_edgelist(G_df) else: From 5601a2b12973b1c3315c2d42fa3c277ac49e9a24 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 19:34:46 +0000 Subject: [PATCH 181/327] Updating networks with CUDA --- PopPUNK/network.py | 60 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index d3d1ab26..f71c977e 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -206,7 +206,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_df = G.view_edge_list() G_df.columns = ['source','destination'] G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)] - # Add self-loop if needing + # Add self-loop if needed max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) max_in_vertex_labels = len(reference_names)-1 if max_in_df.item() != max_in_vertex_labels: @@ -651,6 +651,8 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, be annotated as an edge attribute threads (int) Number of threads to use if new db created + use_gpu (bool) + Whether to use cugraph for analysis (default = 1) Returns: @@ -738,18 +740,54 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, new_edges.append(edge_tuple) # finish by updating the network - G.add_vertex(len(qList)) - - if weights is not None: - eweight = G.new_ep("float") - G.add_edge_list(new_edges, eprops = [eweight]) - G.edge_properties["weight"] = eweight + if use_gpu: + + # load CUDA libraries + try: + import cugraph + import cudf + except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + raise ImportError(e) + + # construct updated graph + G_current_df = G.view_edge_list() + if weights is not None: + G_current_df.columns = ['source','destination','weights'] + G_extra_df = cudf.DataFrame(edge_list, columns =['source','destination','weights']) + G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True) + else: + G_current_df.columns = ['source','destination'] + G_extra_df = cudf.DataFrame(edge_list, columns =['source','destination']) + G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True) + G = cugraph.Graph() + G.from_cudf_edgelist(G_df) + + # use self-loop to ensure all nodes are present + max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) + max_in_vertex_labels = ref_count + len(qList) - 1 + if max_in_df.item() != max_in_vertex_labels: + G_self_loop = cudf.DataFrame() + G_self_loop['source'] = [max_in_vertex_labels] + G_self_loop['destination'] = [max_in_vertex_labels] + G = cudf.concat([G,G_self_loop], ignore_index = True) + # Construct graph + G = cugraph.Graph() + G.from_cudf_edgelist(G_df) + else: - G.add_edge_list(new_edges) + G.add_vertex(len(qList)) + + if weights is not None: + eweight = G.new_ep("float") + G.add_edge_list(new_edges, eprops = [eweight]) + G.edge_properties["weight"] = eweight + else: + G.add_edge_list(new_edges) - # including the vertex ID property map - for i, q in enumerate(qList): - G.vp.id[i + len(rList)] = q + # including the vertex ID property map + for i, q in enumerate(qList): + G.vp.id[i + len(rList)] = q return qqDistMat From 15c106af974236b3783fb4d023bb9889adfc876c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 21:21:28 +0000 Subject: [PATCH 182/327] Change betweeness processing --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index f71c977e..aef891c1 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -606,7 +606,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): if len(betweenness) > 1: mean_bt = np.mean(betweenness) weighted_mean_bt = np.average(betweenness, weights=sizes) - else: + elif len(betweenness) == 1: mean_bt = betweenness[0] weighted_mean_bt = betweenness[0] From f950cd8bfbfafba45a130fbf94ff2579c52d4b9e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 22:13:11 +0000 Subject: [PATCH 183/327] Add GPU options to assign --- PopPUNK/assign.py | 44 ++++++++++++++++++++++++++++++-------------- PopPUNK/network.py | 3 ++- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index b1b11236..ae2b0bcf 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -46,7 +46,11 @@ def assign_query(dbFuncs, accessory_only, web, json_sketch, - save_partial_query_graph): + save_partial_query_graph, + gpu_sketch, + gpu_dist, + gpu_graph, + deviceid): """Code for assign query mode. Written as a separate function so it can be called by web APIs""" @@ -60,6 +64,7 @@ def assign_query(dbFuncs, from .network import extractReferences from .network import addQueryToNetwork from .network import printClusters + from .network import save_network from .plot import writeClusterCsv @@ -133,7 +138,9 @@ def assign_query(dbFuncs, threads, overwrite, codon_phased = codon_phased, - calc_random = False) + calc_random = False, + use_gpu = gpu_sketch, + deviceid = deviceid) # run query qrDistMat = queryDatabase(rNames = rNames, qNames = qNames, @@ -142,7 +149,8 @@ def assign_query(dbFuncs, klist = kmers, self = False, number_plot_fits = plot_fit, - threads = threads) + threads = threads, + use_gpu = gpu_dist) # QC distance matrix qcPass = qcDistMat(qrDistMat, rNames, qNames, max_pi_dist, max_a_dist, reference_isolate) @@ -153,7 +161,8 @@ def assign_query(dbFuncs, rNames, ref_graph = use_ref_graph, core_only = core_only, - accessory_only = accessory_only) + accessory_only = accessory_only, + use_gpu = gpu_graph) if model.type == 'lineage': # Assign lineages by calculating query-query information @@ -165,7 +174,8 @@ def assign_query(dbFuncs, klist = kmers, self = True, number_plot_fits = 0, - threads = threads) + threads = threads, + use_gpu = gpu_dist) model.extend(qqDistMat, qrDistMat) genomeNetwork = {} @@ -182,7 +192,8 @@ def assign_query(dbFuncs, assignment, 0, edge_list = True, - weights=weights) + weights=weights, + use_gpu = gpu_graph) isolateClustering[rank] = \ printClusters(genomeNetwork[rank], @@ -214,7 +225,7 @@ def assign_query(dbFuncs, genomeNetwork, kmers, queryAssignments, model, output, update_db, strand_preserved, - weights = weights, threads = threads) + weights = weights, threads = threads, use_gpu = gpu_graph) isolateClustering = \ {'combined': printClusters(genomeNetwork, rNames + qNames, @@ -237,12 +248,12 @@ def assign_query(dbFuncs, joinDBs(ref_db, output, output, {"threads": threads, "strand_preserved": strand_preserved}) if model.type == 'lineage': - genomeNetwork[min(model.ranks)].save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt') + save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph) # Save sparse distance matrices and updated model model.outPrefix = os.path.basename(output) model.save() else: - genomeNetwork.save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt') + save_network(genomeNetwork, prefix = output, suffix = '_graph', use_gpu = gpu_graph) # Update distance matrices with all calculated distances if distances == None: @@ -289,7 +300,7 @@ def assign_query(dbFuncs, dbOrder = rNames + qNames newRepresentativesIndices, newRepresentativesNames, \ newRepresentativesFile, genomeNetwork = \ - extractReferences(genomeNetwork, dbOrder, output, rNames, threads = threads) + extractReferences(genomeNetwork, dbOrder, output, rNames, threads = threads, use_gpu = gpu_graph) # intersection that maintains order newQueries = [x for x in qNames if x in frozenset(newRepresentativesNames)] @@ -303,7 +314,7 @@ def assign_query(dbFuncs, postpruning_combined_seq, newDistMat = \ prune_distance_matrix(combined_seq, names_to_remove, complete_distMat, output + "/" + os.path.basename(output) + ".refs.dists") - genomeNetwork.save(output + "/" + os.path.basename(output) + '.refs_graph.gt', fmt = 'gt') + save_network(genomeNetwork, prefix = output, suffix = 'refs_graph', use_gpu = gpu_graph) removeFromDB(output, output, names_to_remove) os.rename(output + "/" + os.path.basename(output) + ".tmp.h5", output + "/" + os.path.basename(output) + ".refs.h5") @@ -314,9 +325,9 @@ def assign_query(dbFuncs, storePickle(rNames, qNames, False, qrDistMat, dists_out) if save_partial_query_graph: if model.type == 'lineage': - genomeNetwork[min(model.ranks)].save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt') + save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph) else: - genomeNetwork.save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt') + save_network(genomeNetwork, prefix = output, suffix = '_graph', use_gpu = gpu_graph) return(isolateClustering) @@ -404,6 +415,7 @@ def get_options(): other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]') other.add_argument('--gpu-sketch', default=False, action='store_true', help='Use a GPU when calculating sketches (read data only) [default = False]') other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]') + other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when constructing networks [default = False]') other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]') other.add_argument('--version', action='version', version='%(prog)s '+__version__) @@ -508,7 +520,11 @@ def main(): args.accessory_only, web=False, json_sketch=None, - save_partial_query_graph=False) + save_partial_query_graph=False, + args.gpu_sketch, + args.gpu_dist, + arg.gpu_graph, + args.deviceid) sys.stderr.write("\nDone\n") diff --git a/PopPUNK/network.py b/PopPUNK/network.py index aef891c1..011d6a37 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -618,7 +618,8 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, assignments, model, queryDB, queryQuery = False, - strand_preserved = False, weights = None, threads = 1): + strand_preserved = False, weights = None, threads = 1, + use_gpu = False): """Finds edges between queries and items in the reference database, and modifies the network to include them. From a6a037eefcc2a31edaa9dfb4aa4fd412d528d65c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 22:33:38 +0000 Subject: [PATCH 184/327] Change function argument order --- PopPUNK/assign.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index ae2b0bcf..be44325f 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -44,13 +44,13 @@ def assign_query(dbFuncs, external_clustering, core_only, accessory_only, - web, - json_sketch, - save_partial_query_graph, gpu_sketch, gpu_dist, gpu_graph, - deviceid): + deviceid, + web, + json_sketch, + save_partial_query_graph): """Code for assign query mode. Written as a separate function so it can be called by web APIs""" @@ -518,13 +518,13 @@ def main(): args.external_clustering, args.core_only, args.accessory_only, - web=False, - json_sketch=None, - save_partial_query_graph=False, args.gpu_sketch, args.gpu_dist, arg.gpu_graph, - args.deviceid) + args.deviceid, + web=False, + json_sketch=None, + save_partial_query_graph=False) sys.stderr.write("\nDone\n") From 2b65e614c0556c63ee950f1b0a948bf7e00e18da Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 13 Mar 2021 22:35:52 +0000 Subject: [PATCH 185/327] Change argument typo --- PopPUNK/assign.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index be44325f..0eacef19 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -520,7 +520,7 @@ def main(): args.accessory_only, args.gpu_sketch, args.gpu_dist, - arg.gpu_graph, + args.gpu_graph, args.deviceid, web=False, json_sketch=None, From 9cc9ad6344567ee5b0169695268525e6c7e136bb Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 20:54:07 +0000 Subject: [PATCH 186/327] Add CUDA load for querying --- PopPUNK/network.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 011d6a37..31eb67cb 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -68,6 +68,15 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, # If a refined fit, may use just core or accessory distances dir_prefix = network_dir + "/" + os.path.basename(network_dir) if use_gpu: + + # load CUDA libraries + try: + import cugraph + import cudf + except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + raise ImportError(e) + graph_suffix = '.csv.bz2' else: graph_suffix = '.gt' From 403506272929cd2bfc08a26dfb2a998388f27928 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 20:59:35 +0000 Subject: [PATCH 187/327] Fix graph suffix for GPU --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 31eb67cb..fb70b7e0 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -77,7 +77,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, sys.stderr.write("cugraph and cudf unavailable\n") raise ImportError(e) - graph_suffix = '.csv.bz2' + graph_suffix = '.csv.gz' else: graph_suffix = '.gt' if core_only and model.type == 'refine': From 01dcf3758223d20fcd48bab68530f0548a4d2aff Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 21:03:17 +0000 Subject: [PATCH 188/327] Quote column name --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index fb70b7e0..03258a09 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -100,7 +100,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, if use_gpu: G_df = cudf.read_csv(network_file, compression = 'gzip') - if weights in G_df.columns: + if 'weights' in G_df.columns: genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: genomeNetwork.from_cudf_edgelist(G_df,renumber=False) From cbeebaf9d9dc494a870d2fabe46a806e1cbad266 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 21:05:59 +0000 Subject: [PATCH 189/327] Define graph name --- PopPUNK/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 03258a09..d288f61d 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -100,6 +100,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, if use_gpu: G_df = cudf.read_csv(network_file, compression = 'gzip') + genomeNetwork = cugraph.Graph() if 'weights' in G_df.columns: genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: From aee03b142af3829c5af98b73e9ee95e2c00b31fe Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 21:11:04 +0000 Subject: [PATCH 190/327] Change cudf column names on loading --- PopPUNK/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index d288f61d..92e40e2d 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -100,6 +100,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, if use_gpu: G_df = cudf.read_csv(network_file, compression = 'gzip') + G_df.columns = ['source','destination'] genomeNetwork = cugraph.Graph() if 'weights' in G_df.columns: genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) From a0fce563ef520e5e98de8e3355644ebfa8259dc0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 21:12:13 +0000 Subject: [PATCH 191/327] Add weights to column names --- PopPUNK/network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 92e40e2d..b80ea4e2 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -100,11 +100,12 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, if use_gpu: G_df = cudf.read_csv(network_file, compression = 'gzip') - G_df.columns = ['source','destination'] genomeNetwork = cugraph.Graph() if 'weights' in G_df.columns: + G_df.columns = ['source','destination','weights'] genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: + G_df.columns = ['source','destination'] genomeNetwork.from_cudf_edgelist(G_df,renumber=False) else: genomeNetwork = gt.load_graph(network_file) From 43d13a7a6fa7bfa67827aed0f23689582689b416 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 21:17:17 +0000 Subject: [PATCH 192/327] Print formatted DF --- PopPUNK/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index b80ea4e2..eea657aa 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -105,6 +105,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, G_df.columns = ['source','destination','weights'] genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: + print("DF is " + str(G_df)) G_df.columns = ['source','destination'] genomeNetwork.from_cudf_edgelist(G_df,renumber=False) else: From 2ea95f4f60803c17164090ab2f0306aa266c7d26 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 21:20:53 +0000 Subject: [PATCH 193/327] Remove Pandas index from CSV --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index eea657aa..4f0b69b3 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1135,7 +1135,7 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False): file_name = file_name + suffix if use_gpu: G.to_pandas_edgelist().to_csv(file_name + '.csv.gz', - compression='gzip') + compression='gzip', index = False) else: G.save(file_name + '.gt', fmt = 'gt') From 41868b9a2dc991fdbdc315f00e6ddcedf5e86e6e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 21:26:05 +0000 Subject: [PATCH 194/327] Update graph loading message --- PopPUNK/network.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 4f0b69b3..83cb0590 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -105,15 +105,16 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, G_df.columns = ['source','destination','weights'] genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: - print("DF is " + str(G_df)) G_df.columns = ['source','destination'] genomeNetwork.from_cudf_edgelist(G_df,renumber=False) + sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.number_of_vertices()))) + " samples\n") else: genomeNetwork = gt.load_graph(network_file) - sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") + sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") # Ensure all in dists are in final network - networkMissing = set(map(str,set(range(len(refList))).difference(list(genomeNetwork.vertices())))) + vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph)) + networkMissing = set(set(range(len(refList))).difference(vertex_list)) if len(networkMissing) > 0: sys.stderr.write("WARNING: Samples " + ",".join(networkMissing) + " are missing from the final network\n") From 39e80a76134a63d52cf109a0edd0777ad56278fe Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 21:26:55 +0000 Subject: [PATCH 195/327] Update graph loading message again --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 83cb0590..2c28f8e4 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -107,7 +107,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, else: G_df.columns = ['source','destination'] genomeNetwork.from_cudf_edgelist(G_df,renumber=False) - sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.number_of_vertices()))) + " samples\n") + sys.stderr.write("Network loaded: " + str(genomeNetwork.number_of_vertices()) + " samples\n") else: genomeNetwork = gt.load_graph(network_file) sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") From c12d08a17cdcd1d398d837248a126a8ccae28e14 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 21:27:48 +0000 Subject: [PATCH 196/327] Change gpu option --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 2c28f8e4..827db192 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -113,7 +113,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") # Ensure all in dists are in final network - vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph)) + vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = use_gpu)) networkMissing = set(set(range(len(refList))).difference(vertex_list)) if len(networkMissing) > 0: sys.stderr.write("WARNING: Samples " + ",".join(networkMissing) + " are missing from the final network\n") From 04ee820bb417c14511b10ac0c011bd8d909a4534 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 21:30:48 +0000 Subject: [PATCH 197/327] Change name of tuples --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 827db192..90d67bae 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -769,11 +769,11 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, G_current_df = G.view_edge_list() if weights is not None: G_current_df.columns = ['source','destination','weights'] - G_extra_df = cudf.DataFrame(edge_list, columns =['source','destination','weights']) + G_extra_df = cudf.DataFrame(new_edges, columns =['source','destination','weights']) G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True) else: G_current_df.columns = ['source','destination'] - G_extra_df = cudf.DataFrame(edge_list, columns =['source','destination']) + G_extra_df = cudf.DataFrame(new_edges, columns =['source','destination']) G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True) G = cugraph.Graph() G.from_cudf_edgelist(G_df) From 655025b3fba743db7414dfa230f9e36d8b00dd27 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 21:34:41 +0000 Subject: [PATCH 198/327] Change printClusters to use GPU --- PopPUNK/assign.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 0eacef19..1bcfb678 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -198,7 +198,8 @@ def assign_query(dbFuncs, isolateClustering[rank] = \ printClusters(genomeNetwork[rank], rNames + qNames, - printCSV = False) + printCSV = False, + use_gpu = gpu_graph) overall_lineage = createOverallLineage(model.ranks, isolateClustering) writeClusterCsv( @@ -232,7 +233,8 @@ def assign_query(dbFuncs, output + "/" + os.path.basename(output), old_cluster_file, external_clustering, - write_references or update_db)} + write_references or update_db, + use_gpu = gpu_graph)} # Update DB as requested dists_out = output + "/" + os.path.basename(output) + ".dists" From 50cc3172196929b2bc5e6d20f938240afa7894d7 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 21:38:10 +0000 Subject: [PATCH 199/327] Edit component assignments --- PopPUNK/network.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 90d67bae..8442a514 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -862,7 +862,9 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, component_assignments = cugraph.components.connectivity.connected_components(G) component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) newClusters = [set() for rank in range(component_frequencies.size)] + print("Assignments: " + str(component_assignments)) for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment + print("Index: " + str(isolate_index)) component = component_assignments['labels'].iloc[isolate_index].item() component_rank_bool = component_frequencies.index == component component_rank = np.argmax(component_rank_bool.to_array()) From dc9a702de26529789054c9835043cca3608027b0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 21:54:24 +0000 Subject: [PATCH 200/327] Print node count --- PopPUNK/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 8442a514..d9132e83 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -859,6 +859,7 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, sys.stderr.write("cugraph and cudf unavailable\n") raise ImportError(e) + print("num nodes is " + str(G.number_of_vertices())) component_assignments = cugraph.components.connectivity.connected_components(G) component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) newClusters = [set() for rank in range(component_frequencies.size)] From 067149ca40b7fbae9fff509420640beb50278231 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 22:08:41 +0000 Subject: [PATCH 201/327] Return updated graph from function --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index d9132e83..15b947d0 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -804,7 +804,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, for i, q in enumerate(qList): G.vp.id[i + len(rList)] = q - return qqDistMat + return G, qqDistMat def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, externalClusterCSV = None, printRef = True, printCSV = True, From bde7dd8042bd428361e4f054b0f151b3cfbd2d82 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 22:09:48 +0000 Subject: [PATCH 202/327] Update to be consistent with changes to network function --- PopPUNK/assign.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 1bcfb678..e5c92ec6 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -221,7 +221,8 @@ def assign_query(dbFuncs, weights = qrDistMat else: weights = None - qqDistMat = \ + + genomeNetwork, qqDistMat = \ addQueryToNetwork(dbFuncs, rNames, qNames, genomeNetwork, kmers, queryAssignments, model, output, update_db, From 0aa02c631f192dccf8f1470aa6dbe3ad41b02329 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 14 Mar 2021 22:14:55 +0000 Subject: [PATCH 203/327] Remove debug messages --- PopPUNK/network.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 15b947d0..5e008262 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -859,13 +859,10 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, sys.stderr.write("cugraph and cudf unavailable\n") raise ImportError(e) - print("num nodes is " + str(G.number_of_vertices())) component_assignments = cugraph.components.connectivity.connected_components(G) component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) newClusters = [set() for rank in range(component_frequencies.size)] - print("Assignments: " + str(component_assignments)) for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment - print("Index: " + str(isolate_index)) component = component_assignments['labels'].iloc[isolate_index].item() component_rank_bool = component_frequencies.index == component component_rank = np.argmax(component_rank_bool.to_array()) From eca9b50f60d5298d2e698cd034fe00e1c3e0206e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 09:49:45 +0000 Subject: [PATCH 204/327] Ensure consistency across function arguments --- PopPUNK/__main__.py | 3 +-- PopPUNK/assign.py | 2 +- PopPUNK/models.py | 13 +++++++------ PopPUNK/network.py | 22 +++++++++++----------- PopPUNK/refine.py | 13 +++++++------ 5 files changed, 27 insertions(+), 26 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 6275af83..90f42aaf 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -419,7 +419,6 @@ def main(): args.unconstrained, args.score_idx, args.no_local, - args.threads, args.gpu_graph) new_model.plot(distMat) model = new_model @@ -545,7 +544,7 @@ def main(): fit_type = 'accessory' genomeNetwork = indivNetworks['accessory'] - save_network(genomeNetwork, prefix = output, suffix = "graph", use_gpu = args.gpu_graph) + save_network(genomeNetwork, prefix = output, suffix = "_graph", use_gpu = args.gpu_graph) #******************************# #* *# diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 75994f9e..5a1dfe1c 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -125,7 +125,7 @@ def assign_query(dbFuncs, for reference in refFile: rNames.append(reference.rstrip()) else: - if os.path.isfile(distances + ",pkl"): + if os.path.isfile(distances + ".pkl"): rNames = readPickle(distances, enforce_self = True, distances=False)[0] elif update_db: sys.stderr.write("Reference distances missing, cannot use --update-db\n") diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 78f90df4..1dd8997a 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -726,12 +726,13 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi self.unconstrained = unconstrained # load CUDA libraries - try: - import cugraph - import cudf - except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") - raise ImportError(e) + if use_gpu: + try: + import cugraph + import cudf + except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + raise ImportError(e) # Get starting point model.no_scale() diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ff0847a1..1d1945d8 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -585,13 +585,13 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): degree_df = G.degree() triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()]) transitivity = triangle_count/triad_count - print("Triangle count GPU: " + str(triangle_count) + " Triad count GPU: " + str(triad_count)) +# print("Triangle count GPU: " + str(triangle_count) + " Triad count GPU: " + str(triad_count)) else: component_assignments, component_frequencies = gt.label_components(G) components = len(component_frequencies) density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1)) transitivity = gt.global_clustering(G)[0] - print("Triangle/triad count CPU: " + str(gt.global_clustering(G, ret_counts = True))) +# print("Triangle/triad count CPU: " + str(gt.global_clustering(G, ret_counts = True))) mean_bt = 0 weighted_mean_bt = 0 @@ -704,14 +704,14 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, else: sys.stderr.write("Calculating all query-query distances\n") addRandom(queryDB, qList, kmers, strand_preserved, threads = threads) - qlist1, qlist2, qqDistMat = queryDatabase(rNames = qList, - qNames = qList, - dbPrefix = queryDB, - queryPrefix = queryDB, - klist = kmers, - self = True, - number_plot_fits = 0, - threads = threads) + qqDistMat = queryDatabase(rNames = qList, + qNames = qList, + dbPrefix = queryDB, + queryPrefix = queryDB, + klist = kmers, + self = True, + number_plot_fits = 0, + threads = threads) queryAssignation = model.assign(qqDistMat) for row_idx, (assignment, (ref, query)) in enumerate(zip(queryAssignation, listDistInts(qList, qList, self = True))): @@ -748,7 +748,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, # identify any links between queries and store in the same links dict # links dict now contains lists of links both to original database and new queries # have to use names and link to query list in order to match to node indices - for row_idx, (assignment, (query1, query2)) in enumerate(zip(queryAssignation, iterDistRows(qlist1, qlist2, self = True))): + for row_idx, (assignment, (query1, query2)) in enumerate(zip(queryAssignation, iterDistRows(qList, qListp, self = True))): if assignment == model.within_label: if weights is not None: dist = np.linalg.norm(qqDistMat[row_idx, :]) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 5462f7af..e1b0d505 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -216,12 +216,13 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_ """ # load CUDA libraries - try: - import cugraph - import cudf - except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") - raise ImportError(e) + if use_gpu: + try: + import cugraph + import cudf + except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + raise ImportError(e) scores = [] edge_list = [] From d7fb6529ffa80a59244cf9ef5c881f76048a7f11 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 17:18:05 +0000 Subject: [PATCH 205/327] Add omitted sys exit when missing distance file Co-authored-by: John Lees --- PopPUNK/assign.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 5a1dfe1c..b1361aab 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -129,6 +129,7 @@ def assign_query(dbFuncs, rNames = readPickle(distances, enforce_self = True, distances=False)[0] elif update_db: sys.stderr.write("Reference distances missing, cannot use --update-db\n") + sys.exit(1) else: rNames = getSeqsInDb(ref_db + "/" + os.path.basename(ref_db) + ".h5") # construct database From 593b26f07e4d229620304b69199792bbf4eec0d9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 17:18:36 +0000 Subject: [PATCH 206/327] Update file name formatting Co-authored-by: John Lees --- scripts/poppunk_batch_mst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 4b37af81..c5d01e0c 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -402,7 +402,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, os.rename(os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv"), os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv")) for rank in ranks: - os.rename(os.path.join(output_dir,os.path.basename(output_dir) + "_rank" + str(rank) + "_fit.npz"), + os.rename(os.path.join(output_dir, os.path.basename(output_dir) + "_rank" + str(rank) + "_fit.npz"), os.path.join(args.output,os.path.basename(args.output) + "_rank" + str(rank) + "_fit.npz")) # Merge with epidemiological data if requested From 69724c0324a566dd35e355582136c708e18efb2b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 17:23:23 +0000 Subject: [PATCH 207/327] Edit whitespace Co-authored-by: John Lees --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 1d1945d8..5660f8fe 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -222,7 +222,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)] # Add self-loop if needed max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) - max_in_vertex_labels = len(reference_names)-1 + max_in_vertex_labels = len(reference_names) - 1 if max_in_df.item() != max_in_vertex_labels: G_self_loop = cudf.DataFrame() G_self_loop['source'] = [max_in_vertex_labels] From 470fec0bc1c115da64439ff5e5a38dbbd9ef92f2 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 17:23:56 +0000 Subject: [PATCH 208/327] Edit whitespace Co-authored-by: John Lees --- scripts/poppunk_batch_mst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index c5d01e0c..6a6f8eae 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -403,7 +403,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv")) for rank in ranks: os.rename(os.path.join(output_dir, os.path.basename(output_dir) + "_rank" + str(rank) + "_fit.npz"), - os.path.join(args.output,os.path.basename(args.output) + "_rank" + str(rank) + "_fit.npz")) + os.path.join(args.output, os.path.basename(args.output) + "_rank" + str(rank) + "_fit.npz")) # Merge with epidemiological data if requested if args.info_csv is not None: From 9f600de9a05abc4b9dc28af0b2700266ae63042d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 21:00:42 +0000 Subject: [PATCH 209/327] Edit whitespace Co-authored-by: John Lees --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 5660f8fe..43d1e479 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -221,7 +221,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_df.columns = ['source','destination'] G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)] # Add self-loop if needed - max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) + max_in_df = np.amax([G_df['source'].max(), G_df['destination'].max()]) max_in_vertex_labels = len(reference_names) - 1 if max_in_df.item() != max_in_vertex_labels: G_self_loop = cudf.DataFrame() From 95702445aa0c4a9d8df5ca10a2bd4bdb8751e61e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 21:02:12 +0000 Subject: [PATCH 210/327] Fix qList variable name Co-authored-by: John Lees --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 43d1e479..829e4e63 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -748,7 +748,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, # identify any links between queries and store in the same links dict # links dict now contains lists of links both to original database and new queries # have to use names and link to query list in order to match to node indices - for row_idx, (assignment, (query1, query2)) in enumerate(zip(queryAssignation, iterDistRows(qList, qListp, self = True))): + for row_idx, (assignment, (query1, query2)) in enumerate(zip(queryAssignation, iterDistRows(qList, qList, self = True))): if assignment == model.within_label: if weights is not None: dist = np.linalg.norm(qqDistMat[row_idx, :]) From 95fb23516b98f61c81d0875abb99b74e9b440412 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 21:03:01 +0000 Subject: [PATCH 211/327] Remove debug message Co-authored-by: John Lees --- PopPUNK/network.py | 1 - 1 file changed, 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 829e4e63..a830b47d 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -591,7 +591,6 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): components = len(component_frequencies) density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1)) transitivity = gt.global_clustering(G)[0] -# print("Triangle/triad count CPU: " + str(gt.global_clustering(G, ret_counts = True))) mean_bt = 0 weighted_mean_bt = 0 From f323ccf391a7eb3ec997b044748ee8bdf1158de2 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 21:09:41 +0000 Subject: [PATCH 212/327] Edit whitespace Co-authored-by: John Lees --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index a830b47d..0ea1eba6 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -227,7 +227,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_self_loop = cudf.DataFrame() G_self_loop['source'] = [max_in_vertex_labels] G_self_loop['destination'] = [max_in_vertex_labels] - G_ref_df = cudf.concat([G_ref_df,G_self_loop], ignore_index = True) + G_ref_df = cudf.concat([G_ref_df, G_self_loop], ignore_index = True) # Construct graph G_ref = cugraph.Graph() G_ref.from_cudf_edgelist(G_ref_df) From 5333e881cae37865d09d91d9bd1b29f0a699a185 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 21:10:23 +0000 Subject: [PATCH 213/327] Remove debug message Co-authored-by: John Lees --- PopPUNK/network.py | 1 - 1 file changed, 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 0ea1eba6..6ebaa035 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -585,7 +585,6 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): degree_df = G.degree() triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()]) transitivity = triangle_count/triad_count -# print("Triangle count GPU: " + str(triangle_count) + " Triad count GPU: " + str(triad_count)) else: component_assignments, component_frequencies = gt.label_components(G) components = len(component_frequencies) From 78a295fb1bc1937a368bede2e3c142e8c736e67a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 21:12:39 +0000 Subject: [PATCH 214/327] Change comment wording Co-authored-by: John Lees --- PopPUNK/refine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index e1b0d505..6743d8a7 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -247,7 +247,8 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_ G = cugraph.Graph() G.from_cudf_edgelist(G_df) else: - # Not currently possible with GPU - https://github.com/rapidsai/cugraph/issues/805 + # Adding edges to network not currently possible with GPU - https://github.com/rapidsai/cugraph/issues/805 + # We add to the cuDF, and then reconstruct the network instead G.add_edge_list(edge_list) # Add score into vector for any offsets passed (should usually just be one) for s in range(prev_idx, idx): @@ -453,4 +454,3 @@ def likelihoodBoundary(s, model, start, end, within, between): X = transformLine(s, start, end).reshape(1, -1) responsibilities = model.assign(X, progress = False, values = True) return(responsibilities[0, within] - responsibilities[0, between]) - From 6eacd4b2be7b6d5afd2a25a5b0d34ab7f773416e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 21:21:54 +0000 Subject: [PATCH 215/327] Reorder column indices Co-authored-by: John Lees --- PopPUNK/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index f9427ff5..fff66adc 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -252,7 +252,7 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate = None): to_prune = [] # First check with numpy, which is quicker than iterating over everything - if np.any(distMat[:,1] > a_max) or np.any(distMat[:,0] > c_max): + if np.any(distMat[:, 0] > c_max) or np.any(distMat[:, 1] > a_max): passed = False names = iterDistRows(refList, queryList, refList == queryList) for i, (ref, query) in enumerate(names): From 083722bd9e62a58c35603db2b6bc43700d5e53af Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 21:25:23 +0000 Subject: [PATCH 216/327] Change column indices Co-authored-by: John Lees --- PopPUNK/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index fff66adc..e8021e00 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -256,7 +256,7 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate = None): passed = False names = iterDistRows(refList, queryList, refList == queryList) for i, (ref, query) in enumerate(names): - if distMat[i,0] > c_max or distMat[i,1] > a_max: + if distMat[i, 0] > c_max or distMat[i, 1] > a_max: sys.stderr.write("WARNING: Outlier at c = " + str(distMat[i,0]) + " a = " + str(distMat[i,1]) + " 1:" + ref + " 2:" + query + "\n") if ref_isolate is not None: From bbdf53becc916b66fc436f1292e373da92ee42d9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 21:27:45 +0000 Subject: [PATCH 217/327] Edit whitespace Co-authored-by: John Lees --- PopPUNK/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index e8021e00..2beba400 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -257,7 +257,7 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate = None): names = iterDistRows(refList, queryList, refList == queryList) for i, (ref, query) in enumerate(names): if distMat[i, 0] > c_max or distMat[i, 1] > a_max: - sys.stderr.write("WARNING: Outlier at c = " + str(distMat[i,0]) + " a = " + str(distMat[i,1]) + + sys.stderr.write("WARNING: Outlier at c = " + str(distMat[i, 0]) + " a = " + str(distMat[i, 1]) + " 1:" + ref + " 2:" + query + "\n") if ref_isolate is not None: if ref == ref_isolate: From c7daea2c6267bb9dcb5dd0909775fbc86c1dd6ad Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 21:40:31 +0000 Subject: [PATCH 218/327] Update assign arguments --- PopPUNK/web.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/PopPUNK/web.py b/PopPUNK/web.py index c1f6060f..f7939d1c 100644 --- a/PopPUNK/web.py +++ b/PopPUNK/web.py @@ -74,12 +74,17 @@ def sketchAssign(): args.assign.plot_fit, args.assign.graph_weights, args.assign.max_a_dist, + args.assign.max_pi_dist, args.assign.model_dir, args.assign.strand_preserved, args.assign.previous_clustering, args.assign.external_clustering, args.assign.core_only, args.assign.accessory_only, + args.assign.gpu_sketch, + args.assign.gpu_dist, + args.assign.gpu_graph, + args.assign.deviceid, args.assign.web, sketch_dict["sketch"], args.assign.save_partial_query_graph) @@ -323,4 +328,4 @@ def main(): scheduler.init_app(app) scheduler.start() atexit.register(lambda: scheduler.shutdown()) - app.run(debug=False,use_reloader=False) \ No newline at end of file + app.run(debug=False,use_reloader=False) From e18ed35ad03f3c67fe362c07c26a115a22cb3910 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 21:56:06 +0000 Subject: [PATCH 219/327] Make CUDA library imports global --- PopPUNK/models.py | 19 ++++++++++++------- PopPUNK/network.py | 23 +++++++++++++++-------- PopPUNK/refine.py | 19 ++++++++++++------- PopPUNK/sparse_mst.py | 20 +++++++++++++------- 4 files changed, 52 insertions(+), 29 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 1dd8997a..ac46e40e 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -33,6 +33,15 @@ sys.stderr.write("This version of PopPUNK requires python v3.8 or higher\n") sys.exit(0) +# GPU support +try: + import cugraph + import cudf + gpu_lib = True +except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + gpu_lib = False + import pp_sketchlib import poppunk_refine @@ -726,13 +735,9 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi self.unconstrained = unconstrained # load CUDA libraries - if use_gpu: - try: - import cugraph - import cudf - except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") - raise ImportError(e) + if use_gpu and not gpu_lib: + sys.stderr.write('Unable to load GPU libraries; exiting\n') + sys.exit(1) # Get starting point model.no_scale() diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 6ebaa035..9c864b69 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -22,6 +22,15 @@ import graph_tool.all as gt import dendropy +# GPU support +try: + import cugraph + import cudf + gpu_lib = True +except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + gpu_lib = False + from .__main__ import accepted_weights_types from .sketchlib import addRandom @@ -67,19 +76,17 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, """ # If a refined fit, may use just core or accessory distances dir_prefix = network_dir + "/" + os.path.basename(network_dir) - if use_gpu: - # load CUDA libraries - try: - import cugraph - import cudf - except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") - raise ImportError(e) + # load CUDA libraries + if use_gpu and not gpu_lib: + sys.stderr.write('Unable to load GPU libraries; exiting\n') + sys.exit(1) + if use_gpu: graph_suffix = '.csv.gz' else: graph_suffix = '.gt' + if core_only and model.type == 'refine': model.slope = 0 network_file = dir_prefix + '_core_graph' + graph_suffix diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 6743d8a7..2095e82e 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -24,6 +24,15 @@ import poppunk_refine import graph_tool.all as gt +# GPU support +try: + import cugraph + import cudf + gpu_lib = True +except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + gpu_lib = False + from .network import constructNetwork from .network import networkSummary @@ -216,13 +225,9 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_ """ # load CUDA libraries - if use_gpu: - try: - import cugraph - import cudf - except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") - raise ImportError(e) + if use_gpu and not gpu_lib: + sys.stderr.write('Unable to load GPU libraries; exiting\n') + sys.exit(1) scores = [] edge_list = [] diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index 34b47763..adfce4d2 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -12,6 +12,15 @@ import pandas as pd from scipy import sparse +# GPU support +try: + import cugraph + import cudf + gpu_lib = True +except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") + gpu_lib = False + # import poppunk package from .__init__ import __version__ @@ -61,13 +70,10 @@ def main(): args = get_options() import graph_tool.all as gt - try: - import cugraph - import cudf - except ImportError as e: - if args.gpu_graph: - sys.stderr.write("cugraph and cudf unavailable\n") - raise ImportError(e) + # load CUDA libraries + if use_gpu and not gpu_lib: + sys.stderr.write('Unable to load GPU libraries; exiting\n') + sys.exit(1) # Read in sample names if (args.distance_pkl is not None) ^ (args.previous_clustering is not None): From fcf285878d94b545a10453aded5f7e2719b6c119 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 22:02:35 +0000 Subject: [PATCH 220/327] Only import GPU libraries once --- PopPUNK/network.py | 50 ++++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 35 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 9c864b69..cda818fb 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -203,13 +203,9 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u if use_gpu: - # load CUDA libraries - try: - import cugraph - import cudf - except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") - raise ImportError(e) + if not gpu_lib: + sys.stderr.write('Unable to load GPU libraries; exiting\n') + sys.exit(1) # For large network, use more approximate method for extracting references reference = {} @@ -489,13 +485,9 @@ def constructNetwork(rlist, qlist, assignments, within_label, # load GPU libraries if necessary if use_gpu: - # load CUDA libraries - try: - import cugraph - import cudf - except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") - raise ImportError(e) + if not gpu_lib: + sys.stderr.write('Unable to load GPU libraries; exiting\n') + sys.exit(1) # create DataFrame using edge tuples if weights is not None or sparse_input is not None: @@ -576,13 +568,9 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): """ if use_gpu: - # load CUDA libraries - try: - import cugraph - import cudf - except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") - raise ImportError(e) + if not gpu_lib: + sys.stderr.write('Unable to load GPU libraries; exiting\n') + sys.exit(1) component_assignments = cugraph.components.connectivity.connected_components(G) component_nums = component_assignments['labels'].unique().astype(int) @@ -765,13 +753,9 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, # finish by updating the network if use_gpu: - # load CUDA libraries - try: - import cugraph - import cudf - except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") - raise ImportError(e) + if not gpu_lib: + sys.stderr.write('Unable to load GPU libraries; exiting\n') + sys.exit(1) # construct updated graph G_current_df = G.view_edge_list() @@ -859,13 +843,9 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, # get a sorted list of component assignments if use_gpu: - # load CUDA libraries - try: - import cugraph - import cudf - except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") - raise ImportError(e) + if not gpu_lib: + sys.stderr.write('Unable to load GPU libraries; exiting\n') + sys.exit(1) component_assignments = cugraph.components.connectivity.connected_components(G) component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False) From 090994f1969fe6edc08c43b7b028d19d129e7538 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 15 Mar 2021 22:12:55 +0000 Subject: [PATCH 221/327] Add reference isolate to assign command --- PopPUNK/web.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/web.py b/PopPUNK/web.py index f7939d1c..3a45967c 100644 --- a/PopPUNK/web.py +++ b/PopPUNK/web.py @@ -75,6 +75,7 @@ def sketchAssign(): args.assign.graph_weights, args.assign.max_a_dist, args.assign.max_pi_dist, + args.assign.reference_isolate, args.assign.model_dir, args.assign.strand_preserved, args.assign.previous_clustering, From 89f97ce6ab183ddc9f08dff120757787f73d49f9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 06:12:18 +0000 Subject: [PATCH 222/327] Changes to command line phrasing Co-authored-by: John Lees --- PopPUNK/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 90f42aaf..5d597923 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -96,7 +96,7 @@ def get_options(): default = 0.5, type = float) qcGroup.add_argument('--max-pi-dist', help='Maximum core distance to permit [default = 0.5]', default = 0.5, type = float) - qcGroup.add_argument('--reference-isolate', help='Isolate from which distances can be calculated for pruning [default = None]', + qcGroup.add_argument('--reference-isolate', help='Isolate from which distances will be calculated for pruning [default = None]', default = None, type = str) qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond ' 'which sequences will be excluded [default = 5]', default = 5, type = int) From 059d804a1b02846eae87787ca24c5087503e0e40 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 06:16:14 +0000 Subject: [PATCH 223/327] Change GPU library loading --- PopPUNK/sparse_mst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index adfce4d2..5678e60d 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -71,7 +71,7 @@ def main(): import graph_tool.all as gt # load CUDA libraries - if use_gpu and not gpu_lib: + if args.gpu_graph and not gpu_lib: sys.stderr.write('Unable to load GPU libraries; exiting\n') sys.exit(1) From 7f110c8f6a1cb9a645929824c3ea4a96842dd655 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 06:31:28 +0000 Subject: [PATCH 224/327] Update web test --- test/test-web.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test-web.py b/test/test-web.py index dd0f7390..56f47bf5 100644 --- a/test/test-web.py +++ b/test/test-web.py @@ -38,12 +38,18 @@ args.assign.plot_fit, args.assign.graph_weights, args.assign.max_a_dist, + args.assign.max_pi_dist, + args.assign.reference_isolate, args.assign.model_dir, args.assign.strand_preserved, args.assign.previous_clustering, args.assign.external_clustering, args.assign.core_only, args.assign.accessory_only, + args.assign.gpu_sketch, + args.assign.gpu_dist, + args.assign.gpu_graph, + args.assign.deviceid, args.assign.web, sketch, args.assign.save_partial_query_graph) From a87c94db07b611be45b7b445d26899bdd282e0c0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 11:27:33 +0000 Subject: [PATCH 225/327] Change distance QC routine --- PopPUNK/__main__.py | 63 ++++++++++++++++-------------- PopPUNK/utils.py | 95 +++++++++++++++++++++++++++++++-------------- 2 files changed, 100 insertions(+), 58 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 5d597923..a350f5f5 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -230,7 +230,10 @@ def main(): 'length_sigma': args.length_sigma, 'length_range': args.length_range, 'prop_n': args.prop_n, - 'upper_n': args.upper_n + 'upper_n': args.upper_n, + 'max_pi_dist': args.max_pi_dist, + 'max_a_dist': args.max_a_dist, + 'reference_isolate': args.reference_isolate } # Dict of DB access functions @@ -285,7 +288,7 @@ def main(): sys.stderr.write("--create-db requires --r-files and --output") sys.exit(1) - # generate sketches and QC sequences + # generate sketches and QC sequences to identify sequences not matching specified criteria createDatabaseDir(args.output, kmers) seq_names_passing = \ constructDatabase( @@ -298,6 +301,7 @@ def main(): codon_phased = args.codon_phased, calc_random = True) + # calculate distances between sequences distMat = queryDatabase(rNames = seq_names_passing, qNames = seq_names_passing, dbPrefix = args.output, @@ -306,35 +310,36 @@ def main(): self = True, number_plot_fits = args.plot_fit, threads = args.threads) - names_to_remove = qcDistMat(distMat, + + # QC pairwise distances to identify long distances indicative of anomalous sequences in the collection + seq_names_passing, distMat = qcDistMat(distMat, seq_names_passing, seq_names_passing, - args.max_pi_dist, - args.max_a_dist, - args.reference_isolate) - - # prune based on distance from reference if provided - if args.reference_isolate is not None and len(names_to_remove) > 0 and args.qc_filter == "prune": - # Remove sketches - db_name = args.output + '/' + os.path.basename(args.output) + '.h5' - filtered_db_name = args.output + '/' + 'filtered.' + os.path.basename(args.output) + '.h5' - removeFromDB(db_name, - filtered_db_name, - names_to_remove, - full_names = True) - os.rename(filtered_db_name, db_name) - # Remove from distance matrix - prune_distance_matrix(seq_names_passing, - names_to_remove, - distMat, - args.output + "/" + os.path.basename(args.output) + ".dists") - # Remove from reflist - seq_names_passing = [seq_names_passing.remove(x) for x in names_to_remove] - sys.stderr.write("Successfully removed from the database: " + str(names_to_remove)) - else: - # Save results - dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" - storePickle(seq_names_passing, seq_names_passing, True, distMat, dists_out) + args.output, + qc_dict) + +# # prune based on distance from reference if provided +# if args.reference_isolate is not None and len(names_to_remove) > 0 and args.qc_filter == "prune": +# # Remove sketches +# db_name = args.output + '/' + os.path.basename(args.output) + '.h5' +# filtered_db_name = args.output + '/' + 'filtered.' + os.path.basename(args.output) + '.h5' +# removeFromDB(db_name, +# filtered_db_name, +# names_to_remove, +# full_names = True) +# os.rename(filtered_db_name, db_name) +# # Remove from distance matrix +# prune_distance_matrix(seq_names_passing, +# names_to_remove, +# distMat, +# args.output + "/" + os.path.basename(args.output) + ".dists") +# # Remove from reflist +# seq_names_passing = [seq_names_passing.remove(x) for x in names_to_remove] +# sys.stderr.write("Successfully removed from the database: " + str(names_to_remove)) +# else: +# # Save results +# dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" +# storePickle(seq_names_passing, seq_names_passing, True, distMat, dists_out) # Plot results plot_scatter(distMat, diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 2beba400..5f82d4e4 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -226,9 +226,8 @@ def listDistInts(refSeqs, querySeqs, self=True): return comparisons -def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate = None): - """Checks distance matrix for outliers. At the moment - just a threshold for accessory distance +def qcDistMat(distMat, refList, queryList, prefix, qc_dict): + """Checks distance matrix for outliers. Args: distMat (np.array) @@ -237,38 +236,76 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate = None): Reference labels queryList (list) Query labels (or refList if self) - c_max (float) - Maximum core distance to allow - a_max (float) - Maximum accessory distance to allow - ref_isolate (str) - Name of reference from which pruning can occur - + prefix (list) + Prefix for output files + qc_dict (dict) + Dict of QC options + Returns: - passed (bool) - False if any samples failed + seq_names_passing (list) + List of isolates passing QC distance filters + distMat ([n,2] numpy ndarray) + Filtered long form distance matrix """ - passed = True + + # avoid circular import + from .prune_db import prune_distance_matrix + from .sketchlib import removeFromDB + to_prune = [] # First check with numpy, which is quicker than iterating over everything - if np.any(distMat[:, 0] > c_max) or np.any(distMat[:, 1] > a_max): - passed = False - names = iterDistRows(refList, queryList, refList == queryList) - for i, (ref, query) in enumerate(names): - if distMat[i, 0] > c_max or distMat[i, 1] > a_max: - sys.stderr.write("WARNING: Outlier at c = " + str(distMat[i, 0]) + " a = " + str(distMat[i, 1]) + - " 1:" + ref + " 2:" + query + "\n") - if ref_isolate is not None: - if ref == ref_isolate: - to_prune.append(query) - elif query == ref_isolate: - to_prune.append(ref) - - if ref_isolate is None: - return passed + long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist() + if len(long_distance_rows) > 0: + names = list(iterDistRows(refList, queryList, refList == queryList)) + # Prune sequences based on reference sequence + if qc_dict['reference_isolate'] is not None: + for i in long_distance_rows: + if names[i][0] == qc_dict['reference_isolate']: + to_prune.append(names[i][1]) + elif names[i][1] == qc_dict['reference_isolate']: + to_prune.append(names[i][0]) + else: + anomalous_isolates = set() + for i in long_distance_rows: + anomalous_isolates.add(names[i][0]) + anomalous_isolates.add(names[i][1]) + to_prune = list(anomalous_isolates) + + # Create overall list of sequences + if refList == refList: + seq_names_passing = refList else: - return to_prune + seq_names_passing = refList + queryList + + # prune based on distance from reference if provided + if qc_dict['qc_filter'] == 'stop': + if len(to_prune) > 0: + sys.stderr.write('Outlier distances exceed QC thresholds; prune sequences or raise thresholds\n') + sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n') + sys.exit(1) + elif qc_dict['qc_filter'] == 'prune' and len(to_prune) > 0: + if qc_dict['reference_isolate'] is None: + sys.stderr.write('Distances exceeded QC thresholds but no reference isolate supplied\n') + sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n') + else: + # Remove sketches + db_name = prefix + '/' + os.path.basename(prefix) + '.h5' + filtered_db_name = prefix + '/' + 'filtered.' + os.path.basename(prefix) + '.h5' + removeFromDB(db_name, + filtered_db_name, + to_prune, + full_names = True) + os.rename(filtered_db_name, db_name) + # Remove from distance matrix + seq_names_passing, distMat = prune_distance_matrix(seq_names_passing, + to_prune, + distMat, + prefix + "/" + os.path.basename(prefix) + ".dists") + # Remove from reflist + sys.stderr.write('Successfully pruned from the database: ' + ';'.join(to_prune)) + + return seq_names_passing, distMat def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False): From 46a9a9abbb02b2cad27e4664500336f9c51d796c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 13:45:37 +0000 Subject: [PATCH 226/327] Update distance QC functions --- PopPUNK/__main__.py | 28 +++------------------------- PopPUNK/assign.py | 11 ++++++++--- PopPUNK/utils.py | 2 +- PopPUNK/web.py | 1 + 4 files changed, 13 insertions(+), 29 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index a350f5f5..36dfc85a 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -318,29 +318,6 @@ def main(): args.output, qc_dict) -# # prune based on distance from reference if provided -# if args.reference_isolate is not None and len(names_to_remove) > 0 and args.qc_filter == "prune": -# # Remove sketches -# db_name = args.output + '/' + os.path.basename(args.output) + '.h5' -# filtered_db_name = args.output + '/' + 'filtered.' + os.path.basename(args.output) + '.h5' -# removeFromDB(db_name, -# filtered_db_name, -# names_to_remove, -# full_names = True) -# os.rename(filtered_db_name, db_name) -# # Remove from distance matrix -# prune_distance_matrix(seq_names_passing, -# names_to_remove, -# distMat, -# args.output + "/" + os.path.basename(args.output) + ".dists") -# # Remove from reflist -# seq_names_passing = [seq_names_passing.remove(x) for x in names_to_remove] -# sys.stderr.write("Successfully removed from the database: " + str(names_to_remove)) -# else: -# # Save results -# dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" -# storePickle(seq_names_passing, seq_names_passing, True, distMat, dists_out) - # Plot results plot_scatter(distMat, args.output + "/" + os.path.basename(args.output) + "_distanceDistribution", @@ -390,8 +367,9 @@ def main(): # Load the distances refList, queryList, self, distMat = readPickle(distances, enforce_self=True) - if qcDistMat(distMat, refList, queryList, args.max_pi_dist, args.max_a_dist) == False \ - and args.qc_filter == "stop": + seq_names = set(set(refList) | set(queryList)) + seq_names_passing, distMat = qcDistMat(distMat, refList, queryList, args.output, qc_dict) + if length(set(seq_names_passing).difference(seq_names)) > 0 and args.qc_filter == "stop": sys.stderr.write("Distances failed quality control (change QC options to run anyway)\n") sys.exit(1) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index b1361aab..6ad90613 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -28,6 +28,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, + qc_dict, update_db, write_references, distances, @@ -159,7 +160,7 @@ def assign_query(dbFuncs, threads = threads, use_gpu = gpu_dist) # QC distance matrix - qcPass = qcDistMat(qrDistMat, rNames, qNames, max_pi_dist, max_a_dist, reference_isolate) + seq_names_passing, distMat = qcDistMat(qrDistMat, rNames, qNames, output, qc_dict) # Load the network based on supplied options genomeNetwork, old_cluster_file = \ @@ -443,7 +444,7 @@ def main(): # Dict of QC options for passing to database construction and querying functions if args.length_sigma is None and None in args.length_range and args.prop_n is None \ - and args.upper_n is None: + and args.upper_n is None and args.max_a_dist is None and args.max_pi_dist is None: qc_dict = {'run_qc': False } else: # define defaults if one QC parameter given @@ -468,7 +469,10 @@ def main(): 'length_sigma': length_sigma, 'length_range': args.length_range, 'prop_n': prop_n, - 'upper_n': args.upper_n + 'upper_n': args.upper_n, + 'max_pi_dist': args.max_pi_dist, + 'max_a_dist': args.max_a_dist, + 'reference_isolate': args.reference_isolate } # Dict of DB access functions for assign_query (which is out of scope) @@ -497,6 +501,7 @@ def main(): args.db, args.query, args.output, + qc_dict, args.update_db, args.write_references, distances, diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 5f82d4e4..079e5fa0 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -303,7 +303,7 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict): distMat, prefix + "/" + os.path.basename(prefix) + ".dists") # Remove from reflist - sys.stderr.write('Successfully pruned from the database: ' + ';'.join(to_prune)) + sys.stderr.write('Pruned from the database after failing distance QC: ' + ';'.join(to_prune)) return seq_names_passing, distMat diff --git a/PopPUNK/web.py b/PopPUNK/web.py index 3a45967c..a8ed3a7e 100644 --- a/PopPUNK/web.py +++ b/PopPUNK/web.py @@ -66,6 +66,7 @@ def sketchAssign(): args.assign.ref_db, args.assign.q_files, outdir, + qc_dict, args.assign.update_db, args.assign.write_references, args.assign.distances, From 40a6e4f49b956c7c5b2ea4a944ded9d0a045be85 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 14:33:04 +0000 Subject: [PATCH 227/327] Select reference isolate where not supplied --- PopPUNK/sketchlib.py | 40 ++++++++++++++++++++++++++++++++++++++++ PopPUNK/utils.py | 36 ++++++++++++++++++------------------ 2 files changed, 58 insertions(+), 18 deletions(-) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 60f42a54..d00f75f4 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -572,6 +572,46 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num return distMat +def pickReferenceIsolate(prefix, names): + """Selects a reference isolate as that with a minimal proportion + of missing data. + + Args: + prefix (str) + Prefix of output files + names (list) + Names of samples to QC + + Returns: + reference_isolate (str) + Name of isolate selected as reference + """ + # open databases + db_name = prefix + '/' + os.path.basename(prefix) + '.h5' + hdf_in = h5py.File(db_name, 'r+') + + min_prop_n = 1.0 + reference_isolate = None + + try: + # process data structures + read_grp = hdf_in['sketches'] + # iterate through sketches + for dataset in read_grp: + if hdf_in['sketches'][dataset].attrs['missing_bases']/hdf_in['sketches'][dataset].attrs['length'] < min_prop_n: + min_prop_n = hdf_in['sketches'][dataset].attrs['missing_bases']/hdf_in['sketches'][dataset].attrs['length'] + reference_isolate = dataset + if min_prop_n == 0.0: + break + # if failure still close files to avoid corruption + except: + hdf_in.close() + sys.stderr.write('Problem processing h5 databases during QC - aborting\n') + print("Unexpected error:", sys.exc_info()[0], file = sys.stderr) + raise + + return reference_isolate + def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads): """Calculates random match probability based on means of genomes in assemblyList, and looks for length outliers. diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 079e5fa0..4f4c0dab 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -251,32 +251,32 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict): # avoid circular import from .prune_db import prune_distance_matrix from .sketchlib import removeFromDB + from .sketchlib import pickReferenceIsolate + # Create overall list of sequences + if refList == refList: + seq_names_passing = refList + else: + seq_names_passing = refList + queryList + + # Sequences to remove to_prune = [] + # Pick reference isolate if not supplied + if qc_dict['reference_isolate'] is None: + qc_dict['reference_isolate'] = pickReferenceIsolate(prefix, seq_names_passing) + sys.stderr.write('Selected reference isolate is ' + qc_dict['reference_isolate'] + '\n') + # First check with numpy, which is quicker than iterating over everything long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist() if len(long_distance_rows) > 0: names = list(iterDistRows(refList, queryList, refList == queryList)) # Prune sequences based on reference sequence - if qc_dict['reference_isolate'] is not None: - for i in long_distance_rows: - if names[i][0] == qc_dict['reference_isolate']: - to_prune.append(names[i][1]) - elif names[i][1] == qc_dict['reference_isolate']: - to_prune.append(names[i][0]) - else: - anomalous_isolates = set() - for i in long_distance_rows: - anomalous_isolates.add(names[i][0]) - anomalous_isolates.add(names[i][1]) - to_prune = list(anomalous_isolates) - - # Create overall list of sequences - if refList == refList: - seq_names_passing = refList - else: - seq_names_passing = refList + queryList + for i in long_distance_rows: + if names[i][0] == qc_dict['reference_isolate']: + to_prune.append(names[i][1]) + elif names[i][1] == qc_dict['reference_isolate']: + to_prune.append(names[i][0]) # prune based on distance from reference if provided if qc_dict['qc_filter'] == 'stop': From 845acb083d408bce4da270c13d0a94cb3b05e3f2 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 14:36:10 +0000 Subject: [PATCH 228/327] Change missing nodes to error --- PopPUNK/__main__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 36dfc85a..9b892029 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -491,8 +491,9 @@ def main(): networkMissing = set(set(range(len(refList))).difference(vertex_list)) if len(networkMissing) > 0: missing_isolates = [refList[m] for m in networkMissing] - sys.stderr.write("WARNING: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n") + sys.stderr.write("ERROR: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n") sys.stderr.write("These correspond to indices " + ", ".join(map(str,networkMissing)) + "\n") + sys.exit(1) fit_type = model.type isolateClustering = {fit_type: printClusters(genomeNetwork, From bb2f9e11ded1faf0cf763ba37725195b7e9b75c1 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 14:50:10 +0000 Subject: [PATCH 229/327] Use function for checking network vertex count --- PopPUNK/__main__.py | 9 ++------- PopPUNK/network.py | 23 +++++++++++++++++++---- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 9b892029..f2426db5 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -197,6 +197,7 @@ def main(): from .network import printClusters from .network import get_vertex_list from .network import save_network + from .network import checkNetworkVertexCount from .plot import writeClusterCsv from .plot import plot_scatter @@ -487,13 +488,7 @@ def main(): genomeNetwork = indivNetworks[min(rank_list)] # Ensure all in dists are in final network - vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph)) - networkMissing = set(set(range(len(refList))).difference(vertex_list)) - if len(networkMissing) > 0: - missing_isolates = [refList[m] for m in networkMissing] - sys.stderr.write("ERROR: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n") - sys.stderr.write("These correspond to indices " + ", ".join(map(str,networkMissing)) + "\n") - sys.exit(1) + checkNetworkVertexCount(refList, genomeNetwork, use_gpu) fit_type = model.type isolateClustering = {fit_type: printClusters(genomeNetwork, diff --git a/PopPUNK/network.py b/PopPUNK/network.py index cda818fb..e733d64b 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -120,12 +120,27 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") # Ensure all in dists are in final network + checkNetworkVertexCount(refList, genomeNetwork, use_gpu) + + return genomeNetwork, cluster_file + +def checkNetworkVertexCount(seq_list, G, use_gpu): + """Checks the number of network vertices matches the number + of sequence names. + + Args: + seq_list (list) + The list of sequence names + G (graph) + The network of sequences + use_gpu (bool) + Whether to use cugraph for graph analyses + """ vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = use_gpu)) - networkMissing = set(set(range(len(refList))).difference(vertex_list)) + networkMissing = set(set(range(len(seq_list))).difference(vertex_list)) if len(networkMissing) > 0: - sys.stderr.write("WARNING: Samples " + ",".join(networkMissing) + " are missing from the final network\n") - - return (genomeNetwork, cluster_file) + sys.stderr.write("ERROR: Samples " + ",".join(networkMissing) + " are missing from the final network\n") + sys.exit(1) def getCliqueRefs(G, reference_indices = set()): """Recursively prune a network of its cliques. Returns one vertex from From dbcb4f62a956b3ce73b560964a8cef4cce90104e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 14:56:57 +0000 Subject: [PATCH 230/327] Tidy up obsolete text --- PopPUNK/network.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index e733d64b..64f2a170 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -230,7 +230,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u reference_index_df = component_assignments.groupby('partition').nth(0) reference_indices = reference_index_df['vertex'].to_arrow().to_pylist() - # Order found references as in mash sketch files + # Order found references as in sketchlib database reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] refFileName = writeReferences(reference_names, outPrefix) @@ -249,7 +249,6 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Construct graph G_ref = cugraph.Graph() G_ref.from_cudf_edgelist(G_ref_df) - return reference_indices, reference_names, refFileName, G_ref else: @@ -329,7 +328,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Order found references as in mash sketch files reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] refFileName = writeReferences(reference_names, outPrefix) - return reference_indices, reference_names, refFileName, G_ref + return reference_indices, reference_names, refFileName, G_ref def writeReferences(refList, outPrefix): """Writes chosen references to file From f61ccd01286cc0ab8cd509326ac0ab62f487a8bc Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 19:14:47 +0000 Subject: [PATCH 231/327] Add self_loop function --- PopPUNK/network.py | 55 ++++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 64f2a170..13d3d6b2 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -136,7 +136,7 @@ def checkNetworkVertexCount(seq_list, G, use_gpu): use_gpu (bool) Whether to use cugraph for graph analyses """ - vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = use_gpu)) + vertex_list = set(get_vertex_list(G, use_gpu = use_gpu)) networkMissing = set(set(range(len(seq_list))).difference(vertex_list)) if len(networkMissing) > 0: sys.stderr.write("ERROR: Samples " + ",".join(networkMissing) + " are missing from the final network\n") @@ -239,16 +239,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_df.columns = ['source','destination'] G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)] # Add self-loop if needed - max_in_df = np.amax([G_df['source'].max(), G_df['destination'].max()]) max_in_vertex_labels = len(reference_names) - 1 - if max_in_df.item() != max_in_vertex_labels: - G_self_loop = cudf.DataFrame() - G_self_loop['source'] = [max_in_vertex_labels] - G_self_loop['destination'] = [max_in_vertex_labels] - G_ref_df = cudf.concat([G_ref_df, G_self_loop], ignore_index = True) - # Construct graph - G_ref = cugraph.Graph() - G_ref.from_cudf_edgelist(G_ref_df) + G_ref = add_self_loop(G_ref_df,max_in_vertex_labels) else: @@ -781,20 +773,13 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, G_current_df.columns = ['source','destination'] G_extra_df = cudf.DataFrame(new_edges, columns =['source','destination']) G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True) - G = cugraph.Graph() - G.from_cudf_edgelist(G_df) # use self-loop to ensure all nodes are present - max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) max_in_vertex_labels = ref_count + len(qList) - 1 - if max_in_df.item() != max_in_vertex_labels: - G_self_loop = cudf.DataFrame() - G_self_loop['source'] = [max_in_vertex_labels] - G_self_loop['destination'] = [max_in_vertex_labels] - G = cudf.concat([G,G_self_loop], ignore_index = True) - # Construct graph - G = cugraph.Graph() - G.from_cudf_edgelist(G_df) + include_weights = False + if weights is not None: + include_weights = True + G = add_self_loop(G_df, max_in_vertex_labels, weights = include_weights) else: G.add_vertex(len(qList)) @@ -812,6 +797,34 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, return G, qqDistMat +def add_self_loop(G_df, seq_num, weights = False): + """Adds self-loop to cugraph graph to ensure all nodes are included in + the graph, even if singletons. + + Args: + G_df (cudf) + cudf data frame containing edge list + seq_num (int) + The expected number of nodes in the graph + + Returns: + G_new (graph) + Dictionary of cluster assignments (keys are sequence names) + """ + # use self-loop to ensure all nodes are present + max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) + if max_in_df.item() != seq_num: + G_self_loop = cudf.DataFrame() + G_self_loop['source'] = [seq_num] + G_self_loop['destination'] = [seq_num] + if weights: + G_self_loop['weight'] = 0.0 + G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) + # Construct graph + G_new = cugraph.Graph() + G_new.from_cudf_edgelist(G_df) + return G_new + def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, externalClusterCSV = None, printRef = True, printCSV = True, clustering_type = 'combined', use_gpu = False): From 55507b4b2b6af3f6bffe7bc7bec3262646918311 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 19:19:01 +0000 Subject: [PATCH 232/327] Remove condition on adding edges --- PopPUNK/network.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 13d3d6b2..4ca21ecf 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -478,15 +478,13 @@ def constructNetwork(rlist, qlist, assignments, within_label, weights = True) for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights): edge_tuple = (ref, query, weight) - if ref < query: - connections.append(edge_tuple) + connections.append(edge_tuple) else: extra_sources, extra_targets = load_previous_network(prev_G,rlist, weights = False) for (ref, query) in zip(extra_sources, extra_targets): edge_tuple = (ref, query) - if ref < query: - connections.append(edge_tuple) + connections.append(edge_tuple) # load GPU libraries if necessary if use_gpu: From acbeeea105fe4710bd849b6a8decf1075db66de0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 20:32:44 +0000 Subject: [PATCH 233/327] Add copy function for models --- PopPUNK/assign.py | 3 +-- PopPUNK/models.py | 6 ++++++ PopPUNK/utils.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 6ad90613..6542fc7d 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -288,8 +288,7 @@ def assign_query(dbFuncs, # Copy model if needed if output != model.outPrefix: - model.outPrefix = output - model.save() + model.copy(output) # Clique pruning if model.type != 'lineage': diff --git a/PopPUNK/models.py b/PopPUNK/models.py index ac46e40e..502575ff 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -261,6 +261,12 @@ def no_scale(self): is done in the scaled space). ''' self.scale = np.array([1, 1], dtype = self.default_dtype) + + def copy(self, prefix): + """Copy the model to a new directory + """ + self.outPrefix = prefix + save() class BGMMFit(ClusterFit): diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 4f4c0dab..91a04015 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -303,7 +303,7 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict): distMat, prefix + "/" + os.path.basename(prefix) + ".dists") # Remove from reflist - sys.stderr.write('Pruned from the database after failing distance QC: ' + ';'.join(to_prune)) + sys.stderr.write('Pruned from the database after failing distance QC: ' + ';'.join(to_prune) + '\n') return seq_names_passing, distMat From 90144c0103856e4a30d02b575280a204c355f655 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 21:03:44 +0000 Subject: [PATCH 234/327] Fix file and cluster name processing --- PopPUNK/visualise.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 0bc7d8f1..39c7bb35 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -359,9 +359,17 @@ def generate_visualisations(query_db, if not overwrite: existing_tree = load_tree(output, "MST", distances=mst_distances) if existing_tree is None: - # Get a default clustering if none provided - if display_cluster is None: - display_cluster = list(isolateClustering.keys())[0] + # Check selecting clustering type is in CSV + clustering_name = 'Cluster' + if display_cluster != None: + if display_cluster not in isolateClustering.keys(): + clustering_name = list(isolateClustering.keys())[0] + sys.stderr.write('Unable to find clustering column ' + display_cluster + ' in file ' + + prev_clustering + '; instead using ' + clustering_name + '\n') + else: + clustering_name = display_cluster + else: + clustering_name = list(isolateClustering.keys())[0] # Get distance matrix complete_distMat = \ np.hstack((pp_sketchlib.squareToLong(core_distMat, threads).reshape(-1, 1), @@ -376,7 +384,7 @@ def generate_visualisations(query_db, weights_type=mst_distances, summarise=False) mst_graph = generate_minimum_spanning_tree(G) - drawMST(mst_graph, output, isolateClustering, display_cluster, overwrite) + drawMST(mst_graph, output, isolateClustering, clustering_name, overwrite) mst_tree = mst_to_phylogeny(mst_graph, isolateNameToLabel(combined_seq)) else: mst_tree = existing_tree @@ -435,7 +443,7 @@ def generate_visualisations(query_db, if cytoscape: sys.stderr.write("Writing cytoscape output\n") - genomeNetwork, cluster_file = fetchNetwork(os.path.dirname(prev_clustering), + genomeNetwork, cluster_file = fetchNetwork(os.path.dirname(graph_dir), model, rlist, False, From 1b404614a722385b41f14b7dc9e42f8e98a30814 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 16 Mar 2021 21:45:07 +0000 Subject: [PATCH 235/327] Change network loading functions --- PopPUNK/network.py | 36 +++++++++++++++++++++++++++++------- PopPUNK/visualise.py | 28 +++++++++------------------- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 4ca21ecf..c23352e6 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -104,9 +104,34 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, if core_only or accessory_only: sys.stderr.write("Can only do --core-only or --accessory-only fits from " "a refined fit. Using the combined distances.\n") + + # Load network file + genomeNetwork = load_network_file(network_file, use_gpu = use_gpu) + + # Ensure all in dists are in final network + checkNetworkVertexCount(refList, genomeNetwork, use_gpu) + return genomeNetwork, cluster_file + +def load_network_file(fn, use_gpu = False): + """Load the network based on input options + + Returns the network as a graph-tool format graph, and sets + the slope parameter of the passed model object. + + Args: + fn (str) + Network file name + use_gpu (bool) + Use cugraph library to load graph + + Returns: + genomeNetwork (graph) + The loaded network + """ + # Load the network from the specified file if use_gpu: - G_df = cudf.read_csv(network_file, compression = 'gzip') + G_df = cudf.read_csv(fn, compression = 'gzip') genomeNetwork = cugraph.Graph() if 'weights' in G_df.columns: G_df.columns = ['source','destination','weights'] @@ -116,13 +141,10 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, genomeNetwork.from_cudf_edgelist(G_df,renumber=False) sys.stderr.write("Network loaded: " + str(genomeNetwork.number_of_vertices()) + " samples\n") else: - genomeNetwork = gt.load_graph(network_file) + genomeNetwork = gt.load_graph(fn) sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") - - # Ensure all in dists are in final network - checkNetworkVertexCount(refList, genomeNetwork, use_gpu) - - return genomeNetwork, cluster_file + + return genomeNetwork def checkNetworkVertexCount(seq_list, G, use_gpu): """Checks the number of network vertices matches the number diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 39c7bb35..3369177f 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -62,8 +62,8 @@ def get_options(): 'from poppunk_assign [default = use that in the directory ' 'of the query database]', type = str) - iGroup.add_argument('--use-network', - help='Specify a directory containing a .gt file to use for any graph visualisations', + iGroup.add_argument('--network-file', + help='Specify a file to use for any graph visualisations', type = str) iGroup.add_argument('--display-cluster', help='Column of clustering CSV to use for plotting', @@ -109,6 +109,7 @@ def get_options(): other = parser.add_argument_group('Other options') other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]') other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]') + other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating graphs [default = False]') other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]') other.add_argument('--strand-preserved', default=False, action='store_true', help='If distances being calculated, treat strand as known when calculating random ' @@ -149,7 +150,8 @@ def generate_visualisations(query_db, model_dir, previous_clustering, previous_query_clustering, - use_network, + network_file, + gpu_graph, info_csv, rapidnj, tree, @@ -165,6 +167,7 @@ def generate_visualisations(query_db, from .network import constructNetwork from .network import fetchNetwork from .network import generate_minimum_spanning_tree + from .network import load_network_file from .plot import drawMST from .plot import outputsForMicroreact @@ -326,15 +329,6 @@ def generate_visualisations(query_db, mode = mode, return_dict = True) - # Set graph location - if use_network is not None: - graph_dir = use_network - if graph_dir != prev_clustering: - sys.stderr.write("WARNING: Loading graph from a different directory to clusters\n") - sys.stderr.write("WARNING: Ensure that they are consistent\n") - else: - graph_dir = prev_clustering - # Join clusters with query clusters if required if not self: if previous_query_clustering is not None: @@ -443,12 +437,7 @@ def generate_visualisations(query_db, if cytoscape: sys.stderr.write("Writing cytoscape output\n") - genomeNetwork, cluster_file = fetchNetwork(os.path.dirname(graph_dir), - model, - rlist, - False, - core_only, - accessory_only) + genomeNetwork = load_network_file(network_file, use_gpu = gpu_graph) outputsForCytoscape(genomeNetwork, mst_graph, isolateClustering, output, info_csv, viz_subset = viz_subset) if model.type == 'lineage': sys.stderr.write("Note: Only support for output of cytoscape graph at lowest rank\n") @@ -478,7 +467,8 @@ def main(): args.model_dir, args.previous_clustering, args.previous_query_clustering, - args.use_network, + args.network_file, + args.gpu_graph, args.info_csv, args.rapidnj, args.tree, From ae777f4ff1831417f563e7ca5dbc8cd4731a7193 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 06:43:54 +0000 Subject: [PATCH 236/327] Update import of old networks to use cugraph --- PopPUNK/network.py | 94 +++++++++++++++++++++++++++++++------------ PopPUNK/sparse_mst.py | 6 ++- 2 files changed, 73 insertions(+), 27 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index c23352e6..19f195c4 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -365,8 +365,9 @@ def writeReferences(refList, outPrefix): return refFileName -def load_previous_network(prev_G_fn, rlist, weights=False): - """Load previous network with graph-tool, extract the edges to match the +def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False, + use_gpu = False): + """Load previous network, extract the edges to match the vertex order specified in rlist, and also return weights if specified. Args: @@ -374,9 +375,14 @@ def load_previous_network(prev_G_fn, rlist, weights=False): Path of file containing existing network. rlist (list) List of reference sequence labels in new network + previous_pkl (str) + Path of pkl file containing names of sequences in + previous network weights (bool) Whether to return edge weights (default = False) + use_gpu (bool) + Whether to use cugraph for graph analyses Returns: source_ids (list) @@ -387,20 +393,46 @@ def load_previous_network(prev_G_fn, rlist, weights=False): Weights for each new edge """ # get list for translating node IDs to rlist - prev_G = gt.load_graph(prev_G_fn) - old_ids = prev_G.vp["id"] + prev_G = load_network_file(prev_G_fn, use_gpu = use_gpu) + + # load list of names in previous network + if previous_pkl is not None: + with open(previous_pkl, 'rb') as pickle_file: + old_rlist, old_qlist, self = pickle.load(pickle_file) + if self: + old_ids = old_rlist + else: + old_ids = old_rlist + old_qlist + else: + sys.stderr.write('Pkl file containing names of sequences in previous network\n') + sys.exit(1) + + # Get edges as lists of source,destination,weight using original IDs + if use_gpu: + G_df = prev_G.view_edge_list() + if weights: + G_df.columns = ['source','destination','weight'] + edge_weights = G_df['weight'].to_arrow().to_pylist() + else: + G_df.columns = ['source','destination'] + old_source_ids = G_df['source'].to_arrow().to_pylist() + old_target_ids = G_df['destination'].to_arrow().to_pylist() + else: + # get the source and target nodes + old_source_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "source") + old_target_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "target") + # get the weights + if weights: + edge_weights = list(prev_G.ep['weight']) + + # Update IDs to new versions old_id_indices = [rlist.index(x) for x in old_ids] - # get the source and target nods - source_old_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "source") - target_old_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "target") # translate to indices - source_ids = [old_id_indices[x] for x in source_old_ids] - target_ids = [old_id_indices[x] for x in target_old_ids] - # convert to ndarray - # get the weights + source_ids = [old_id_indices[x] for x in old_source_ids] + target_ids = [old_id_indices[x] for x in old_target_ids] + + # return values if weights: - edge_weights = list(prev_G.ep['weight']) - # return values return source_ids, target_ids, edge_weights else: return source_ids, target_ids @@ -408,7 +440,7 @@ def load_previous_network(prev_G_fn, rlist, weights=False): def constructNetwork(rlist, qlist, assignments, within_label, summarise = True, edge_list = False, weights = None, weights_type = 'euclidean', sparse_input = None, - previous_network = None, use_gpu = False): + previous_network = None, previous_pkl = None, use_gpu = False): """Construct an unweighted, undirected network without self-loops. Nodes are samples and edges where samples are within the same cluster @@ -440,6 +472,8 @@ def constructNetwork(rlist, qlist, assignments, within_label, previous_network (str) Name of file containing a previous network to be integrated into this new network + previous_pkl (str) + Name of file containing the names of the sequences in the previous_network use_gpu (bool) Whether to use GPUs for network construction @@ -495,18 +529,28 @@ def constructNetwork(rlist, qlist, assignments, within_label, # read previous graph if previous_network is not None: - if weights is not None or sparse_input is not None: - extra_sources, extra_targets, extra_weights = load_previous_network(previous_network,rlist, - weights = True) - for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights): - edge_tuple = (ref, query, weight) - connections.append(edge_tuple) + if previous_pkl is not None: + if weights is not None or sparse_input is not None: + extra_sources, extra_targets, extra_weights = network_to_edges(previous_network, + rlist, + previous_pkl = previous_pkl, + weights = True, + use_gpu = use_gpu) + for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights): + edge_tuple = (ref, query, weight) + connections.append(edge_tuple) + else: + extra_sources, extra_targets = network_to_edges(prev_G, + rlist, + previous_pkl = previous_pkl, + weights = False, + use_gpu = use_gpu) + for (ref, query) in zip(extra_sources, extra_targets): + edge_tuple = (ref, query) + connections.append(edge_tuple) else: - extra_sources, extra_targets = load_previous_network(prev_G,rlist, - weights = False) - for (ref, query) in zip(extra_sources, extra_targets): - edge_tuple = (ref, query) - connections.append(edge_tuple) + sys.stderr.write('A distance pkl corresponding to ' + previous_pkl + ' is required for loading\n') + sys.exit(1) # load GPU libraries if necessary if use_gpu: diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index 5678e60d..24fc2bfa 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -106,9 +106,11 @@ def main(): # Load previous MST if specified if args.previous_mst is not None: print("Previous: " + str(args.previous_mst)) - extra_sources, extra_targets, extra_weights = load_previous_network(args.previous_mst, + extra_sources, extra_targets, extra_weights = network_to_edges(args.previous_mst, rlist, - weights = True) + previous_pkl = args.distance_pkl, + weights = True, + use_gpu = use_gpu) sources = np.append(sparse_mat.row, np.asarray(extra_sources)) targets = np.append(sparse_mat.col, np.asarray(extra_targets)) weights = np.append(sparse_mat.data, np.asarray(extra_weights)) From 616c31e295b74edd159d42668ab142c66cb38234 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 08:40:14 +0000 Subject: [PATCH 237/327] Fix processing of distance matrix --- PopPUNK/__main__.py | 6 +++--- PopPUNK/sketchlib.py | 2 +- PopPUNK/utils.py | 14 ++++++++------ 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index f2426db5..ce44f443 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -369,8 +369,8 @@ def main(): # Load the distances refList, queryList, self, distMat = readPickle(distances, enforce_self=True) seq_names = set(set(refList) | set(queryList)) - seq_names_passing, distMat = qcDistMat(distMat, refList, queryList, args.output, qc_dict) - if length(set(seq_names_passing).difference(seq_names)) > 0 and args.qc_filter == "stop": + seq_names_passing, distMat = qcDistMat(distMat, refList, queryList, output, qc_dict) + if len(set(seq_names_passing).difference(seq_names)) > 0 and args.qc_filter == "stop": sys.stderr.write("Distances failed quality control (change QC options to run anyway)\n") sys.exit(1) @@ -488,7 +488,7 @@ def main(): genomeNetwork = indivNetworks[min(rank_list)] # Ensure all in dists are in final network - checkNetworkVertexCount(refList, genomeNetwork, use_gpu) + checkNetworkVertexCount(refList, genomeNetwork, use_gpu = args.gpu_graph) fit_type = model.type isolateClustering = {fit_type: printClusters(genomeNetwork, diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index d00f75f4..528fc1d2 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -589,7 +589,7 @@ def pickReferenceIsolate(prefix, names): # open databases db_name = prefix + '/' + os.path.basename(prefix) + '.h5' hdf_in = h5py.File(db_name, 'r+') - + min_prop_n = 1.0 reference_isolate = None diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 91a04015..8739bdd2 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -236,7 +236,7 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict): Reference labels queryList (list) Query labels (or refList if self) - prefix (list) + prefix (str) Prefix for output files qc_dict (dict) Dict of QC options @@ -279,15 +279,15 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict): to_prune.append(names[i][0]) # prune based on distance from reference if provided - if qc_dict['qc_filter'] == 'stop': - if len(to_prune) > 0: - sys.stderr.write('Outlier distances exceed QC thresholds; prune sequences or raise thresholds\n') - sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n') - sys.exit(1) + if qc_dict['qc_filter'] == 'stop' and len(to_prune) > 0: + sys.stderr.write('Outlier distances exceed QC thresholds; prune sequences or raise thresholds\n') + sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n') + sys.exit(1) elif qc_dict['qc_filter'] == 'prune' and len(to_prune) > 0: if qc_dict['reference_isolate'] is None: sys.stderr.write('Distances exceeded QC thresholds but no reference isolate supplied\n') sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n') + sys.exit(1) else: # Remove sketches db_name = prefix + '/' + os.path.basename(prefix) + '.h5' @@ -304,6 +304,8 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict): prefix + "/" + os.path.basename(prefix) + ".dists") # Remove from reflist sys.stderr.write('Pruned from the database after failing distance QC: ' + ';'.join(to_prune) + '\n') + else: + storePickle(seq_names_passing, seq_names_passing, True, distMat, prefix + "/" + os.path.basename(prefix) + ".dists") return seq_names_passing, distMat From 03ab3f8b05847b0ea34ba9a78e069bba78f58f25 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 09:14:50 +0000 Subject: [PATCH 238/327] Avoid overwrite on qcDistMat --- PopPUNK/__main__.py | 11 ++++++----- PopPUNK/assign.py | 4 ++-- PopPUNK/utils.py | 10 ++++++---- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index ce44f443..fa6650f7 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -314,10 +314,11 @@ def main(): # QC pairwise distances to identify long distances indicative of anomalous sequences in the collection seq_names_passing, distMat = qcDistMat(distMat, - seq_names_passing, - seq_names_passing, - args.output, - qc_dict) + seq_names_passing, + seq_names_passing, + args.output, + args.output, + qc_dict) # Plot results plot_scatter(distMat, @@ -369,7 +370,7 @@ def main(): # Load the distances refList, queryList, self, distMat = readPickle(distances, enforce_self=True) seq_names = set(set(refList) | set(queryList)) - seq_names_passing, distMat = qcDistMat(distMat, refList, queryList, output, qc_dict) + seq_names_passing, distMat = qcDistMat(distMat, refList, queryList, args.ref_db, output, qc_dict) if len(set(seq_names_passing).difference(seq_names)) > 0 and args.qc_filter == "stop": sys.stderr.write("Distances failed quality control (change QC options to run anyway)\n") sys.exit(1) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 6542fc7d..3f9b05ee 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -160,7 +160,7 @@ def assign_query(dbFuncs, threads = threads, use_gpu = gpu_dist) # QC distance matrix - seq_names_passing, distMat = qcDistMat(qrDistMat, rNames, qNames, output, qc_dict) + seq_names_passing, distMat = qcDistMat(qrDistMat, rNames, qNames, ref_db, output, qc_dict) # Load the network based on supplied options genomeNetwork, old_cluster_file = \ @@ -249,7 +249,7 @@ def assign_query(dbFuncs, dists_out = output + "/" + os.path.basename(output) + ".dists" if update_db: # Check new sequences pass QC before adding them - if not qcPass: + if len(set(seq_names_passing).difference(rNames + qNames)) > 0: sys.stderr.write("Queries contained outlier distances, " "not updating database\n") else: diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 8739bdd2..c48ca280 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -226,7 +226,7 @@ def listDistInts(refSeqs, querySeqs, self=True): return comparisons -def qcDistMat(distMat, refList, queryList, prefix, qc_dict): +def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): """Checks distance matrix for outliers. Args: @@ -236,8 +236,10 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict): Reference labels queryList (list) Query labels (or refList if self) + ref_db (str) + Prefix of reference database prefix (str) - Prefix for output files + Prefix of output files qc_dict (dict) Dict of QC options @@ -264,7 +266,7 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict): # Pick reference isolate if not supplied if qc_dict['reference_isolate'] is None: - qc_dict['reference_isolate'] = pickReferenceIsolate(prefix, seq_names_passing) + qc_dict['reference_isolate'] = pickReferenceIsolate(ref_db, seq_names_passing) sys.stderr.write('Selected reference isolate is ' + qc_dict['reference_isolate'] + '\n') # First check with numpy, which is quicker than iterating over everything @@ -290,7 +292,7 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict): sys.exit(1) else: # Remove sketches - db_name = prefix + '/' + os.path.basename(prefix) + '.h5' + db_name = ref_db + '/' + os.path.basename(ref_db) + '.h5' filtered_db_name = prefix + '/' + 'filtered.' + os.path.basename(prefix) + '.h5' removeFromDB(db_name, filtered_db_name, From ef450c6a7b67832d8c45f159b13f169dc08ec4fd Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 10:09:29 +0000 Subject: [PATCH 239/327] Start checking reference graph connectivity --- PopPUNK/network.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 19f195c4..1872d106 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -259,10 +259,14 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Extract reference edges G_df = G.view_edge_list() G_df.columns = ['source','destination'] - G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)] + G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] # Add self-loop if needed max_in_vertex_labels = len(reference_names) - 1 - G_ref = add_self_loop(G_ref_df,max_in_vertex_labels) + G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False) + + # Check on targets + reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) + print("Reference component assignments: " + str(reference_component_assignments)) else: @@ -339,7 +343,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_ref = gt.GraphView(G, vfilt = reference_vertex) G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object - # Order found references as in mash sketch files + # Order found references as in sketch files reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] refFileName = writeReferences(reference_names, outPrefix) return reference_indices, reference_names, refFileName, G_ref @@ -424,7 +428,7 @@ def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False, # get the weights if weights: edge_weights = list(prev_G.ep['weight']) - + # Update IDs to new versions old_id_indices = [rlist.index(x) for x in old_ids] # translate to indices @@ -861,7 +865,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, return G, qqDistMat -def add_self_loop(G_df, seq_num, weights = False): +def add_self_loop(G_df, seq_num, weights = False, renumber = True): """Adds self-loop to cugraph graph to ensure all nodes are included in the graph, even if singletons. @@ -870,6 +874,8 @@ def add_self_loop(G_df, seq_num, weights = False): cudf data frame containing edge list seq_num (int) The expected number of nodes in the graph + renumber (bool) + Whether to renumber the vertices when added to the graph Returns: G_new (graph) @@ -886,7 +892,7 @@ def add_self_loop(G_df, seq_num, weights = False): G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) # Construct graph G_new = cugraph.Graph() - G_new.from_cudf_edgelist(G_df) + G_new.from_cudf_edgelist(G_df, renumber = renumber) return G_new def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None, From 08f58ff1088c813ce76407a00c04719b5f67b44e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 10:26:16 +0000 Subject: [PATCH 240/327] Enable qcDistMat to create output directory --- PopPUNK/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index c48ca280..41ef484a 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -264,6 +264,14 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): # Sequences to remove to_prune = [] + # Create output directory if it does not exist already + if not os.path.isdir(prefix): + try: + os.makedirs(prefix) + except OSError: + sys.stderr.write("Cannot create output directory " + prefix + "\n") + sys.exit(1) + # Pick reference isolate if not supplied if qc_dict['reference_isolate'] is None: qc_dict['reference_isolate'] = pickReferenceIsolate(ref_db, seq_names_passing) From 6094b2bcfbf4a6eba50601142df8bad9951cae77 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 10:33:14 +0000 Subject: [PATCH 241/327] Change vertex count error message --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 1872d106..e01f8779 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -161,7 +161,7 @@ def checkNetworkVertexCount(seq_list, G, use_gpu): vertex_list = set(get_vertex_list(G, use_gpu = use_gpu)) networkMissing = set(set(range(len(seq_list))).difference(vertex_list)) if len(networkMissing) > 0: - sys.stderr.write("ERROR: Samples " + ",".join(networkMissing) + " are missing from the final network\n") + sys.stderr.write("ERROR: Samples " + ",".join(map(str,networkMissing)) + " are missing from the final network\n") sys.exit(1) def getCliqueRefs(G, reference_indices = set()): From 93c872d6acfc2134541e5c46c701570229310fe9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 10:41:40 +0000 Subject: [PATCH 242/327] Change column naming in cugraph --- PopPUNK/network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index e01f8779..52b6899b 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -258,7 +258,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Extract reference edges G_df = G.view_edge_list() - G_df.columns = ['source','destination'] + if 'src' in G_df.columns: + G_df.rename(columns={'src': 'source','dst': 'destination'}) G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] # Add self-loop if needed max_in_vertex_labels = len(reference_names) - 1 From 0f77ade99089feaa20449fada801943728224019 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 10:43:11 +0000 Subject: [PATCH 243/327] Get column names --- PopPUNK/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 52b6899b..7bcb662c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -258,6 +258,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Extract reference edges G_df = G.view_edge_list() + print('G_df original structure: ' + str(G_df)) if 'src' in G_df.columns: G_df.rename(columns={'src': 'source','dst': 'destination'}) G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] From 90aee39123b485bf15df3d434079297e28c423cc Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 10:44:37 +0000 Subject: [PATCH 244/327] Rename in place --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 7bcb662c..13750745 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -260,7 +260,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_df = G.view_edge_list() print('G_df original structure: ' + str(G_df)) if 'src' in G_df.columns: - G_df.rename(columns={'src': 'source','dst': 'destination'}) + G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] # Add self-loop if needed max_in_vertex_labels = len(reference_names) - 1 From b0375939c6e1c76046a6f11a6ca71aa193ece5ff Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 10:48:46 +0000 Subject: [PATCH 245/327] Compare cudfs --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 13750745..0d1ca25d 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -258,7 +258,6 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Extract reference edges G_df = G.view_edge_list() - print('G_df original structure: ' + str(G_df)) if 'src' in G_df.columns: G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] @@ -269,6 +268,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Check on targets reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) print("Reference component assignments: " + str(reference_component_assignments)) + print("Component assignments: " + str(component_assignments)) else: From ce8fccbf3b4bf0e3dde7f92bdbcb346d0c69fc43 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 10:55:09 +0000 Subject: [PATCH 246/327] View reference cudf --- PopPUNK/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 0d1ca25d..c427b4fe 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -267,6 +267,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Check on targets reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) + print("Reference df: " + str(G_ref_df)) print("Reference component assignments: " + str(reference_component_assignments)) print("Component assignments: " + str(component_assignments)) From fe24179a5468215de518dffd03463a2637d54245 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 10:56:42 +0000 Subject: [PATCH 247/327] View overall cudf --- PopPUNK/network.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index c427b4fe..d16e9d6e 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -267,6 +267,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Check on targets reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) + print("Reference indices: " + str(reference_indices)) + print("Overall cudf: " + str(G_df)) print("Reference df: " + str(G_ref_df)) print("Reference component assignments: " + str(reference_component_assignments)) print("Component assignments: " + str(component_assignments)) From b8d03b2aae9b8330bfb89da680d11c556d8deb25 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 11:47:28 +0000 Subject: [PATCH 248/327] Concat cudf --- PopPUNK/network.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index d16e9d6e..44b1280e 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -246,10 +246,12 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # For large network, use more approximate method for extracting references reference = {} + # Record the original components to which sequences belonged + component_assignments = cugraph.components.connectivity.connected_components(G) # Leiden method has resolution parameter - higher values give greater precision - component_assignments, score = cugraph.leiden(G, resolution = 1.0) + partition_assignments, score = cugraph.leiden(G, resolution = 1.0) # group by partition, which becomes the first column, so retrieve second column - reference_index_df = component_assignments.groupby('partition').nth(0) + reference_index_df = partition_assignments.groupby('partition').nth(0) reference_indices = reference_index_df['vertex'].to_arrow().to_pylist() # Order found references as in sketchlib database @@ -262,11 +264,14 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] # Add self-loop if needed - max_in_vertex_labels = len(reference_names) - 1 + max_in_vertex_labels = max(reference_indices) G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False) # Check on targets reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) + combined_vertex_assignments = cudf.concat([reference_component_assignments,component_assignments], + axis = 1, + join = 'inner') print("Reference indices: " + str(reference_indices)) print("Overall cudf: " + str(G_df)) print("Reference df: " + str(G_ref_df)) From a8e7e33673fdcb40bc0c692dc59a256c473b972b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 11:54:35 +0000 Subject: [PATCH 249/327] Merge cudf --- PopPUNK/network.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 44b1280e..afb539af 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -263,20 +263,22 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u if 'src' in G_df.columns: G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] + G_ref_df.rename(columns={'labels': 'ref_labels'}) # Add self-loop if needed max_in_vertex_labels = max(reference_indices) G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False) # Check on targets reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) - combined_vertex_assignments = cudf.concat([reference_component_assignments,component_assignments], - axis = 1, - join = 'inner') + combined_vertex_assignments = reference_component_assignments.merge(component_assignments, + on = 'vertex', + how = 'left') print("Reference indices: " + str(reference_indices)) print("Overall cudf: " + str(G_df)) print("Reference df: " + str(G_ref_df)) print("Reference component assignments: " + str(reference_component_assignments)) print("Component assignments: " + str(component_assignments)) + print("Combined assignments: " + str(combined_vertex_assignments)) else: From 86b8d85bbd0914c8eb3782296862d010eabdaa23 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 11:57:57 +0000 Subject: [PATCH 250/327] Filter merged cudf --- PopPUNK/network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index afb539af..29cad4e5 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -263,7 +263,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u if 'src' in G_df.columns: G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] - G_ref_df.rename(columns={'labels': 'ref_labels'}) + G_ref_df.rename(columns={'labels': 'ref_labels'}, inplace=True) # Add self-loop if needed max_in_vertex_labels = max(reference_indices) G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False) @@ -273,6 +273,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u combined_vertex_assignments = reference_component_assignments.merge(component_assignments, on = 'vertex', how = 'left') + combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] print("Reference indices: " + str(reference_indices)) print("Overall cudf: " + str(G_df)) print("Reference df: " + str(G_ref_df)) From c34a51f187728294c309b1da090c09c521329f14 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 12:09:58 +0000 Subject: [PATCH 251/327] Summarise merged cudf --- PopPUNK/network.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 29cad4e5..8a78f82a 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -269,17 +269,23 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False) # Check on targets - reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) - combined_vertex_assignments = reference_component_assignments.merge(component_assignments, - on = 'vertex', - how = 'left') - combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] - print("Reference indices: " + str(reference_indices)) - print("Overall cudf: " + str(G_df)) - print("Reference df: " + str(G_ref_df)) - print("Reference component assignments: " + str(reference_component_assignments)) - print("Component assignments: " + str(component_assignments)) - print("Combined assignments: " + str(combined_vertex_assignments)) + partition_match = False + while partition_match: + reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) + combined_vertex_assignments = reference_component_assignments.merge(component_assignments, + on = 'vertex', + how = 'left') + combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] + combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels','ref_labels'])['ref_labels'].count() + max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max() + print('max is ' + str(max_ref_comp_count)) + print("Reference indices: " + str(reference_indices)) + print("Overall cudf: " + str(G_df)) + print("Reference df: " + str(G_ref_df)) + print("Reference component assignments: " + str(reference_component_assignments)) + print("Component assignments: " + str(component_assignments)) + print("Combined assignments: " + str(combined_vertex_assignments)) + partition_match = True else: From b4d1fbd3e258dd682e76608a1ed0c0921a2b5b17 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 12:10:51 +0000 Subject: [PATCH 252/327] Change bool in while loop --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 8a78f82a..dcdc5095 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -269,7 +269,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False) # Check on targets - partition_match = False + partition_mismatch = True while partition_match: reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) combined_vertex_assignments = reference_component_assignments.merge(component_assignments, @@ -285,7 +285,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("Reference component assignments: " + str(reference_component_assignments)) print("Component assignments: " + str(component_assignments)) print("Combined assignments: " + str(combined_vertex_assignments)) - partition_match = True + partition_match = False else: From 8c93ac3f84156ed1af62dc68a8e4691351976c39 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 12:29:19 +0000 Subject: [PATCH 253/327] Rename bool variable --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index dcdc5095..9fd3c303 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -270,7 +270,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Check on targets partition_mismatch = True - while partition_match: + while partition_mismatch: reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) combined_vertex_assignments = reference_component_assignments.merge(component_assignments, on = 'vertex', From af08e6a27104f65a0c78cfcc1cfed0ab51d75b30 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 13:03:18 +0000 Subject: [PATCH 254/327] Change cudf tallying --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 9fd3c303..98742672 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -276,7 +276,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u on = 'vertex', how = 'left') combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] - combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels','ref_labels'])['ref_labels'].count() + combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'])['ref_labels'].nunique() max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max() print('max is ' + str(max_ref_comp_count)) print("Reference indices: " + str(reference_indices)) From 3af73cabc6d519db7e6c7120625247c02cf873db Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 13:05:45 +0000 Subject: [PATCH 255/327] identify column names --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 98742672..7080dbaa 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -276,6 +276,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u on = 'vertex', how = 'left') combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] + print("Combined assignments: " + str(combined_vertex_assignments)) combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'])['ref_labels'].nunique() max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max() print('max is ' + str(max_ref_comp_count)) @@ -284,7 +285,6 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("Reference df: " + str(G_ref_df)) print("Reference component assignments: " + str(reference_component_assignments)) print("Component assignments: " + str(component_assignments)) - print("Combined assignments: " + str(combined_vertex_assignments)) partition_match = False else: From 2f7d1f59fbe38823a9f944ec804eaa0c3add4bea Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 13:08:09 +0000 Subject: [PATCH 256/327] Rename columns --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 7080dbaa..36c8938f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -263,7 +263,6 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u if 'src' in G_df.columns: G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] - G_ref_df.rename(columns={'labels': 'ref_labels'}, inplace=True) # Add self-loop if needed max_in_vertex_labels = max(reference_indices) G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False) @@ -272,6 +271,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u partition_mismatch = True while partition_mismatch: reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) + reference_component_assignments.rename(columns={'labels': 'ref_labels'}, inplace=True) combined_vertex_assignments = reference_component_assignments.merge(component_assignments, on = 'vertex', how = 'left') From 049aba55388c2d00281b9619c1f81195ef21b22d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 13:10:39 +0000 Subject: [PATCH 257/327] Fix loop control --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 36c8938f..c1db8519 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -285,7 +285,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("Reference df: " + str(G_ref_df)) print("Reference component assignments: " + str(reference_component_assignments)) print("Component assignments: " + str(component_assignments)) - partition_match = False + partition_mismatch = False else: From ce670742bbd7c16c80b5980b0952d429851a0678 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 13:14:04 +0000 Subject: [PATCH 258/327] Remove some debug messages --- PopPUNK/network.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index c1db8519..487847ce 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -276,15 +276,15 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u on = 'vertex', how = 'left') combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] - print("Combined assignments: " + str(combined_vertex_assignments)) combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'])['ref_labels'].nunique() max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max() + print("Combined assignments: " + str(combined_vertex_assignments)) print('max is ' + str(max_ref_comp_count)) - print("Reference indices: " + str(reference_indices)) - print("Overall cudf: " + str(G_df)) - print("Reference df: " + str(G_ref_df)) - print("Reference component assignments: " + str(reference_component_assignments)) - print("Component assignments: " + str(component_assignments)) +# print("Reference indices: " + str(reference_indices)) +# print("Overall cudf: " + str(G_df)) +# print("Reference df: " + str(G_ref_df)) +# print("Reference component assignments: " + str(reference_component_assignments)) +# print("Component assignments: " + str(component_assignments)) partition_mismatch = False else: From a80f20686622742f0a96a4da9b8fc364ebcf6461 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 13:24:16 +0000 Subject: [PATCH 259/327] Print counting information --- PopPUNK/network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 487847ce..0fd33afc 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -276,7 +276,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u on = 'vertex', how = 'left') combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] - combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'])['ref_labels'].nunique() + print("Counting: " + str(combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'])) + combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique() max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max() print("Combined assignments: " + str(combined_vertex_assignments)) print('max is ' + str(max_ref_comp_count)) From ab52160c654d0780d6857938da88d50153ceb5de Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 13:25:07 +0000 Subject: [PATCH 260/327] Print counting information --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 0fd33afc..bfed5cb3 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -276,7 +276,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u on = 'vertex', how = 'left') combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] - print("Counting: " + str(combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'])) + print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'])) combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique() max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max() print("Combined assignments: " + str(combined_vertex_assignments)) From 490a227b0f267edf1d1666db6976d222294b1b89 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 13:26:13 +0000 Subject: [PATCH 261/327] Print as list --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index bfed5cb3..6f20f5b4 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -276,7 +276,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u on = 'vertex', how = 'left') combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] - print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'])) + print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].to_arrow().to_pylist())) combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique() max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max() print("Combined assignments: " + str(combined_vertex_assignments)) From 7683bf9e7a9b6cf2cd1f3f9b4d6dd7acb1d0e22c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 13:27:16 +0000 Subject: [PATCH 262/327] Print as unique list --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 6f20f5b4..e1bb9fd8 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -276,7 +276,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u on = 'vertex', how = 'left') combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] - print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].to_arrow().to_pylist())) + print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().to_arrow().to_pylist())) combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique() max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max() print("Combined assignments: " + str(combined_vertex_assignments)) From 659ccff76da1a0fc5018c311568fb9796cdf6734 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 13:31:24 +0000 Subject: [PATCH 263/327] Find overall max --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index e1bb9fd8..653d3944 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -278,7 +278,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().to_arrow().to_pylist())) combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique() - max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max() + max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max() print("Combined assignments: " + str(combined_vertex_assignments)) print('max is ' + str(max_ref_comp_count)) # print("Reference indices: " + str(reference_indices)) From 80bbd67dbdbd47ef36f7b51e4197198f9c5ab8b4 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 14:05:34 +0000 Subject: [PATCH 264/327] Test reference connectivity --- PopPUNK/network.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 653d3944..4284082d 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -249,7 +249,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Record the original components to which sequences belonged component_assignments = cugraph.components.connectivity.connected_components(G) # Leiden method has resolution parameter - higher values give greater precision - partition_assignments, score = cugraph.leiden(G, resolution = 1.0) + partition_assignments, score = cugraph.leiden(G, resolution = 10.0) # group by partition, which becomes the first column, so retrieve second column reference_index_df = partition_assignments.groupby('partition').nth(0) reference_indices = reference_index_df['vertex'].to_arrow().to_pylist() @@ -277,8 +277,19 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u how = 'left') combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().to_arrow().to_pylist())) - combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique() max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max() + if max_ref_comp_count == 1: + partition_mismatch = False + else: + for component, component_df in combined_vertex_assignments.groupby(): + print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique())) + if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique() > 1: + G_component_df = G_df[G_df['labels'] == component] + print("Component info: " + str(G_component_df)) + G_component = cugraph.Graph() + G_component.from_cudf_edgelist(G_component_df) + traversal = cugraph.traversal.sssp(G_component,source = component_df['vertex'][0]) + print("Traversal: " + str(traversal)) print("Combined assignments: " + str(combined_vertex_assignments)) print('max is ' + str(max_ref_comp_count)) # print("Reference indices: " + str(reference_indices)) From 51e442e0034fc61b18162e4c356d6339769503f3 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 14:06:46 +0000 Subject: [PATCH 265/327] use debug mode --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 4284082d..bd29242f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -278,7 +278,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().to_arrow().to_pylist())) max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max() - if max_ref_comp_count == 1: + if max_ref_comp_count == 0: partition_mismatch = False else: for component, component_df in combined_vertex_assignments.groupby(): From 0e69d8c5ee360cb0e188e3c9739ceb9d23fa8a0b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 14:11:01 +0000 Subject: [PATCH 266/327] Change group by variable --- PopPUNK/network.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index bd29242f..16898aef 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -281,15 +281,15 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u if max_ref_comp_count == 0: partition_mismatch = False else: - for component, component_df in combined_vertex_assignments.groupby(): + for component, component_df in combined_vertex_assignments.groupby([labels], sort = False): print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique())) if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique() > 1: G_component_df = G_df[G_df['labels'] == component] print("Component info: " + str(G_component_df)) G_component = cugraph.Graph() G_component.from_cudf_edgelist(G_component_df) - traversal = cugraph.traversal.sssp(G_component,source = component_df['vertex'][0]) - print("Traversal: " + str(traversal)) + distances, predecessors = cugraph.traversal.sssp(G_component,source = component_df['vertex'][0]) + print("Traversal: " + str(predecessors)) print("Combined assignments: " + str(combined_vertex_assignments)) print('max is ' + str(max_ref_comp_count)) # print("Reference indices: " + str(reference_indices)) From c98179254cba1fd973af8caa87efbdff5b19b7b5 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 14:38:13 +0000 Subject: [PATCH 267/327] Correct group variable selection --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 16898aef..5d8a7a83 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -281,7 +281,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u if max_ref_comp_count == 0: partition_mismatch = False else: - for component, component_df in combined_vertex_assignments.groupby([labels], sort = False): + for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False): print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique())) if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique() > 1: G_component_df = G_df[G_df['labels'] == component] From 906b534fb9be2451d73f868a97760b24410588de Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 14:42:58 +0000 Subject: [PATCH 268/327] Add extra debug print statement --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 5d8a7a83..d7ccbda9 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -282,8 +282,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u partition_mismatch = False else: for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False): - print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique())) - if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique() > 1: + print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0])) + if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1: G_component_df = G_df[G_df['labels'] == component] print("Component info: " + str(G_component_df)) G_component = cugraph.Graph() From ee510c434cb7922200ae9bdaeef980152817b01b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 14:45:38 +0000 Subject: [PATCH 269/327] Change ref selection --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index d7ccbda9..85af62cd 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -249,7 +249,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Record the original components to which sequences belonged component_assignments = cugraph.components.connectivity.connected_components(G) # Leiden method has resolution parameter - higher values give greater precision - partition_assignments, score = cugraph.leiden(G, resolution = 10.0) + partition_assignments, score = cugraph.leiden(G, resolution = 0.1) # group by partition, which becomes the first column, so retrieve second column reference_index_df = partition_assignments.groupby('partition').nth(0) reference_indices = reference_index_df['vertex'].to_arrow().to_pylist() From 2b981e2423b894c9d5727768237385cb5de1a943 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 14:47:08 +0000 Subject: [PATCH 270/327] Extend debug mode --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 85af62cd..93966156 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -283,7 +283,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u else: for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False): print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0])) - if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1: + if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 0: G_component_df = G_df[G_df['labels'] == component] print("Component info: " + str(G_component_df)) G_component = cugraph.Graph() From 23e6b6023085d50675f1bba27091c71d14495241 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 14:50:54 +0000 Subject: [PATCH 271/327] Further debug --- PopPUNK/network.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 93966156..198706aa 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -284,7 +284,9 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False): print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0])) if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 0: - G_component_df = G_df[G_df['labels'] == component] + vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex'] + print("Vertices in components: " + str(vertices_in_component)) + G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)] print("Component info: " + str(G_component_df)) G_component = cugraph.Graph() G_component.from_cudf_edgelist(G_component_df) From 2151ec8cd99df30a813c8c8c2bb51c9b46490494 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 14:52:23 +0000 Subject: [PATCH 272/327] Changes to debug message --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 198706aa..9e5644f5 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -290,8 +290,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("Component info: " + str(G_component_df)) G_component = cugraph.Graph() G_component.from_cudf_edgelist(G_component_df) - distances, predecessors = cugraph.traversal.sssp(G_component,source = component_df['vertex'][0]) - print("Traversal: " + str(predecessors)) + traversal = cugraph.traversal.sssp(G_component,source = component_df['vertex'][0]) + print("Traversal: " + str(traversal)) print("Combined assignments: " + str(combined_vertex_assignments)) print('max is ' + str(max_ref_comp_count)) # print("Reference indices: " + str(reference_indices)) From a57a1b8ad55d586df6e28eb84e192434c83e7025 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 15:29:21 +0000 Subject: [PATCH 273/327] Update reference indices --- PopPUNK/network.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 9e5644f5..407c2a09 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -290,9 +290,17 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("Component info: " + str(G_component_df)) G_component = cugraph.Graph() G_component.from_cudf_edgelist(G_component_df) - traversal = cugraph.traversal.sssp(G_component,source = component_df['vertex'][0]) + traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0]) print("Traversal: " + str(traversal)) - print("Combined assignments: " + str(combined_vertex_assignments)) + reference_index_set = set(reference_indices) + predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor']) + while len(predecessors) > 0 and len(reference_index_set.difference(predecessors)) > 0: + reference_index_set = reference_index_set.union(predecessors) + predecessors = set() + predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor']) + print("Predecessors: " + str(predecessors)) + reference_indices = list(reference_index_set) + print("Final references: " + str(reference_indices)) print('max is ' + str(max_ref_comp_count)) # print("Reference indices: " + str(reference_indices)) # print("Overall cudf: " + str(G_df)) @@ -376,9 +384,9 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_ref = gt.GraphView(G, vfilt = reference_vertex) G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object - # Order found references as in sketch files - reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] - refFileName = writeReferences(reference_names, outPrefix) + # Order found references as in sketch files + reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] + refFileName = writeReferences(reference_names, outPrefix) return reference_indices, reference_names, refFileName, G_ref def writeReferences(refList, outPrefix): From 920a66be9c94492c693e98ebcc17701ecc29f487 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 15:35:57 +0000 Subject: [PATCH 274/327] Change series to set conversion --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 407c2a09..b48961a9 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -293,11 +293,11 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0]) print("Traversal: " + str(traversal)) reference_index_set = set(reference_indices) - predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor']) + predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'].to_arrow().to_pylist()) while len(predecessors) > 0 and len(reference_index_set.difference(predecessors)) > 0: reference_index_set = reference_index_set.union(predecessors) predecessors = set() - predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor']) + predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'].to_arrow().to_pylist()) print("Predecessors: " + str(predecessors)) reference_indices = list(reference_index_set) print("Final references: " + str(reference_indices)) From 8c9cf4dc2d19f7375de6cc50588783afab59e704 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 15:40:51 +0000 Subject: [PATCH 275/327] Change set processing --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index b48961a9..3626398c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -294,7 +294,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("Traversal: " + str(traversal)) reference_index_set = set(reference_indices) predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'].to_arrow().to_pylist()) - while len(predecessors) > 0 and len(reference_index_set.difference(predecessors)) > 0: + while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0: reference_index_set = reference_index_set.union(predecessors) predecessors = set() predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'].to_arrow().to_pylist()) From 4f646b3012830a2b4fefce4b5f1d1222dd5e6a5f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 15:51:51 +0000 Subject: [PATCH 276/327] Change filtering conditions --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 3626398c..bbe959d5 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -293,11 +293,11 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0]) print("Traversal: " + str(traversal)) reference_index_set = set(reference_indices) - predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'].to_arrow().to_pylist()) + predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist()) while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0: reference_index_set = reference_index_set.union(predecessors) predecessors = set() - predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'].to_arrow().to_pylist()) + predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist()) print("Predecessors: " + str(predecessors)) reference_indices = list(reference_index_set) print("Final references: " + str(reference_indices)) From 4669728291d3632d9026355f9b0bd48c7b85a618 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 15:54:36 +0000 Subject: [PATCH 277/327] Change definition of reference set --- PopPUNK/network.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index bbe959d5..ae89f62f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -293,11 +293,13 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0]) print("Traversal: " + str(traversal)) reference_index_set = set(reference_indices) - predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist()) + #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist()) + predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].to_arrow().to_pylist() + predecessors = set(predecessor_list[predecessor_list >= 0]) while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0: reference_index_set = reference_index_set.union(predecessors) - predecessors = set() - predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist()) + predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].to_arrow().to_pylist() + predecessors = set(predecessor_list[predecessor_list >= 0]) print("Predecessors: " + str(predecessors)) reference_indices = list(reference_index_set) print("Final references: " + str(reference_indices)) From 25b8cccf9f5a2d1e20902b2fbcee130ae8d023c9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 15:57:07 +0000 Subject: [PATCH 278/327] Change definition of reference set --- PopPUNK/network.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ae89f62f..948fe07e 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -294,8 +294,10 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("Traversal: " + str(traversal)) reference_index_set = set(reference_indices) #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist()) - predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].to_arrow().to_pylist() - predecessors = set(predecessor_list[predecessor_list >= 0]) + predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'] + print("pred list: " + str(predecessor_list)) + predecessors = set(predecessor_list[predecessor_list >= 0].to_arrow().to_pylist()) +# predecessors = set(predecessor_list[predecessor_list >= 0]) while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0: reference_index_set = reference_index_set.union(predecessors) predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].to_arrow().to_pylist() From 294f3fad8902c86d349ebf3baccca10bae3eed1e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 16:00:46 +0000 Subject: [PATCH 279/327] Change definition of reference set --- PopPUNK/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 948fe07e..28939b6b 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -294,6 +294,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u print("Traversal: " + str(traversal)) reference_index_set = set(reference_indices) #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist()) + print("Ref indices: " + str(reference_indices)) predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'] print("pred list: " + str(predecessor_list)) predecessors = set(predecessor_list[predecessor_list >= 0].to_arrow().to_pylist()) From ba946b1a55b46acdf33d87ae83c721b62461b8fb Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 16:07:45 +0000 Subject: [PATCH 280/327] Debug series filtering --- PopPUNK/network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 28939b6b..9c632f4b 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -296,6 +296,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist()) print("Ref indices: " + str(reference_indices)) predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'] + print("Raw traversal: " + str(traversal[traversal['vertex'].isin(reference_indices)])) print("pred list: " + str(predecessor_list)) predecessors = set(predecessor_list[predecessor_list >= 0].to_arrow().to_pylist()) # predecessors = set(predecessor_list[predecessor_list >= 0]) From cebdf3aad32f78b1786c6bd704dcc3b61f8e6745 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 16:09:50 +0000 Subject: [PATCH 281/327] Extract series values --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 9c632f4b..3322fd19 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -295,7 +295,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u reference_index_set = set(reference_indices) #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist()) print("Ref indices: " + str(reference_indices)) - predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'] + predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values() print("Raw traversal: " + str(traversal[traversal['vertex'].isin(reference_indices)])) print("pred list: " + str(predecessor_list)) predecessors = set(predecessor_list[predecessor_list >= 0].to_arrow().to_pylist()) From f86e814e257e6d5a69e3ad9db9397c22c823f899 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 16:14:14 +0000 Subject: [PATCH 282/327] Change extraction of values from series --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 3322fd19..9512d515 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -295,10 +295,10 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u reference_index_set = set(reference_indices) #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist()) print("Ref indices: " + str(reference_indices)) - predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values() + predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values print("Raw traversal: " + str(traversal[traversal['vertex'].isin(reference_indices)])) print("pred list: " + str(predecessor_list)) - predecessors = set(predecessor_list[predecessor_list >= 0].to_arrow().to_pylist()) + predecessors = set(predecessor_list[predecessor_list >= 0]) # predecessors = set(predecessor_list[predecessor_list >= 0]) while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0: reference_index_set = reference_index_set.union(predecessors) From 26bbc0a2bf7d0e505d0fae0a8339ab8125741ade Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 16:35:53 +0000 Subject: [PATCH 283/327] Comment code for impending review --- PopPUNK/network.py | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 9512d515..46c80c3c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -267,52 +267,48 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u max_in_vertex_labels = max(reference_indices) G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False) - # Check on targets + # Check references in same component in overall graph are connected in the reference graph partition_mismatch = True while partition_mismatch: + # Get components of original reference graph reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) reference_component_assignments.rename(columns={'labels': 'ref_labels'}, inplace=True) + # Merge with component assignments from overall graph combined_vertex_assignments = reference_component_assignments.merge(component_assignments, on = 'vertex', how = 'left') combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] - print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().to_arrow().to_pylist())) + # Find the number of components in the reference graph associated with each component in the overall graph - + # should be one if there is a one-to-one mapping of components - else links need to be added max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max() - if max_ref_comp_count == 0: + if max_ref_comp_count == 1: partition_mismatch = False else: + # Iterate through components for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False): - print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0])) - if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 0: + # Find components in the overall graph matching multiple components in the reference graph + if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1: + # Make a graph of the component from the overall graph vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex'] - print("Vertices in components: " + str(vertices_in_component)) G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)] - print("Component info: " + str(G_component_df)) G_component = cugraph.Graph() G_component.from_cudf_edgelist(G_component_df) + # Find single shortest path from a reference + # Should check first will always be a reference traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0]) - print("Traversal: " + str(traversal)) reference_index_set = set(reference_indices) - #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist()) - print("Ref indices: " + str(reference_indices)) + # Add predecessors to reference sequences on the SSSPs predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values - print("Raw traversal: " + str(traversal[traversal['vertex'].isin(reference_indices)])) - print("pred list: " + str(predecessor_list)) predecessors = set(predecessor_list[predecessor_list >= 0]) -# predecessors = set(predecessor_list[predecessor_list >= 0]) + # Add predecessors to reference set and check whether this results in complete paths + # where complete paths are indicated by references' predecessors being within the set of + # references while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0: reference_index_set = reference_index_set.union(predecessors) - predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].to_arrow().to_pylist() + predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values predecessors = set(predecessor_list[predecessor_list >= 0]) - print("Predecessors: " + str(predecessors)) + # Add expanded reference set to the overall list reference_indices = list(reference_index_set) - print("Final references: " + str(reference_indices)) - print('max is ' + str(max_ref_comp_count)) -# print("Reference indices: " + str(reference_indices)) -# print("Overall cudf: " + str(G_df)) -# print("Reference df: " + str(G_ref_df)) -# print("Reference component assignments: " + str(reference_component_assignments)) -# print("Component assignments: " + str(component_assignments)) partition_mismatch = False else: From a9d8fb0f4c121a7091a7f2c5c441bc415fe35c5e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 16:37:08 +0000 Subject: [PATCH 284/327] Remove unnecessary loop --- PopPUNK/network.py | 77 ++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 46c80c3c..fa20c0dd 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -268,48 +268,45 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False) # Check references in same component in overall graph are connected in the reference graph - partition_mismatch = True - while partition_mismatch: - # Get components of original reference graph - reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) - reference_component_assignments.rename(columns={'labels': 'ref_labels'}, inplace=True) - # Merge with component assignments from overall graph - combined_vertex_assignments = reference_component_assignments.merge(component_assignments, - on = 'vertex', - how = 'left') - combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] - # Find the number of components in the reference graph associated with each component in the overall graph - - # should be one if there is a one-to-one mapping of components - else links need to be added - max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max() - if max_ref_comp_count == 1: - partition_mismatch = False - else: - # Iterate through components - for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False): - # Find components in the overall graph matching multiple components in the reference graph - if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1: - # Make a graph of the component from the overall graph - vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex'] - G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)] - G_component = cugraph.Graph() - G_component.from_cudf_edgelist(G_component_df) - # Find single shortest path from a reference - # Should check first will always be a reference - traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0]) - reference_index_set = set(reference_indices) - # Add predecessors to reference sequences on the SSSPs + # First get components of original reference graph + reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref) + reference_component_assignments.rename(columns={'labels': 'ref_labels'}, inplace=True) + # Merge with component assignments from overall graph + combined_vertex_assignments = reference_component_assignments.merge(component_assignments, + on = 'vertex', + how = 'left') + combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)] + # Find the number of components in the reference graph associated with each component in the overall graph - + # should be one if there is a one-to-one mapping of components - else links need to be added + max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max() + if max_ref_comp_count == 1: + partition_mismatch = False + else: + # Iterate through components + for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False): + # Find components in the overall graph matching multiple components in the reference graph + if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1: + # Make a graph of the component from the overall graph + vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex'] + G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)] + G_component = cugraph.Graph() + G_component.from_cudf_edgelist(G_component_df) + # Find single shortest path from a reference + # Should check first will always be a reference + traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0]) + reference_index_set = set(reference_indices) + # Add predecessors to reference sequences on the SSSPs + predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values + predecessors = set(predecessor_list[predecessor_list >= 0]) + # Add predecessors to reference set and check whether this results in complete paths + # where complete paths are indicated by references' predecessors being within the set of + # references + while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0: + reference_index_set = reference_index_set.union(predecessors) predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values predecessors = set(predecessor_list[predecessor_list >= 0]) - # Add predecessors to reference set and check whether this results in complete paths - # where complete paths are indicated by references' predecessors being within the set of - # references - while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0: - reference_index_set = reference_index_set.union(predecessors) - predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values - predecessors = set(predecessor_list[predecessor_list >= 0]) - # Add expanded reference set to the overall list - reference_indices = list(reference_index_set) - partition_mismatch = False + # Add expanded reference set to the overall list + reference_indices = list(reference_index_set) else: From 92d0f0c38f17a11722c022965cd23e9bfa64b27b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 16:43:45 +0000 Subject: [PATCH 285/327] Change vertex selection for SSSP --- PopPUNK/network.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index fa20c0dd..b4e569ea 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -280,20 +280,18 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # should be one if there is a one-to-one mapping of components - else links need to be added max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max() if max_ref_comp_count == 1: - partition_mismatch = False - else: # Iterate through components for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False): # Find components in the overall graph matching multiple components in the reference graph - if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1: + if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0]== 1: # Make a graph of the component from the overall graph vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex'] + references_in_component = vertices_in_component[vertices_in_component.isin(reference_indices)].values G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)] G_component = cugraph.Graph() G_component.from_cudf_edgelist(G_component_df) # Find single shortest path from a reference - # Should check first will always be a reference - traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0]) + traversal = cugraph.traversal.sssp(G_component,source = references_in_component[0]) reference_index_set = set(reference_indices) # Add predecessors to reference sequences on the SSSPs predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values From 01bbe775589d6f2e7d2a2298e24b44e656c97a03 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 16:46:16 +0000 Subject: [PATCH 286/327] Reconstruct reference graph where necessary --- PopPUNK/network.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index b4e569ea..5af05e35 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -279,11 +279,11 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # Find the number of components in the reference graph associated with each component in the overall graph - # should be one if there is a one-to-one mapping of components - else links need to be added max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max() - if max_ref_comp_count == 1: + if max_ref_comp_count > 1: # Iterate through components for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False): # Find components in the overall graph matching multiple components in the reference graph - if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0]== 1: + if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1: # Make a graph of the component from the overall graph vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex'] references_in_component = vertices_in_component[vertices_in_component.isin(reference_indices)].values @@ -305,7 +305,10 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u predecessors = set(predecessor_list[predecessor_list >= 0]) # Add expanded reference set to the overall list reference_indices = list(reference_index_set) - + # Create new reference graph + G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] + G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False) + else: # Each component is independent, so can be multithreaded From ad66e9cab61f4f970ea9cb5b0648a8b115721781 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 16:51:00 +0000 Subject: [PATCH 287/327] Debug for missing nodes --- PopPUNK/network.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 5af05e35..5e2688f5 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -290,7 +290,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)] G_component = cugraph.Graph() G_component.from_cudf_edgelist(G_component_df) - # Find single shortest path from a reference + # Find single shortest path from a reference to all other nodes in the component traversal = cugraph.traversal.sssp(G_component,source = references_in_component[0]) reference_index_set = set(reference_indices) # Add predecessors to reference sequences on the SSSPs @@ -1239,6 +1239,9 @@ def get_vertex_list(G, use_gpu = False): if use_gpu: vlist = G.nodes().to_array().tolist() + print("Nodes: " + str(G.nodes())) + print("Array: " + str(G.nodes().to_array())) + print("List: " + str(G.nodes().to_array().tolist())) else: vlist = list(G.vertices()) From 7d49d8647a1653c7e5904c2979f8280e85f19d55 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 16:55:30 +0000 Subject: [PATCH 288/327] Remove debug message --- PopPUNK/network.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 5e2688f5..835b67a4 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1239,9 +1239,6 @@ def get_vertex_list(G, use_gpu = False): if use_gpu: vlist = G.nodes().to_array().tolist() - print("Nodes: " + str(G.nodes())) - print("Array: " + str(G.nodes().to_array())) - print("List: " + str(G.nodes().to_array().tolist())) else: vlist = list(G.vertices()) From 512c1272c1f19ea59bf59fd7b53549b55ca4acf9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 17:09:13 +0000 Subject: [PATCH 289/327] Add missing nodes with cugraph --- PopPUNK/network.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 835b67a4..8d2f7703 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -614,21 +614,23 @@ def constructNetwork(rlist, qlist, assignments, within_label, # by adding a self-loop if necessary; see https://github.com/rapidsai/cugraph/issues/1206 max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) max_in_vertex_labels = len(vertex_labels)-1 - if max_in_df.item() != max_in_vertex_labels: - G_self_loop = cudf.DataFrame() - G_self_loop['source'] = [max_in_vertex_labels] - G_self_loop['destination'] = [max_in_vertex_labels] - if weights is not None or sparse_input is not None: - G_self_loop['weights'] = [0.0] - G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) - new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) - - # construct graph - G = cugraph.Graph() - if weights is not None or sparse_input is not None: - G.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) - else: - G.from_cudf_edgelist(G_df, renumber=False) + G = add_self_loop(G_df, max_in_vertex_labels, weights = (if weights is not None), renumber = False) +# +# if max_in_df.item() != max_in_vertex_labels: +# G_self_loop = cudf.DataFrame() +# G_self_loop['source'] = [max_in_vertex_labels] +# G_self_loop['destination'] = [max_in_vertex_labels] +# if weights is not None or sparse_input is not None: +# G_self_loop['weights'] = [0.0] +# G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) +# new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) +# +# # construct graph +# G = cugraph.Graph() +# if weights is not None or sparse_input is not None: +# G.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) +# else: +# G.from_cudf_edgelist(G_df, renumber=False) else: From c504f964436248b9aa20aa6ecfed2f52e4522f2f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 17:10:47 +0000 Subject: [PATCH 290/327] Add missing nodes with cugraph --- PopPUNK/network.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 8d2f7703..366f8f25 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -614,8 +614,11 @@ def constructNetwork(rlist, qlist, assignments, within_label, # by adding a self-loop if necessary; see https://github.com/rapidsai/cugraph/issues/1206 max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) max_in_vertex_labels = len(vertex_labels)-1 - G = add_self_loop(G_df, max_in_vertex_labels, weights = (if weights is not None), renumber = False) -# + use_weights = False + if weights is not None: + use_weights = True + G = add_self_loop(G_df, max_in_vertex_labels, weights = use_weights, renumber = False) +# # if max_in_df.item() != max_in_vertex_labels: # G_self_loop = cudf.DataFrame() # G_self_loop['source'] = [max_in_vertex_labels] @@ -624,7 +627,7 @@ def constructNetwork(rlist, qlist, assignments, within_label, # G_self_loop['weights'] = [0.0] # G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) # new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) -# +# # # construct graph # G = cugraph.Graph() # if weights is not None or sparse_input is not None: From 8b5ed909d888debeda0d882d5d7113d5ff1ca049 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 17:14:10 +0000 Subject: [PATCH 291/327] Add missing nodes with cugraph --- PopPUNK/network.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 366f8f25..512af9b1 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -618,22 +618,6 @@ def constructNetwork(rlist, qlist, assignments, within_label, if weights is not None: use_weights = True G = add_self_loop(G_df, max_in_vertex_labels, weights = use_weights, renumber = False) -# -# if max_in_df.item() != max_in_vertex_labels: -# G_self_loop = cudf.DataFrame() -# G_self_loop['source'] = [max_in_vertex_labels] -# G_self_loop['destination'] = [max_in_vertex_labels] -# if weights is not None or sparse_input is not None: -# G_self_loop['weights'] = [0.0] -# G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) -# new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) -# -# # construct graph -# G = cugraph.Graph() -# if weights is not None or sparse_input is not None: -# G.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) -# else: -# G.from_cudf_edgelist(G_df, renumber=False) else: @@ -928,6 +912,14 @@ def add_self_loop(G_df, seq_num, weights = False, renumber = True): Dictionary of cluster assignments (keys are sequence names) """ # use self-loop to ensure all nodes are present + = np.amin([G_df['source'].max(),G_df['destination'].max()]) + if min_in_df.item() > 0: + G_self_loop = cudf.DataFrame() + G_self_loop['source'] = [0] + G_self_loop['destination'] = [0] + if weights: + G_self_loop['weight'] = 0.0 + G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) if max_in_df.item() != seq_num: G_self_loop = cudf.DataFrame() From 624bce8376599b6f1c48f75626f6ab7779739ce6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 17:14:47 +0000 Subject: [PATCH 292/327] Add missing nodes with cugraph --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 512af9b1..b1187b51 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -912,7 +912,7 @@ def add_self_loop(G_df, seq_num, weights = False, renumber = True): Dictionary of cluster assignments (keys are sequence names) """ # use self-loop to ensure all nodes are present - = np.amin([G_df['source'].max(),G_df['destination'].max()]) + min_in_df = np.amin([G_df['source'].max(),G_df['destination'].max()]) if min_in_df.item() > 0: G_self_loop = cudf.DataFrame() G_self_loop['source'] = [0] From f032f26218686e75549f2f64b091263245d2cd8a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 21:10:32 +0000 Subject: [PATCH 293/327] Change cugraph node count retrieval --- PopPUNK/network.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index b1187b51..6f871845 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -912,13 +912,13 @@ def add_self_loop(G_df, seq_num, weights = False, renumber = True): Dictionary of cluster assignments (keys are sequence names) """ # use self-loop to ensure all nodes are present - min_in_df = np.amin([G_df['source'].max(),G_df['destination'].max()]) + min_in_df = np.amin([G_df['source'].min(),G_df['destination'].min()]) if min_in_df.item() > 0: G_self_loop = cudf.DataFrame() G_self_loop['source'] = [0] G_self_loop['destination'] = [0] if weights: - G_self_loop['weight'] = 0.0 + G_self_loop['weights'] = 0.0 G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) if max_in_df.item() != seq_num: @@ -926,7 +926,7 @@ def add_self_loop(G_df, seq_num, weights = False, renumber = True): G_self_loop['source'] = [seq_num] G_self_loop['destination'] = [seq_num] if weights: - G_self_loop['weight'] = 0.0 + G_self_loop['weights'] = 0.0 G_df = cudf.concat([G_df,G_self_loop], ignore_index = True) # Construct graph G_new = cugraph.Graph() @@ -1235,7 +1235,7 @@ def get_vertex_list(G, use_gpu = False): """ if use_gpu: - vlist = G.nodes().to_array().tolist() + vlist = range(G.number_of_vertices().item()) else: vlist = list(G.vertices()) From 5dc2b0eca2a37e47c3a3f395e58cedb9460b6e64 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 21:11:45 +0000 Subject: [PATCH 294/327] Change int format --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 6f871845..f888eafc 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -1235,7 +1235,7 @@ def get_vertex_list(G, use_gpu = False): """ if use_gpu: - vlist = range(G.number_of_vertices().item()) + vlist = range(G.number_of_vertices()) else: vlist = list(G.vertices()) From 7602dd31d7dbe11c4d04a4217a2271325fb6a505 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 21:40:34 +0000 Subject: [PATCH 295/327] Change save function definition --- PopPUNK/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 502575ff..51218083 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -39,7 +39,6 @@ import cudf gpu_lib = True except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") gpu_lib = False import pp_sketchlib @@ -266,7 +265,7 @@ def copy(self, prefix): """Copy the model to a new directory """ self.outPrefix = prefix - save() + self.save() class BGMMFit(ClusterFit): From bdb01410185a2f089abdf075b3c5b72d64dfa9fd Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 21:42:36 +0000 Subject: [PATCH 296/327] Change GPU score calculation --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index f888eafc..64d61c95 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -681,9 +681,9 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): components = len(component_nums) density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1) triangle_count = cugraph.community.triangle_count.triangles(G) - degree_df = G.degree() + degree_df = G.in_degree() triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()]) - transitivity = triangle_count/triad_count + transitivity = 2*triangle_count/triad_count else: component_assignments, component_frequencies = gt.label_components(G) components = len(component_frequencies) From 4b78c1e19af42ad4565e051aaf9bcb45a30f90a8 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 17 Mar 2021 22:06:26 +0000 Subject: [PATCH 297/327] Update cytoscape viz test --- test/run_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/run_test.py b/test/run_test.py index a72b450d..61cd3706 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -67,7 +67,7 @@ # viz sys.stderr.write("Running visualisations (poppunk_visualise)\n") subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --microreact", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --cytoscape", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --cytoscape --network-file example_db/example_db_graph.gt", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --phandango", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt", shell=True, check=True) From 35abac00853e50b00657ff50e78d3b2bd644a5fb Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 18 Mar 2021 11:32:51 +0000 Subject: [PATCH 298/327] Changes to messages and function arguments --- PopPUNK/models.py | 3 +-- PopPUNK/network.py | 1 - PopPUNK/refine.py | 1 - PopPUNK/sparse_mst.py | 3 +-- 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 51218083..46be4219 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -1009,8 +1009,7 @@ def fit(self, X, accessory): pp_sketchlib.sparsifyDists( pp_sketchlib.longToSquare(X[:, [self.dist_col]], self.threads), 0, - rank, - self.threads + rank ) data = [epsilon if d < epsilon else d for d in data] self.nn_dists[rank] = coo_matrix((data, (row, col)), diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 64d61c95..666bdd6f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -28,7 +28,6 @@ import cudf gpu_lib = True except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") gpu_lib = False from .__main__ import accepted_weights_types diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 2095e82e..f72c6900 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -30,7 +30,6 @@ import cudf gpu_lib = True except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") gpu_lib = False from .network import constructNetwork diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index 24fc2bfa..68f8e321 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -18,13 +18,12 @@ import cudf gpu_lib = True except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") gpu_lib = False # import poppunk package from .__init__ import __version__ -from .network import constructNetwork, generate_minimum_spanning_tree, load_previous_network +from .network import constructNetwork, generate_minimum_spanning_tree, network_to_edges from .plot import drawMST from .trees import mst_to_phylogeny, write_tree from .utils import setGtThreads, readIsolateTypeFromCsv From 6a7e806e7f8d3dcb2d7c746cd32d383249373e2e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 18 Mar 2021 11:48:47 +0000 Subject: [PATCH 299/327] Disambiguation of term 'reference' --- PopPUNK/__main__.py | 4 ++-- PopPUNK/assign.py | 8 ++++---- PopPUNK/sketchlib.py | 12 ++++++------ PopPUNK/utils.py | 16 ++++++++-------- PopPUNK/web.py | 2 +- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index fa6650f7..592102c1 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -96,7 +96,7 @@ def get_options(): default = 0.5, type = float) qcGroup.add_argument('--max-pi-dist', help='Maximum core distance to permit [default = 0.5]', default = 0.5, type = float) - qcGroup.add_argument('--reference-isolate', help='Isolate from which distances will be calculated for pruning [default = None]', + qcGroup.add_argument('--type-isolate', help='Isolate from which distances will be calculated for pruning [default = None]', default = None, type = str) qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond ' 'which sequences will be excluded [default = 5]', default = 5, type = int) @@ -234,7 +234,7 @@ def main(): 'upper_n': args.upper_n, 'max_pi_dist': args.max_pi_dist, 'max_a_dist': args.max_a_dist, - 'reference_isolate': args.reference_isolate + 'type_isolate': args.type_isolate } # Dict of DB access functions diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 3f9b05ee..bc86d15e 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -38,7 +38,7 @@ def assign_query(dbFuncs, graph_weights, max_a_dist, max_pi_dist, - reference_isolate, + type_isolate, model_dir, strand_preserved, previous_clustering, @@ -378,7 +378,7 @@ def get_options(): default = 0.5, type = float) qcGroup.add_argument('--max-pi-dist', help='Maximum core distance to permit [default = 0.5]', default = 0.5, type = float) - qcGroup.add_argument('--reference-isolate', help='Isolate from which distances can be calculated for pruning [default = None]', + qcGroup.add_argument('--type-isolate', help='Isolate from which distances can be calculated for pruning [default = None]', default = None, type = str) qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond ' 'which sequences will be excluded [default = 5]', default = None, type = int) @@ -471,7 +471,7 @@ def main(): 'upper_n': args.upper_n, 'max_pi_dist': args.max_pi_dist, 'max_a_dist': args.max_a_dist, - 'reference_isolate': args.reference_isolate + 'type_isolate': args.type_isolate } # Dict of DB access functions for assign_query (which is out of scope) @@ -510,7 +510,7 @@ def main(): args.graph_weights, args.max_a_dist, args.max_pi_dist, - args.reference_isolate, + args.type_isolate, args.model_dir, args.strand_preserved, args.previous_clustering, diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 528fc1d2..cdfd3ef8 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -572,8 +572,8 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num return distMat -def pickReferenceIsolate(prefix, names): - """Selects a reference isolate as that with a minimal proportion +def pickTypeIsolate(prefix, names): + """Selects a type isolate as that with a minimal proportion of missing data. Args: @@ -583,7 +583,7 @@ def pickReferenceIsolate(prefix, names): Names of samples to QC Returns: - reference_isolate (str) + type_isolate (str) Name of isolate selected as reference """ # open databases @@ -591,7 +591,7 @@ def pickReferenceIsolate(prefix, names): hdf_in = h5py.File(db_name, 'r+') min_prop_n = 1.0 - reference_isolate = None + type_isolate = None try: # process data structures @@ -600,7 +600,7 @@ def pickReferenceIsolate(prefix, names): for dataset in read_grp: if hdf_in['sketches'][dataset].attrs['missing_bases']/hdf_in['sketches'][dataset].attrs['length'] < min_prop_n: min_prop_n = hdf_in['sketches'][dataset].attrs['missing_bases']/hdf_in['sketches'][dataset].attrs['length'] - reference_isolate = dataset + type_isolate = dataset if min_prop_n == 0.0: break # if failure still close files to avoid corruption @@ -610,7 +610,7 @@ def pickReferenceIsolate(prefix, names): print("Unexpected error:", sys.exc_info()[0], file = sys.stderr) raise - return reference_isolate + return type_isolate def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads): """Calculates random match probability based on means of genomes diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 41ef484a..fee09417 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -253,7 +253,7 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): # avoid circular import from .prune_db import prune_distance_matrix from .sketchlib import removeFromDB - from .sketchlib import pickReferenceIsolate + from .sketchlib import pickTypeIsolate # Create overall list of sequences if refList == refList: @@ -272,10 +272,10 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): sys.stderr.write("Cannot create output directory " + prefix + "\n") sys.exit(1) - # Pick reference isolate if not supplied - if qc_dict['reference_isolate'] is None: - qc_dict['reference_isolate'] = pickReferenceIsolate(ref_db, seq_names_passing) - sys.stderr.write('Selected reference isolate is ' + qc_dict['reference_isolate'] + '\n') + # Pick type isolate if not supplied + if qc_dict['type_isolate'] is None: + qc_dict['type_isolate'] = pickTypeIsolate(ref_db, seq_names_passing) + sys.stderr.write('Selected type isolate is ' + qc_dict['type_isolate'] + '\n') # First check with numpy, which is quicker than iterating over everything long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist() @@ -283,9 +283,9 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): names = list(iterDistRows(refList, queryList, refList == queryList)) # Prune sequences based on reference sequence for i in long_distance_rows: - if names[i][0] == qc_dict['reference_isolate']: + if names[i][0] == qc_dict['type_isolate']: to_prune.append(names[i][1]) - elif names[i][1] == qc_dict['reference_isolate']: + elif names[i][1] == qc_dict['type_isolate']: to_prune.append(names[i][0]) # prune based on distance from reference if provided @@ -294,7 +294,7 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n') sys.exit(1) elif qc_dict['qc_filter'] == 'prune' and len(to_prune) > 0: - if qc_dict['reference_isolate'] is None: + if qc_dict['type_isolate'] is None: sys.stderr.write('Distances exceeded QC thresholds but no reference isolate supplied\n') sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n') sys.exit(1) diff --git a/PopPUNK/web.py b/PopPUNK/web.py index a8ed3a7e..5303a724 100644 --- a/PopPUNK/web.py +++ b/PopPUNK/web.py @@ -76,7 +76,7 @@ def sketchAssign(): args.assign.graph_weights, args.assign.max_a_dist, args.assign.max_pi_dist, - args.assign.reference_isolate, + args.assign.type_isolate, args.assign.model_dir, args.assign.strand_preserved, args.assign.previous_clustering, From a94fe2d686b3d9cebcf5d5c27323726204ac3fbc Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 09:02:51 +0000 Subject: [PATCH 300/327] Check type isolate is in QC filtered set --- PopPUNK/sketchlib.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index cdfd3ef8..4aa4d43f 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -742,6 +742,13 @@ def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads # This gives back retained in the same order as names retained = [x for x in names if x in frozenset(retained)] + + # stop if type sequence does not pass QC or is absent + if qc_dict['type_isolate'] not in retained: + sys.stderr.write('Type isolate ' + qc_dict['type_isolate'] + ' not found in isolates after QC; check ' + 'name of type isolate and QC options\n') + sys.exit(1) + return retained def fitKmerCurve(pairwise, klist, jacobian): From 348e502795e4d98670a53ed0e1f9ee08a1b2db13 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 09:22:37 +0000 Subject: [PATCH 301/327] Add type isolate to reference set --- PopPUNK/__main__.py | 1 + PopPUNK/assign.py | 10 ++++++++-- PopPUNK/network.py | 20 ++++++++++++++++++-- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 592102c1..6cdbabaa 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -538,6 +538,7 @@ def main(): extractReferences(genomeNetwork, refList, output, + type_isolate = qc_dict['type_isolate'], threads = args.threads, use_gpu = args.gpu_graph) nodes_to_remove = set(range(len(refList))).difference(newReferencesIndices) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index bc86d15e..ce4e2d2c 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -295,7 +295,13 @@ def assign_query(dbFuncs, dbOrder = rNames + qNames newRepresentativesIndices, newRepresentativesNames, \ newRepresentativesFile, genomeNetwork = \ - extractReferences(genomeNetwork, dbOrder, output, rNames, threads = threads, use_gpu = gpu_graph) + extractReferences(genomeNetwork, + dbOrder, + output, + rNames, + type_isolate = qc_dict['type_isolate'], + threads = threads, + use_gpu = gpu_graph) # intersection that maintains order newQueries = [x for x in qNames if x in frozenset(newRepresentativesNames)] @@ -444,7 +450,7 @@ def main(): # Dict of QC options for passing to database construction and querying functions if args.length_sigma is None and None in args.length_range and args.prop_n is None \ and args.upper_n is None and args.max_a_dist is None and args.max_pi_dist is None: - qc_dict = {'run_qc': False } + qc_dict = {'run_qc': False, 'type_isolate': None } else: # define defaults if one QC parameter given # length_sigma diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 666bdd6f..6cc611fd 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -206,7 +206,8 @@ def cliquePrune(component, graph, reference_indices, components_list): ref_list = getCliqueRefs(subgraph, refs) return(list(ref_list)) -def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, use_gpu = False): +def extractReferences(G, dbOrder, outPrefix, type_isolate = None, + existingRefs = None, threads = 1, use_gpu = False): """Extract references for each cluster based on cliques Writes chosen references to file by calling :func:`~writeReferences` @@ -218,6 +219,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u The order of files in the sketches, so returned references are in the same order outPrefix (str) Prefix for output file (.refs will be appended) + type_isolate (str) + Isolate to be included in set of references existingRefs (list) References that should be used for each clique use_gpu (bool) @@ -237,6 +240,15 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u index_lookup = {v:k for k,v in enumerate(dbOrder)} reference_indices = set([index_lookup[r] for r in references]) + # Add type isolate, if necessary + type_isolate_index = None + if type_isolate is not None: + if type_isolate is in dbOrder: + type_isolate_index = dbOrder.index(type_isolate) + else: + sys.stderr.write('Type isolate ' + type_isolate + ' not found\n') + sys.exit(1) + if use_gpu: if not gpu_lib: @@ -252,7 +264,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u # group by partition, which becomes the first column, so retrieve second column reference_index_df = partition_assignments.groupby('partition').nth(0) reference_indices = reference_index_df['vertex'].to_arrow().to_pylist() - + # Order found references as in sketchlib database reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] refFileName = writeReferences(reference_names, outPrefix) @@ -383,6 +395,10 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u G_ref = gt.GraphView(G, vfilt = reference_vertex) G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object + # Add type isolate if necessary + if type_isolate_index is not None and type_isolate_index not in reference_indices: + reference_indices.add(type_isolate_index) + # Order found references as in sketch files reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] refFileName = writeReferences(reference_names, outPrefix) From 5041a91cc838de18e20acdc1e25cd6564a69dd2e Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 09:37:33 +0000 Subject: [PATCH 302/327] Fixes to conditional statements --- PopPUNK/network.py | 2 +- PopPUNK/sketchlib.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 6cc611fd..1de0cfbc 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -243,7 +243,7 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, # Add type isolate, if necessary type_isolate_index = None if type_isolate is not None: - if type_isolate is in dbOrder: + if type_isolate in dbOrder: type_isolate_index = dbOrder.index(type_isolate) else: sys.stderr.write('Type isolate ' + type_isolate + ' not found\n') diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 4aa4d43f..7a8ef51d 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -744,7 +744,7 @@ def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads retained = [x for x in names if x in frozenset(retained)] # stop if type sequence does not pass QC or is absent - if qc_dict['type_isolate'] not in retained: + if qc_dict['type_isolate'] is not None and qc_dict['type_isolate'] not in retained: sys.stderr.write('Type isolate ' + qc_dict['type_isolate'] + ' not found in isolates after QC; check ' 'name of type isolate and QC options\n') sys.exit(1) From 7901e42ce7d3129cc4cb491d8077accb6d347bc0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 11:28:57 +0000 Subject: [PATCH 303/327] Fixes to function arguments --- PopPUNK/assign.py | 2 +- PopPUNK/network.py | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index ce4e2d2c..dc359f5e 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -298,7 +298,7 @@ def assign_query(dbFuncs, extractReferences(genomeNetwork, dbOrder, output, - rNames, + existingRefs = rNames, type_isolate = qc_dict['type_isolate'], threads = threads, use_gpu = gpu_graph) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 1de0cfbc..30f5be08 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -160,7 +160,7 @@ def checkNetworkVertexCount(seq_list, G, use_gpu): vertex_list = set(get_vertex_list(G, use_gpu = use_gpu)) networkMissing = set(set(range(len(seq_list))).difference(vertex_list)) if len(networkMissing) > 0: - sys.stderr.write("ERROR: Samples " + ",".join(map(str,networkMissing)) + " are missing from the final network\n") + sys.stderr.write("ERROR: " + str(len(networkMissing)) + " samples are missing from the final network\n") sys.exit(1) def getCliqueRefs(G, reference_indices = set()): @@ -264,7 +264,11 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, # group by partition, which becomes the first column, so retrieve second column reference_index_df = partition_assignments.groupby('partition').nth(0) reference_indices = reference_index_df['vertex'].to_arrow().to_pylist() - + + # Add type isolate if necessary - before edges are added + if type_isolate_index is not None and type_isolate_index not in reference_indices: + reference_indices.add(type_isolate_index) + # Order found references as in sketchlib database reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] refFileName = writeReferences(reference_names, outPrefix) @@ -339,6 +343,10 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, # Returns nested lists, which need to be flattened reference_indices = set([entry for sublist in ref_lists for entry in sublist]) + # Add type isolate if necessary - before edges are added + if type_isolate_index is not None and type_isolate_index not in reference_indices: + reference_indices.add(type_isolate_index) + if gt.openmp_enabled(): gt.openmp_set_num_threads(threads) @@ -395,10 +403,6 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, G_ref = gt.GraphView(G, vfilt = reference_vertex) G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object - # Add type isolate if necessary - if type_isolate_index is not None and type_isolate_index not in reference_indices: - reference_indices.add(type_isolate_index) - # Order found references as in sketch files reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] refFileName = writeReferences(reference_names, outPrefix) From 7742e8344993949f02e377abd79c1fc1e3e46f8d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 11:30:21 +0000 Subject: [PATCH 304/327] Change cudf memory management --- PopPUNK/network.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 30f5be08..b10512eb 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -622,7 +622,10 @@ def constructNetwork(rlist, qlist, assignments, within_label, if not gpu_lib: sys.stderr.write('Unable to load GPU libraries; exiting\n') sys.exit(1) - + + # Set memory management for large networks + cudf.set_allocator("managed") + # create DataFrame using edge tuples if weights is not None or sparse_input is not None: G_df = cudf.DataFrame(connections, columns =['source', 'destination', 'weights']) From 77e9d64e0a3ea1a6aefae236c67c825afe71eb32 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 12:40:07 +0000 Subject: [PATCH 305/327] Add no-plot mode for models --- PopPUNK/__main__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 6cdbabaa..075cf1b3 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -116,6 +116,8 @@ def get_options(): modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster ' 'in DBSCAN fitting [default = 0.0001]', type=float, default=0.0001) modelGroup.add_argument('--threshold', help='Cutoff if using --fit-model threshold', type=float) + modelGroup.add_argument('--no-plot', help='Switch off plotting, which can be slow for large datasets', type=bool, + default=False, action='store_true') # model refinement refinementGroup = parser.add_argument_group('Refine model options') From 2c7d0141edf14042e00dc3faee013e7daa8f12cc Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 12:42:19 +0000 Subject: [PATCH 306/327] Extend no-plot mode for models --- PopPUNK/__main__.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 075cf1b3..72f8fede 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -389,13 +389,15 @@ def main(): model = DBSCANFit(output) model.set_threads(args.threads) assignments = model.fit(distMat, args.D, args.min_cluster_prop) - model.plot() + if not args.no_plot: + model.plot() # Run Gaussian model elif args.fit_model == "bgmm": model = BGMMFit(output) model.set_threads(args.threads) assignments = model.fit(distMat, args.K) - model.plot(distMat, assignments) + if not args.no_plot: + model.plot(distMat, assignments) elif args.fit_model == "refine": new_model = RefineFit(output) model.set_threads(args.threads) @@ -407,14 +409,16 @@ def main(): args.score_idx, args.no_local, args.gpu_graph) - new_model.plot(distMat) + if not args.no_plot: + new_model.plot(distMat) model = new_model elif args.fit_model == "threshold": new_model = RefineFit(output) new_model.set_threads(args.threads) assignments = new_model.apply_threshold(distMat, args.threshold) - new_model.plot(distMat) + if not args.no_plot: + new_model.plot(distMat) model = new_model elif args.fit_model == "lineage": # run lineage clustering. Sparsity & low rank should keep memory @@ -422,7 +426,8 @@ def main(): model = LineageFit(output, rank_list) model.set_threads(args.threads) model.fit(distMat, args.use_accessory) - model.plot(distMat) + if not args.no_plot: + model.plot(distMat) assignments = {} for rank in rank_list: From c450710f102053c3186ed86863795ff729e979eb Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 13:12:36 +0000 Subject: [PATCH 307/327] Move model processing flags to optimisation arg group --- PopPUNK/__main__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 72f8fede..4d8a78cc 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -116,8 +116,6 @@ def get_options(): modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster ' 'in DBSCAN fitting [default = 0.0001]', type=float, default=0.0001) modelGroup.add_argument('--threshold', help='Cutoff if using --fit-model threshold', type=float) - modelGroup.add_argument('--no-plot', help='Switch off plotting, which can be slow for large datasets', type=bool, - default=False, action='store_true') # model refinement refinementGroup = parser.add_argument_group('Refine model options') @@ -127,8 +125,6 @@ def get_options(): type=float, default = None) refinementGroup.add_argument('--manual-start', help='A file containing information for a start point. ' 'See documentation for help.', default=None) - refinementGroup.add_argument('--no-local', help='Do not perform the local optimization step (speed up on very large datasets)', - default=False, action='store_true') refinementGroup.add_argument('--model-dir', help='Directory containing model to use for assigning queries ' 'to clusters [default = reference database directory]', type = str) refinementGroup.add_argument('--score-idx', @@ -159,6 +155,10 @@ def get_options(): other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]') other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating networks [default = False]') other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]') + other.add_argument('--no-plot', help='Switch off model plotting, which can be slow for large datasets', type=bool, + default=False, action='store_true') + other.add_argument('--no-local', help='Do not perform the local optimization step in model refinement (speed up on very large datasets)', + default=False, action='store_true') other.add_argument('--version', action='version', version='%(prog)s '+__version__) From f823964f92b8e3ec4057491a7ff2c65ff10d6522 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 13:15:44 +0000 Subject: [PATCH 308/327] Update new lines --- scripts/poppunk_batch_mst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py index 6a6f8eae..50fa152e 100755 --- a/scripts/poppunk_batch_mst.py +++ b/scripts/poppunk_batch_mst.py @@ -247,7 +247,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, # Validate batches batch_names = sorted(set(batches)) if len(batch_names) < 2: - sys.stderr.write("You must supply multiple batches") + sys.stderr.write("You must supply multiple batches\n") sys.exit(1) first_batch = batch_names.pop(0) From a5c4f0282f98084706307cf3ef07b9066f7ae432 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 13:17:35 +0000 Subject: [PATCH 309/327] Edit whitespace Co-authored-by: John Lees --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index b10512eb..e58a442b 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -322,7 +322,7 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, reference_indices = list(reference_index_set) # Create new reference graph G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] - G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False) + G_ref = add_self_loop(G_ref_df, max_in_vertex_labels, renumber = False) else: From e6abfb489bc3c5f02c1bf8841a933c3a1119acbd Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 13:17:58 +0000 Subject: [PATCH 310/327] Edit whitespace Co-authored-by: John Lees --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index e58a442b..e455e076 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -705,7 +705,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): triangle_count = cugraph.community.triangle_count.triangles(G) degree_df = G.in_degree() triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()]) - transitivity = 2*triangle_count/triad_count + transitivity = 2 * triangle_count/triad_count else: component_assignments, component_frequencies = gt.label_components(G) components = len(component_frequencies) From 78ee9c23dd61415abc9ee96b26af9e4ffae8b0e5 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 13:18:25 +0000 Subject: [PATCH 311/327] Edit whitespace Co-authored-by: John Lees --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index e455e076..7c2b6ac4 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -934,7 +934,7 @@ def add_self_loop(G_df, seq_num, weights = False, renumber = True): Dictionary of cluster assignments (keys are sequence names) """ # use self-loop to ensure all nodes are present - min_in_df = np.amin([G_df['source'].min(),G_df['destination'].min()]) + min_in_df = np.amin([G_df['source'].min(), G_df['destination'].min()]) if min_in_df.item() > 0: G_self_loop = cudf.DataFrame() G_self_loop['source'] = [0] From 5b362fbb70dd3ad095b46e5f27eeadd5f916d656 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 13:19:39 +0000 Subject: [PATCH 312/327] Reinsert library loading warning --- PopPUNK/refine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index f72c6900..2095e82e 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -30,6 +30,7 @@ import cudf gpu_lib = True except ImportError as e: + sys.stderr.write("cugraph and cudf unavailable\n") gpu_lib = False from .network import constructNetwork From 03f11a86c907a527f5725cd95f87858e6fa9a68a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 13:34:51 +0000 Subject: [PATCH 313/327] Change model plotting behaviour --- PopPUNK/__main__.py | 23 +++++++++-------------- PopPUNK/models.py | 5 ++++- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 4d8a78cc..47e17ba3 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -155,7 +155,7 @@ def get_options(): other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]') other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating networks [default = False]') other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]') - other.add_argument('--no-plot', help='Switch off model plotting, which can be slow for large datasets', type=bool, + other.add_argument('--no-plot', help='Switch off model plotting, which can be slow for large datasets', default=False, action='store_true') other.add_argument('--no-local', help='Do not perform the local optimization step in model refinement (speed up on very large datasets)', default=False, action='store_true') @@ -323,9 +323,10 @@ def main(): qc_dict) # Plot results - plot_scatter(distMat, - args.output + "/" + os.path.basename(args.output) + "_distanceDistribution", - args.output + " distances") + if not args.no_plot: + plot_scatter(distMat, + args.output + "/" + os.path.basename(args.output) + "_distanceDistribution", + args.output + " distances") #******************************# #* *# @@ -389,15 +390,11 @@ def main(): model = DBSCANFit(output) model.set_threads(args.threads) assignments = model.fit(distMat, args.D, args.min_cluster_prop) - if not args.no_plot: - model.plot() # Run Gaussian model elif args.fit_model == "bgmm": model = BGMMFit(output) model.set_threads(args.threads) assignments = model.fit(distMat, args.K) - if not args.no_plot: - model.plot(distMat, assignments) elif args.fit_model == "refine": new_model = RefineFit(output) model.set_threads(args.threads) @@ -409,16 +406,12 @@ def main(): args.score_idx, args.no_local, args.gpu_graph) - if not args.no_plot: - new_model.plot(distMat) model = new_model elif args.fit_model == "threshold": new_model = RefineFit(output) new_model.set_threads(args.threads) assignments = new_model.apply_threshold(distMat, args.threshold) - if not args.no_plot: - new_model.plot(distMat) model = new_model elif args.fit_model == "lineage": # run lineage clustering. Sparsity & low rank should keep memory @@ -426,8 +419,6 @@ def main(): model = LineageFit(output, rank_list) model.set_threads(args.threads) model.fit(distMat, args.use_accessory) - if not args.no_plot: - model.plot(distMat) assignments = {} for rank in rank_list: @@ -436,6 +427,10 @@ def main(): # save model model.save() + + # plot model + if not args.no_plot: + model.plot(distMat, assignments) # use model else: diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 46be4219..48968247 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -1048,7 +1048,7 @@ def load(self, fit_npz, fit_obj): self.nn_dists = fit_npz self.fitted = True - def plot(self, X): + def plot(self, X, y = None): '''Extends :func:`~ClusterFit.plot` Write a summary of the fit, and plot the results using @@ -1057,6 +1057,9 @@ def plot(self, X): Args: X (numpy.array) Core and accessory distances + y (any) + Unused variable for compatibility with other + plotting functions ''' ClusterFit.plot(self, X) for rank in self.ranks: From 3e5160e5a1b7520cf4d475e6a975d6ded57cb9fd Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 13:59:58 +0000 Subject: [PATCH 314/327] Add cudf and cugraph --- environment.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/environment.yml b/environment.yml index 0c525fe0..d8bdce3e 100644 --- a/environment.yml +++ b/environment.yml @@ -4,6 +4,8 @@ channels: - bioconda - defaults - r + - nvidia + - rapidsai dependencies: - pip - numpy @@ -42,3 +44,5 @@ dependencies: - libgomp - tqdm - flask-apscheduler + - cudf + - cugraph From 13e565fec7099512f8fcc84a89ec0fa7c71e44c6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 17:24:19 +0000 Subject: [PATCH 315/327] Change list command to append --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 7c2b6ac4..d684c35d 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -267,7 +267,7 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, # Add type isolate if necessary - before edges are added if type_isolate_index is not None and type_isolate_index not in reference_indices: - reference_indices.add(type_isolate_index) + reference_indices.append(type_isolate_index) # Order found references as in sketchlib database reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] From 771ae57ee25925e2a4b42b676ee80c5a3b15c920 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 19:29:19 +0000 Subject: [PATCH 316/327] Limit betweenness calculation with GPU --- PopPUNK/network.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index d684c35d..4019b919 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -310,7 +310,8 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, reference_index_set = set(reference_indices) # Add predecessors to reference sequences on the SSSPs predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values - predecessors = set(predecessor_list[predecessor_list >= 0]) + print("Predecessors: " + str(predecessor_list) + " type: " + str(type(predecessor_list))) + predecessors = set(predecessor_list[predecessor_list >= 0].values) # Add predecessors to reference set and check whether this results in complete paths # where complete paths are indicated by references' predecessors being within the set of # references @@ -725,7 +726,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): if size > 3: component_vertices = component_assignments['vertex'][component_assignments['labels']==component] subgraph = cugraph.subgraph(G, component_vertices) - component_betweenness = cugraph.betweenness_centrality(G) + component_betweenness = cugraph.betweenness_centrality(G, k = 250) betweenness.append(component_betweenness['betweenness_centrality'].max()) sizes.append(size) else: From ca8c4292d4c350b6364cee0844b499a8f61ecad6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 20:32:53 +0000 Subject: [PATCH 317/327] Set from numpy ndarray --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ac42166b..1badcce9 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -312,8 +312,8 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, reference_index_set = set(reference_indices) # Add predecessors to reference sequences on the SSSPs predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values - print("Predecessors: " + str(predecessor_list) + " type: " + str(type(predecessor_list))) - predecessors = set(predecessor_list[predecessor_list >= 0].values) + print("Predecessors: " + str(predecessor_list.flatten()) + " type: " + str(type(predecessor_list))) + predecessors = set(predecessor_list[predecessor_list >= 0].flatten()) # Add predecessors to reference set and check whether this results in complete paths # where complete paths are indicated by references' predecessors being within the set of # references From b526a2f06b594a93d7efb1db731d3f420bd67196 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 21:40:57 +0000 Subject: [PATCH 318/327] Convert ndarray to list --- PopPUNK/network.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 1badcce9..98df6821 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -312,8 +312,8 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, reference_index_set = set(reference_indices) # Add predecessors to reference sequences on the SSSPs predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values - print("Predecessors: " + str(predecessor_list.flatten()) + " type: " + str(type(predecessor_list))) - predecessors = set(predecessor_list[predecessor_list >= 0].flatten()) + print("Predecessors: " + str(predecessor_list.flatten().tolist()) + " type: " + str(type(predecessor_list))) + predecessors = set(predecessor_list[predecessor_list >= 0].flatten().tolist()) # Add predecessors to reference set and check whether this results in complete paths # where complete paths are indicated by references' predecessors being within the set of # references From 414c27bc838aee5ea559f8439210a3682f8ed77f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 22:06:50 +0000 Subject: [PATCH 319/327] Convert ndarray to list --- PopPUNK/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 98df6821..e6091054 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -320,7 +320,7 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0: reference_index_set = reference_index_set.union(predecessors) predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values - predecessors = set(predecessor_list[predecessor_list >= 0]) + predecessors = set(predecessor_list[predecessor_list >= 0].flatten().tolist()) # Add expanded reference set to the overall list reference_indices = list(reference_index_set) # Create new reference graph From 0cef59d21b518cac65014b2445e12227fa0cdcd6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 22 Mar 2021 22:40:01 +0000 Subject: [PATCH 320/327] Remove debug message --- PopPUNK/network.py | 1 - 1 file changed, 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index e6091054..bb6bf73e 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -312,7 +312,6 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, reference_index_set = set(reference_indices) # Add predecessors to reference sequences on the SSSPs predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values - print("Predecessors: " + str(predecessor_list.flatten().tolist()) + " type: " + str(type(predecessor_list))) predecessors = set(predecessor_list[predecessor_list >= 0].flatten().tolist()) # Add predecessors to reference set and check whether this results in complete paths # where complete paths are indicated by references' predecessors being within the set of From d6667a1e95004e3c1f207e512aebbba7ef97a9ba Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 23 Mar 2021 12:23:55 +0000 Subject: [PATCH 321/327] Remove nvidia packages from CI --- environment.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/environment.yml b/environment.yml index d8bdce3e..0c525fe0 100644 --- a/environment.yml +++ b/environment.yml @@ -4,8 +4,6 @@ channels: - bioconda - defaults - r - - nvidia - - rapidsai dependencies: - pip - numpy @@ -44,5 +42,3 @@ dependencies: - libgomp - tqdm - flask-apscheduler - - cudf - - cugraph From b5e03eba24eb05b127dffa375e3b11da762e50d9 Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 23 Mar 2021 12:25:37 +0000 Subject: [PATCH 322/327] Remove whitespace --- PopPUNK/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index fee09417..fc4c4310 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -242,25 +242,25 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): Prefix of output files qc_dict (dict) Dict of QC options - + Returns: seq_names_passing (list) List of isolates passing QC distance filters distMat ([n,2] numpy ndarray) Filtered long form distance matrix """ - + # avoid circular import from .prune_db import prune_distance_matrix from .sketchlib import removeFromDB from .sketchlib import pickTypeIsolate - + # Create overall list of sequences if refList == refList: seq_names_passing = refList else: seq_names_passing = refList + queryList - + # Sequences to remove to_prune = [] @@ -287,7 +287,7 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): to_prune.append(names[i][1]) elif names[i][1] == qc_dict['type_isolate']: to_prune.append(names[i][0]) - + # prune based on distance from reference if provided if qc_dict['qc_filter'] == 'stop' and len(to_prune) > 0: sys.stderr.write('Outlier distances exceed QC thresholds; prune sequences or raise thresholds\n') From 62e69d25e2dfb03eb14f86c0e5b37cf49575897a Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 23 Mar 2021 12:29:08 +0000 Subject: [PATCH 323/327] Remove cudf/cugraph err message --- PopPUNK/network.py | 6 +++--- PopPUNK/refine.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index bb6bf73e..6bd4bb90 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -19,6 +19,7 @@ from collections import defaultdict, Counter from functools import partial from multiprocessing import Pool +import pickle import graph_tool.all as gt import dendropy @@ -252,7 +253,6 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, sys.exit(1) if use_gpu: - if not gpu_lib: sys.stderr.write('Unable to load GPU libraries; exiting\n') sys.exit(1) @@ -327,7 +327,6 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, G_ref = add_self_loop(G_ref_df, max_in_vertex_labels, renumber = False) else: - # Each component is independent, so can be multithreaded components = gt.label_components(G)[0].a @@ -470,7 +469,8 @@ def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False, else: old_ids = old_rlist + old_qlist else: - sys.stderr.write('Pkl file containing names of sequences in previous network\n') + sys.stderr.write('Missing .pkl file containing names of sequences in ' + 'previous network\n') sys.exit(1) # Get edges as lists of source,destination,weight using original IDs diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 55196767..139a50bc 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -30,7 +30,6 @@ import cudf gpu_lib = True except ImportError as e: - sys.stderr.write("cugraph and cudf unavailable\n") gpu_lib = False from .network import constructNetwork From a1c6692ea9b6ab3270713054485fde99129e64df Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 23 Mar 2021 12:35:40 +0000 Subject: [PATCH 324/327] trailing whitespace --- PopPUNK/sketchlib.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 7a8ef51d..b5099e59 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -592,7 +592,7 @@ def pickTypeIsolate(prefix, names): min_prop_n = 1.0 type_isolate = None - + try: # process data structures read_grp = hdf_in['sketches'] @@ -742,13 +742,13 @@ def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads # This gives back retained in the same order as names retained = [x for x in names if x in frozenset(retained)] - + # stop if type sequence does not pass QC or is absent if qc_dict['type_isolate'] is not None and qc_dict['type_isolate'] not in retained: sys.stderr.write('Type isolate ' + qc_dict['type_isolate'] + ' not found in isolates after QC; check ' 'name of type isolate and QC options\n') sys.exit(1) - + return retained def fitKmerCurve(pairwise, klist, jacobian): From 1a11b21c926e3da08332094f49a1a17a2d876c94 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 23 Mar 2021 12:44:13 +0000 Subject: [PATCH 325/327] Change cugraph betweenness calculation --- PopPUNK/network.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 6bd4bb90..34354c1f 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -727,7 +727,11 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False): if size > 3: component_vertices = component_assignments['vertex'][component_assignments['labels']==component] subgraph = cugraph.subgraph(G, component_vertices) - component_betweenness = cugraph.betweenness_centrality(G, k = 250) + max_betweeness_k = 1000 + if len(component_vertices) >= max_betweeness_k: + component_betweenness = cugraph.betweenness_centrality(subgraph, k = max_betweeness_k) + else: + component_betweenness = cugraph.betweenness_centrality(subgraph) betweenness.append(component_betweenness['betweenness_centrality'].max()) sizes.append(size) else: From f98bd0fa2c7a029f8196a9056ada11d614c0dd24 Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 23 Mar 2021 16:32:00 +0000 Subject: [PATCH 326/327] Remove multiprocessing block from 2d network refine w/ GPU --- MANIFEST.in | 2 +- PopPUNK/refine.py | 68 +++++++++++++++++++++++++++-------------------- PopPUNK/utils.py | 2 +- environment.yml | 2 +- 4 files changed, 42 insertions(+), 32 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index b510d8bd..ad9e8edf 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ recursive-include scripts *.py -recursive-include PopPUNK/data *.json *.gz *.txt \ No newline at end of file +recursive-include PopPUNK/data *.gz \ No newline at end of file diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index 139a50bc..fc3b752b 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -112,30 +112,41 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1, x_max = np.linspace(x_max_start, x_max_end, global_grid_resolution, dtype=np.float32) y_max = np.linspace(y_max_start, y_max_end, global_grid_resolution, dtype=np.float32) - if gt.openmp_enabled(): - gt.openmp_set_num_threads(1) - - with SharedMemoryManager() as smm: - shm_distMat = smm.SharedMemory(size = distMat.nbytes) - distances_shared_array = np.ndarray(distMat.shape, dtype = distMat.dtype, buffer = shm_distMat.buf) - distances_shared_array[:] = distMat[:] - distances_shared = NumpyShared(name = shm_distMat.name, shape = distMat.shape, dtype = distMat.dtype) - - with Pool(processes = num_processes) as pool: - global_s = pool.map(partial(newNetwork2D, - sample_names = sample_names, - distMat = distances_shared, - x_range = x_max, - y_range = y_max, - score_idx = score_idx, - use_gpu = use_gpu), - range(global_grid_resolution)) - - if gt.openmp_enabled(): - gt.openmp_set_num_threads(num_processes) - - global_s = list(chain.from_iterable(global_s)) - min_idx = np.argmin(np.array(global_s)) + if use_gpu: + global_s = map(partial(newNetwork2D, + sample_names = sample_names, + distMat = distMat, + x_range = x_max, + y_range = y_max, + score_idx = score_idx, + use_gpu = True), + range(global_grid_resolution)) + else: + if gt.openmp_enabled(): + gt.openmp_set_num_threads(1) + + with SharedMemoryManager() as smm: + shm_distMat = smm.SharedMemory(size = distMat.nbytes) + distances_shared_array = np.ndarray(distMat.shape, dtype = distMat.dtype, buffer = shm_distMat.buf) + distances_shared_array[:] = distMat[:] + distances_shared = NumpyShared(name = shm_distMat.name, shape = distMat.shape, dtype = distMat.dtype) + + with Pool(processes = num_processes) as pool: + global_s = pool.map(partial(newNetwork2D, + sample_names = sample_names, + distMat = distances_shared, + x_range = x_max, + y_range = y_max, + score_idx = score_idx, + use_gpu = False), + range(global_grid_resolution)) + + if gt.openmp_enabled(): + gt.openmp_set_num_threads(num_processes) + + global_s = np.array(list(chain.from_iterable(global_s))) + global_s[np.isnan(global_s)] = 1 + min_idx = np.argmin(global_s) optimal_x = x_max[min_idx % global_grid_resolution] optimal_y = y_max[min_idx // global_grid_resolution] @@ -217,18 +228,17 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_ Optional thread idx (if multithreaded) to offset progress bar by use_gpu (bool) Whether to use cugraph for graph analyses - + Returns: scores (list) -1 * network score for each of x_range. Where network score is from :func:`~PopPUNK.network.networkSummary` """ - # load CUDA libraries if use_gpu and not gpu_lib: sys.stderr.write('Unable to load GPU libraries; exiting\n') sys.exit(1) - + scores = [] edge_list = [] prev_idx = 0 @@ -313,7 +323,7 @@ def newNetwork(s, sample_names, distMat, start_point, mean1, gradient, Number of CPUs to use for calculating assignment use_gpu (bool) Whether to use cugraph for graph analysis - + Returns: score (float) -1 * network score. Where network score is from :func:`~PopPUNK.network.networkSummary` @@ -364,7 +374,7 @@ def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0, us [default = 0] use_gpu (bool) Whether to use cugraph for graph analysis - + Returns: scores (list) -1 * network score for each of x_range. diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index fc4c4310..eb880604 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -275,7 +275,7 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): # Pick type isolate if not supplied if qc_dict['type_isolate'] is None: qc_dict['type_isolate'] = pickTypeIsolate(ref_db, seq_names_passing) - sys.stderr.write('Selected type isolate is ' + qc_dict['type_isolate'] + '\n') + sys.stderr.write('Selected type isolate for distance QC is ' + qc_dict['type_isolate'] + '\n') # First check with numpy, which is quicker than iterating over everything long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist() diff --git a/environment.yml b/environment.yml index 0c525fe0..76b01588 100644 --- a/environment.yml +++ b/environment.yml @@ -17,7 +17,7 @@ dependencies: - hdbscan - rapidnj - h5py - - pp-sketchlib >=1.6.2 + - pp-sketchlib >=1.7.0 - graph-tool >=2.35 - requests - flask From e1879a82c3dcdcb387687d94e6f9ad76d4e95180 Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 23 Mar 2021 16:56:21 +0000 Subject: [PATCH 327/327] Fix web test --- PopPUNK/assign.py | 5 +- PopPUNK/web.py | 8 +- test/clean_test.py | 4 +- test/test-web.py | 220 +++++++++++++++++++++++---------------------- test/web_args.txt | 7 +- 5 files changed, 132 insertions(+), 112 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 626d2cb9..dcd1f276 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -160,7 +160,10 @@ def assign_query(dbFuncs, threads = threads, use_gpu = gpu_dist) # QC distance matrix - seq_names_passing, distMat = qcDistMat(qrDistMat, rNames, qNames, ref_db, output, qc_dict) + if qc_dict['run_qc']: + seq_names_passing = qcDistMat(qrDistMat, rNames, qNames, ref_db, output, qc_dict)[0] + else: + seq_names_passing = rNames + qNames # Load the network based on supplied options genomeNetwork, old_cluster_file = \ diff --git a/PopPUNK/web.py b/PopPUNK/web.py index 5303a724..6b013f61 100644 --- a/PopPUNK/web.py +++ b/PopPUNK/web.py @@ -114,16 +114,18 @@ def sketchAssign(): args.visualise.strand_preserved, outdir + "/include.txt", species_db, - species_db, + species_db + "/" + os.path.basename(species_db) + "_clusters.csv", args.visualise.previous_query_clustering, - outdir, + outdir + "/" + os.path.basename(outdir) + "_graph.gt", args.visualise.gpu_graph, args.visualise.info_csv, args.visualise.rapidnj, args.visualise.tree, args.visualise.mst_distances, args.visualise.overwrite, args.visualise.core_only, - args.visualise.accessory_only) + args.visualise.accessory_only, + args.visualise.display_cluster, + web=True) networkJson = graphml_to_json(outdir) if len(to_include) >= 3: with open(os.path.join(outdir, os.path.basename(outdir) + "_core_NJ.nwk"), "r") as p: diff --git a/test/clean_test.py b/test/clean_test.py index 29852e14..b1923144 100755 --- a/test/clean_test.py +++ b/test/clean_test.py @@ -43,7 +43,9 @@ def deleteDir(dirname): "example_api", "batch1", "batch2", - "batch12" + "batch3", + "batch12", + "batch123" ] for outDir in outputDirs: deleteDir(outDir) diff --git a/test/test-web.py b/test/test-web.py index 56f47bf5..a69505c1 100644 --- a/test/test-web.py +++ b/test/test-web.py @@ -10,114 +10,122 @@ from PopPUNK.utils import setupDBFuncs from PopPUNK.visualise import generate_visualisations -# Copy and move args and sketch files into example dirs -copyfile("web_args.txt", "example_db/args.txt") -copyfile("example_viz/example_viz_core_NJ.nwk", "example_viz/example_viz.nwk") +def main(): + # Copy and move args and sketch files into example dirs + copyfile("web_args.txt", "example_db/args.txt") + copyfile("example_viz/example_viz_core_NJ.nwk", "example_viz/example_viz.nwk") -# Test the output of the PopPUNk-web upload route for incorrect data types -sys.stderr.write('\nTesting assign for PopPUNK-web\n') -with open("json_sketch.txt", "r") as s: - sketch = s.read() -species = "Listeria monocytogenes" -species_db = "example_db" -outdir = "example_api" -if not os.path.exists(outdir): - os.mkdir(outdir) -args = default_options(species_db) -qc_dict = {'run_qc': False } -dbFuncs = setupDBFuncs(args.assign, args.assign.min_kmer_count, qc_dict) -ClusterResult = assign_query(dbFuncs, - args.assign.ref_db, - args.assign.q_files, + # Test the output of the PopPUNk-web upload route for incorrect data types + sys.stderr.write('\nTesting assign for PopPUNK-web\n') + with open("json_sketch.txt", "r") as s: + sketch = s.read() + species = "Listeria monocytogenes" + species_db = "example_db" + outdir = "example_api" + if not os.path.exists(outdir): + os.mkdir(outdir) + args = default_options(species_db) + qc_dict = {'run_qc': False } + dbFuncs = setupDBFuncs(args.assign, args.assign.min_kmer_count, qc_dict) + ClusterResult = assign_query(dbFuncs, + args.assign.ref_db, + args.assign.q_files, + outdir, + qc_dict, + args.assign.update_db, + args.assign.write_references, + args.assign.distances, + args.assign.threads, + args.assign.overwrite, + args.assign.plot_fit, + args.assign.graph_weights, + args.assign.max_a_dist, + args.assign.max_pi_dist, + args.assign.type_isolate, + args.assign.model_dir, + args.assign.strand_preserved, + args.assign.previous_clustering, + args.assign.external_clustering, + args.assign.core_only, + args.assign.accessory_only, + args.assign.gpu_sketch, + args.assign.gpu_dist, + args.assign.gpu_graph, + args.assign.deviceid, + args.assign.web, + sketch, + args.assign.save_partial_query_graph) + query, query_prevalence, clusters, prevalences, alias_dict, to_include = \ + summarise_clusters(outdir, species, species_db) + colours = get_colours(query, clusters) + url = api(query, "example_viz") + sys.stderr.write('PopPUNK-web assign test successful\n') + + # Test generate_visualisations() for PopPUNK-web + sys.stderr.write('\nTesting visualisations for PopPUNK-web\n') + if len(to_include) < 3: + args.visualise.microreact = False + generate_visualisations(outdir, + species_db, + None, + args.visualise.threads, outdir, - args.assign.update_db, - args.assign.write_references, - args.assign.distances, - args.assign.threads, - args.assign.overwrite, - args.assign.plot_fit, - args.assign.graph_weights, - args.assign.max_a_dist, - args.assign.max_pi_dist, - args.assign.reference_isolate, - args.assign.model_dir, - args.assign.strand_preserved, - args.assign.previous_clustering, - args.assign.external_clustering, - args.assign.core_only, - args.assign.accessory_only, - args.assign.gpu_sketch, - args.assign.gpu_dist, - args.assign.gpu_graph, - args.assign.deviceid, - args.assign.web, - sketch, - args.assign.save_partial_query_graph) -query, query_prevalence, clusters, prevalences, alias_dict, to_include = \ - summarise_clusters(outdir, species, species_db) -colours = get_colours(query, clusters) -url = api(query, "example_viz") -sys.stderr.write('PopPUNK-web assign test successful\n') + args.visualise.gpu_dist, + args.visualise.deviceid, + args.visualise.external_clustering, + args.visualise.microreact, + args.visualise.phandango, + args.visualise.grapetree, + args.visualise.cytoscape, + args.visualise.perplexity, + args.visualise.strand_preserved, + outdir + "/include.txt", + species_db, + species_db + "/" + os.path.basename(species_db) + "_clusters.csv", + args.visualise.previous_query_clustering, + outdir + "/" + os.path.basename(outdir) + "_graph.gt", + args.visualise.gpu_graph, + args.visualise.info_csv, + args.visualise.rapidnj, + args.visualise.tree, + args.visualise.mst_distances, + args.visualise.overwrite, + args.visualise.core_only, + args.visualise.accessory_only, + args.visualise.display_cluster, + web=True) + networkJson = graphml_to_json(outdir) + if len(to_include) >= 3: + with open(os.path.join(outdir, os.path.basename(outdir) + "_core_NJ.nwk"), "r") as p: + phylogeny = p.read() + else: + phylogeny = "A tree cannot be built with fewer than 3 samples." -# Test generate_visualisations() for PopPUNK-web -sys.stderr.write('\nTesting visualisations for PopPUNK-web\n') -if len(to_include) < 3: - args.visualise.microreact = False -generate_visualisations(outdir, - species_db, - None, - args.visualise.threads, - outdir, - args.visualise.gpu_dist, - args.visualise.deviceid, - args.visualise.external_clustering, - args.visualise.microreact, - args.visualise.phandango, - args.visualise.grapetree, - args.visualise.cytoscape, - args.visualise.perplexity, - args.visualise.strand_preserved, - outdir + "/include.txt", - species_db, - species_db, - args.visualise.previous_query_clustering, - outdir, - args.visualise.info_csv, - args.visualise.rapidnj, - args.visualise.tree, - args.visualise.mst_distances, - args.visualise.overwrite, - args.visualise.core_only, - args.visualise.accessory_only) -networkJson = graphml_to_json(outdir) -if len(to_include) >= 3: - with open(os.path.join(outdir, os.path.basename(outdir) + "_core_NJ.nwk"), "r") as p: - phylogeny = p.read() -else: - phylogeny = "A tree cannot be built with fewer than 3 samples." + # ensure web api outputs are of the correct type + if not isinstance(species, str): + raise TypeError('"Species" datatype is incorrect, should be string.\n') + if not (isinstance(query_prevalence, float) or isinstance(query_prevalence, int)): + raise TypeError('"query_prevalence" datatype is incorrect, should be float/integer.\n') + if not isinstance(query, str): + raise TypeError('"query" datatype is incorrect, should be string.\n') + if not isinstance(clusters, list) and not isinstance(clusters[0], str): + raise TypeError('"clusters" datatype is incorrect, should be list of strings.\n') + if not isinstance(prevalences, list) and not (isinstance(prevalences[0], float) or isinstance(prevalences[0], int)): + raise TypeError('"prevalences" datatype is incorrect, should be list of floats/integers.\n') + if not isinstance(colours, list) and not isinstance(colours[0], str): + raise TypeError('"colours" datatype is incorrect, should be list of strings.\n') + if not isinstance(url, str): + raise TypeError('"url" datatype is incorrect, should be string.\n') + if not isinstance(alias_dict, dict): + raise TypeError('"alias_dict" datatype is incorrect, should be dictionary.\n') + if not isinstance(outdir, str): + raise TypeError('"outdir" datatype is incorrect, should be string.\n') + if not isinstance(networkJson, dict): + raise TypeError('"networkJson" datatype is incorrect, should be dict.\n') + if not isinstance(phylogeny, str): + raise TypeError('"phylogeny" datatype is incorrect, should be str.\n') -# ensure web api outputs are of the correct type -if not isinstance(species, str): - raise TypeError('"Species" datatype is incorrect, should be string.\n') -if not (isinstance(query_prevalence, float) or isinstance(query_prevalence, int)): - raise TypeError('"query_prevalence" datatype is incorrect, should be float/integer.\n') -if not isinstance(query, str): - raise TypeError('"query" datatype is incorrect, should be string.\n') -if not isinstance(clusters, list) and not isinstance(clusters[0], str): - raise TypeError('"clusters" datatype is incorrect, should be list of strings.\n') -if not isinstance(prevalences, list) and not (isinstance(prevalences[0], float) or isinstance(prevalences[0], int)): - raise TypeError('"prevalences" datatype is incorrect, should be list of floats/integers.\n') -if not isinstance(colours, list) and not isinstance(colours[0], str): - raise TypeError('"colours" datatype is incorrect, should be list of strings.\n') -if not isinstance(url, str): - raise TypeError('"url" datatype is incorrect, should be string.\n') -if not isinstance(alias_dict, dict): - raise TypeError('"alias_dict" datatype is incorrect, should be dictionary.\n') -if not isinstance(outdir, str): - raise TypeError('"outdir" datatype is incorrect, should be string.\n') -if not isinstance(networkJson, dict): - raise TypeError('"networkJson" datatype is incorrect, should be dict.\n') -if not isinstance(phylogeny, str): - raise TypeError('"phylogeny" datatype is incorrect, should be str.\n') + sys.stderr.write('\nAPI tests complete\n') -sys.stderr.write('\nAPI tests complete\n') +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test/web_args.txt b/test/web_args.txt index 2eb7f0fb..dc397c11 100644 --- a/test/web_args.txt +++ b/test/web_args.txt @@ -8,6 +8,8 @@ "plot_fit":0, "graph_weights":true, "max_a_dist":0.5, + "max_pi_dist":0.5, + "type_isolate":null, "strand_preserved":false, "external_clustering":null, "core_only":false, @@ -19,6 +21,7 @@ "gpu_sketch":false, "deviceid":0, "gpu_dist":false, + "gpu_graph":false, "min_kmer_count":0, "min_k":14, "max_k":29, @@ -36,6 +39,7 @@ "visualise":{ "threads":1, "gpu_dist":false, + "gpu_graph":false, "deviceid":0, "external_clustering":null, "microreact":true, @@ -51,6 +55,7 @@ "mst_distances":"core", "overwrite":true, "core_only":false, - "accessory_only":false + "accessory_only":false, + "display_cluster":null } }