From 04c8f0d13245527f0ba108d9ba3aeb8fb2cff1e1 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 1 Feb 2021 15:24:04 +0000
Subject: [PATCH 001/327] Change reference to clustering in MST plot

---
 PopPUNK/plot.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
index 521b41ae..92526178 100644
--- a/PopPUNK/plot.py
+++ b/PopPUNK/plot.py
@@ -446,12 +446,12 @@ def drawMST(mst, outPrefix, isolate_clustering, overwrite):
                             output=graph1_file_name, output_size=(3000, 3000))
         if overwrite or not os.path.isfile(graph2_file_name):
             cluster_fill = {}
-            for cluster in set(isolate_clustering['Cluster'].values()):
+            for cluster in set(isolate_clustering['Rank_50_Lineage'].values()):
                 cluster_fill[cluster] = list(np.random.rand(3)) + [0.9]
             plot_color = mst.new_vertex_property('vector<double>')
             mst.vertex_properties['plot_color'] = plot_color
             for v in mst.vertices():
-                plot_color[v] = cluster_fill[isolate_clustering['Cluster'][mst.vp.id[v]]]
+                plot_color[v] = cluster_fill[isolate_clustering['Rank_50_Lineage'][mst.vp.id[v]]]
 
             gt.graph_draw(mst, pos=pos, vertex_fill_color=mst.vertex_properties['plot_color'],
                     output=graph2_file_name, output_size=(3000, 3000))

From 352a8aa558f317f8a259616e375ec41a850312fd Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 1 Feb 2021 21:39:50 +0000
Subject: [PATCH 002/327] Enable colouring by lineage with MSTs

---
 PopPUNK/plot.py       |  8 +++++---
 PopPUNK/sparse_mst.py | 18 ++++++++++++++++--
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
index 92526178..51e40e09 100644
--- a/PopPUNK/plot.py
+++ b/PopPUNK/plot.py
@@ -414,7 +414,7 @@ def distHistogram(dists, rank, outPrefix):
                 "_rank_" + str(rank) + "_histogram.png")
     plt.close()
 
-def drawMST(mst, outPrefix, isolate_clustering, overwrite):
+def drawMST(mst, outPrefix, isolate_clustering, clustering_name, overwrite):
     """Plot a layout of the minimum spanning tree
 
     Args:
@@ -424,6 +424,8 @@ def drawMST(mst, outPrefix, isolate_clustering, overwrite):
             Output prefix for save files
         isolate_clustering (dict)
             Dictionary of ID: cluster, used for colouring vertices
+        clustering_name (str)
+            Name of clustering scheme to be used for colouring
         overwrite (bool)
             Overwrite existing output files
     """
@@ -446,12 +448,12 @@ def drawMST(mst, outPrefix, isolate_clustering, overwrite):
                             output=graph1_file_name, output_size=(3000, 3000))
         if overwrite or not os.path.isfile(graph2_file_name):
             cluster_fill = {}
-            for cluster in set(isolate_clustering['Rank_50_Lineage'].values()):
+            for cluster in set(isolate_clustering[clustering_name].values()):
                 cluster_fill[cluster] = list(np.random.rand(3)) + [0.9]
             plot_color = mst.new_vertex_property('vector<double>')
             mst.vertex_properties['plot_color'] = plot_color
             for v in mst.vertices():
-                plot_color[v] = cluster_fill[isolate_clustering['Rank_50_Lineage'][mst.vp.id[v]]]
+                plot_color[v] = cluster_fill[isolate_clustering[clustering_name][mst.vp.id[v]]]
 
             gt.graph_draw(mst, pos=pos, vertex_fill_color=mst.vertex_properties['plot_color'],
                     output=graph2_file_name, output_size=(3000, 3000))
diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py
index 50a9e4b1..a106e009 100755
--- a/PopPUNK/sparse_mst.py
+++ b/PopPUNK/sparse_mst.py
@@ -32,6 +32,7 @@ def get_options():
     iGroup.add_argument('--distances', required=True, help='Prefix of input pickle of pre-calculated distances (required)')
     iGroup.add_argument('--rank-fit', required=True, help='Location of rank fit, a sparse matrix (*_rank*_fit.npz)')
     iGroup.add_argument('--previous-clustering', help='CSV file with cluster definitions')
+    iGroup.add_argument('--display-cluster', default=None, help='Column of clustering CSV to use for plotting')
 
     # output options
     oGroup = parser.add_argument_group('Output options')
@@ -116,7 +117,7 @@ def main():
     if not args.no_plot:
         if args.previous_clustering != None:
             mode = "clusters"
-            if re.match(r"_lineages\.csv$", args.previous_clustering):
+            if args.previous_clustering.endswith('_lineages.csv'):
                 mode = "lineages"
             isolateClustering = readIsolateTypeFromCsv(args.previous_clustering,
                                                        mode = mode,
@@ -127,7 +128,20 @@ def main():
             for v in mst.vertices:
                 isolateClustering['Cluster'][mst.vp.id[v]] = '0'
 
-        drawMST(mst, args.output, isolateClustering, True)
+        # Check selecting clustering type is in CSV
+        clustering_name = 'Cluster'
+        if args.display_cluster != None and args.previous_clustering != None:
+            if args.display_cluster not in isolateClustering.keys():
+                sys.stderr.write('Unable to find clustering column ' + args.display_cluster + ' in file ' +
+                                 args.previous_clustering + '\n')
+                sys.exit()
+            else:
+                clustering_name = args.display_cluster
+        else:
+            args.display_cluster = list(isolateClustering.keys())[0]
+
+        # Draw MST
+        drawMST(mst, args.output, isolateClustering, args.display_cluster, True)
 
     sys.exit(0)
 

From 5bbc775f4ad7219a3b875db4b1a27b8944c547b9 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 1 Feb 2021 22:10:10 +0000
Subject: [PATCH 003/327] Fix name processing for display clustering

---
 PopPUNK/sparse_mst.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py
index a106e009..b53866fa 100755
--- a/PopPUNK/sparse_mst.py
+++ b/PopPUNK/sparse_mst.py
@@ -138,10 +138,10 @@ def main():
             else:
                 clustering_name = args.display_cluster
         else:
-            args.display_cluster = list(isolateClustering.keys())[0]
+            clustering_name = list(isolateClustering.keys())[0]
 
         # Draw MST
-        drawMST(mst, args.output, isolateClustering, args.display_cluster, True)
+        drawMST(mst, args.output, isolateClustering, clustering_name, True)
 
     sys.exit(0)
 

From d70d65e2638a126d693daff8413287ed33142232 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 1 Feb 2021 22:25:03 +0000
Subject: [PATCH 004/327] Harmonise visualise.py with sparse_mst.py

---
 PopPUNK/visualise.py | 58 +++++++++++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 20 deletions(-)

diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py
index d767a54d..ab9fdf73 100644
--- a/PopPUNK/visualise.py
+++ b/PopPUNK/visualise.py
@@ -53,15 +53,18 @@ def get_options():
                              'to clusters [default = reference database directory]',
                         type = str)
     iGroup.add_argument('--previous-clustering',
-                        help='Directory containing previous cluster definitions '
+                        help='File containing previous cluster definitions '
                              'and network [default = use that in the directory '
                              'containing the model]',
                         type = str)
     iGroup.add_argument('--previous-query-clustering',
-                        help='Directory containing previous cluster definitions '
+                        help='File containing previous cluster definitions '
                              'from poppunk_assign [default = use that in the directory '
-                             'containing the model]',
+                             'of the query database]',
                         type = str)
+    iGroup.add_argument('--display-cluster',
+                        help='Column of clustering CSV to use for plotting',
+                        default=None)
 
     # output options
     oGroup = parser.add_argument_group('Output options')
@@ -295,34 +298,37 @@ def generate_visualisations(query_db,
         sys.exit(1)
 
     # Load previous clusters
-    mode = "clusters"
-    suffix = "_clusters.csv"
-    if model.type == "lineage":
-        mode = "lineages"
-        suffix = "_lineages.csv"
-    if model.indiv_fitted:
-        sys.stderr.write("Note: Individual (core/accessory) fits found, but "
-                         "visualisation only supports combined boundary fit\n")
-
-    # Set directories of previous fit
     if previous_clustering is not None:
         prev_clustering = previous_clustering
+        mode = "clusters"
+        suffix = "_clusters.csv"
+        if prev_clustering.endswith('_lineages.csv'):
+            mode = "lineages"
+            suffix = "_lineages.csv"
     else:
-        prev_clustering = os.path.dirname(model_file)
-    cluster_file = prev_clustering + '/' + os.path.basename(prev_clustering) + suffix
-    isolateClustering = readIsolateTypeFromCsv(cluster_file,
+        # Identify type of clustering based on model
+        mode = "clusters"
+        suffix = "_clusters.csv"
+        if model.type == "lineage":
+            mode = "lineages"
+            suffix = "_lineages.csv"
+        if model.indiv_fitted:
+            sys.stderr.write("Note: Individual (core/accessory) fits found, but "
+                             "visualisation only supports combined boundary fit\n")
+        prev_clustering = os.path.dirname(model_file) + '/' + os.path.basename(model_file) + suffix
+    isolateClustering = readIsolateTypeFromCsv(prev_clustering,
                                                mode = mode,
                                                return_dict = True)
 
     # Join clusters with query clusters if required
     if not self:
         if previous_query_clustering is not None:
-            prev_query_clustering = previous_query_clustering + '/' + os.path.basename(previous_query_clustering)
+            prev_query_clustering = previous_query_clustering
         else:
-            prev_query_clustering = query_db
+            prev_query_clustering = os.path.dirname(query_db) + '/' + os.path.basename(query_db) + suffix
 
         queryIsolateClustering = readIsolateTypeFromCsv(
-                prev_query_clustering + suffix,
+                prev_query_clustering,
                 mode = mode,
                 return_dict = True)
         isolateClustering = joinClusterDicts(isolateClustering, queryIsolateClustering)
@@ -348,7 +354,19 @@ def generate_visualisations(query_db,
                                  weights_type=mst_distances,
                                  summarise=False)
             mst_graph = generate_minimum_spanning_tree(G)
-            drawMST(mst_graph, output, isolateClustering, overwrite)
+            # Check selecting clustering type is in CSV
+            clustering_name = 'Cluster'
+            if args.display_cluster != None:
+                if args.display_cluster not in isolateClustering.keys():
+                    sys.stderr.write('Unable to find clustering column ' + args.display_cluster + ' in file ' +
+                                     prev_clustering + '\n')
+                    sys.exit()
+                else:
+                    clustering_name = args.display_cluster
+            else:
+                clustering_name = list(isolateClustering.keys())[0]
+            # Draw MST
+            drawMST(mst_graph, output, isolateClustering, clustering_name, overwrite)
             mst_tree = mst_to_phylogeny(mst_graph, isolateNameToLabel(combined_seq))
         else:
             mst_tree = existing_tree

From 77bd637b31db75172da7f21e017830d44e369a9a Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 1 Feb 2021 22:27:42 +0000
Subject: [PATCH 005/327] Pass display cluster argument correctly

---
 PopPUNK/visualise.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py
index ab9fdf73..8bfeef25 100644
--- a/PopPUNK/visualise.py
+++ b/PopPUNK/visualise.py
@@ -153,6 +153,7 @@ def generate_visualisations(query_db,
                             overwrite,
                             core_only,
                             accessory_only,
+                            display_cluster,
                             web):
 
     from .models import loadClusterFit
@@ -356,13 +357,13 @@ def generate_visualisations(query_db,
             mst_graph = generate_minimum_spanning_tree(G)
             # Check selecting clustering type is in CSV
             clustering_name = 'Cluster'
-            if args.display_cluster != None:
-                if args.display_cluster not in isolateClustering.keys():
-                    sys.stderr.write('Unable to find clustering column ' + args.display_cluster + ' in file ' +
+            if display_cluster != None:
+                if display_cluster not in isolateClustering.keys():
+                    sys.stderr.write('Unable to find clustering column ' + display_cluster + ' in file ' +
                                      prev_clustering + '\n')
                     sys.exit()
                 else:
-                    clustering_name = args.display_cluster
+                    clustering_name = display_cluster
             else:
                 clustering_name = list(isolateClustering.keys())[0]
             # Draw MST
@@ -461,6 +462,7 @@ def main():
                             args.overwrite,
                             args.core_only,
                             args.accessory_only,
+                            args.display_cluster,
                             web = False)
 
 if __name__ == '__main__':

From 1aed21aabd0a20731bad031235a3690a1c9996ef Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 2 Feb 2021 08:12:14 +0000
Subject: [PATCH 006/327] Update test for new argument parsing

---
 test/run_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/run_test.py b/test/run_test.py
index c3f11051..ad249492 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -62,7 +62,7 @@
 subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree", shell=True, check=True)
 subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt", shell=True, check=True)
 subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --query-db example_query --output example_viz_query --microreact", shell=True, check=True)
-subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True)
+subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages/example_dbscan_clusters.csv --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True)
 subprocess.run("python ../poppunk_visualise-runner.py --distances example_query/example_query.dists --ref-db example_db --model-dir example_lineages --query-db example_lineage_query --output example_viz_query_lineages --microreact", shell=True, check=True)
 
 # MST

From 282dc921a42cf7163f15528c56cae23e9561d79f Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 2 Feb 2021 09:42:50 +0000
Subject: [PATCH 007/327] Fix reference to fetchNetwork

---
 PopPUNK/visualise.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py
index 8bfeef25..385294b3 100644
--- a/PopPUNK/visualise.py
+++ b/PopPUNK/visualise.py
@@ -425,7 +425,12 @@ def generate_visualisations(query_db,
 
     if cytoscape:
         sys.stderr.write("Writing cytoscape output\n")
-        genomeNetwork, cluster_file = fetchNetwork(prev_clustering, model, rlist, False, core_only, accessory_only)
+        genomeNetwork, cluster_file = fetchNetwork(os.path.dirname(prev_clustering),
+                                                    model,
+                                                    rlist,
+                                                    False,
+                                                    core_only,
+                                                    accessory_only)
         outputsForCytoscape(genomeNetwork, mst_graph, isolateClustering, output, info_csv, viz_subset = viz_subset)
         if model.type == 'lineage':
             sys.stderr.write("Note: Only support for output of cytoscape graph at lowest rank\n")

From 6aba143c0e15465499c24e824d842cc9c589b17f Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 2 Feb 2021 10:32:39 +0000
Subject: [PATCH 008/327] Do not exit when display cluster is missing

---
 PopPUNK/visualise.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py
index 385294b3..75682248 100644
--- a/PopPUNK/visualise.py
+++ b/PopPUNK/visualise.py
@@ -359,9 +359,9 @@ def generate_visualisations(query_db,
             clustering_name = 'Cluster'
             if display_cluster != None:
                 if display_cluster not in isolateClustering.keys():
+                    clustering_name = list(isolateClustering.keys())[0]
                     sys.stderr.write('Unable to find clustering column ' + display_cluster + ' in file ' +
-                                     prev_clustering + '\n')
-                    sys.exit()
+                                     prev_clustering + '; instead using ' + clustering_name + '\n')
                 else:
                     clustering_name = display_cluster
             else:

From 51b939fdf1ddb1a8f259c1840a425a0efee57c9f Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 2 Feb 2021 10:35:18 +0000
Subject: [PATCH 009/327] Update documents for --previous-clustering

---
 docs/visualisation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/visualisation.rst b/docs/visualisation.rst
index 666daa7a..43d655e8 100644
--- a/docs/visualisation.rst
+++ b/docs/visualisation.rst
@@ -44,7 +44,7 @@ Visualisation after query assignment::
 
 Visualisation when sketches and models are in different folders::
 
-   poppunk_visualise --ref-db example_db --previous-clustering example_lineages \
+   poppunk_visualise --ref-db example_db --previous-clustering example_lineages/example_lineages_lineages.csv \
     --model-dir example_lineages --output example_viz --microreact
 
 Visualisation with a lineage model, which has been queried (query-query distances must be provided)::

From 1f7d34fbd5a041413a520b68b7de718ee96068b0 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 2 Feb 2021 10:38:10 +0000
Subject: [PATCH 010/327] Update tests for visualisation

---
 test/run_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/run_test.py b/test/run_test.py
index ad249492..0e028fd9 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -62,7 +62,7 @@
 subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree", shell=True, check=True)
 subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt", shell=True, check=True)
 subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --query-db example_query --output example_viz_query --microreact", shell=True, check=True)
-subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages/example_dbscan_clusters.csv --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True)
+subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages/example_lineages_lineages.csv --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True)
 subprocess.run("python ../poppunk_visualise-runner.py --distances example_query/example_query.dists --ref-db example_db --model-dir example_lineages --query-db example_lineage_query --output example_viz_query_lineages --microreact", shell=True, check=True)
 
 # MST

From 9df22acc518bada0769a59431824e41c7785da52 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 2 Feb 2021 11:14:06 +0000
Subject: [PATCH 011/327] Indent change to MST calculation

---
 PopPUNK/sparse_mst.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py
index b53866fa..e0ad75f1 100755
--- a/PopPUNK/sparse_mst.py
+++ b/PopPUNK/sparse_mst.py
@@ -92,6 +92,7 @@ def main():
         G_cu = cugraph.Graph()
         G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
 
+        # Generate minimum spanning tree
         sys.stderr.write("Calculating MST (GPU part)\n")
         G_mst = cugraph.minimum_spanning_tree(G_cu, weight='weights')
         edge_df = G_mst.view_edge_list()
@@ -106,8 +107,10 @@ def main():
         G = constructNetwork(rlist, rlist, None, 0,
                              sparse_input=sparse_mat, summarise=False)
         sys.stderr.write("Calculating MST (CPU)\n")
-
-    mst = generate_minimum_spanning_tree(G, args.gpu_graph)
+        # Generate minimum spanning tree
+        mst = generate_minimum_spanning_tree(G, args.gpu_graph)
+    
+    # Save output
     sys.stderr.write("Generating output\n")
     mst.save(args.output + "/" + os.path.basename(args.output) + ".graphml", fmt="graphml")
     mst_as_tree = mst_to_phylogeny(mst, rlist)

From 89ccbb0abb0b49956f43f52023beab1a5fa777d6 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 2 Feb 2021 11:34:49 +0000
Subject: [PATCH 012/327] Correct grammar in help message

---
 scripts/poppunk_easy_run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/poppunk_easy_run.py b/scripts/poppunk_easy_run.py
index ccefca10..28d53e87 100755
--- a/scripts/poppunk_easy_run.py
+++ b/scripts/poppunk_easy_run.py
@@ -13,7 +13,7 @@ def get_options():
                                      prog='easy_run')
 
     # input options
-    parser.add_argument('--r-files', help='List of sequence names and files (as for --r-files')
+    parser.add_argument('--r-files', help='List of sequence names and files (as for --r-files)')
     parser.add_argument('--output', help='Prefix for output files')
 
     parser.add_argument('--analysis-args', help="Other arguments to pass to poppunk. e.g. "

From e5f4a229ef39558d666f1eb89cbf2d5bb1f1d13d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 2 Feb 2021 15:26:28 +0000
Subject: [PATCH 013/327] Add batch MST script

---
 scripts/poppunk_batch_mst.py | 158 +++++++++++++++++++++++++++++++++++
 1 file changed, 158 insertions(+)
 create mode 100755 scripts/poppunk_batch_mst.py

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
new file mode 100755
index 00000000..06c0e5ef
--- /dev/null
+++ b/scripts/poppunk_batch_mst.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+# vim: set fileencoding=<utf-8> :
+# Copyright 2018-2020 John Lees and Nick Croucher
+
+# universal
+import os
+import sys
+import argparse
+import subprocess
+import shutil
+import glob
+from collections import defaultdict
+
+def write_batch(batched_sequences, files, batch, output):
+    out_fn = output + '.' + batch + '.list'
+    with open(out_fn,'w') as out_file:
+        for seq in batched_sequences[batch]:
+            out_file.write(seq + "\t" + files[seq] + "\n")
+    return out_fn
+
+# command line parsing
+def get_options():
+
+    parser = argparse.ArgumentParser(description='Batch MST mode (create db + lineage model fit + assign)',
+                                     prog='batch_mst')
+
+    # input options
+    ioGroup = parser.add_argument_group('Input and output file options')
+    ioGroup.add_argument('--batch-file', help='Tab-separated list of sequence names, files '
+                                              'and batch assignments',
+                                              required = True)
+    ioGroup.add_argument('--batch-order', help='File specifying order in which batches should '
+                                              'be processed')
+    ioGroup.add_argument('--keep-intermediates', help='Retain the outputs of each batch',
+                                                        default=False,
+                                                        action='store_true')
+    ioGroup.add_argument('--output', help='Prefix for output files', required=True)
+    
+    # analysis options
+    aGroup = parser.add_argument_group('Analysis options')
+    aGroup.add_argument('--rank', help='Rank used to fit lineage model (int)',
+                                               type = int,
+                                               default = 1)
+    aGroup.add_argument('--threads', help='Number of threads for parallelisation (int)',
+                                              type = int,
+                                              default = 1)
+    aGroup.add_argument('--gpu', help='Use GPU for analysis',
+                                              default=False,
+                                              action='store_true')
+    aGroup.add_argument('--deviceid', help='GPU device ID (int)',
+                                              type = int,
+                                              default = 0)
+    aGroup.add_argument('--db-args', help="Other arguments to pass to poppunk. e.g. "
+                                             "'--min-k 13 --max-k 29'",
+                                             default = "")
+    aGroup.add_argument('--model-args', help="Other arguments to pass to lineage model fit",
+                                            default = "")
+    aGroup.add_argument('--assign-args', help="Other arguments to pass to poppunk_assign",
+                                            default = "")
+
+    # Executable options
+    eGroup = parser.add_argument_group('Executable locations')
+    eGroup.add_argument('--poppunk-exe', help="Location of poppunk executable. Use "
+                                            "'python poppunk-runner.py' to run from source tree")
+    eGroup.add_argument('--assign-exe', help="Location of poppunk executable. Use "
+                                            "'python poppunk-runner.py' to run from source tree")
+    eGroup.add_argument('--mst-exe', help="Location of poppunk executable. Use "
+                                            "'python poppunk-runner.py' to run from source tree")
+
+    return parser.parse_args()
+
+# main code
+if __name__ == "__main__":
+
+    # Check input ok
+    args = get_options()
+
+    # Get poppunk executable
+    if args.poppunk_exe is None:
+        poppunk = "poppunk"
+    else:
+        poppunk = args.poppunk_exe
+    # Need to add poppunk_assign_exe
+
+    # Check input file and batching
+    batch_set = set()
+    files = {}
+    batched_sequences = defaultdict(list)
+    with open(args.batch_file,'r') as input_file:
+        for line in input_file.readlines():
+            info = line.rstrip().split()
+            files[info[0]] = info[1]
+            batch_set.add(info[2])
+            batched_sequences[info[2]].append(info[0])
+
+    # Check on batch order
+    batches = []
+    if args.batch_order is not None:
+        with open(args.batch_order,'r') as order_file:
+            batches = [line for line in input_file.readlines().rstrip()]
+        if set(batches) != batch_set:
+            batch_discrepancies = set(batches).difference(batch_set) + \
+                                    batch_set.difference(set(batches))
+            sys.stderr.write('Discrepancies between input file and batch '
+            'ordering: ' + str(batch_discrepancies) + '\n')
+            sys.exit()
+    else:
+        batches = list(batch_set)
+
+    # Iterate through batches
+    first_batch = True
+    current_dir = args.output
+    for batch in batches:
+        # Write batch file
+        batch_fn = write_batch(batched_sequences, files, batch, args.output)
+        if first_batch:
+            # Initialise database
+            create_db_cmd = poppunk + " --create-db --r-files " + batch_fn + " --output " + args.output + " " + args.db_args + " --threads " + str(args.threads) + " " + args.db_args
+            if args.gpu:
+                create_db_cmd = create_db_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid)
+            sys.stderr.write(create_db_cmd + "\n")
+            subprocess.run(create_db_cmd, shell=True, check=True)
+            # Fit lineage model
+            fit_model_cmd = poppunk + " --fit-model lineage --ref-db " + args.output + " --rank " + str(args.rank) + " --threads " + str(args.threads) + " " + args.model_args
+            sys.stderr.write(fit_model_cmd + "\n")
+            subprocess.run(fit_model_cmd, shell=True, check=True)
+            # Completed first batch
+            first_batch = False
+        else:
+            # Define batch prefix
+            batch_prefix = args.output + "_" + batch
+            # Add to first batch through querying
+            assign_cmd = "poppunk_assign --db " + current_dir + " --query " + batch_fn + " --model-dir " + args.output + " --output " + batch_prefix + " --threads " + str(args.threads) + " --update-db " + args.assign_args
+            if args.gpu:
+                assign_cmd = assign_cmd + " --gpu-dist --deviceid " + str(args.deviceid)
+            sys.stderr.write(assign_cmd + "\n")
+            subprocess.run(assign_cmd, shell=True, check=True)
+            # Process output
+            if args.keep_intermediates:
+#                shutil.rmtree(batch_prefix)
+                current_dir = batch_prefix
+                print("Switch current dir to " + current_dir)
+            else:
+                for file in glob.glob(args.output + "_" + batch + "/*"):
+                    file_basename = os.path.basename(file)
+                    if file_basename.startswith(batch_prefix):
+                        print("Moving file " + args.output + "_" + batch + '/' + file_basename + " to " + current_dir + '/' + file_basename.replace(batch_prefix,args.output))
+                        os.rename(args.output + "_" + batch + '/' + file_basename,
+                                  current_dir + '/' + file_basename.replace(batch_prefix,args.output))
+                shutil.rmtree(args.output + "_" + batch)
+                    
+        # Remove npy dist file
+#        os.remove(args.output + "/" + args.output + ".dists.npy")
+
+    # Calculate MST
+    mst_command = "poppunk_mst --distances " + args.output + "/" + args.output + ".dists --rank-fit " + args.output + "/" + args.output + "_rank" + str(args.rank) + "_fit.npz --previous-clustering " + args.output + "/" + args.output + "_lineages.csv --output " + args.output + " --threads " + str(args.threads)
+    if args.gpu:
+        mst_command = mst_command + " --gpu-network"

From 4da168254545328e5d2fffdcca4629b32e2236ce Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Wed, 3 Feb 2021 18:15:18 +0000
Subject: [PATCH 014/327] Refactor batch script

---
 PopPUNK/sparse_mst.py        |  27 ++--
 scripts/poppunk_batch_mst.py | 233 +++++++++++++++++++----------------
 setup.py                     |   1 +
 3 files changed, 141 insertions(+), 120 deletions(-)

diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py
index e0ad75f1..71b78f10 100755
--- a/PopPUNK/sparse_mst.py
+++ b/PopPUNK/sparse_mst.py
@@ -29,9 +29,9 @@ def get_options():
 
     # input options
     iGroup = parser.add_argument_group('Input files')
-    iGroup.add_argument('--distances', required=True, help='Prefix of input pickle of pre-calculated distances (required)')
     iGroup.add_argument('--rank-fit', required=True, help='Location of rank fit, a sparse matrix (*_rank*_fit.npz)')
     iGroup.add_argument('--previous-clustering', help='CSV file with cluster definitions')
+    iGroup.add_argument('--distance-pkl', help='Input pickle from distances, which contains sample names')
     iGroup.add_argument('--display-cluster', default=None, help='Column of clustering CSV to use for plotting')
 
     # output options
@@ -66,6 +66,18 @@ def main():
             sys.stderr.write("cugraph and cudf unavailable\n")
             raise ImportError(e)
 
+    # Read in sample names
+    if (args.distance_pkl is not None) ^ (args.previous_clustering is not None):
+        sys.stderr.write("To label strains, both --distance-pkl and --previous-clustering"
+                         " must be provided\n")
+        sys.exit(1)
+    elif os.path.exists(args.distance_pkl):
+        with open(args.distances + ".pkl", 'rb') as pickle_file:
+            rlist, qlist, self = pickle.load(pickle_file)
+            if not self:
+                sys.stderr.write("This script must be run on a full all-v-all model\n")
+                sys.exit(1)
+
     # Check output path ok
     if not os.path.isdir(args.output):
         try:
@@ -75,13 +87,6 @@ def main():
             sys.exit(1)
     setGtThreads(args.threads)
 
-    # Read in sample names
-    with open(args.distances + ".pkl", 'rb') as pickle_file:
-        rlist, qlist, self = pickle.load(pickle_file)
-        if not self:
-            sys.stderr.write("This script must be run on a full all-v-all model\n")
-            sys.exit(1)
-
     # Create network with sparse dists
     sys.stderr.write("Loading distances into graph\n")
     sparse_mat = sparse.load_npz(args.rank_fit)
@@ -107,9 +112,9 @@ def main():
         G = constructNetwork(rlist, rlist, None, 0,
                              sparse_input=sparse_mat, summarise=False)
         sys.stderr.write("Calculating MST (CPU)\n")
-        # Generate minimum spanning tree
-        mst = generate_minimum_spanning_tree(G, args.gpu_graph)
-    
+
+    mst = generate_minimum_spanning_tree(G, args.gpu_graph)
+
     # Save output
     sys.stderr.write("Generating output\n")
     mst.save(args.output + "/" + os.path.basename(args.output) + ".graphml", fmt="graphml")
diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 06c0e5ef..77208d30 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # vim: set fileencoding=<utf-8> :
-# Copyright 2018-2020 John Lees and Nick Croucher
+# Copyright 2018-2021 John Lees and Nick Croucher
 
 # universal
 import os
@@ -9,150 +9,165 @@
 import subprocess
 import shutil
 import glob
+import tempfile
 from collections import defaultdict
 
-def write_batch(batched_sequences, files, batch, output):
-    out_fn = output + '.' + batch + '.list'
-    with open(out_fn,'w') as out_file:
-        for seq in batched_sequences[batch]:
-            out_file.write(seq + "\t" + files[seq] + "\n")
-    return out_fn
+rfile_names = "rlist.txt"
 
 # command line parsing
 def get_options():
 
-    parser = argparse.ArgumentParser(description='Batch MST mode (create db + lineage model fit + assign)',
-                                     prog='batch_mst')
+    parser = argparse.ArgumentParser(description='Batch MST mode (create db + lineage model fit + assign + sparse_mst)',
+                                     prog='poppunk_batch_mst')
 
     # input options
     ioGroup = parser.add_argument_group('Input and output file options')
-    ioGroup.add_argument('--batch-file', help='Tab-separated list of sequence names, files '
-                                              'and batch assignments',
-                                              required = True)
-    ioGroup.add_argument('--batch-order', help='File specifying order in which batches should '
-                                              'be processed')
+    ioGroup.add_argument('--r-files', help="Sample names and locations (as for poppunk --r-files)",
+                                      required=True)
+    ioGroup.add_argument('--batch-file', help="Batches to process samples in --r-files in",
+                                         required = True)
+    ioGroup.add_argument('--output', help='Prefix for output files', required=True)
+    ioGroup.add_argument('--previous-clustering', help='CSV file with previous clusters in MST drawing',
+                                                  default=None)
     ioGroup.add_argument('--keep-intermediates', help='Retain the outputs of each batch',
                                                         default=False,
                                                         action='store_true')
-    ioGroup.add_argument('--output', help='Prefix for output files', required=True)
-    
+
     # analysis options
     aGroup = parser.add_argument_group('Analysis options')
     aGroup.add_argument('--rank', help='Rank used to fit lineage model (int)',
-                                               type = int,
-                                               default = 1)
+                                  type = int,
+                                  default = 10)
     aGroup.add_argument('--threads', help='Number of threads for parallelisation (int)',
-                                              type = int,
-                                              default = 1)
-    aGroup.add_argument('--gpu', help='Use GPU for analysis',
-                                              default=False,
-                                              action='store_true')
+                                     type = int,
+                                     default = 1)
+    aGroup.add_argument('--use-gpu', help='Use GPU for analysis',
+                                     default=False,
+                                     action='store_true')
     aGroup.add_argument('--deviceid', help='GPU device ID (int)',
-                                              type = int,
-                                              default = 0)
+                                      type = int,
+                                      default = 0)
     aGroup.add_argument('--db-args', help="Other arguments to pass to poppunk. e.g. "
                                              "'--min-k 13 --max-k 29'",
-                                             default = "")
+                                     default = "")
     aGroup.add_argument('--model-args', help="Other arguments to pass to lineage model fit",
-                                            default = "")
+                                        default = "")
     aGroup.add_argument('--assign-args', help="Other arguments to pass to poppunk_assign",
-                                            default = "")
+                                         default = "")
 
     # Executable options
     eGroup = parser.add_argument_group('Executable locations')
     eGroup.add_argument('--poppunk-exe', help="Location of poppunk executable. Use "
-                                            "'python poppunk-runner.py' to run from source tree")
+                                             "'python poppunk-runner.py' to run from source tree",
+                                         default="poppunk")
     eGroup.add_argument('--assign-exe', help="Location of poppunk executable. Use "
-                                            "'python poppunk-runner.py' to run from source tree")
+                                             "'python poppunk_assign-runner.py' to run from source tree",
+                                        default="poppunk_assign")
     eGroup.add_argument('--mst-exe', help="Location of poppunk executable. Use "
-                                            "'python poppunk-runner.py' to run from source tree")
+                                           "'python poppunk_mst-runner.py' to run from source tree",
+                                     default="poppunk_visulaise")
 
     return parser.parse_args()
 
+def writeBatch(rlines, batches, batch_selected):
+    tmpdir = tempfile.mkdtemp(prefix="pp_mst", dir="./")
+    with open(tmpdir + "/" + rfile_names, 'w') as outfile:
+        for rline, batch in zip(rlines, batches):
+            if batch == batch_selected:
+                outfile.write(rline)
+
+    return tmpdir
+
+def runCmd(cmd_string):
+    sys.stderr.write("Running command:\n")
+    sys.stderr.write(cmd_string)
+    subprocess.run(cmd_string, shell=True, check=True)
+
 # main code
 if __name__ == "__main__":
 
     # Check input ok
     args = get_options()
-
-    # Get poppunk executable
-    if args.poppunk_exe is None:
-        poppunk = "poppunk"
-    else:
-        poppunk = args.poppunk_exe
-    # Need to add poppunk_assign_exe
+    if args.previous_clustering is not None and \
+        not os.path.isfile(args.previous_clustering):
+        sys.stderr.write("Provided --previous-clustering file cannot be found\n")
+        sys.exit(1)
 
     # Check input file and batching
-    batch_set = set()
-    files = {}
-    batched_sequences = defaultdict(list)
-    with open(args.batch_file,'r') as input_file:
-        for line in input_file.readlines():
-            info = line.rstrip().split()
-            files[info[0]] = info[1]
-            batch_set.add(info[2])
-            batched_sequences[info[2]].append(info[0])
-
-    # Check on batch order
+    rlines = []
     batches = []
-    if args.batch_order is not None:
-        with open(args.batch_order,'r') as order_file:
-            batches = [line for line in input_file.readlines().rstrip()]
-        if set(batches) != batch_set:
-            batch_discrepancies = set(batches).difference(batch_set) + \
-                                    batch_set.difference(set(batches))
-            sys.stderr.write('Discrepancies between input file and batch '
-            'ordering: ' + str(batch_discrepancies) + '\n')
-            sys.exit()
-    else:
-        batches = list(batch_set)
-
-    # Iterate through batches
-    first_batch = True
-    current_dir = args.output
-    for batch in batches:
-        # Write batch file
-        batch_fn = write_batch(batched_sequences, files, batch, args.output)
-        if first_batch:
-            # Initialise database
-            create_db_cmd = poppunk + " --create-db --r-files " + batch_fn + " --output " + args.output + " " + args.db_args + " --threads " + str(args.threads) + " " + args.db_args
-            if args.gpu:
-                create_db_cmd = create_db_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid)
-            sys.stderr.write(create_db_cmd + "\n")
-            subprocess.run(create_db_cmd, shell=True, check=True)
-            # Fit lineage model
-            fit_model_cmd = poppunk + " --fit-model lineage --ref-db " + args.output + " --rank " + str(args.rank) + " --threads " + str(args.threads) + " " + args.model_args
-            sys.stderr.write(fit_model_cmd + "\n")
-            subprocess.run(fit_model_cmd, shell=True, check=True)
-            # Completed first batch
-            first_batch = False
-        else:
-            # Define batch prefix
-            batch_prefix = args.output + "_" + batch
-            # Add to first batch through querying
-            assign_cmd = "poppunk_assign --db " + current_dir + " --query " + batch_fn + " --model-dir " + args.output + " --output " + batch_prefix + " --threads " + str(args.threads) + " --update-db " + args.assign_args
+    with open(args.r_file,'rU') as r_file, open(args.batch_file, 'rU') as batch_file:
+        for r_line, batch_line in zip(r_file, batch_file):
+            rlines.append(r_line)
+            batch_fields = batch_line.rstrip()
+            batches.append(batch_fields)
+
+    batch_names = sorted(set(batches))
+    if len(batch_names) < 2:
+        sys.stderr.write("You must supply multiple batches")
+        sys.exit(1)
+    first_batch = batch_names.pop(0)
+
+    # try/except block to clean up tmp files
+    wd = writeBatch(rlines, batches, first_batch)
+    tmp_dirs = []
+    try:
+        # First batch is create DB + lineage
+        create_db_cmd = args.poppunk_exe + " --create-db --r-files " + \
+                                wd + "/" + rfile_names + \
+                                " --output " + wd + " " + \
+                                args.db_args + " --threads " + \
+                                str(args.threads) + " " + \
+                                args.db_args
+        if args.gpu:
+            create_db_cmd += " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid)
+        runCmd(create_db_cmd)
+
+        # Fit lineage model
+        fit_model_cmd = args.poppunk_exe + " --fit-model lineage --ref-db " + \
+                                wd + " --rank " + \
+                                str(args.rank) + " --threads " + \
+                                str(args.threads) + " " + \
+                                args.model_args
+        runCmd(fit_model_cmd)
+
+        for batch_idx, batch in enumerate(batch_names):
+            batch_wd = writeBatch(rlines, batches, batch)
+            tmp_dirs.append(batch_wd)
+
+            assign_cmd = args.assign_exe + " --db " + wd + \
+                        " --query " + batch_wd + "/" + rfile_names + \
+                        " --model-dir " + wd + " --output " + batch_wd + \
+                        " --threads " + str(args.threads) + " --update-db " + \
+                        args.assign_args
             if args.gpu:
-                assign_cmd = assign_cmd + " --gpu-dist --deviceid " + str(args.deviceid)
-            sys.stderr.write(assign_cmd + "\n")
-            subprocess.run(assign_cmd, shell=True, check=True)
-            # Process output
-            if args.keep_intermediates:
-#                shutil.rmtree(batch_prefix)
-                current_dir = batch_prefix
-                print("Switch current dir to " + current_dir)
-            else:
-                for file in glob.glob(args.output + "_" + batch + "/*"):
-                    file_basename = os.path.basename(file)
-                    if file_basename.startswith(batch_prefix):
-                        print("Moving file " + args.output + "_" + batch + '/' + file_basename + " to " + current_dir + '/' + file_basename.replace(batch_prefix,args.output))
-                        os.rename(args.output + "_" + batch + '/' + file_basename,
-                                  current_dir + '/' + file_basename.replace(batch_prefix,args.output))
-                shutil.rmtree(args.output + "_" + batch)
-                    
-        # Remove npy dist file
-#        os.remove(args.output + "/" + args.output + ".dists.npy")
-
-    # Calculate MST
-    mst_command = "poppunk_mst --distances " + args.output + "/" + args.output + ".dists --rank-fit " + args.output + "/" + args.output + "_rank" + str(args.rank) + "_fit.npz --previous-clustering " + args.output + "/" + args.output + "_lineages.csv --output " + args.output + " --threads " + str(args.threads)
-    if args.gpu:
-        mst_command = mst_command + " --gpu-network"
+                assign_cmd = assign_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid)
+            runCmd(assign_cmd)
+
+            # Remove the previous batch
+            if batch_idx > 0 and args.keep_intermediates == False:
+                shutil.rmtree(tmp_dirs[batch_idx - 1])
+
+        # Calculate MST
+        output_dir = tmp_dirs[-1]
+        mst_command = args.mst_ext + " --distance-pkl " + output_dir + \
+                        "/" + output_dir + ".dists.pkl --rank-fit " + \
+                        output_dir + "/" + output_dir + "_rank" + \
+                        str(args.rank) +  "_fit.npz " + \
+                        "--previous-clustering " + args.previous_clustering + \
+                        " --output " + args.output + \
+                        " --threads " + str(args.threads)
+        if args.gpu:
+            mst_command = mst_command + " --gpu-graph"
+        runCmd(mst_command)
+    except:
+        if args.keep_intermediates == False:
+            for tmpdir in tmp_dirs:
+                shutil.rmtree(wd)
+                shutil.rmtree(tmpdir)
+        print("Unexpected error:", sys.exc_info()[0])
+        raise
+
+    if args.keep_intermediates == False:
+        shutil.rmtree(wd)
+        shutil.rmtree(output_dir)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index a0bea06d..27166238 100644
--- a/setup.py
+++ b/setup.py
@@ -119,6 +119,7 @@ def build_extension(self, ext):
     scripts=['scripts/poppunk_calculate_rand_indices.py',
              'scripts/poppunk_extract_components.py',
              'scripts/poppunk_calculate_silhouette.py',
+             'scripts/poppunk_batch_mst.py',
              'scripts/poppunk_extract_distances.py',
              'scripts/poppunk_add_weights.py',
              'scripts/poppunk_easy_run.py',

From fe4cb41e62fcfd5133e61a68b94d46a0cf38997c Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 Feb 2021 05:00:16 +0000
Subject: [PATCH 015/327] Update sparse_mst distance file processing

---
 PopPUNK/sparse_mst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py
index 71b78f10..6133881c 100755
--- a/PopPUNK/sparse_mst.py
+++ b/PopPUNK/sparse_mst.py
@@ -72,7 +72,7 @@ def main():
                          " must be provided\n")
         sys.exit(1)
     elif os.path.exists(args.distance_pkl):
-        with open(args.distances + ".pkl", 'rb') as pickle_file:
+        with open(args.distance_pkl, 'rb') as pickle_file:
             rlist, qlist, self = pickle.load(pickle_file)
             if not self:
                 sys.stderr.write("This script must be run on a full all-v-all model\n")

From 3098aeaba985aeb7a8730810c2b7d3849fb59aa8 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 Feb 2021 05:15:39 +0000
Subject: [PATCH 016/327] Consistent argument names

---
 scripts/poppunk_batch_mst.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 77208d30..939767f3 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -96,7 +96,7 @@ def runCmd(cmd_string):
     # Check input file and batching
     rlines = []
     batches = []
-    with open(args.r_file,'rU') as r_file, open(args.batch_file, 'rU') as batch_file:
+    with open(args.r_files,'r') as r_file, open(args.batch_file, 'r') as batch_file:
         for r_line, batch_line in zip(r_file, batch_file):
             rlines.append(r_line)
             batch_fields = batch_line.rstrip()
@@ -119,7 +119,7 @@ def runCmd(cmd_string):
                                 args.db_args + " --threads " + \
                                 str(args.threads) + " " + \
                                 args.db_args
-        if args.gpu:
+        if args.use_gpu:
             create_db_cmd += " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid)
         runCmd(create_db_cmd)
 
@@ -140,7 +140,7 @@ def runCmd(cmd_string):
                         " --model-dir " + wd + " --output " + batch_wd + \
                         " --threads " + str(args.threads) + " --update-db " + \
                         args.assign_args
-            if args.gpu:
+            if args.use_gpu:
                 assign_cmd = assign_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid)
             runCmd(assign_cmd)
 
@@ -157,7 +157,7 @@ def runCmd(cmd_string):
                         "--previous-clustering " + args.previous_clustering + \
                         " --output " + args.output + \
                         " --threads " + str(args.threads)
-        if args.gpu:
+        if args.use_gpu:
             mst_command = mst_command + " --gpu-graph"
         runCmd(mst_command)
     except:
@@ -170,4 +170,4 @@ def runCmd(cmd_string):
 
     if args.keep_intermediates == False:
         shutil.rmtree(wd)
-        shutil.rmtree(output_dir)
\ No newline at end of file
+        shutil.rmtree(output_dir)

From 0eb0265ce01f2a6823e40cb228df69b5d1391c1e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 Feb 2021 05:45:52 +0000
Subject: [PATCH 017/327] FIx arguments to mst command

---
 scripts/poppunk_batch_mst.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 939767f3..9a8b77c7 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -60,12 +60,12 @@ def get_options():
     eGroup.add_argument('--poppunk-exe', help="Location of poppunk executable. Use "
                                              "'python poppunk-runner.py' to run from source tree",
                                          default="poppunk")
-    eGroup.add_argument('--assign-exe', help="Location of poppunk executable. Use "
+    eGroup.add_argument('--assign-exe', help="Location of poppunk_assign executable. Use "
                                              "'python poppunk_assign-runner.py' to run from source tree",
                                         default="poppunk_assign")
     eGroup.add_argument('--mst-exe', help="Location of poppunk executable. Use "
                                            "'python poppunk_mst-runner.py' to run from source tree",
-                                     default="poppunk_visulaise")
+                                     default="poppunk_mst")
 
     return parser.parse_args()
 
@@ -150,13 +150,16 @@ def runCmd(cmd_string):
 
         # Calculate MST
         output_dir = tmp_dirs[-1]
-        mst_command = args.mst_ext + " --distance-pkl " + output_dir + \
-                        "/" + output_dir + ".dists.pkl --rank-fit " + \
-                        output_dir + "/" + output_dir + "_rank" + \
+        mst_command = args.mst_exe + " --distance-pkl " + output_dir + \
+                        "/" + os.path.basename(output_dir) + ".dists.pkl --rank-fit " + \
+                        output_dir + "/" + os.path.basename(output_dir) + "_rank" + \
                         str(args.rank) +  "_fit.npz " + \
-                        "--previous-clustering " + args.previous_clustering + \
                         " --output " + args.output + \
                         " --threads " + str(args.threads)
+        if args.previous_clustering is not None:
+            mst_command = mst_command + " --previous-clustering " + args.previous_clustering
+        else:
+            mst_command = mst_command + " --previous-clustering " + os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv")
         if args.use_gpu:
             mst_command = mst_command + " --gpu-graph"
         runCmd(mst_command)

From 6c723c662f7ef50693941f80a41f5d3093d8a78a Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 Feb 2021 09:10:34 +0000
Subject: [PATCH 018/327] Update MST test

---
 test/run_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/run_test.py b/test/run_test.py
index 0e028fd9..aaf7eaa2 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -68,7 +68,7 @@
 # MST
 sys.stderr.write("Running MST\n")
 subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree mst", shell=True, check=True)
-subprocess.run("python ../poppunk_mst-runner.py --distances example_db/example_db.dists --rank-fit example_lineages/example_lineages_rank5_fit.npz --previous-clustering example_dbscan/example_dbscan_clusters.csv --output example_sparse_mst --no-plot", shell=True, check=True)
+subprocess.run("python ../poppunk_mst-runner.py --distance-pkl example_db/example_db.dists.pkl --rank-fit example_lineages/example_lineages_rank5_fit.npz --previous-clustering example_dbscan/example_dbscan_clusters.csv --output example_sparse_mst --no-plot", shell=True, check=True)
 
 # t-sne
 sys.stderr.write("Running tsne viz\n")

From a678080ab9813f885852cb18d1e33aa36118ebcb Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 Feb 2021 09:47:15 +0000
Subject: [PATCH 019/327] Output updated sparse distance matrix with
 --update-db

---
 PopPUNK/assign.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index c5a422bc..c7482f61 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -9,12 +9,15 @@
 import numpy as np
 import subprocess
 from collections import defaultdict
+import scipy.optimize
+from scipy.sparse import coo_matrix, bmat, find
 
 # required from v2.1.1 onwards (no mash support)
 import pp_sketchlib
 
 # import poppunk package
 from .__init__ import __version__
+from .models import rankFile
 
 #*******************************#
 #*                             *#
@@ -235,6 +238,14 @@ def assign_query(dbFuncs,
         else:
             genomeNetwork.save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt')
 
+        # Save sparse distance matrices
+        if model.type == 'lineage':
+            for rank in model.ranks:
+                scipy.sparse.save_npz(
+                    output + "/" + os.path.basename(output) + \
+                    rankFile(rank),
+                    self.nn_dists[rank])
+
         # Update distance matrices with all calculated distances
         if distances == None:
             distanceFiles = ref_db + "/" + os.path.basename(ref_db) + ".dists"

From ccae98e6ea2f1e00307281e072b6720fe47b35e7 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 Feb 2021 10:18:49 +0000
Subject: [PATCH 020/327] Update lineage models for iterative MST generation

---
 PopPUNK/assign.py            | 11 +++--------
 scripts/poppunk_batch_mst.py |  3 ++-
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index c7482f61..a2202108 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -235,17 +235,12 @@ def assign_query(dbFuncs,
         joinDBs(ref_db, output, output)
         if model.type == 'lineage':
             genomeNetwork[min(model.ranks)].save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt')
+            # Save sparse distance matrices and updated model
+            model.outPrefix = os.path.basename(output)
+            model.save()
         else:
             genomeNetwork.save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt')
 
-        # Save sparse distance matrices
-        if model.type == 'lineage':
-            for rank in model.ranks:
-                scipy.sparse.save_npz(
-                    output + "/" + os.path.basename(output) + \
-                    rankFile(rank),
-                    self.nn_dists[rank])
-
         # Update distance matrices with all calculated distances
         if distances == None:
             distanceFiles = ref_db + "/" + os.path.basename(ref_db) + ".dists"
diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 9a8b77c7..9115945b 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -159,7 +159,8 @@ def runCmd(cmd_string):
         if args.previous_clustering is not None:
             mst_command = mst_command + " --previous-clustering " + args.previous_clustering
         else:
-            mst_command = mst_command + " --previous-clustering " + os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv")
+            mst_command = mst_command + " --previous-clustering " + \
+                            os.path.join(output_dir,output_dir + "_lineages.csv")
         if args.use_gpu:
             mst_command = mst_command + " --gpu-graph"
         runCmd(mst_command)

From b9fdf3a54f5f57ee298eb3e4f9efcfff6afab681 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 Feb 2021 10:49:35 +0000
Subject: [PATCH 021/327] Save lineage definitions

---
 scripts/poppunk_batch_mst.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 9115945b..629b6e59 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -93,6 +93,9 @@ def runCmd(cmd_string):
         sys.stderr.write("Provided --previous-clustering file cannot be found\n")
         sys.exit(1)
 
+    # If no batch file is provided, generate one
+    
+
     # Check input file and batching
     rlines = []
     batches = []
@@ -164,6 +167,11 @@ def runCmd(cmd_string):
         if args.use_gpu:
             mst_command = mst_command + " --gpu-graph"
         runCmd(mst_command)
+        
+        # Retrieve lineages from previous round
+        os.rename(os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv"),
+                  os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv"))
+        
     except:
         if args.keep_intermediates == False:
             for tmpdir in tmp_dirs:

From 8192ba73eeb011d5ba27c9249efdbe603cd42b81 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 Feb 2021 11:00:35 +0000
Subject: [PATCH 022/327] Retain named batches if requested

---
 scripts/poppunk_batch_mst.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 629b6e59..a822956a 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -32,7 +32,9 @@ def get_options():
     ioGroup.add_argument('--keep-intermediates', help='Retain the outputs of each batch',
                                                         default=False,
                                                         action='store_true')
-
+    ioGroup.add_argument('--use-batch-names', help='Name the stored outputs of each batch',
+                                                        default=False,
+                                                        action='store_true')
     # analysis options
     aGroup = parser.add_argument_group('Analysis options')
     aGroup.add_argument('--rank', help='Rank used to fit lineage model (int)',
@@ -69,8 +71,13 @@ def get_options():
 
     return parser.parse_args()
 
-def writeBatch(rlines, batches, batch_selected):
-    tmpdir = tempfile.mkdtemp(prefix="pp_mst", dir="./")
+def writeBatch(rlines, batches, batch_selected, use_names = False):
+    tmpdir = ""
+    if use_names:
+        tmpdir = "./pp_mst_" + batch_selected
+        os.mkdir(tmpdir)
+    else:
+        tmpdir = tempfile.mkdtemp(prefix="pp_mst", dir="./")
     with open(tmpdir + "/" + rfile_names, 'w') as outfile:
         for rline, batch in zip(rlines, batches):
             if batch == batch_selected:
@@ -93,9 +100,6 @@ def runCmd(cmd_string):
         sys.stderr.write("Provided --previous-clustering file cannot be found\n")
         sys.exit(1)
 
-    # If no batch file is provided, generate one
-    
-
     # Check input file and batching
     rlines = []
     batches = []
@@ -112,7 +116,7 @@ def runCmd(cmd_string):
     first_batch = batch_names.pop(0)
 
     # try/except block to clean up tmp files
-    wd = writeBatch(rlines, batches, first_batch)
+    wd = writeBatch(rlines, batches, first_batch, args.use_batch_names)
     tmp_dirs = []
     try:
         # First batch is create DB + lineage
@@ -135,7 +139,7 @@ def runCmd(cmd_string):
         runCmd(fit_model_cmd)
 
         for batch_idx, batch in enumerate(batch_names):
-            batch_wd = writeBatch(rlines, batches, batch)
+            batch_wd = writeBatch(rlines, batches, batch, args.use_batch_names)
             tmp_dirs.append(batch_wd)
 
             assign_cmd = args.assign_exe + " --db " + wd + \

From 1189b87ba42f499f5dc0390afd7984175de511bd Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 Feb 2021 11:08:21 +0000
Subject: [PATCH 023/327] Allow for clustering with multiple lineage ranks

---
 scripts/poppunk_batch_mst.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index a822956a..d65f9e56 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -37,9 +37,9 @@ def get_options():
                                                         action='store_true')
     # analysis options
     aGroup = parser.add_argument_group('Analysis options')
-    aGroup.add_argument('--rank', help='Rank used to fit lineage model (int)',
-                                  type = int,
-                                  default = 10)
+    aGroup.add_argument('--rank', help='Comma separated ranks used to fit lineage model (list of ints)',
+                                      type = str,
+                                      default = "10")
     aGroup.add_argument('--threads', help='Number of threads for parallelisation (int)',
                                      type = int,
                                      default = 1)
@@ -100,6 +100,10 @@ def runCmd(cmd_string):
         sys.stderr.write("Provided --previous-clustering file cannot be found\n")
         sys.exit(1)
 
+    # Check ranks
+    ranks = [int(rank) for rank in args.rank.split(',')]
+    max_rank = max(ranks)
+
     # Check input file and batching
     rlines = []
     batches = []
@@ -133,7 +137,7 @@ def runCmd(cmd_string):
         # Fit lineage model
         fit_model_cmd = args.poppunk_exe + " --fit-model lineage --ref-db " + \
                                 wd + " --rank " + \
-                                str(args.rank) + " --threads " + \
+                                args.rank + " --threads " + \
                                 str(args.threads) + " " + \
                                 args.model_args
         runCmd(fit_model_cmd)
@@ -160,7 +164,7 @@ def runCmd(cmd_string):
         mst_command = args.mst_exe + " --distance-pkl " + output_dir + \
                         "/" + os.path.basename(output_dir) + ".dists.pkl --rank-fit " + \
                         output_dir + "/" + os.path.basename(output_dir) + "_rank" + \
-                        str(args.rank) +  "_fit.npz " + \
+                        str(max_rank) +  "_fit.npz " + \
                         " --output " + args.output + \
                         " --threads " + str(args.threads)
         if args.previous_clustering is not None:

From 2d8813300cce4cb59b06789f0088bc0a6b57d4b6 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 Feb 2021 11:21:48 +0000
Subject: [PATCH 024/327] Alter batch processing

---
 scripts/poppunk_batch_mst.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index d65f9e56..2cc78f39 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -100,25 +100,27 @@ def runCmd(cmd_string):
         sys.stderr.write("Provided --previous-clustering file cannot be found\n")
         sys.exit(1)
 
-    # Check ranks
+    # Extract ranks
     ranks = [int(rank) for rank in args.rank.split(',')]
     max_rank = max(ranks)
 
-    # Check input file and batching
+    # Check batching
     rlines = []
     batches = []
-    with open(args.r_files,'r') as r_file, open(args.batch_file, 'r') as batch_file:
-        for r_line, batch_line in zip(r_file, batch_file):
-            rlines.append(r_line)
-            batch_fields = batch_line.rstrip()
-            batches.append(batch_fields)
-
+    with open(args.batch_file,'r') as batch_file:
+        batches = [batch_line.rstrip() for batch_line in batch_file.readlines()]
     batch_names = sorted(set(batches))
     if len(batch_names) < 2:
         sys.stderr.write("You must supply multiple batches")
         sys.exit(1)
     first_batch = batch_names.pop(0)
 
+    # Check input file
+    with open(args.r_files,'r') as r_file:
+        for r_line in r_file:
+            rlines.append(r_line)
+
+
     # try/except block to clean up tmp files
     wd = writeBatch(rlines, batches, first_batch, args.use_batch_names)
     tmp_dirs = []

From b9993ccda6959bed26446347372fe332c581f771 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 Feb 2021 11:39:08 +0000
Subject: [PATCH 025/327] Allow for automatic batching

---
 scripts/poppunk_batch_mst.py | 80 +++++++++++++++++++++---------------
 1 file changed, 47 insertions(+), 33 deletions(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 2cc78f39..8bb44acd 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -22,52 +22,55 @@ def get_options():
 
     # input options
     ioGroup = parser.add_argument_group('Input and output file options')
-    ioGroup.add_argument('--r-files', help="Sample names and locations (as for poppunk --r-files)",
-                                      required=True)
-    ioGroup.add_argument('--batch-file', help="Batches to process samples in --r-files in",
-                                         required = True)
-    ioGroup.add_argument('--output', help='Prefix for output files', required=True)
+    ioGroup.add_argument('--r-files', help='Sample names and locations (as for poppunk --r-files)',
+                                                required=True)
+    ioGroup.add_argument('--batch-file', help='Single column list of batches to process samples in --r-files in')
+    ioGroup.add_argument('--n-batches', help='Number of batches for process if --batch-file is not specified',
+                                                type=int,
+                                                default=10)
+    ioGroup.add_argument('--output', help='Prefix for output files',
+                                                required=True)
     ioGroup.add_argument('--previous-clustering', help='CSV file with previous clusters in MST drawing',
-                                                  default=None)
+                                                default=None)
     ioGroup.add_argument('--keep-intermediates', help='Retain the outputs of each batch',
-                                                        default=False,
-                                                        action='store_true')
+                                                default=False,
+                                                action='store_true')
     ioGroup.add_argument('--use-batch-names', help='Name the stored outputs of each batch',
-                                                        default=False,
-                                                        action='store_true')
+                                                default=False,
+                                                action='store_true')
     # analysis options
     aGroup = parser.add_argument_group('Analysis options')
     aGroup.add_argument('--rank', help='Comma separated ranks used to fit lineage model (list of ints)',
-                                      type = str,
-                                      default = "10")
+                                                type = str,
+                                                default = "10")
     aGroup.add_argument('--threads', help='Number of threads for parallelisation (int)',
-                                     type = int,
-                                     default = 1)
+                                                type = int,
+                                                default = 1)
     aGroup.add_argument('--use-gpu', help='Use GPU for analysis',
-                                     default=False,
-                                     action='store_true')
+                                                default=False,
+                                                action='store_true')
     aGroup.add_argument('--deviceid', help='GPU device ID (int)',
-                                      type = int,
-                                      default = 0)
+                                                type = int,
+                                                default = 0)
     aGroup.add_argument('--db-args', help="Other arguments to pass to poppunk. e.g. "
                                              "'--min-k 13 --max-k 29'",
-                                     default = "")
+                                                default = "")
     aGroup.add_argument('--model-args', help="Other arguments to pass to lineage model fit",
-                                        default = "")
+                                                default = "")
     aGroup.add_argument('--assign-args', help="Other arguments to pass to poppunk_assign",
-                                         default = "")
+                                                default = "")
 
     # Executable options
     eGroup = parser.add_argument_group('Executable locations')
     eGroup.add_argument('--poppunk-exe', help="Location of poppunk executable. Use "
                                              "'python poppunk-runner.py' to run from source tree",
-                                         default="poppunk")
+                                                default="poppunk")
     eGroup.add_argument('--assign-exe', help="Location of poppunk_assign executable. Use "
                                              "'python poppunk_assign-runner.py' to run from source tree",
-                                        default="poppunk_assign")
+                                                default="poppunk_assign")
     eGroup.add_argument('--mst-exe', help="Location of poppunk executable. Use "
                                            "'python poppunk_mst-runner.py' to run from source tree",
-                                     default="poppunk_mst")
+                                                default="poppunk_mst")
 
     return parser.parse_args()
 
@@ -104,23 +107,34 @@ def runCmd(cmd_string):
     ranks = [int(rank) for rank in args.rank.split(',')]
     max_rank = max(ranks)
 
-    # Check batching
+    # Check input file
     rlines = []
+    with open(args.r_files,'r') as r_file:
+        for r_line in r_file:
+            rlines.append(r_line)
+
+    # Check batching
     batches = []
-    with open(args.batch_file,'r') as batch_file:
-        batches = [batch_line.rstrip() for batch_line in batch_file.readlines()]
+    if args.batch_file:
+        # Read specified batches
+        with open(args.batch_file,'r') as batch_file:
+            batches = [batch_line.rstrip() for batch_line in batch_file.readlines()]
+    else:
+        # Generate arbitrary batches
+        x = 0
+        while x < len(rlines):
+            if n > args.n_batches:
+                n = 1
+            batches.append(n)
+            n = n + 1
+            x = x + 1
+    # Validate batches
     batch_names = sorted(set(batches))
     if len(batch_names) < 2:
         sys.stderr.write("You must supply multiple batches")
         sys.exit(1)
     first_batch = batch_names.pop(0)
 
-    # Check input file
-    with open(args.r_files,'r') as r_file:
-        for r_line in r_file:
-            rlines.append(r_line)
-
-
     # try/except block to clean up tmp files
     wd = writeBatch(rlines, batches, first_batch, args.use_batch_names)
     tmp_dirs = []

From 05c960940fe92eac565b7ef03b5f6dd806d36d53 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 Feb 2021 14:44:23 +0000
Subject: [PATCH 026/327] Add parsing of additional information

---
 scripts/poppunk_batch_mst.py | 112 ++++++++++++++++++++++++++++++++---
 1 file changed, 105 insertions(+), 7 deletions(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 8bb44acd..f2bdaab4 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -11,6 +11,7 @@
 import glob
 import tempfile
 from collections import defaultdict
+import pandas as pd
 
 rfile_names = "rlist.txt"
 
@@ -28,6 +29,7 @@ def get_options():
     ioGroup.add_argument('--n-batches', help='Number of batches for process if --batch-file is not specified',
                                                 type=int,
                                                 default=10)
+    ioGroup.add_argument('--info-csv', help='CSV containing information about sequences', default=None)
     ioGroup.add_argument('--output', help='Prefix for output files',
                                                 required=True)
     ioGroup.add_argument('--previous-clustering', help='CSV file with previous clusters in MST drawing',
@@ -93,6 +95,83 @@ def runCmd(cmd_string):
     sys.stderr.write(cmd_string)
     subprocess.run(cmd_string, shell=True, check=True)
 
+def readLineages(clustCSV):
+    clusters = defaultdict(dict)
+    # read CSV
+    clustersCsv = pd.read_csv(clustCSV, index_col = 0, quotechar='"')
+    # select relevant columns
+    type_columns = [n for n,col in enumerate(clustersCsv.columns) if ('Rank_' in col or 'overall' in col)]
+    # read file
+    for row in clustersCsv.itertuples():
+        for cls_idx in type_columns:
+            cluster_name = clustersCsv.columns[cls_idx]
+            cluster_name = cluster_name.replace('__autocolour','')
+            clusters[cluster_name][row.Index] = str(row[cls_idx + 1])
+    # return data structure
+    return clusters
+
+def isolateNameToLabel(names):
+    labels = [name.split('/')[-1].split('.')[0] for name in names]
+    return labels
+
+def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
+                epiCsv = None, suffix = '_Lineage'):
+    # set order of column names
+    colnames = ['ID']
+    for cluster_type in clustering:
+        col_name = cluster_type + suffix
+        colnames.append(col_name)
+    # process epidemiological data
+    d = defaultdict(list)
+    # process epidemiological data without duplicating names
+    # used by PopPUNK
+    columns_to_be_omitted = ['id', 'Id', 'ID', 'combined_Cluster__autocolour',
+    'core_Cluster__autocolour', 'accessory_Cluster__autocolour',
+    'overall_Lineage']
+    if epiCsv is not None:
+        epiData = pd.read_csv(epiCsv, index_col = False, quotechar='"')
+        epiData.index = isolateNameToLabel(epiData.iloc[:,0])
+        for e in epiData.columns.values:
+            if e not in columns_to_be_omitted:
+                colnames.append(str(e))
+    # get example clustering name for validation
+    example_cluster_title = list(clustering.keys())[0]
+    for name, label in zip(nodeNames, isolateNameToLabel(nodeLabels)):
+        print('Example: ' + example_cluster_title + '\nClustering: ' + str(clustering[example_cluster_title]))
+        if name in clustering[example_cluster_title]:
+            d['ID'].append(label)
+            for cluster_type in clustering:
+                col_name = cluster_type + suffix
+                d[col_name].append(clustering[cluster_type][name])
+            if epiCsv is not None:
+                if label in epiData.index:
+                    for col, value in zip(epiData.columns.values, epiData.loc[label].values):
+                        if col not in columns_to_be_omitted:
+                            d[col].append(str(value))
+                else:
+                    for col in epiData.columns.values:
+                        if col not in columns_to_be_omitted:
+                            d[col].append('nan')
+        else:
+            sys.stderr.write("Cannot find " + name + " in clustering\n")
+            sys.exit(1)
+    # print CSV
+    sys.stderr.write("Parsed data, now writing to CSV\n")
+    try:
+        pd.DataFrame(data=d).to_csv(outfile, columns = colnames, index = False)
+    except subprocess.CalledProcessError as e:
+        sys.stderr.write("Problem with epidemiological data CSV; returned code: " + str(e.returncode) + "\n")
+        # check CSV
+        prev_col_items = -1
+        prev_col_name = "unknown"
+        for col in d:
+            this_col_items = len(d[col])
+            if prev_col_items > -1 and prev_col_items != this_col_items:
+                sys.stderr.write("Discrepant length between " + prev_col_name + \
+                                 " (length of " + prev_col_items + ") and " + \
+                                 col + "(length of " + this_col_items + ")\n")
+        sys.exit(1)
+
 # main code
 if __name__ == "__main__":
 
@@ -109,9 +188,14 @@ def runCmd(cmd_string):
 
     # Check input file
     rlines = []
+    nodeNames = []
+    nodeLabels = []
     with open(args.r_files,'r') as r_file:
         for r_line in r_file:
             rlines.append(r_line)
+            node_info = r_line.rstrip().split()
+            nodeNames.append(node_info[0])
+            nodeLabels.append(node_info[1])
 
     # Check batching
     batches = []
@@ -122,6 +206,7 @@ def runCmd(cmd_string):
     else:
         # Generate arbitrary batches
         x = 0
+        n = 1
         while x < len(rlines):
             if n > args.n_batches:
                 n = 1
@@ -137,7 +222,7 @@ def runCmd(cmd_string):
 
     # try/except block to clean up tmp files
     wd = writeBatch(rlines, batches, first_batch, args.use_batch_names)
-    tmp_dirs = []
+    tmp_dirs = [wd]
     try:
         # First batch is create DB + lineage
         create_db_cmd = args.poppunk_exe + " --create-db --r-files " + \
@@ -159,12 +244,13 @@ def runCmd(cmd_string):
         runCmd(fit_model_cmd)
 
         for batch_idx, batch in enumerate(batch_names):
+            prev_wd = tmp_dirs[-1]
             batch_wd = writeBatch(rlines, batches, batch, args.use_batch_names)
             tmp_dirs.append(batch_wd)
 
-            assign_cmd = args.assign_exe + " --db " + wd + \
+            assign_cmd = args.assign_exe + " --db " + prev_wd + \
                         " --query " + batch_wd + "/" + rfile_names + \
-                        " --model-dir " + wd + " --output " + batch_wd + \
+                        " --model-dir " + prev_wd + " --output " + batch_wd + \
                         " --threads " + str(args.threads) + " --update-db " + \
                         args.assign_args
             if args.use_gpu:
@@ -187,19 +273,31 @@ def runCmd(cmd_string):
             mst_command = mst_command + " --previous-clustering " + args.previous_clustering
         else:
             mst_command = mst_command + " --previous-clustering " + \
-                            os.path.join(output_dir,output_dir + "_lineages.csv")
+                            os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv")
         if args.use_gpu:
             mst_command = mst_command + " --gpu-graph"
         runCmd(mst_command)
         
-        # Retrieve lineages from previous round
+        # Retrieve isolate names and lineages from previous round
+        os.rename(os.path.join(output_dir,os.path.basename(output_dir) + ".dists.pkl"),
+                  os.path.join(args.output,os.path.basename(args.output) + ".dists.pkl"))
         os.rename(os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv"),
                   os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv"))
-        
+
+        # Merge with epidemiological data if requested
+        if args.info_csv is not None:
+            lineage_clustering = readLineages(os.path.join(args.output,
+                                            os.path.basename(args.output) + "_lineages.csv"))
+            writeClusterCsv(os.path.join(args.output,
+                                            os.path.basename(args.output)  + "_info.csv"),
+                            nodeNames,
+                            nodeLabels,
+                            lineage_clustering,
+                            epiCsv = args.info_csv)
+
     except:
         if args.keep_intermediates == False:
             for tmpdir in tmp_dirs:
-                shutil.rmtree(wd)
                 shutil.rmtree(tmpdir)
         print("Unexpected error:", sys.exc_info()[0])
         raise

From 40238dd7e67a676f8e0a1cfd32cbc2ba5d487d59 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 Feb 2021 07:12:28 +0000
Subject: [PATCH 027/327] Enable expansion of previous MST

---
 PopPUNK/network.py           | 61 +++++++++++++++++++++++++++++++++++-
 PopPUNK/sparse_mst.py        | 12 +++++--
 scripts/poppunk_batch_mst.py |  5 ++-
 3 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index b5c9cadd..209b7af2 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -266,9 +266,49 @@ def writeReferences(refList, outPrefix):
 
     return refFileName
 
+def load_previous_network(prev_G, rlist, weights=False):
+    """Load previous network with graph-tool, extract the edges to match the
+    vertex order specified in rlist, and also return weights if specified.
+
+    Args:
+        prev_G (str)
+            Path of file containing existing network.
+        rlist (list)
+            List of reference sequence labels in new network
+        weights (bool)
+            Whether to return edge weights
+            (default = False)
+
+    Returns:
+        source_ids (list)
+            Source nodes for each edge
+        target_ids (list)
+            Target nodes for each edge
+        edge_weights (list)
+            Weights for each new edge
+    """
+    # get list for translating node IDs to rlist
+    old_ids = prev_G.vp["id"]
+    old_id_indices = [rlist.index(x) for x in old_ids]
+    # get the source and target nods
+    source_old_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "source")
+    target_old_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "target")
+    # translate to indices
+    source_ids = [old_id_indices[x] for x in source_old_ids]
+    target_ids = [old_id_indices[x] for x in target_old_ids]
+    # convert to ndarray
+    # get the weights
+    if weights:
+        edge_weights = prev_G.ep['weight']
+        # return values
+        return source_ids, target_ids, edge_weights
+    else:
+        return source_ids, target_ids
+
 def constructNetwork(rlist, qlist, assignments, within_label,
                      summarise = True, edge_list = False, weights = None,
-                     weights_type = 'euclidean', sparse_input = None):
+                     weights_type = 'euclidean', sparse_input = None,
+                     previous_network = None):
     """Construct an unweighted, undirected network without self-loops.
     Nodes are samples and edges where samples are within the same cluster
 
@@ -297,6 +337,9 @@ def constructNetwork(rlist, qlist, assignments, within_label,
             accessory or euclidean distance
         sparse_input (numpy.array)
             Sparse distance matrix from lineage fit
+        previous_network (str)
+            Name of file containing a previous network to be integrated into this new
+            network
 
     Returns:
         G (graph)
@@ -348,6 +391,22 @@ def constructNetwork(rlist, qlist, assignments, within_label,
                     edge_tuple = (ref, query)
                 connections.append(edge_tuple)
 
+    # read previous graph
+    if previous_network is not None:
+        prev_G = gt.load_graph(previous_network)
+        if weights is not None:
+            extra_sources, extra_targets, extra_weights = load_previous_network(prev_G,rlist,
+                                                                                weights = True)
+            for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights):
+                edge_tuple = (ref, query, dist)
+                connections.append(edge_tuple)
+        else:
+            extra_sources, extra_targets = load_previous_network(prev_G,rlist,
+                                                                                weights = False)
+            for (ref, query) in zip(extra_sources, extra_targets):
+                edge_tuple = (ref, query)
+                connections.append(edge_tuple)
+
     # build the graph
     G = gt.Graph(directed = False)
     G.add_vertex(len(vertex_labels))
diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py
index 6133881c..8c120b00 100755
--- a/PopPUNK/sparse_mst.py
+++ b/PopPUNK/sparse_mst.py
@@ -8,6 +8,7 @@
 
 import pickle
 import re
+import numpy as np
 import pandas as pd
 from scipy import sparse
 
@@ -31,6 +32,8 @@ def get_options():
     iGroup = parser.add_argument_group('Input files')
     iGroup.add_argument('--rank-fit', required=True, help='Location of rank fit, a sparse matrix (*_rank*_fit.npz)')
     iGroup.add_argument('--previous-clustering', help='CSV file with cluster definitions')
+    iGroup.add_argument('--previous-mst', help='Graph tool file from which previous MST can be loaded',
+                                            default=None)
     iGroup.add_argument('--distance-pkl', help='Input pickle from distances, which contains sample names')
     iGroup.add_argument('--display-cluster', default=None, help='Column of clustering CSV to use for plotting')
 
@@ -109,8 +112,13 @@ def main():
                                weights=edge_df['weights'].values_host,
                                summarise=False)
     else:
-        G = constructNetwork(rlist, rlist, None, 0,
-                             sparse_input=sparse_mat, summarise=False)
+        if args.previous_mst is not None:
+            G = constructNetwork(rlist, rlist, None, 0,
+                                 sparse_input=sparse_mat, summarise=False,
+                                 previous_network = args.previous_mst)
+        else:
+            G = constructNetwork(rlist, rlist, None, 0,
+                                 sparse_input=sparse_mat, summarise=False)
         sys.stderr.write("Calculating MST (CPU)\n")
 
     mst = generate_minimum_spanning_tree(G, args.gpu_graph)
diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index f2bdaab4..8d87c080 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -34,6 +34,8 @@ def get_options():
                                                 required=True)
     ioGroup.add_argument('--previous-clustering', help='CSV file with previous clusters in MST drawing',
                                                 default=None)
+    ioGroup.add_argument('--previous-mst', help='MST calculated from a subset of the data in graph tool format',
+                                                default=None)
     ioGroup.add_argument('--keep-intermediates', help='Retain the outputs of each batch',
                                                 default=False,
                                                 action='store_true')
@@ -137,7 +139,6 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
     # get example clustering name for validation
     example_cluster_title = list(clustering.keys())[0]
     for name, label in zip(nodeNames, isolateNameToLabel(nodeLabels)):
-        print('Example: ' + example_cluster_title + '\nClustering: ' + str(clustering[example_cluster_title]))
         if name in clustering[example_cluster_title]:
             d['ID'].append(label)
             for cluster_type in clustering:
@@ -269,6 +270,8 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                         str(max_rank) +  "_fit.npz " + \
                         " --output " + args.output + \
                         " --threads " + str(args.threads)
+        if args.previous_mst is not None:
+            mst_command = mst_command + " --previous-mst " + args.previous_mst
         if args.previous_clustering is not None:
             mst_command = mst_command + " --previous-clustering " + args.previous_clustering
         else:

From 68f4fba2a294db6167bfa397e02fa8d29f5b94bd Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 Feb 2021 07:26:35 +0000
Subject: [PATCH 028/327] Add graph extension for GPUs

---
 PopPUNK/sparse_mst.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py
index 8c120b00..80c794ed 100755
--- a/PopPUNK/sparse_mst.py
+++ b/PopPUNK/sparse_mst.py
@@ -15,7 +15,7 @@
 # import poppunk package
 from .__init__ import __version__
 
-from .network import constructNetwork, generate_minimum_spanning_tree
+from .network import constructNetwork, generate_minimum_spanning_tree, load_previous_network
 from .plot import drawMST
 from .trees import mst_to_phylogeny, write_tree
 from .utils import setGtThreads, readIsolateTypeFromCsv
@@ -94,9 +94,21 @@ def main():
     sys.stderr.write("Loading distances into graph\n")
     sparse_mat = sparse.load_npz(args.rank_fit)
     if args.gpu_graph:
-        G_df = cudf.DataFrame({'source': sparse_mat.row,
-                               'destination': sparse_mat.col,
-                               'weights': sparse_mat.data})
+        # Load previous MST if specified
+        if args.previous_mst is not None:
+            extra_sources, extra_targets, extra_weights = load_previous_network(args.previous_mst,
+                                                                                  rlist,
+                                                                                  weights = True)
+            sources = np.append(sparse_mat.row, np.asarray(extra_sources))
+            targets = np.append(sparse_mat.col, np.asarray(extra_targets))
+            weights = np.append(sparse_mat.data, np.asarray(extra_weights))
+        else:
+            sources = sparse_mat.row
+            targets = sparse_mat.col
+            weights = sparse_mat.data
+        G_df = cudf.DataFrame({'source': sources,
+                               'destination': targets,
+                               'weights': weights})
         G_cu = cugraph.Graph()
         G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
 
@@ -112,6 +124,7 @@ def main():
                                weights=edge_df['weights'].values_host,
                                summarise=False)
     else:
+        # Load previous MST if specified
         if args.previous_mst is not None:
             G = constructNetwork(rlist, rlist, None, 0,
                                  sparse_input=sparse_mat, summarise=False,

From 1b710722e2cb6956565cdae0b7af6b47f5bcedda Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 Feb 2021 10:51:55 +0000
Subject: [PATCH 029/327] Fix iterative MST mode

---
 PopPUNK/network.py           |   5 +-
 scripts/poppunk_batch_mst.py | 101 ++++++++++++++++++++++++++++-------
 2 files changed, 85 insertions(+), 21 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 209b7af2..4390e901 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -301,6 +301,7 @@ def load_previous_network(prev_G, rlist, weights=False):
     if weights:
         edge_weights = prev_G.ep['weight']
         # return values
+        print("Old weights: " + str(edge_weights))
         return source_ids, target_ids, edge_weights
     else:
         return source_ids, target_ids
@@ -394,11 +395,11 @@ def constructNetwork(rlist, qlist, assignments, within_label,
     # read previous graph
     if previous_network is not None:
         prev_G = gt.load_graph(previous_network)
-        if weights is not None:
+        if weights is not None or sparse_input is not None:
             extra_sources, extra_targets, extra_weights = load_previous_network(prev_G,rlist,
                                                                                 weights = True)
             for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights):
-                edge_tuple = (ref, query, dist)
+                edge_tuple = (ref, query, weight)
                 connections.append(edge_tuple)
         else:
             extra_sources, extra_targets = load_previous_network(prev_G,rlist,
diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 8d87c080..d3166fec 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -34,8 +34,9 @@ def get_options():
                                                 required=True)
     ioGroup.add_argument('--previous-clustering', help='CSV file with previous clusters in MST drawing',
                                                 default=None)
-    ioGroup.add_argument('--previous-mst', help='MST calculated from a subset of the data in graph tool format',
-                                                default=None)
+    ioGroup.add_argument('--iterative-mst', help='Re-calculate the MST for each batch',
+                                                default=False,
+                                                action='store_true')
     ioGroup.add_argument('--keep-intermediates', help='Retain the outputs of each batch',
                                                 default=False,
                                                 action='store_true')
@@ -94,7 +95,7 @@ def writeBatch(rlines, batches, batch_selected, use_names = False):
 
 def runCmd(cmd_string):
     sys.stderr.write("Running command:\n")
-    sys.stderr.write(cmd_string)
+    sys.stderr.write(cmd_string + '\n')
     subprocess.run(cmd_string, shell=True, check=True)
 
 def readLineages(clustCSV):
@@ -176,6 +177,10 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
 # main code
 if __name__ == "__main__":
 
+    ###########
+    # Prepare #
+    ###########
+
     # Check input ok
     args = get_options()
     if args.previous_clustering is not None and \
@@ -225,6 +230,11 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
     wd = writeBatch(rlines, batches, first_batch, args.use_batch_names)
     tmp_dirs = [wd]
     try:
+    
+        ###############
+        # First batch #
+        ###############
+    
         # First batch is create DB + lineage
         create_db_cmd = args.poppunk_exe + " --create-db --r-files " + \
                                 wd + "/" + rfile_names + \
@@ -243,7 +253,26 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                                 str(args.threads) + " " + \
                                 args.model_args
         runCmd(fit_model_cmd)
-
+        
+        # Calculate MST if operating iteratively
+        if args.iterative_mst:
+        
+            mst_command = args.mst_exe + " --distance-pkl " + wd + \
+                            "/" + os.path.basename(wd) + ".dists.pkl --rank-fit " + \
+                            wd + "/" + os.path.basename(wd) + "_rank" + \
+                            str(max_rank) +  "_fit.npz " + \
+                            " --output " + wd + \
+                            " --threads " + str(args.threads) + \
+                            " --previous-clustering " + wd + \
+                            "/" + os.path.basename(wd) + "_lineages.csv"
+            if args.use_gpu:
+                mst_command = mst_command + " --gpu-graph"
+            runCmd(mst_command)
+            
+        ###########
+        # Iterate #
+        ###########
+        
         for batch_idx, batch in enumerate(batch_names):
             prev_wd = tmp_dirs[-1]
             batch_wd = writeBatch(rlines, batches, batch, args.use_batch_names)
@@ -257,29 +286,63 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
             if args.use_gpu:
                 assign_cmd = assign_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid)
             runCmd(assign_cmd)
+            
+            # Calculate MST if operating iteratively
+            if args.iterative_mst:
+            
+                mst_command = args.mst_exe + " --distance-pkl " + batch_wd + \
+                                "/" + os.path.basename(batch_wd) + ".dists.pkl --rank-fit " + \
+                                batch_wd + "/" + os.path.basename(batch_wd) + "_rank" + \
+                                str(max_rank) +  "_fit.npz " + \
+                                " --output " + batch_wd + \
+                                " --threads " + str(args.threads) + \
+                                " --previous-mst " + \
+                                prev_wd + "/" + os.path.basename(prev_wd) + ".graphml" + \
+                                " --previous-clustering " + batch_wd + \
+                                "/" + os.path.basename(batch_wd) + "_lineages.csv"
+                if args.use_gpu:
+                    mst_command = mst_command + " --gpu-graph"
+                runCmd(mst_command)
 
             # Remove the previous batch
             if batch_idx > 0 and args.keep_intermediates == False:
                 shutil.rmtree(tmp_dirs[batch_idx - 1])
 
+        ##########
+        # Finish #
+        ##########
+
         # Calculate MST
         output_dir = tmp_dirs[-1]
-        mst_command = args.mst_exe + " --distance-pkl " + output_dir + \
-                        "/" + os.path.basename(output_dir) + ".dists.pkl --rank-fit " + \
-                        output_dir + "/" + os.path.basename(output_dir) + "_rank" + \
-                        str(max_rank) +  "_fit.npz " + \
-                        " --output " + args.output + \
-                        " --threads " + str(args.threads)
-        if args.previous_mst is not None:
-            mst_command = mst_command + " --previous-mst " + args.previous_mst
-        if args.previous_clustering is not None:
-            mst_command = mst_command + " --previous-clustering " + args.previous_clustering
+        if args.iterative_mst:
+            # Create directory
+            if os.path.exists(args.output):
+                if os.path.isdir(args.output):
+                    shutil.rmtree(args.output)
+                else:
+                    os.remove(args.output)
+            os.mkdir(args.output)
+            # Copy over final MST
+            shutil.copy(os.path.join(output_dir,os.path.basename(output_dir) + ".graphml"),
+                        os.path.join(args.output,os.path.basename(args.output) + ".graphml"))
+            shutil.copy(os.path.join(output_dir,os.path.basename(output_dir) + "_MST.nwk"),
+            os.path.join(args.output,os.path.basename(args.output) + "_MST.nwk"))
         else:
-            mst_command = mst_command + " --previous-clustering " + \
-                            os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv")
-        if args.use_gpu:
-            mst_command = mst_command + " --gpu-graph"
-        runCmd(mst_command)
+            # Calculate MST
+            mst_command = args.mst_exe + " --distance-pkl " + output_dir + \
+                            "/" + os.path.basename(output_dir) + ".dists.pkl --rank-fit " + \
+                            output_dir + "/" + os.path.basename(output_dir) + "_rank" + \
+                            str(max_rank) +  "_fit.npz " + \
+                            " --output " + args.output + \
+                            " --threads " + str(args.threads)
+            if args.previous_clustering is not None:
+                mst_command = mst_command + " --previous-clustering " + args.previous_clustering
+            else:
+                mst_command = mst_command + " --previous-clustering " + \
+                                os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv")
+            if args.use_gpu:
+                mst_command = mst_command + " --gpu-graph"
+            runCmd(mst_command)
         
         # Retrieve isolate names and lineages from previous round
         os.rename(os.path.join(output_dir,os.path.basename(output_dir) + ".dists.pkl"),

From 05ffc813f272f91ba10d417b6b36e56e1df6e2f4 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 Feb 2021 11:10:11 +0000
Subject: [PATCH 030/327] Add output to lineage model fitting

---
 scripts/poppunk_batch_mst.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index d3166fec..42fe6fcd 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -251,7 +251,8 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                                 wd + " --rank " + \
                                 args.rank + " --threads " + \
                                 str(args.threads) + " " + \
-                                args.model_args
+                                args.model_args + \
+                                " --output " + args.output
         runCmd(fit_model_cmd)
         
         # Calculate MST if operating iteratively

From fef9e0127dd09016be766c3d20b2e1ca958b0694 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 Feb 2021 11:26:34 +0000
Subject: [PATCH 031/327] Remove unnecessary flags/messages

---
 PopPUNK/network.py           | 1 -
 scripts/poppunk_batch_mst.py | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 4390e901..ad8206d5 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -301,7 +301,6 @@ def load_previous_network(prev_G, rlist, weights=False):
     if weights:
         edge_weights = prev_G.ep['weight']
         # return values
-        print("Old weights: " + str(edge_weights))
         return source_ids, target_ids, edge_weights
     else:
         return source_ids, target_ids
diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 42fe6fcd..d3166fec 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -251,8 +251,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                                 wd + " --rank " + \
                                 args.rank + " --threads " + \
                                 str(args.threads) + " " + \
-                                args.model_args + \
-                                " --output " + args.output
+                                args.model_args
         runCmd(fit_model_cmd)
         
         # Calculate MST if operating iteratively

From 46ef3a1965d6b8055b5700dd8e6359b421345e1e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 Feb 2021 22:09:46 +0000
Subject: [PATCH 032/327] Consistent behaviour between GPU and non-GPU
 processes

---
 PopPUNK/network.py    | 10 +++++-----
 PopPUNK/sparse_mst.py |  1 +
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index ad8206d5..d059d57d 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -266,12 +266,12 @@ def writeReferences(refList, outPrefix):
 
     return refFileName
 
-def load_previous_network(prev_G, rlist, weights=False):
+def load_previous_network(prev_G_fn, rlist, weights=False):
     """Load previous network with graph-tool, extract the edges to match the
     vertex order specified in rlist, and also return weights if specified.
 
     Args:
-        prev_G (str)
+        prev_G_fn (str)
             Path of file containing existing network.
         rlist (list)
             List of reference sequence labels in new network
@@ -288,6 +288,7 @@ def load_previous_network(prev_G, rlist, weights=False):
             Weights for each new edge
     """
     # get list for translating node IDs to rlist
+    prev_G = gt.load_graph(prev_G_fn)
     old_ids = prev_G.vp["id"]
     old_id_indices = [rlist.index(x) for x in old_ids]
     # get the source and target nods
@@ -299,7 +300,7 @@ def load_previous_network(prev_G, rlist, weights=False):
     # convert to ndarray
     # get the weights
     if weights:
-        edge_weights = prev_G.ep['weight']
+        edge_weights = list(prev_G.ep['weight'])
         # return values
         return source_ids, target_ids, edge_weights
     else:
@@ -393,9 +394,8 @@ def constructNetwork(rlist, qlist, assignments, within_label,
 
     # read previous graph
     if previous_network is not None:
-        prev_G = gt.load_graph(previous_network)
         if weights is not None or sparse_input is not None:
-            extra_sources, extra_targets, extra_weights = load_previous_network(prev_G,rlist,
+            extra_sources, extra_targets, extra_weights = load_previous_network(previous_network,rlist,
                                                                                 weights = True)
             for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights):
                 edge_tuple = (ref, query, weight)
diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py
index 80c794ed..c5d15757 100755
--- a/PopPUNK/sparse_mst.py
+++ b/PopPUNK/sparse_mst.py
@@ -96,6 +96,7 @@ def main():
     if args.gpu_graph:
         # Load previous MST if specified
         if args.previous_mst is not None:
+            print("Previous: " + str(args.previous_mst))
             extra_sources, extra_targets, extra_weights = load_previous_network(args.previous_mst,
                                                                                   rlist,
                                                                                   weights = True)

From 2a04c21e27852820a42c41fea72a9442e41c2704 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 Feb 2021 22:14:40 +0000
Subject: [PATCH 033/327] Remove gpu-sketch flags

---
 scripts/poppunk_batch_mst.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index d3166fec..0d84c956 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -243,7 +243,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                                 str(args.threads) + " " + \
                                 args.db_args
         if args.use_gpu:
-            create_db_cmd += " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid)
+            create_db_cmd += " --gpu-dist --deviceid " + str(args.deviceid)
         runCmd(create_db_cmd)
 
         # Fit lineage model
@@ -284,7 +284,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                         " --threads " + str(args.threads) + " --update-db " + \
                         args.assign_args
             if args.use_gpu:
-                assign_cmd = assign_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid)
+                assign_cmd = assign_cmd + " --gpu-dist --deviceid " + str(args.deviceid)
             runCmd(assign_cmd)
             
             # Calculate MST if operating iteratively

From 7309dc24346a97f4329961dea642780427332b07 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 8 Feb 2021 10:22:50 +0000
Subject: [PATCH 034/327] Add QC options to assign

---
 PopPUNK/assign.py            | 46 ++++++++++++++++++++++++++++++++++--
 scripts/poppunk_batch_mst.py | 45 +++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index a2202108..2142731f 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -335,9 +335,24 @@ def get_options():
                                 'k-mers [default = use canonical k-mers]')
 
     # qc options
-    qcGroup = parser.add_argument_group('Quality control options')
+    qcGroup = parser.add_argument_group('Quality control options for distances')
+    qcGroup.add_argument('--qc-filter', help='Behaviour following sequence QC step: "stop" [default], "prune"'
+                                                ' (analyse data passing QC), or "continue" (analyse all data)',
+                                                default='stop', type = str, choices=['stop', 'prune', 'continue'])
+    qcGroup.add_argument('--retain-failures', help='Retain sketches of genomes that do not pass QC filters in '
+                                                'separate database [default = False]', default=False, action='store_true')
     qcGroup.add_argument('--max-a-dist', help='Maximum accessory distance to permit [default = 0.5]',
                                                 default = 0.5, type = float)
+    qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond '
+                                                'which sequences will be excluded [default = 5]', default = None, type = int)
+    qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded '
+                                                '[two values needed - lower and upper bounds]', default=[None,None],
+                                                type = int, nargs = 2)
+    qcGroup.add_argument('--prop-n', help='Threshold ambiguous base proportion above which sequences will be excluded'
+                                                ' [default = 0.1]', default = None,
+                                                type = float)
+    qcGroup.add_argument('--upper-n', help='Threshold ambiguous base count above which sequences will be excluded',
+                                                default=None, type = int)
 
     # sequence querying
     queryingGroup = parser.add_argument_group('Database querying options')
@@ -389,7 +404,34 @@ def main():
     from .utils import setupDBFuncs
 
     # Dict of QC options for passing to database construction and querying functions
-    qc_dict = {'run_qc': False }
+    if args.length_sigma is None and None in args.length_range and args.prop_n is None \
+        and args.upper_n is None:
+        qc_dict = {'run_qc': False }
+    else:
+        # define defaults if one QC parameter given
+        # length_sigma
+        if args.length_sigma is not None:
+            length_sigma = args.length_sigma
+        elif None in args.length_range:
+            length_sigma = 5 # default used in __main__
+        else:
+            length_sigma = None
+        # prop_n
+        if args.prop_n is not None:
+            prop_n = args.prop_n
+        elif args.upper_n is None:
+            prop_n = 0.1 # default used in __main__
+        else:
+            prop_n = None
+        qc_dict = {
+            'run_qc': True,
+            'qc_filter': args.qc_filter,
+            'retain_failures': args.retain_failures,
+            'length_sigma': length_sigma,
+            'length_range': args.length_range,
+            'prop_n': prop_n,
+            'upper_n': args.upper_n
+        }
 
     # Dict of DB access functions for assign_query (which is out of scope)
     dbFuncs = setupDBFuncs(args, args.min_kmer_count, qc_dict)
diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index d3166fec..e299c527 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -65,6 +65,26 @@ def get_options():
     aGroup.add_argument('--assign-args', help="Other arguments to pass to poppunk_assign",
                                                 default = "")
 
+    # QC options
+    qcGroup = parser.add_argument_group('Quality control options for distances')
+    qcGroup.add_argument('--qc-filter', help='Behaviour following sequence QC step: "stop" [default], "prune"'
+                                                ' (analyse data passing QC), or "continue" (analyse all data)',
+                                                default='stop', type = str, choices=['stop', 'prune', 'continue'])
+    qcGroup.add_argument('--retain-failures', help='Retain sketches of genomes that do not pass QC filters in '
+                                                'separate database [default = False]', default=False, action='store_true')
+    qcGroup.add_argument('--max-a-dist', help='Maximum accessory distance to permit [default = 0.5]',
+                                                default = 0.5, type = float)
+    qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond '
+                                                'which sequences will be excluded [default = 5]', default = None, type = int)
+    qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded '
+                                                '[two values needed - lower and upper bounds]', default=[None,None],
+                                                type = int, nargs = 2)
+    qcGroup.add_argument('--prop-n', help='Threshold ambiguous base proportion above which sequences will be excluded'
+                                                ' [default = 0.1]', default = None,
+                                                type = float)
+    qcGroup.add_argument('--upper-n', help='Threshold ambiguous base count above which sequences will be excluded',
+                                                default=None, type = int)
+
     # Executable options
     eGroup = parser.add_argument_group('Executable locations')
     eGroup.add_argument('--poppunk-exe', help="Location of poppunk executable. Use "
@@ -242,6 +262,16 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                                 args.db_args + " --threads " + \
                                 str(args.threads) + " " + \
                                 args.db_args
+        # QC options
+        if None not in args.length_range:
+            create_db_cmd += " --length-range " + str(length_range[0]) + " " + str(length_range[1])
+        elif args.length_sigma is not None:
+            create_db_cmd += " --length-sigma " + str(args.length_sigma)
+        if args.upper_n is not None:
+            create_db_cmd += " --upper-n " + str(args.upper_n)
+        elif args.prop_n is not None:
+            create_db_cmd += " --prop-n " + str(args.prop_n)
+        # GPU options
         if args.use_gpu:
             create_db_cmd += " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid)
         runCmd(create_db_cmd)
@@ -265,6 +295,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                             " --threads " + str(args.threads) + \
                             " --previous-clustering " + wd + \
                             "/" + os.path.basename(wd) + "_lineages.csv"
+            # GPU options
             if args.use_gpu:
                 mst_command = mst_command + " --gpu-graph"
             runCmd(mst_command)
@@ -283,6 +314,20 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                         " --model-dir " + prev_wd + " --output " + batch_wd + \
                         " --threads " + str(args.threads) + " --update-db " + \
                         args.assign_args
+            # QC options
+            if None not in args.length_range:
+                create_db_cmd += " --length-range " + str(length_range[0]) + " " + str(length_range[1])
+            elif args.length_sigma is not None:
+                create_db_cmd += " --length-sigma " + str(args.length_sigma)
+            else:
+                create_db_cmd += " --length-sigma 5" # default from __main__
+            if args.upper_n is not None:
+                create_db_cmd += " --upper-n " + str(args.upper_n)
+            elif args.prop_n is not None:
+                create_db_cmd += " --prop-n " + str(args.prop_n)
+            else:
+                create_db_cmd += " --prop-n 0.1" # default from __main__
+            # GPU options
             if args.use_gpu:
                 assign_cmd = assign_cmd + " --gpu-sketch --gpu-dist --deviceid " + str(args.deviceid)
             runCmd(assign_cmd)

From 73515b365e592af4fdf0bb5460f47124fa3bec3b Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 8 Feb 2021 10:36:55 +0000
Subject: [PATCH 035/327] Fix QC option parsing

---
 scripts/poppunk_batch_mst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index f8ccebf0..421dad8f 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -264,7 +264,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                                 args.db_args
         # QC options
         if None not in args.length_range:
-            create_db_cmd += " --length-range " + str(length_range[0]) + " " + str(length_range[1])
+            create_db_cmd += " --length-range " + str(args.length_range[0]) + " " + str(args.length_range[1])
         elif args.length_sigma is not None:
             create_db_cmd += " --length-sigma " + str(args.length_sigma)
         if args.upper_n is not None:

From 8c86be812a64c8c023fb04d7935fc24636b485a0 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 8 Feb 2021 12:24:36 +0000
Subject: [PATCH 036/327] Define qc filter behaviour

---
 scripts/poppunk_batch_mst.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 421dad8f..33979610 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -271,6 +271,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
             create_db_cmd += " --upper-n " + str(args.upper_n)
         elif args.prop_n is not None:
             create_db_cmd += " --prop-n " + str(args.prop_n)
+        create_db_cmd += " --qc-filter " + args.qc_filter
         # GPU options
         if args.use_gpu:
             create_db_cmd += " --gpu-dist --deviceid " + str(args.deviceid)
@@ -327,6 +328,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                 create_db_cmd += " --prop-n " + str(args.prop_n)
             else:
                 create_db_cmd += " --prop-n 0.1" # default from __main__
+            create_db_cmd += " --qc-filter " + args.qc_filter
             # GPU options
             if args.use_gpu:
                 assign_cmd = assign_cmd + " --gpu-dist --deviceid " + str(args.deviceid)

From 10bd9fa7b6707b30ef29e77704630ca816c27f85 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 8 Feb 2021 13:10:54 +0000
Subject: [PATCH 037/327] Fix length range error

---
 scripts/poppunk_batch_mst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 33979610..6d831c43 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -317,7 +317,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                         args.assign_args
             # QC options
             if None not in args.length_range:
-                create_db_cmd += " --length-range " + str(length_range[0]) + " " + str(length_range[1])
+                create_db_cmd += " --length-range " + str(args.length_range[0]) + " " + str(args.length_range[1])
             elif args.length_sigma is not None:
                 create_db_cmd += " --length-sigma " + str(args.length_sigma)
             else:

From 752772d6a92541c4917d83b93093cbb4573bf20f Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 8 Feb 2021 13:54:51 +0000
Subject: [PATCH 038/327] Change GPU use options

---
 scripts/poppunk_batch_mst.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 6d831c43..87d9e112 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -51,7 +51,10 @@ def get_options():
     aGroup.add_argument('--threads', help='Number of threads for parallelisation (int)',
                                                 type = int,
                                                 default = 1)
-    aGroup.add_argument('--use-gpu', help='Use GPU for analysis',
+    aGroup.add_argument('--gpu-dist', help='Use GPU for distance calculations',
+                                                default=False,
+                                                action='store_true')
+    aGroup.add_argument('--gpu-graph', help='Use GPU for network analysis',
                                                 default=False,
                                                 action='store_true')
     aGroup.add_argument('--deviceid', help='GPU device ID (int)',
@@ -273,7 +276,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
             create_db_cmd += " --prop-n " + str(args.prop_n)
         create_db_cmd += " --qc-filter " + args.qc_filter
         # GPU options
-        if args.use_gpu:
+        if args.gpu_dist:
             create_db_cmd += " --gpu-dist --deviceid " + str(args.deviceid)
         runCmd(create_db_cmd)
 
@@ -297,7 +300,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                             " --previous-clustering " + wd + \
                             "/" + os.path.basename(wd) + "_lineages.csv"
             # GPU options
-            if args.use_gpu:
+            if args.gpu_graph:
                 mst_command = mst_command + " --gpu-graph"
             runCmd(mst_command)
             
@@ -330,7 +333,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                 create_db_cmd += " --prop-n 0.1" # default from __main__
             create_db_cmd += " --qc-filter " + args.qc_filter
             # GPU options
-            if args.use_gpu:
+            if args.gpu_dist:
                 assign_cmd = assign_cmd + " --gpu-dist --deviceid " + str(args.deviceid)
             runCmd(assign_cmd)
             
@@ -347,7 +350,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                                 prev_wd + "/" + os.path.basename(prev_wd) + ".graphml" + \
                                 " --previous-clustering " + batch_wd + \
                                 "/" + os.path.basename(batch_wd) + "_lineages.csv"
-                if args.use_gpu:
+                if args.gpu_graph:
                     mst_command = mst_command + " --gpu-graph"
                 runCmd(mst_command)
 
@@ -387,7 +390,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
             else:
                 mst_command = mst_command + " --previous-clustering " + \
                                 os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv")
-            if args.use_gpu:
+            if args.gpu_graph:
                 mst_command = mst_command + " --gpu-graph"
             runCmd(mst_command)
         

From fc549cc7418d7bc543fcf5a40195f79e2165136d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 8 Feb 2021 16:02:56 +0000
Subject: [PATCH 039/327] Add QC to assign command

---
 scripts/poppunk_batch_mst.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 87d9e112..959e3c0a 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -320,18 +320,18 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                         args.assign_args
             # QC options
             if None not in args.length_range:
-                create_db_cmd += " --length-range " + str(args.length_range[0]) + " " + str(args.length_range[1])
+                assign_cmd += " --length-range " + str(args.length_range[0]) + " " + str(args.length_range[1])
             elif args.length_sigma is not None:
-                create_db_cmd += " --length-sigma " + str(args.length_sigma)
+                assign_cmd += " --length-sigma " + str(args.length_sigma)
             else:
-                create_db_cmd += " --length-sigma 5" # default from __main__
+                assign_cmd += " --length-sigma 5" # default from __main__
             if args.upper_n is not None:
                 create_db_cmd += " --upper-n " + str(args.upper_n)
             elif args.prop_n is not None:
-                create_db_cmd += " --prop-n " + str(args.prop_n)
+                assign_cmd += " --prop-n " + str(args.prop_n)
             else:
-                create_db_cmd += " --prop-n 0.1" # default from __main__
-            create_db_cmd += " --qc-filter " + args.qc_filter
+                assign_cmd += " --prop-n 0.1" # default from __main__
+            assign_cmd += " --qc-filter " + args.qc_filter
             # GPU options
             if args.gpu_dist:
                 assign_cmd = assign_cmd + " --gpu-dist --deviceid " + str(args.deviceid)

From 6836d0f578d965411c3ee6a5b1903a2e29a5ec76 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 9 Feb 2021 08:18:26 +0000
Subject: [PATCH 040/327] Manual updates from master

---
 PopPUNK/visualise.py | 30 ++++++++++++++++++-------
 test/run_test.py     | 52 ++++++++++++++++++++++++++------------------
 2 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py
index 75682248..31be5033 100644
--- a/PopPUNK/visualise.py
+++ b/PopPUNK/visualise.py
@@ -62,6 +62,9 @@ def get_options():
                              'from poppunk_assign [default = use that in the directory '
                              'of the query database]',
                         type = str)
+    iGroup.add_argument('--use-network',
+                        help='Specify a directory containing a .gt file to use for any graph visualisations',
+                        type = str)
     iGroup.add_argument('--display-cluster',
                         help='Column of clustering CSV to use for plotting',
                         default=None)
@@ -146,6 +149,7 @@ def generate_visualisations(query_db,
                             model_dir,
                             previous_clustering,
                             previous_query_clustering,
+                            use_network,
                             info_csv,
                             rapidnj,
                             tree,
@@ -203,9 +207,9 @@ def generate_visualisations(query_db,
 
     if distances is None:
         if query_db is None:
-            distances = os.path.basename(ref_db) + "/" + ref_db + ".dists"
+            distances = ref_db + "/" + os.path.basename(ref_db) + ".dists"
         else:
-            distances = os.path.basename(query_db) + "/" + query_db + ".dists"
+            distances = query_db + "/" + os.path.basename(query_db) + ".dists"
     else:
         distances = distances
 
@@ -220,16 +224,16 @@ def generate_visualisations(query_db,
         sys.stderr.write("Note: Distances in " + distances + " are from assign mode\n"
                          "Note: Distance will be extended to full all-vs-all distances\n"
                          "Note: Re-run poppunk_assign with --update-db to avoid this\n")
-        ref_db = os.path.basename(ref_db) + "/" + ref_db
-        rlist_original, qlist_original, self_ref, rr_distMat = readPickle(ref_db + ".dists")
+        ref_db_loc = ref_db + "/" + os.path.basename(ref_db)
+        rlist_original, qlist_original, self_ref, rr_distMat = readPickle(ref_db_loc + ".dists")
         if not self_ref:
             sys.stderr.write("Distances in " + ref_db + " not self all-vs-all either\n")
             sys.exit(1)
         kmers, sketch_sizes, codon_phased = readDBParams(query_db)
         addRandom(query_db, qlist, kmers,
                   strand_preserved = strand_preserved, threads = threads)
-        query_db = os.path.basename(query_db) + "/" + query_db
-        qq_distMat = pp_sketchlib.queryDatabase(query_db, query_db,
+        query_db_loc = query_db + "/" + os.path.basename(query_db)
+        qq_distMat = pp_sketchlib.queryDatabase(query_db_loc, query_db_loc,
                                                 qlist, qlist, kmers,
                                                 True, False,
                                                 threads,
@@ -239,7 +243,7 @@ def generate_visualisations(query_db,
         # If the assignment was run with references, qrDistMat will be incomplete
         if rlist != rlist_original:
             rlist = rlist_original
-            qr_distMat = pp_sketchlib.queryDatabase(ref_db, query_db,
+            qr_distMat = pp_sketchlib.queryDatabase(ref_db_loc, query_db_loc,
                                                     rlist, qlist, kmers,
                                                     True, False,
                                                     threads,
@@ -291,7 +295,7 @@ def generate_visualisations(query_db,
     else:
         model_prefix = ref_db
     try:
-        model_file = os.path.basename(model_prefix) + "/" + os.path.basename(model_prefix)
+        model_file = model_prefix + "/" + os.path.basename(model_prefix)
         model = loadClusterFit(model_file + '_fit.pkl',
                                model_file + '_fit.npz')
     except FileNotFoundError:
@@ -321,6 +325,15 @@ def generate_visualisations(query_db,
                                                mode = mode,
                                                return_dict = True)
 
+    # Set graph location
+    if use_network is not None:
+        graph_dir = use_network
+        if graph_dir != prev_clustering:
+            sys.stderr.write("WARNING: Loading graph from a different directory to clusters\n")
+            sys.stderr.write("WARNING: Ensure that they are consistent\n")
+    else:
+        graph_dir = prev_clustering
+
     # Join clusters with query clusters if required
     if not self:
         if previous_query_clustering is not None:
@@ -460,6 +473,7 @@ def main():
                             args.model_dir,
                             args.previous_clustering,
                             args.previous_query_clustering,
+                            args.use_network,
                             args.info_csv,
                             args.rapidnj,
                             args.tree,
diff --git a/test/run_test.py b/test/run_test.py
index aaf7eaa2..9b0c2b75 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -12,20 +12,26 @@
     sys.stderr.write("Extracting example dataset\n")
     subprocess.run("tar xf example_set.tar.bz2", shell=True, check=True)
 
+if os.environ.get("POPPUNK_PYTHON"):
+    python_cmd = os.environ.get("POPPUNK_PYTHON")
+else:
+    python_cmd = "python"
+
+#easy run
 sys.stderr.write("Running database creation (--create-db)\n")
-subprocess.run("python ../poppunk-runner.py --create-db --r-files references.txt --min-k 13 --k-step 3 --output example_db --qc-filter prune --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files references.txt --min-k 13 --k-step 3 --output example_db --qc-filter prune --overwrite", shell=True, check=True)
 
 # create database with different QC options
 sys.stderr.write("Running database QC test (--create-db)\n")
-subprocess.run("python ../poppunk-runner.py --create-db --r-files references.txt --min-k 13 --k-step 3 --output example_qc --qc-filter continue --length-range 2000000 3000000 --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files references.txt --min-k 13 --k-step 3 --output example_qc --qc-filter continue --length-range 2000000 3000000 --overwrite", shell=True, check=True)
 
 #fit GMM
 sys.stderr.write("Running GMM model fit (--fit-model gmm)\n")
-subprocess.run("python ../poppunk-runner.py --fit-model bgmm --ref-db example_db --K 4 --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model bgmm --ref-db example_db --K 4 --overwrite", shell=True, check=True)
 
 #fit dbscan
 sys.stderr.write("Running DBSCAN model fit (--fit-model dbscan)\n")
-subprocess.run("python ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --graph-weights", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model dbscan --ref-db example_db --output example_dbscan --overwrite --graph-weights", shell=True, check=True)
 
 #refine model with GMM
 sys.stderr.write("Running model refinement (--fit-model refine)\n")
@@ -37,11 +43,11 @@
 
 # lineage clustering
 sys.stderr.write("Running lineage clustering test (--fit-model lineage)\n")
-subprocess.run("python ../poppunk-runner.py --fit-model lineage --output example_lineages --ranks 1,2,3,5 --ref-db example_db --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --output example_lineages --ranks 1,2,3,5 --ref-db example_db --overwrite", shell=True, check=True)
 
 #use model
 sys.stderr.write("Running with an existing model (--use-model)\n")
-subprocess.run("python ../poppunk-runner.py --use-model --ref-db example_db --model-dir example_db --output example_use --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --use-model --ref-db example_db --model-dir example_db --output example_use --overwrite", shell=True, check=True)
 
 # tests of other command line programs
 sys.stderr.write("Testing C++ extension\n")
@@ -49,21 +55,21 @@
 
 #assign query
 sys.stderr.write("Running query assignment\n")
-subprocess.run("python ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query --overwrite", shell=True, check=True)
-subprocess.run("python ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query_update --update-db --graph-weights --overwrite", shell=True, check=True)
-subprocess.run("python ../poppunk_assign-runner.py --query single_query.txt --db example_db --output example_single_query --update-db --overwrite", shell=True, check=True)
-subprocess.run("python ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_lineages --output example_lineage_query --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query_update --update-db --graph-weights --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --output example_single_query --update-db --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_lineages --output example_lineage_query --overwrite", shell=True, check=True)
 
 # viz
 sys.stderr.write("Running visualisations (poppunk_visualise)\n")
-subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --microreact", shell=True, check=True)
-subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --cytoscape", shell=True, check=True)
-subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --phandango", shell=True, check=True)
-subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree", shell=True, check=True)
-subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt", shell=True, check=True)
-subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --query-db example_query --output example_viz_query --microreact", shell=True, check=True)
-subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages/example_lineages_lineages.csv --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True)
-subprocess.run("python ../poppunk_visualise-runner.py --distances example_query/example_query.dists --ref-db example_db --model-dir example_lineages --query-db example_lineage_query --output example_viz_query_lineages --microreact", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --microreact", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --cytoscape", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --phandango", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --query-db example_query --output example_viz_query --microreact", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --distances example_query/example_query.dists --ref-db example_db --model-dir example_lineages --query-db example_lineage_query --output example_viz_query_lineages --microreact", shell=True, check=True)
 
 # MST
 sys.stderr.write("Running MST\n")
@@ -72,15 +78,19 @@
 
 # t-sne
 sys.stderr.write("Running tsne viz\n")
-subprocess.run("python ../poppunk_tsne-runner.py --distances example_db/example_db.dists --output example_tsne --perplexity 5 --verbosity 1", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_tsne-runner.py --distances example_db/example_db.dists --output example_tsne --perplexity 5 --verbosity 1", shell=True, check=True)
 
 # prune
 sys.stderr.write("Running poppunk_prune\n")
-subprocess.run("python ../poppunk_prune-runner.py --distances example_db/example_db.dists --ref-db example_db --remove subset.txt --output example_prune", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_prune-runner.py --distances example_db/example_db.dists --ref-db example_db --remove subset.txt --output example_prune", shell=True, check=True)
 
 # references
 sys.stderr.write("Running poppunk_references\n")
-subprocess.run("python ../poppunk_references-runner.py --network example_db/example_db_graph.gt --distances example_db/example_db.dists --ref-db example_db --output example_refs --model example_db", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_references-runner.py --network example_db/example_db_graph.gt --distances example_db/example_db.dists --ref-db example_db --output example_refs --model example_db", shell=True, check=True)
+
+# web API
+sys.stderr.write("Running API tests\n")
+subprocess.run(python_cmd + " test_web.py", shell=True, check=True)
 
 sys.stderr.write("Tests completed\n")
 

From 8b3995855f4fbfdbd89b1f8798e364dfe9ba3aa3 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 9 Feb 2021 09:29:36 +0000
Subject: [PATCH 041/327] Fix file paths in tests

---
 PopPUNK/visualise.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py
index 31be5033..27e5e1b5 100644
--- a/PopPUNK/visualise.py
+++ b/PopPUNK/visualise.py
@@ -320,7 +320,7 @@ def generate_visualisations(query_db,
         if model.indiv_fitted:
             sys.stderr.write("Note: Individual (core/accessory) fits found, but "
                              "visualisation only supports combined boundary fit\n")
-        prev_clustering = os.path.dirname(model_file) + '/' + os.path.basename(model_file) + suffix
+        prev_clustering = os.path.basename(model_file) + '/' + os.path.basename(model_file) + suffix
     isolateClustering = readIsolateTypeFromCsv(prev_clustering,
                                                mode = mode,
                                                return_dict = True)
@@ -339,7 +339,7 @@ def generate_visualisations(query_db,
         if previous_query_clustering is not None:
             prev_query_clustering = previous_query_clustering
         else:
-            prev_query_clustering = os.path.dirname(query_db) + '/' + os.path.basename(query_db) + suffix
+            prev_query_clustering = os.path.basename(query_db) + '/' + os.path.basename(query_db) + suffix
 
         queryIsolateClustering = readIsolateTypeFromCsv(
                 prev_query_clustering,

From 64d0bbe24a6297591f25c2b48cfcfd62b74f7d12 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 9 Feb 2021 12:15:18 +0000
Subject: [PATCH 042/327] Add error message when distance file is missing

---
 PopPUNK/sparse_mst.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py
index c5d15757..34b47763 100755
--- a/PopPUNK/sparse_mst.py
+++ b/PopPUNK/sparse_mst.py
@@ -80,6 +80,9 @@ def main():
             if not self:
                 sys.stderr.write("This script must be run on a full all-v-all model\n")
                 sys.exit(1)
+    else:
+        sys.stderr.write("Cannot find file " + args.distance_pkl + "\n")
+        sys.exit(1)
 
     # Check output path ok
     if not os.path.isdir(args.output):

From f94648d6e53f194ad881b2d622f9d5c92082b8c8 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 9 Feb 2021 13:03:33 +0000
Subject: [PATCH 043/327] Prune MST when created

---
 PopPUNK/network.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index d059d57d..3a385829 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -369,12 +369,16 @@ def constructNetwork(rlist, qlist, assignments, within_label,
     if edge_list:
         if weights is not None:
             for weight, (ref, query) in zip(weights, assignments):
-                connections.append((ref, query, weight))
+                # sparse matrix is symmetrical, avoid redundant loops
+                if ref < query:
+                    connections.append((ref, query, weight))
         else:
             connections = assignments
     elif sparse_input is not None:
         for ref, query, weight in zip(sparse_input.row, sparse_input.col, sparse_input.data):
-            connections.append((ref, query, weight))
+            # sparse matrix is symmetrical, avoid redundant loops
+            if ref < query:
+                connections.append((ref, query, weight))
     else:
         for row_idx, (assignment, (ref, query)) in enumerate(zip(assignments,
                                                                  listDistInts(rlist, qlist,
@@ -399,14 +403,16 @@ def constructNetwork(rlist, qlist, assignments, within_label,
                                                                                 weights = True)
             for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights):
                 edge_tuple = (ref, query, weight)
-                connections.append(edge_tuple)
+                if ref < query:
+                    connections.append(edge_tuple)
         else:
             extra_sources, extra_targets = load_previous_network(prev_G,rlist,
                                                                                 weights = False)
             for (ref, query) in zip(extra_sources, extra_targets):
                 edge_tuple = (ref, query)
-                connections.append(edge_tuple)
-
+                if ref < query:
+                    connections.append(edge_tuple)
+                    
     # build the graph
     G = gt.Graph(directed = False)
     G.add_vertex(len(vertex_labels))
@@ -834,6 +840,7 @@ def generate_minimum_spanning_tree(G, from_cugraph = False):
         if "weight" in G.edge_properties:
             mst_edge_prop_map = gt.min_spanning_tree(G, weights = G.ep["weight"])
             mst_network = gt.GraphView(G, efilt = mst_edge_prop_map)
+            mst_network = gt.Graph(mst_network, prune = True)
         else:
             sys.stderr.write("generate_minimum_spanning_tree requires a weighted graph\n")
             raise RuntimeError("MST passed unweighted graph")

From 88eb94b8fdd4328754de36b5dc5a80be9ec57cb5 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 9 Feb 2021 13:15:10 +0000
Subject: [PATCH 044/327] Restore correct test for visualisation

---
 test/run_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/run_test.py b/test/run_test.py
index 9b0c2b75..2337a282 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -68,7 +68,7 @@
 subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --query-db example_query --output example_viz_query --microreact", shell=True, check=True)
-subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --previous-clustering example_lineages/example_lineages_lineages.csv --model-dir example_lineages --output example_lineage_viz --microreact", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --distances example_query/example_query.dists --ref-db example_db --model-dir example_lineages --query-db example_lineage_query --output example_viz_query_lineages --microreact", shell=True, check=True)
 
 # MST

From 25d57f4a9ffe467beeabff6283e30778aac06cbf Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Feb 2021 12:38:51 +0000
Subject: [PATCH 045/327] Add ability to extract distances from a sparse matrix

---
 scripts/poppunk_extract_distances.py | 46 ++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/scripts/poppunk_extract_distances.py b/scripts/poppunk_extract_distances.py
index 6552fd03..3bbe2448 100755
--- a/scripts/poppunk_extract_distances.py
+++ b/scripts/poppunk_extract_distances.py
@@ -7,6 +7,7 @@
 import numpy as np
 import argparse
 import dendropy
+from scipy import sparse
 
 # command line parsing
 def get_options():
@@ -14,9 +15,17 @@ def get_options():
     parser = argparse.ArgumentParser(description='Extract tab-separated file of distances from pkl and npy files', prog='extract_distances')
 
     # input options
-    parser.add_argument('--distances', required=True, help='Prefix of input pickle and numpy file of pre-calculated distances (required)')
-    parser.add_argument('--tree', required=False, help='Newick file containing phylogeny of isolates', default = None)
-    parser.add_argument('--output', required=True, help='Name of output file')
+    parser.add_argument('--distances', help='Prefix of input pickle (and optionally,'
+    '  numpy file) of pre-calculated distances (required)',
+                                    required=True)
+    parser.add_argument('--sparse', help='Sparse distance matrix file name',
+                                    default = None,
+                                    required = False)
+    parser.add_argument('--tree', help='Newick file containing phylogeny of isolates',
+                                    required = False,
+                                    default = None)
+    parser.add_argument('--output', help='Name of output file',
+                                    required = True)
 
     return parser.parse_args()
 
@@ -71,7 +80,6 @@ def isolateNameToLabel(names):
     # open stored distances
     with open(args.distances + ".pkl", 'rb') as pickle_file:
         rlist, qlist, self = pickle.load(pickle_file)
-    X = np.load(args.distances + ".npy")
 
     # get names order
     r_names = isolateNameToLabel(rlist)
@@ -91,14 +99,32 @@ def isolateNameToLabel(names):
             taxon_name = t.label.replace(' ','_')
             tip_index[r_names.index(taxon_name)] = t
 
+    # Load sparse matrix
+    if args.sparse is not None:
+        sparse_mat = sparse.load_npz(args.sparse)
+    else:
+        X = np.load(args.distances + ".npy")
+
     # open output file
     with open(args.output, 'w') as oFile:
-        oFile.write("\t".join(['Query', 'Reference', 'Core', 'Accessory']))
+        # Write header of output file
+        if args.sparse is not None:
+            oFile.write("\t".join(['Query', 'Reference', 'Core']))
+        else:
+            oFile.write("\t".join(['Query', 'Reference', 'Core', 'Accessory']))
         if args.tree is not None:
             oFile.write("\t" + 'Patristic')
         oFile.write("\n")
-        for i, (r_index, q_index) in enumerate(iterDistRows(r_names, q_names, r_names == q_names)):
-            oFile.write("\t".join([q_names[q_index], r_names[r_index], str(X[i,0]), str(X[i,1])]))
-            if args.tree is not None:
-                oFile.write("\t" + str(pdc(tip_index[r_index], tip_index[q_index])))
-            oFile.write("\n")
+        # Write distances
+        if args.sparse is not None:
+            for (r_index, q_index, dist) in zip(sparse_mat.col, sparse_mat.row, sparse_mat.data):
+                oFile.write("\t".join([q_names[q_index], r_names[r_index], str(dist)]))
+                if args.tree is not None:
+                    oFile.write("\t" + str(pdc(tip_index[r_index], tip_index[q_index])))
+                oFile.write("\n")
+        else:
+            for i, (r_name, q_name) in enumerate(iterDistRows(r_names, q_names, r_names == q_names)):
+                oFile.write("\t".join([q_name, r_name, str(X[i,0]), str(X[i,1])]))
+                if args.tree is not None:
+                    oFile.write("\t" + str(pdc(tip_index[r_index], tip_index[q_index])))
+                oFile.write("\n")

From 2b17f81bfc8c6063eda09d9d0bf94b15edf114d6 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Feb 2021 12:41:46 +0000
Subject: [PATCH 046/327] Fix naming of temporary directories

---
 scripts/poppunk_batch_mst.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 959e3c0a..4b37af81 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -105,7 +105,9 @@ def get_options():
 def writeBatch(rlines, batches, batch_selected, use_names = False):
     tmpdir = ""
     if use_names:
-        tmpdir = "./pp_mst_" + batch_selected
+        tmpdir = "./pp_mst_" + str(batch_selected)
+        if os.path.exists(tmpdir):
+            shutil.rmtree(tmpdir)
         os.mkdir(tmpdir)
     else:
         tmpdir = tempfile.mkdtemp(prefix="pp_mst", dir="./")
@@ -399,6 +401,9 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                   os.path.join(args.output,os.path.basename(args.output) + ".dists.pkl"))
         os.rename(os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv"),
                   os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv"))
+        for rank in ranks:
+            os.rename(os.path.join(output_dir,os.path.basename(output_dir) + "_rank" + str(rank) + "_fit.npz"),
+                      os.path.join(args.output,os.path.basename(args.output) + "_rank" + str(rank) + "_fit.npz"))
 
         # Merge with epidemiological data if requested
         if args.info_csv is not None:
@@ -414,10 +419,12 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
     except:
         if args.keep_intermediates == False:
             for tmpdir in tmp_dirs:
-                shutil.rmtree(tmpdir)
+                try:
+                    shutil.rmtree(tmpdir)
+                except:
+                    sys.stderr.write("Unable to remove " + tmpdir + "\n")
         print("Unexpected error:", sys.exc_info()[0])
         raise
 
     if args.keep_intermediates == False:
-        shutil.rmtree(wd)
         shutil.rmtree(output_dir)

From 90758221b922c58e2861591671f5e546eeaf9aa6 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Fri, 12 Feb 2021 15:02:28 +0000
Subject: [PATCH 047/327] Remove unneeded return from sketchlib query

Label order remains unchanged (unlike with mash)
---
 PopPUNK/__main__.py  | 20 +++++++-------
 PopPUNK/assign.py    | 62 ++++++++++++++++++++++----------------------
 PopPUNK/network.py   | 32 +++++++++++------------
 PopPUNK/sketchlib.py | 44 +------------------------------
 4 files changed, 58 insertions(+), 100 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 63924906..cb5f981a 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -292,19 +292,19 @@ def main():
 
         rNames = seq_names
         qNames = seq_names
-        refList, queryList, distMat = queryDatabase(rNames = rNames,
-                                                    qNames = qNames,
-                                                    dbPrefix = args.output,
-                                                    queryPrefix = args.output,
-                                                    klist = kmers,
-                                                    self = True,
-                                                    number_plot_fits = args.plot_fit,
-                                                    threads = args.threads)
-        qcDistMat(distMat, refList, queryList, args.max_a_dist)
+        distMat = queryDatabase(rNames = rNames,
+                                qNames = qNames,
+                                dbPrefix = args.output,
+                                queryPrefix = args.output,
+                                klist = kmers,
+                                self = True,
+                                number_plot_fits = args.plot_fit,
+                                threads = args.threads)
+        qcDistMat(distMat, rNames, qNames, args.max_a_dist)
 
         # Save results
         dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
-        storePickle(refList, queryList, True, distMat, dists_out)
+        storePickle(rNames, qNames, True, distMat, dists_out)
 
         # Plot results
         plot_scatter(distMat,
diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index 4ce970a3..30b2d558 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -131,22 +131,22 @@ def assign_query(dbFuncs,
                                     codon_phased = codon_phased,
                                     calc_random = False)
     # run query
-    refList, queryList, qrDistMat = queryDatabase(rNames = rNames,
-                                                  qNames = qNames,
-                                                  dbPrefix = ref_db,
-                                                  queryPrefix = output,
-                                                  klist = kmers,
-                                                  self = False,
-                                                  number_plot_fits = plot_fit,
-                                                  threads = threads)
+    qrDistMat = queryDatabase(rNames = rNames,
+                              qNames = qNames,
+                              dbPrefix = ref_db,
+                              queryPrefix = output,
+                              klist = kmers,
+                              self = False,
+                              number_plot_fits = plot_fit,
+                              threads = threads)
     # QC distance matrix
-    qcPass = qcDistMat(qrDistMat, refList, queryList, max_a_dist)
+    qcPass = qcDistMat(qrDistMat, rNames, qNames, max_a_dist)
 
     # Load the network based on supplied options
     genomeNetwork, old_cluster_file = \
         fetchNetwork(prev_clustering,
                      model,
-                     refList,
+                     rNames,
                      ref_graph = use_ref_graph,
                      core_only = core_only,
                      accessory_only = accessory_only)
@@ -154,14 +154,14 @@ def assign_query(dbFuncs,
     if model.type == 'lineage':
         # Assign lineages by calculating query-query information
         addRandom(output, qNames, kmers, strand_preserved, overwrite, threads)
-        qlist1, qlist2, qqDistMat = queryDatabase(rNames = qNames,
-                                                  qNames = qNames,
-                                                  dbPrefix = output,
-                                                  queryPrefix = output,
-                                                  klist = kmers,
-                                                  self = True,
-                                                  number_plot_fits = 0,
-                                                  threads = threads)
+        qqDistMat = queryDatabase(rNames = qNames,
+                                  qNames = qNames,
+                                  dbPrefix = output,
+                                  queryPrefix = output,
+                                  klist = kmers,
+                                  self = True,
+                                  number_plot_fits = 0,
+                                  threads = threads)
         model.extend(qqDistMat, qrDistMat)
 
         genomeNetwork = {}
@@ -182,18 +182,18 @@ def assign_query(dbFuncs,
 
             isolateClustering[rank] = \
                 printClusters(genomeNetwork[rank],
-                              refList + queryList,
+                              rNames + qNames,
                               printCSV = False)
 
         overall_lineage = createOverallLineage(model.ranks, isolateClustering)
         writeClusterCsv(
             output + "/" + os.path.basename(output) + '_lineages.csv',
-            refList + queryList,
-            refList + queryList,
+            rNames + qNames,
+            rNames + qNames,
             overall_lineage,
             output_format = 'phandango',
             epiCsv = None,
-            queryNames = queryList,
+            queryNames = qNames,
             suffix = '_Lineage')
 
     else:
@@ -206,14 +206,14 @@ def assign_query(dbFuncs,
         else:
             weights = None
         qqDistMat = \
-            addQueryToNetwork(dbFuncs, refList, queryList,
+            addQueryToNetwork(dbFuncs, rNames, qNames,
                                 genomeNetwork, kmers,
                                 queryAssignments, model, output, update_db,
                                 strand_preserved,
                                 weights = weights, threads = threads)
 
         isolateClustering = \
-            {'combined': printClusters(genomeNetwork, refList + queryList,
+            {'combined': printClusters(genomeNetwork, rNames + qNames,
                                         output + "/" + os.path.basename(output),
                                         old_cluster_file,
                                         external_clustering,
@@ -248,9 +248,9 @@ def assign_query(dbFuncs,
 
         combined_seq, core_distMat, acc_distMat = \
             update_distance_matrices(refList, rrDistMat,
-                                    queryList, qrDistMat,
-                                    qqDistMat, threads = threads)
-        assert combined_seq == refList + queryList
+                                     qNames, qrDistMat,
+                                     qqDistMat, threads = threads)
+        assert combined_seq == refList + qNames
 
         # Get full distance matrix and save
         complete_distMat = \
@@ -260,12 +260,12 @@ def assign_query(dbFuncs,
 
         # Clique pruning
         if model.type != 'lineage':
-            dbOrder = refList + queryList
+            dbOrder = refList + qNames
             newRepresentativesIndices, newRepresentativesNames, \
                 newRepresentativesFile, genomeNetwork = \
                     extractReferences(genomeNetwork, dbOrder, output, refList, threads = threads)
             # intersection that maintains order
-            newQueries = [x for x in queryList if x in frozenset(newRepresentativesNames)]
+            newQueries = [x for x in qNames if x in frozenset(newRepresentativesNames)]
 
             # could also have newRepresentativesNames in this diff (should be the same) - but want
             # to ensure consistency with the network in case of bad input/bugs
@@ -280,12 +280,12 @@ def assign_query(dbFuncs,
                 genomeNetwork.save(output + "/" + os.path.basename(output) + '.refs_graph.gt', fmt = 'gt')
                 removeFromDB(output, output, names_to_remove)
                 os.rename(output + "/" + os.path.basename(output) + ".tmp.h5",
-                            output + "/" + os.path.basename(output) + ".refs.h5")
+                          output + "/" + os.path.basename(output) + ".refs.h5")
 
                 # ensure sketch and distMat order match
                 assert postpruning_combined_seq == refList + newQueries
     else:
-        storePickle(refList, queryList, False, qrDistMat, dists_out)
+        storePickle(refList, qNames, False, qrDistMat, dists_out)
         if save_partial_query_graph:
             if model.type == 'lineage':
                 genomeNetwork[min(model.ranks)].save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt')
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index b5c9cadd..d55a2521 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -489,14 +489,14 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
     if queryQuery:
         sys.stderr.write("Calculating all query-query distances\n")
         addRandom(queryDB, qList, kmers, strand_preserved, threads = threads)
-        qlist1, qlist2, qqDistMat = queryDatabase(rNames = qList,
-                                                  qNames = qList,
-                                                  dbPrefix = queryDB,
-                                                  queryPrefix = queryDB,
-                                                  klist = kmers,
-                                                  self = True,
-                                                  number_plot_fits = 0,
-                                                  threads = threads)
+        qqDistMat = queryDatabase(rNames = qList,
+                                  qNames = qList,
+                                  dbPrefix = queryDB,
+                                  queryPrefix = queryDB,
+                                  klist = kmers,
+                                  self = True,
+                                  number_plot_fits = 0,
+                                  threads = threads)
 
         queryAssignation = model.assign(qqDistMat)
         for row_idx, (assignment, (ref, query)) in enumerate(zip(queryAssignation, listDistInts(qList, qList, self = True))):
@@ -519,14 +519,14 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
 
             # use database construction methods to find links between unassigned queries
             addRandom(queryDB, qList, kmers, strand_preserved, threads = threads)
-            qlist1, qlist2, qqDistMat = queryDatabase(rNames = list(unassigned),
-                                                    qNames = list(unassigned),
-                                                    dbPrefix = queryDB,
-                                                    queryPrefix = queryDB,
-                                                    klist = kmers,
-                                                    self = True,
-                                                    number_plot_fits = 0,
-                                                    threads = threads)
+            qqDistMat = queryDatabase(rNames = list(unassigned),
+                                      qNames = list(unassigned),
+                                      dbPrefix = queryDB,
+                                      queryPrefix = queryDB,
+                                      klist = kmers,
+                                      self = True,
+                                      number_plot_fits = 0,
+                                      threads = threads)
 
             queryAssignation = model.assign(qqDistMat)
 
diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py
index 37ead86d..96509c3d 100644
--- a/PopPUNK/sketchlib.py
+++ b/PopPUNK/sketchlib.py
@@ -517,10 +517,6 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
             (default = 0)
 
     Returns:
-         refList (list)
-            Names of reference sequences
-         queryList (list)
-            Names of query sequences
          distMat (numpy.array)
             Core distances (column 0) and accessory distances (column 1) between
             refList and queryList
@@ -568,46 +564,8 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
         distMat = pp_sketchlib.queryDatabase(ref_db, query_db, rNames, qNames, klist,
                                              True, False, threads, use_gpu, deviceid)
 
-    return(rNames, qNames, distMat)
+    return distMat
 
-def calculateQueryQueryDistances(dbFuncs, qlist, kmers,
-                                 queryDB, threads = 1):
-    """Calculates distances between queries.
-
-    Args:
-        dbFuncs (list)
-            List of backend functions from :func:`~PopPUNK.utils.setupDBFuncs`
-        rlist (list)
-            List of reference names
-        qlist (list)
-            List of query names
-        kmers (list)
-            List of k-mer sizes
-        queryDB (str)
-            Query database location
-        threads (int)
-            Number of threads to use if new db created
-            (default = 1)
-
-    Returns:
-        qlist1 (list)
-            Ordered list of queries
-        distMat (numpy.array)
-            Query-query distances
-    """
-
-    queryDatabase = dbFuncs['queryDatabase']
-
-    qlist1, qlist2, distMat = queryDatabase(rNames = qlist,
-                                            qNames = qlist,
-                                            dbPrefix = queryDB,
-                                            queryPrefix = queryDB,
-                                            klist = kmers,
-                                            self = True,
-                                            number_plot_fits = 0,
-                                            threads = threads)
-
-    return qlist1, distMat
 
 def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads):
     """Calculates random match probability based on means of genomes

From 18dd105406f7c35483acd99d58a1d7b6db60bc98 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Fri, 12 Feb 2021 15:07:48 +0000
Subject: [PATCH 048/327] Fix assign import

---
 PopPUNK/assign.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index 30b2d558..5e5808df 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -60,7 +60,6 @@ def assign_query(dbFuncs,
 
     from .prune_db import prune_distance_matrix
 
-    from .sketchlib import calculateQueryQueryDistances
     from .sketchlib import addRandom
 
     from .utils import storePickle
@@ -285,7 +284,7 @@ def assign_query(dbFuncs,
                 # ensure sketch and distMat order match
                 assert postpruning_combined_seq == refList + newQueries
     else:
-        storePickle(refList, qNames, False, qrDistMat, dists_out)
+        storePickle(rNames, qNames, False, qrDistMat, dists_out)
         if save_partial_query_graph:
             if model.type == 'lineage':
                 genomeNetwork[min(model.ranks)].save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt')

From 92e5fde579d0891879f2f902d9da9521cb9e73d8 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Fri, 12 Feb 2021 15:16:26 +0000
Subject: [PATCH 049/327] Check for slash in sample names

---
 PopPUNK/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index bd3d2995..4bf042ce 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -389,6 +389,9 @@ def readRfile(rFile, oneSeq=False):
                                  "Must contain sample name and file, tab separated\n")
                 sys.exit(1)
 
+            if "/" in rFields[0]:
+                sys.stderr.write("Sample names may not contain slashes\n")
+                sys.exit(1)
             names.append(rFields[0])
             sample_files = []
             for sequence in rFields[1:]:

From 6934b416608d9f3112e07821bd199ddf0d225556 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Fri, 12 Feb 2021 16:28:41 +0000
Subject: [PATCH 050/327] Update distance extract script

---
 scripts/poppunk_extract_distances.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/scripts/poppunk_extract_distances.py b/scripts/poppunk_extract_distances.py
index 6552fd03..914a8ade 100755
--- a/scripts/poppunk_extract_distances.py
+++ b/scripts/poppunk_extract_distances.py
@@ -20,9 +20,11 @@ def get_options():
 
     return parser.parse_args()
 
-def iterDistRows(refSeqs, querySeqs, self=True):
+def listDistInts(refSeqs, querySeqs, self=True):
     """Gets the ref and query ID for each row of the distance matrix
+
     Returns an iterable with ref and query ID pairs by row.
+
     Args:
         refSeqs (list)
             List of reference sequence names.
@@ -36,15 +38,21 @@ def iterDistRows(refSeqs, querySeqs, self=True):
         ref, query (str, str)
             Iterable of tuples with ref and query names for each distMat row.
     """
+    num_ref = len(refSeqs)
+    num_query = len(querySeqs)
     if self:
-        assert refSeqs == querySeqs
-        for i, ref in enumerate(refSeqs):
-            for j in range(i + 1, len(refSeqs)):
-                yield(refSeqs[j], ref)
+        if refSeqs != querySeqs:
+            raise RuntimeError('refSeqs must equal querySeqs for db building (self = true)')
+        for i in range(num_ref):
+            for j in range(i + 1, num_ref):
+                yield(j, i)
     else:
-        for query in querySeqs:
-            for ref in refSeqs:
-                yield(ref, query)
+        comparisons = [(0,0)] * (len(refSeqs) * len(querySeqs))
+        for i in range(num_query):
+            for j in range(num_ref):
+                yield(j, i)
+
+        return comparisons
 
 def isolateNameToLabel(names):
     """Function to process isolate names to labels
@@ -97,7 +105,7 @@ def isolateNameToLabel(names):
         if args.tree is not None:
             oFile.write("\t" + 'Patristic')
         oFile.write("\n")
-        for i, (r_index, q_index) in enumerate(iterDistRows(r_names, q_names, r_names == q_names)):
+        for i, (r_index, q_index) in enumerate(listDistInts(r_names, q_names, r_names == q_names)):
             oFile.write("\t".join([q_names[q_index], r_names[r_index], str(X[i,0]), str(X[i,1])]))
             if args.tree is not None:
                 oFile.write("\t" + str(pdc(tip_index[r_index], tip_index[q_index])))

From 25f74fb45b2a6dd839fb120cdbb43945faedcda2 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Fri, 12 Feb 2021 16:29:15 +0000
Subject: [PATCH 051/327] Save/copy model with assign + update

---
 PopPUNK/assign.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index 5e5808df..7e9c6803 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -257,6 +257,11 @@ def assign_query(dbFuncs,
                        pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1)))
         storePickle(combined_seq, combined_seq, True, complete_distMat, dists_out)
 
+        # Copy model if needed
+        if output != model.outPrefix:
+            model.outPrefix = output
+            model.save()
+
         # Clique pruning
         if model.type != 'lineage':
             dbOrder = refList + qNames

From c1f4c5a23fecc5d099529c5948642dfa8569ed4c Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Feb 2021 09:28:57 +0000
Subject: [PATCH 052/327] Fix distance file names

---
 PopPUNK/__main__.py | 2 +-
 PopPUNK/assign.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index cb5f981a..11ab2ee3 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -330,7 +330,7 @@ def main():
             sys.stderr.write("Need to provide --ref-db where .h5 and .dists from "
                              "--create-db mode were output")
         if args.distances is None:
-            distances = os.path.basename(args.ref_db) + "/" + args.ref_db + ".dists"
+            distances = args.ref_db + "/" + os.path.basename(args.ref_db) + ".dists"
         else:
             distances = args.distances
         if args.output is None:
diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index ad13cd5c..13e77a81 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -456,7 +456,7 @@ def main():
     setGtThreads(args.threads)
 
     if args.distances is None:
-        distances = os.path.basename(args.db) + "/" + args.db + ".dists"
+        distances = args.db + "/" + os.path.basename(args.db) + ".dists"
     else:
         distances = args.distances
 

From d2c5d139846a0be0542b84095c5e4f8ff4406303 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Feb 2021 09:29:55 +0000
Subject: [PATCH 053/327] Fix name ordering function

---
 scripts/poppunk_extract_distances.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/poppunk_extract_distances.py b/scripts/poppunk_extract_distances.py
index 80ff64b9..eb4805f1 100755
--- a/scripts/poppunk_extract_distances.py
+++ b/scripts/poppunk_extract_distances.py
@@ -61,8 +61,6 @@ def listDistInts(refSeqs, querySeqs, self=True):
             for j in range(num_ref):
                 yield(j, i)
 
-        return comparisons
-
 def isolateNameToLabel(names):
     """Function to process isolate names to labels
     appropriate for visualisation.

From c91eb03f75502b6eb7af9d3ce95e22d7cf393b6f Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Thu, 18 Feb 2021 11:29:51 +0000
Subject: [PATCH 054/327] Fix ref name order in assign + update

---
 PopPUNK/__main__.py  | 31 +++++++++++++++----------------
 PopPUNK/assign.py    | 28 +++++++++++++++++++++-------
 PopPUNK/sketchlib.py | 20 +++++++++++++++-----
 3 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 11ab2ee3..359ac27b 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -280,31 +280,30 @@ def main():
 
         # generate sketches and QC sequences
         createDatabaseDir(args.output, kmers)
-        seq_names = constructDatabase(
-                        args.r_files,
-                        kmers,
-                        sketch_sizes,
-                        args.output,
-                        args.threads,
-                        args.overwrite,
-                        codon_phased = args.codon_phased,
-                        calc_random = True)
-
-        rNames = seq_names
-        qNames = seq_names
-        distMat = queryDatabase(rNames = rNames,
-                                qNames = qNames,
+        seq_names_passing = \
+            constructDatabase(
+                args.r_files,
+                kmers,
+                sketch_sizes,
+                args.output,
+                args.threads,
+                args.overwrite,
+                codon_phased = args.codon_phased,
+                calc_random = True)
+
+        distMat = queryDatabase(rNames = seq_names_passing,
+                                qNames = seq_names_passing,
                                 dbPrefix = args.output,
                                 queryPrefix = args.output,
                                 klist = kmers,
                                 self = True,
                                 number_plot_fits = args.plot_fit,
                                 threads = args.threads)
-        qcDistMat(distMat, rNames, qNames, args.max_a_dist)
+        qcDistMat(distMat, seq_names_passing, seq_names_passing, args.max_a_dist)
 
         # Save results
         dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
-        storePickle(rNames, qNames, True, distMat, dists_out)
+        storePickle(seq_names_passing, seq_names_passing, True, distMat, dists_out)
 
         # Plot results
         plot_scatter(distMat,
diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index 13e77a81..b4ab7c00 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -248,14 +248,28 @@ def assign_query(dbFuncs,
         else:
             distanceFiles = distances
 
-        refList, refList_copy, self, rrDistMat = readPickle(distanceFiles,
-                                                            enforce_self = True)
+        # Load the previous distances
+        refList_loaded, refList_copy, self, rrDistMat = \
+            readPickle(distanceFiles,
+                       enforce_self = True)
+        # qrDistMat: order of ref labels is the same as in the database (usually
+        # ordered). Order in original rrDistMat is arbitrary, leading to an
+        # awkwardness here. We prefer to reorder the qrDistMat to match, as it is
+        # usually smaller and has a simpler layout in long form
+        # At the end, rNames is updated to match what has been loaded
+        if refList_loaded != rNames:
+            match_order = [rNames.index(i) for i in refList_loaded] * len(qNames)
+            for q_offset in range(len(qNames)):
+                for r_offset in range(len(rNames)):
+                    match_order[q_offset * len(rNames) + r_offset] += q_offset * len(rNames)
+            qrDistMat = qrDistMat[match_order, :]
+            rNames = refList_loaded
 
         combined_seq, core_distMat, acc_distMat = \
-            update_distance_matrices(refList, rrDistMat,
+            update_distance_matrices(rNames, rrDistMat,
                                      qNames, qrDistMat,
                                      qqDistMat, threads = threads)
-        assert combined_seq == refList + qNames
+        assert combined_seq == rNames + qNames
 
         # Get full distance matrix and save
         complete_distMat = \
@@ -270,10 +284,10 @@ def assign_query(dbFuncs,
 
         # Clique pruning
         if model.type != 'lineage':
-            dbOrder = refList + qNames
+            dbOrder = rNames + qNames
             newRepresentativesIndices, newRepresentativesNames, \
                 newRepresentativesFile, genomeNetwork = \
-                    extractReferences(genomeNetwork, dbOrder, output, refList, threads = threads)
+                    extractReferences(genomeNetwork, dbOrder, output, rNames, threads = threads)
             # intersection that maintains order
             newQueries = [x for x in qNames if x in frozenset(newRepresentativesNames)]
 
@@ -293,7 +307,7 @@ def assign_query(dbFuncs,
                           output + "/" + os.path.basename(output) + ".refs.h5")
 
                 # ensure sketch and distMat order match
-                assert postpruning_combined_seq == refList + newQueries
+                assert postpruning_combined_seq == rNames + newQueries
     else:
         storePickle(rNames, qNames, False, qrDistMat, dists_out)
         if save_partial_query_graph:
diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py
index 96509c3d..60f42a54 100644
--- a/PopPUNK/sketchlib.py
+++ b/PopPUNK/sketchlib.py
@@ -388,6 +388,10 @@ def constructDatabase(assemblyList, klist, sketch_size, oPrefix,
         deviceid (int)
             GPU device id
             (default = 0)
+    Returns:
+        names (list)
+            List of names included in the database (some may be pruned due
+            to QC)
     """
     # read file names
     names, sequences = readRfile(assemblyList)
@@ -417,6 +421,7 @@ def constructDatabase(assemblyList, klist, sketch_size, oPrefix,
     # QC sequences
     if qc_dict['run_qc']:
         filtered_names = sketchlibAssemblyQC(oPrefix,
+                                             names,
                                              klist,
                                              qc_dict,
                                              strand_preserved,
@@ -567,13 +572,15 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
     return distMat
 
 
-def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads):
+def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads):
     """Calculates random match probability based on means of genomes
     in assemblyList, and looks for length outliers.
 
     Args:
         prefix (str)
             Prefix of output files
+        names (list)
+            Names of samples to QC
         klist (list)
             List of k-mer sizes to sketch
         qc_dict (dict)
@@ -605,10 +612,11 @@ def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads):
 
         # iterate through sketches
         for dataset in read_grp:
-            # test thresholds
-            remove = False
-            seq_length[dataset] = hdf_in['sketches'][dataset].attrs['length']
-            seq_ambiguous[dataset] = hdf_in['sketches'][dataset].attrs['missing_bases']
+            if dataset in names:
+                # test thresholds
+                remove = False
+                seq_length[dataset] = hdf_in['sketches'][dataset].attrs['length']
+                seq_ambiguous[dataset] = hdf_in['sketches'][dataset].attrs['missing_bases']
 
         # calculate thresholds
         # get mean length
@@ -692,6 +700,8 @@ def sketchlibAssemblyQC(prefix, klist, qc_dict, strand_preserved, threads):
         del hdf_in['random']
     hdf_in.close()
 
+    # This gives back retained in the same order as names
+    retained = [x for x in names if x in frozenset(retained)]
     return retained
 
 def fitKmerCurve(pairwise, klist, jacobian):

From 353dd1df6f905cb08d04fa1f6b2fdde7b78b494c Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Thu, 18 Feb 2021 11:34:32 +0000
Subject: [PATCH 055/327] Add sort on readRfile

---
 PopPUNK/utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 4bf042ce..f15fe72c 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -413,6 +413,14 @@ def readRfile(rFile, oneSeq=False):
         sys.stderr.write("Non-unique names are " + ",".join(dupes) + "\n")
         sys.exit(1)
 
+    # Names are sorted on return
+    # We have had issues (though they should be fixed) with unordered input
+    # not matching the database. This should help simplify things
+    list_iterable = zip(names, sequences)
+    sorted_names = sorted(list_iterable)
+    tuples = zip(*sorted_names)
+    names, sequences = [list(tuple) for tuple in tuples]
+
     return (names, sequences)
 
 def isolateNameToLabel(names):

From db54c7a8830e3f29ad7020cbce9acf5d13529c70 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Thu, 18 Feb 2021 11:48:42 +0000
Subject: [PATCH 056/327] Add test of distance order

---
 test/clean_test.py                |  5 ++++-
 test/rfile1.txt                   |  3 +++
 test/rfile12.txt                  |  6 +++++
 test/rfile2.txt                   |  3 +++
 test/run_test.py                  | 22 ++++++++++--------
 test/test-update.py               | 37 +++++++++++++++++++++++++++++++
 test/{test_web.py => test-web.py} |  0
 7 files changed, 66 insertions(+), 10 deletions(-)
 create mode 100644 test/rfile1.txt
 create mode 100644 test/rfile12.txt
 create mode 100644 test/rfile2.txt
 create mode 100755 test/test-update.py
 rename test/{test_web.py => test-web.py} (100%)

diff --git a/test/clean_test.py b/test/clean_test.py
index 3ecc96a1..29852e14 100755
--- a/test/clean_test.py
+++ b/test/clean_test.py
@@ -40,7 +40,10 @@ def deleteDir(dirname):
     "example_tsne",
     "example_prune",
     "example_refs",
-    "example_api"
+    "example_api",
+    "batch1",
+    "batch2",
+    "batch12"
 ]
 for outDir in outputDirs:
     deleteDir(outDir)
diff --git a/test/rfile1.txt b/test/rfile1.txt
new file mode 100644
index 00000000..4f388da2
--- /dev/null
+++ b/test/rfile1.txt
@@ -0,0 +1,3 @@
+7	12673_8#24.contigs_velvet.fa
+1	12673_8#34.contigs_velvet.fa
+2	12673_8#43.contigs_velvet.fa
diff --git a/test/rfile12.txt b/test/rfile12.txt
new file mode 100644
index 00000000..e4f63584
--- /dev/null
+++ b/test/rfile12.txt
@@ -0,0 +1,6 @@
+7	12673_8#24.contigs_velvet.fa
+1	12673_8#34.contigs_velvet.fa
+2	12673_8#43.contigs_velvet.fa
+6	12754_4#79.contigs_velvet.fa
+4	12754_4#85.contigs_velvet.fa
+5	12754_4#89.contigs_velvet.fa
diff --git a/test/rfile2.txt b/test/rfile2.txt
new file mode 100644
index 00000000..5f6e9a24
--- /dev/null
+++ b/test/rfile2.txt
@@ -0,0 +1,3 @@
+6	12754_4#79.contigs_velvet.fa
+4	12754_4#85.contigs_velvet.fa
+5	12754_4#89.contigs_velvet.fa
diff --git a/test/run_test.py b/test/run_test.py
index 2337a282..c6ed75e3 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -25,6 +25,10 @@
 sys.stderr.write("Running database QC test (--create-db)\n")
 subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files references.txt --min-k 13 --k-step 3 --output example_qc --qc-filter continue --length-range 2000000 3000000 --overwrite", shell=True, check=True)
 
+# test updating order is correct
+sys.stderr.write("Running distance matrix order check (--update-db)\n")
+subprocess.run(python_cmd + " test-update.py", shell=True, check=True)
+
 #fit GMM
 sys.stderr.write("Running GMM model fit (--fit-model gmm)\n")
 subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model bgmm --ref-db example_db --K 4 --overwrite", shell=True, check=True)
@@ -35,11 +39,11 @@
 
 #refine model with GMM
 sys.stderr.write("Running model refinement (--fit-model refine)\n")
-subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite", shell=True, check=True)
-subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both", shell=True, check=True)
-subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1", shell=True, check=True)
-subprocess.run("python ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 2", shell=True, check=True)
-subprocess.run("python ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold", shell=True, check=True)
+subprocess.run(python_cmd + "../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both", shell=True, check=True)
+subprocess.run(python_cmd + "../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 2", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold", shell=True, check=True)
 
 # lineage clustering
 sys.stderr.write("Running lineage clustering test (--fit-model lineage)\n")
@@ -51,7 +55,7 @@
 
 # tests of other command line programs
 sys.stderr.write("Testing C++ extension\n")
-subprocess.run("python test-refine.py", shell=True, check=True)
+subprocess.run(python_cmd + " test-refine.py", shell=True, check=True)
 
 #assign query
 sys.stderr.write("Running query assignment\n")
@@ -73,8 +77,8 @@
 
 # MST
 sys.stderr.write("Running MST\n")
-subprocess.run("python ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree mst", shell=True, check=True)
-subprocess.run("python ../poppunk_mst-runner.py --distance-pkl example_db/example_db.dists.pkl --rank-fit example_lineages/example_lineages_rank5_fit.npz --previous-clustering example_dbscan/example_dbscan_clusters.csv --output example_sparse_mst --no-plot", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree mst", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_mst-runner.py --distance-pkl example_db/example_db.dists.pkl --rank-fit example_lineages/example_lineages_rank5_fit.npz --previous-clustering example_dbscan/example_dbscan_clusters.csv --output example_sparse_mst --no-plot", shell=True, check=True)
 
 # t-sne
 sys.stderr.write("Running tsne viz\n")
@@ -90,7 +94,7 @@
 
 # web API
 sys.stderr.write("Running API tests\n")
-subprocess.run(python_cmd + " test_web.py", shell=True, check=True)
+subprocess.run(python_cmd + " test-web.py", shell=True, check=True)
 
 sys.stderr.write("Tests completed\n")
 
diff --git a/test/test-update.py b/test/test-update.py
new file mode 100755
index 00000000..38c950c2
--- /dev/null
+++ b/test/test-update.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# Copyright 2018-2020 John Lees and Nick Croucher
+
+"""Tests for PopPUNK"""
+
+import subprocess
+import os
+import sys
+import shutil
+
+import numpy as np
+from scipy import stats
+
+if os.environ.get("POPPUNK_PYTHON"):
+    python_cmd = os.environ.get("POPPUNK_PYTHON")
+else:
+    python_cmd = "python"
+
+def run_regression(x, y, threshold = 0.99):
+    res = stats.linregress(x, y)
+    if res.rvalue**2 < threshold:
+        sys.stderr.write("Dist order failed: R^2 = " + str(res.rvalue**2) + "\n")
+        sys.exit(1)
+
+# Check that order is the same after doing 1 + 2 with --update-db, as doing all of 1 + 2 together
+subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile12.txt --output batch12 --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile1.txt --output batch1 --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch1 --ranks 1", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch1 --query rfile2.txt --output batch2 --update-db --overwrite", shell=True, check=True)
+
+X1 = np.load("batch12/batch12.dists.npy")
+X2 = np.load("batch2/batch2.dists.npy")
+
+run_regression(X1[:, 0], X2[:, 0])
+run_regression(X1[:, 1], X2[:, 1])
+
+
diff --git a/test/test_web.py b/test/test-web.py
similarity index 100%
rename from test/test_web.py
rename to test/test-web.py

From 2ca5dec552bccfa26169fb8b15f117aa5bac00ad Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Thu, 18 Feb 2021 12:47:56 +0000
Subject: [PATCH 057/327] Requery in update test

---
 test/test-update.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/test/test-update.py b/test/test-update.py
index 38c950c2..9ffc9192 100755
--- a/test/test-update.py
+++ b/test/test-update.py
@@ -1,15 +1,20 @@
 #!/usr/bin/env python
-# Copyright 2018-2020 John Lees and Nick Croucher
+# Copyright 2018-2021 John Lees and Nick Croucher
 
-"""Tests for PopPUNK"""
+"""Tests for PopPUNK --update-db order"""
 
 import subprocess
-import os
+import os, sys
 import sys
 import shutil
+import pickle
 
 import numpy as np
 from scipy import stats
+import h5py
+
+sys.path.insert(0, '/Users/jlees/Documents/Imperial/pp-sketchlib/build/lib.macosx-10.9-x86_64-3.8')
+import pp_sketchlib
 
 if os.environ.get("POPPUNK_PYTHON"):
     python_cmd = os.environ.get("POPPUNK_PYTHON")
@@ -18,8 +23,9 @@
 
 def run_regression(x, y, threshold = 0.99):
     res = stats.linregress(x, y)
+    print("R^2: " + str(res.rvalue**2))
     if res.rvalue**2 < threshold:
-        sys.stderr.write("Dist order failed: R^2 = " + str(res.rvalue**2) + "\n")
+        sys.stderr.write("Distance matrix order failed!\n")
         sys.exit(1)
 
 # Check that order is the same after doing 1 + 2 with --update-db, as doing all of 1 + 2 together
@@ -28,9 +34,20 @@ def run_regression(x, y, threshold = 0.99):
 subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch1 --ranks 1", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch1 --query rfile2.txt --output batch2 --update-db --overwrite", shell=True, check=True)
 
-X1 = np.load("batch12/batch12.dists.npy")
+# Load updated distances
 X2 = np.load("batch2/batch2.dists.npy")
-
+with open("batch2/batch2.dists.pkl", 'rb') as pickle_file:
+    rlist2, qlist, self = pickle.load(pickle_file)
+
+# Get same distances from the full database
+ref_db = "batch12/batch12"
+ref_h5 = h5py.File(ref_db + ".h5", 'r')
+db_kmers = sorted(ref_h5['sketches/' + rlist2[0]].attrs['kmers'])
+ref_h5.close()
+X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist2, rlist2, db_kmers,
+                                True, False, 1, False, 0)
+
+# Check distances match
 run_regression(X1[:, 0], X2[:, 0])
 run_regression(X1[:, 1], X2[:, 1])
 

From 0e33312e940e7f5fcd2a5481a571d1c3de688796 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Feb 2021 13:22:08 +0000
Subject: [PATCH 058/327] Fix refine model test

---
 test/run_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/run_test.py b/test/run_test.py
index c6ed75e3..ef173f39 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -39,7 +39,7 @@
 
 #refine model with GMM
 sys.stderr.write("Running model refinement (--fit-model refine)\n")
-subprocess.run(python_cmd + "../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both", shell=True, check=True)
 subprocess.run(python_cmd + "../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 2", shell=True, check=True)

From dfc83fae0e56566fdbacd4de8b42b96f533ab206 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Feb 2021 13:32:35 +0000
Subject: [PATCH 059/327] Fix second lineage test

---
 test/run_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/run_test.py b/test/run_test.py
index ef173f39..a72b450d 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -41,7 +41,7 @@
 sys.stderr.write("Running model refinement (--fit-model refine)\n")
 subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both", shell=True, check=True)
-subprocess.run(python_cmd + "../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 2", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold", shell=True, check=True)
 

From 31dcbfd2bb4f19ce05c79ba40a7dbcd1815e3a59 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Feb 2021 15:08:55 +0000
Subject: [PATCH 060/327] Add display cluster option for MST visualisation

---
 PopPUNK/visualise.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py
index 9b135448..d977abc9 100644
--- a/PopPUNK/visualise.py
+++ b/PopPUNK/visualise.py
@@ -358,6 +358,10 @@ def generate_visualisations(query_db,
             if not overwrite:
                 existing_tree = load_tree(output, "MST", distances=mst_distances)
             if existing_tree is None:
+                # Get a default clustering if none provided
+                if display_cluster is None:
+                    display_cluster = list(isolateClustering.keys())[0]
+                # Get distance matrix
                 complete_distMat = \
                     np.hstack((pp_sketchlib.squareToLong(core_distMat, threads).reshape(-1, 1),
                             pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1)))
@@ -371,7 +375,7 @@ def generate_visualisations(query_db,
                                     weights_type=mst_distances,
                                     summarise=False)
                 mst_graph = generate_minimum_spanning_tree(G)
-                drawMST(mst_graph, output, isolateClustering, overwrite)
+                drawMST(mst_graph, output, isolateClustering, display_cluster, overwrite)
                 mst_tree = mst_to_phylogeny(mst_graph, isolateNameToLabel(combined_seq))
             else:
                 mst_tree = existing_tree

From d396870e9c98c9d8d914947309d09eff6fa710b9 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Feb 2021 16:26:02 +0000
Subject: [PATCH 061/327] Add test for querying updated database

---
 test/rfile123.txt   |  9 +++++++++
 test/rfile3.txt     |  3 +++
 test/test-update.py | 20 ++++++++++++++++++++
 3 files changed, 32 insertions(+)
 create mode 100644 test/rfile123.txt
 create mode 100644 test/rfile3.txt

diff --git a/test/rfile123.txt b/test/rfile123.txt
new file mode 100644
index 00000000..af5a0ead
--- /dev/null
+++ b/test/rfile123.txt
@@ -0,0 +1,9 @@
+7	12673_8#24.contigs_velvet.fa
+1	12673_8#34.contigs_velvet.fa
+2	12673_8#43.contigs_velvet.fa
+6	12754_4#79.contigs_velvet.fa
+4	12754_4#85.contigs_velvet.fa
+5	12754_4#89.contigs_velvet.fa
+8	12754_5#73.contigs_velvet.fa
+3	12754_5#78.contigs_velvet.fa
+9	12754_5#71.contigs_velvet.fa
diff --git a/test/rfile3.txt b/test/rfile3.txt
new file mode 100644
index 00000000..23104358
--- /dev/null
+++ b/test/rfile3.txt
@@ -0,0 +1,3 @@
+8	12754_5#73.contigs_velvet.fa
+3	12754_5#78.contigs_velvet.fa
+9	12754_5#71.contigs_velvet.fa
diff --git a/test/test-update.py b/test/test-update.py
index 9ffc9192..6ec36309 100755
--- a/test/test-update.py
+++ b/test/test-update.py
@@ -51,4 +51,24 @@ def run_regression(x, y, threshold = 0.99):
 run_regression(X1[:, 0], X2[:, 0])
 run_regression(X1[:, 1], X2[:, 1])
 
+# Check that order is the same after doing 1 + 2 + 3 with --update-db, as doing all of 1 + 2 + 3 together
+subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile123.txt --output batch123 --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch123 --ranks 1", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch2 --query rfile3.txt --output batch3 --update-db --overwrite", shell=True, check=True)
 
+# Load updated distances
+X2 = np.load("batch3/batch3.dists.npy")
+with open("batch3/batch3.dists.pkl", 'rb') as pickle_file:
+    rlist3, qlist, self = pickle.load(pickle_file)
+
+# Get same distances from the full database
+ref_db = "batch123/batch123"
+ref_h5 = h5py.File(ref_db + ".h5", 'r')
+db_kmers = sorted(ref_h5['sketches/' + rlist3[0]].attrs['kmers'])
+ref_h5.close()
+X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist3, rlist3, db_kmers,
+                                True, False, 1, False, 0)
+
+# Check distances match
+run_regression(X1[:, 0], X2[:, 0])
+run_regression(X1[:, 1], X2[:, 1])

From 44ec04d8ff08867872625bb7e8ba2101ff0889b8 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Feb 2021 19:48:32 +0000
Subject: [PATCH 062/327] Remove hard coded file path

---
 test/test-update.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test-update.py b/test/test-update.py
index 6ec36309..28c986f8 100755
--- a/test/test-update.py
+++ b/test/test-update.py
@@ -13,7 +13,6 @@
 from scipy import stats
 import h5py
 
-sys.path.insert(0, '/Users/jlees/Documents/Imperial/pp-sketchlib/build/lib.macosx-10.9-x86_64-3.8')
 import pp_sketchlib
 
 if os.environ.get("POPPUNK_PYTHON"):

From 5c6ebf5a04119bdd717c71460c1034c1b0ded855 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Feb 2021 07:24:40 +0000
Subject: [PATCH 063/327] Add sparse distance matrix update testing

---
 test/test-update.py | 61 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 5 deletions(-)

diff --git a/test/test-update.py b/test/test-update.py
index 28c986f8..22aa630e 100755
--- a/test/test-update.py
+++ b/test/test-update.py
@@ -12,6 +12,7 @@
 import numpy as np
 from scipy import stats
 import h5py
+import scipy.sparse
 
 import pp_sketchlib
 
@@ -27,10 +28,43 @@ def run_regression(x, y, threshold = 0.99):
         sys.stderr.write("Distance matrix order failed!\n")
         sys.exit(1)
 
+def compare_sparse_matrices(d1,d2,r1,r2):
+    d1_pairs = get_seq_tuples(d1.row,d1.col,r1)
+    d2_pairs = get_seq_tuples(d2.row,d2.col,r2)
+    d1_dists = []
+    d2_dists = []
+
+    for (pair1,dist1) in zip(d1_pairs,d1.data):
+        for (pair2,dist2) in zip(d2_pairs,d2.data):
+            if pair1 == pair2:
+                d1_dists.append(dist1)
+                d2_dists.append(dist2)
+                break
+
+    run_regression(np.asarray(d1_dists),np.asarray(d2_dists))
+
+def get_seq_tuples(rows,cols,names):
+    tuple_list = []
+    for (i,j) in zip(rows,cols):
+        sorted_pair = tuple(sorted((names[i],names[j])))
+        tuple_list.append(sorted_pair)
+    return tuple_list
+
+def old_get_seq_tuples(rows,cols):
+    max_seqs = np.maximum(rows,cols)
+    min_seqs = np.minimum(rows,cols)
+    concat_seqs = np.vstack((max_seqs,min_seqs))
+    seq_pairs = concat_seqs.T
+    seq_tuples = [tuple(row) for row in seq_pairs]
+    return seq_tuples
+
+# Check distances after one query
+
 # Check that order is the same after doing 1 + 2 with --update-db, as doing all of 1 + 2 together
 subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile12.txt --output batch12 --overwrite", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch12 --ranks 1,2", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile1.txt --output batch1 --overwrite", shell=True, check=True)
-subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch1 --ranks 1", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch1 --ranks 1,2", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch1 --query rfile2.txt --output batch2 --update-db --overwrite", shell=True, check=True)
 
 # Load updated distances
@@ -50,24 +84,41 @@ def run_regression(x, y, threshold = 0.99):
 run_regression(X1[:, 0], X2[:, 0])
 run_regression(X1[:, 1], X2[:, 1])
 
+# Check sparse distances after one query
+with open("batch12/batch12.dists.pkl", 'rb') as pickle_file:
+    rlist1, qlist1, self = pickle.load(pickle_file)
+S1 = scipy.sparse.load_npz("batch12/batch12_rank2_fit.npz")
+S2 = scipy.sparse.load_npz("batch2/batch2_rank2_fit.npz")
+compare_sparse_matrices(S1,S2,rlist1,rlist2)
+
+# Check distances after second query
+
 # Check that order is the same after doing 1 + 2 + 3 with --update-db, as doing all of 1 + 2 + 3 together
 subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile123.txt --output batch123 --overwrite", shell=True, check=True)
-subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch123 --ranks 1", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch123 --ranks 1,2", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch2 --query rfile3.txt --output batch3 --update-db --overwrite", shell=True, check=True)
 
 # Load updated distances
 X2 = np.load("batch3/batch3.dists.npy")
 with open("batch3/batch3.dists.pkl", 'rb') as pickle_file:
-    rlist3, qlist, self = pickle.load(pickle_file)
+    rlist4, qlist, self = pickle.load(pickle_file)
 
 # Get same distances from the full database
 ref_db = "batch123/batch123"
 ref_h5 = h5py.File(ref_db + ".h5", 'r')
-db_kmers = sorted(ref_h5['sketches/' + rlist3[0]].attrs['kmers'])
+db_kmers = sorted(ref_h5['sketches/' + rlist4[0]].attrs['kmers'])
 ref_h5.close()
-X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist3, rlist3, db_kmers,
+X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist4, rlist4, db_kmers,
                                 True, False, 1, False, 0)
 
 # Check distances match
 run_regression(X1[:, 0], X2[:, 0])
 run_regression(X1[:, 1], X2[:, 1])
+
+# Check sparse distances after second query
+with open("batch123/batch123.dists.pkl", 'rb') as pickle_file:
+    rlist3, qlist, self = pickle.load(pickle_file)
+S3 = scipy.sparse.load_npz("batch123/batch123_rank2_fit.npz")
+S4 = scipy.sparse.load_npz("batch3/batch3_rank2_fit.npz")
+
+compare_sparse_matrices(S3,S4,rlist3,rlist4)

From d4dd508e35c998bd051539c62a9a7f38147a7e6d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Feb 2021 19:14:42 +0000
Subject: [PATCH 064/327] Add maximum core distance

---
 PopPUNK/__main__.py | 6 ++++--
 PopPUNK/utils.py    | 8 +++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 359ac27b..3639cf84 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -94,6 +94,8 @@ def get_options():
                                                 'separate database [default = False]', default=False, action='store_true')
     qcGroup.add_argument('--max-a-dist', help='Maximum accessory distance to permit [default = 0.5]',
                                                 default = 0.5, type = float)
+    qcGroup.add_argument('--max-pi-dist', help='Maximum core distance to permit [default = 0.5]',
+                                                default = 0.5, type = float)
     qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond '
                                                 'which sequences will be excluded [default = 5]', default = 5, type = int)
     qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded '
@@ -299,7 +301,7 @@ def main():
                                 self = True,
                                 number_plot_fits = args.plot_fit,
                                 threads = args.threads)
-        qcDistMat(distMat, seq_names_passing, seq_names_passing, args.max_a_dist)
+        qcDistMat(distMat, seq_names_passing, seq_names_passing, args.max_pi_dist, args.max_a_dist)
 
         # Save results
         dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
@@ -353,7 +355,7 @@ def main():
 
         # Load the distances
         refList, queryList, self, distMat = readPickle(distances, enforce_self=True)
-        if qcDistMat(distMat, refList, queryList, args.max_a_dist) == False \
+        if qcDistMat(distMat, refList, queryList, args.max_pi_dist, args.max_a_dist) == False \
                 and args.qc_filter == "stop":
             sys.stderr.write("Distances failed quality control (change QC options to run anyway)\n")
             sys.exit(1)
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index f15fe72c..dffc7b06 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -199,7 +199,7 @@ def listDistInts(refSeqs, querySeqs, self=True):
         return comparisons
 
 
-def qcDistMat(distMat, refList, queryList, a_max):
+def qcDistMat(distMat, refList, queryList, c_max, a_max):
     """Checks distance matrix for outliers. At the moment
     just a threshold for accessory distance
 
@@ -210,6 +210,8 @@ def qcDistMat(distMat, refList, queryList, a_max):
             Reference labels
         queryList (list)
             Query labels (or refList if self)
+        c_max (float)
+            Maximum core distance to allow
         a_max (float)
             Maximum accessory distance to allow
 
@@ -224,7 +226,7 @@ def qcDistMat(distMat, refList, queryList, a_max):
         passed = False
         names = iterDistRows(refList, queryList, refList == queryList)
         for i, (ref, query) in enumerate(names):
-            if distMat[i,1] > a_max:
+            if distMat[i,0] > c_max or distMat[i,1] > a_max:
                 sys.stderr.write("WARNING: Accessory outlier at a=" + str(distMat[i,1]) +
                                  " 1:" + ref + " 2:" + query + "\n")
 
@@ -500,4 +502,4 @@ def decisionBoundary(intercept, gradient):
     """
     x = intercept[0] + intercept[1] * gradient
     y = intercept[1] + intercept[0] / gradient
-    return(x, y)
\ No newline at end of file
+    return(x, y)

From f592701f27c179cbde79b7325af5ec5753955b4d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 19 Feb 2021 19:30:06 +0000
Subject: [PATCH 065/327] Fix search for outlier core distances

---
 PopPUNK/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index dffc7b06..ebc239b7 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -222,12 +222,12 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max):
     passed = True
 
     # First check with numpy, which is quicker than iterating over everything
-    if np.any(distMat[:,1] > a_max):
+    if np.any(distMat[:,1] > a_max) or np.any(distMat[:,0] > c_max):
         passed = False
         names = iterDistRows(refList, queryList, refList == queryList)
         for i, (ref, query) in enumerate(names):
             if distMat[i,0] > c_max or distMat[i,1] > a_max:
-                sys.stderr.write("WARNING: Accessory outlier at a=" + str(distMat[i,1]) +
+                sys.stderr.write("WARNING: Accessory outlier at c = " + str(distMat[i,0]) + " a = " + str(distMat[i,1]) +
                                  " 1:" + ref + " 2:" + query + "\n")
 
     return passed

From 7af793e88a199a153183a378a2571911a0cef79c Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Feb 2021 20:32:19 +0000
Subject: [PATCH 066/327] Avoid tree redrawing

---
 PopPUNK/visualise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py
index d977abc9..4517b46f 100644
--- a/PopPUNK/visualise.py
+++ b/PopPUNK/visualise.py
@@ -95,7 +95,7 @@ def get_options():
     faGroup.add_argument('--phandango', help='Generate phylogeny and TSV for Phandango visualisation', default=False, action='store_true')
     faGroup.add_argument('--grapetree', help='Generate phylogeny and CSV for grapetree visualisation', default=False, action='store_true')
     faGroup.add_argument('--tree', help='Type of tree to calculate [default = nj]', type=str, default='nj',
-        choices=['nj', 'mst', 'both'])
+        choices=['nj', 'mst', 'both', 'none'])
     faGroup.add_argument('--mst-distances', help='Distances used to calculate a minimum spanning tree [default = core]', type=str,
         default='core', choices=accepted_weights_types)
     faGroup.add_argument('--rapidnj', help='Path to rapidNJ binary to build NJ tree for Microreact', default='rapidnj')

From 06d9f152561b1d2cda45627aaaa7efc82395f09d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Feb 2021 21:12:47 +0000
Subject: [PATCH 067/327] Allow for pruning based on distances to a reference

---
 PopPUNK/__main__.py | 36 +++++++++++++++++++++++++++++++-----
 PopPUNK/utils.py    | 19 +++++++++++++++----
 2 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 3639cf84..8c67e9f2 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -96,6 +96,8 @@ def get_options():
                                                 default = 0.5, type = float)
     qcGroup.add_argument('--max-pi-dist', help='Maximum core distance to permit [default = 0.5]',
                                                 default = 0.5, type = float)
+    qcGroup.add_argument('--reference-isolate', help='Isolate from which distances can be calculated for pruning [default = None]',
+                                                default = None, type = str)
     qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond '
                                                 'which sequences will be excluded [default = 5]', default = 5, type = int)
     qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded '
@@ -301,11 +303,35 @@ def main():
                                 self = True,
                                 number_plot_fits = args.plot_fit,
                                 threads = args.threads)
-        qcDistMat(distMat, seq_names_passing, seq_names_passing, args.max_pi_dist, args.max_a_dist)
-
-        # Save results
-        dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
-        storePickle(seq_names_passing, seq_names_passing, True, distMat, dists_out)
+        names_to_remove = qcDistMat(distMat,
+                                seq_names_passing,
+                                seq_names_passing,
+                                args.max_pi_dist,
+                                args.max_a_dist,
+                                args.reference_isolate)
+        
+        # prune based on distance from reference if provided
+        if args.reference_isolate is not None and args.qc_filter == "prune":
+            # Remove sketches
+            db_name = args.output + '/' + os.path.basename(args.output) + '.h5'
+            filtered_db_name = args.output + '/' + 'filtered.' + os.path.basename(args.output) + '.h5'
+            removeFromDB(db_name,
+                         filtered_db_name,
+                         names_to_remove,
+                         full_names = True)
+            os.rename(filtered_db_name, db_name)
+            # Remove from distance matrix
+            prune_distance_matrix(seq_names_passing,
+                                    names_to_remove,
+                                    distMat,
+                                    args.output + "/" + os.path.basename(args.output) + ".dists")
+            # Remove from reflist
+            seq_names_passing = [seq_names_passing.remove(x) for x in names_to_remove]
+            sys.stderr.write("Successfully removed from the database: " + str(names_to_remove))
+        else:
+            # Save results
+            dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
+            storePickle(seq_names_passing, seq_names_passing, True, distMat, dists_out)
 
         # Plot results
         plot_scatter(distMat,
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index ebc239b7..777bd495 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -199,7 +199,7 @@ def listDistInts(refSeqs, querySeqs, self=True):
         return comparisons
 
 
-def qcDistMat(distMat, refList, queryList, c_max, a_max):
+def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate):
     """Checks distance matrix for outliers. At the moment
     just a threshold for accessory distance
 
@@ -214,12 +214,15 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max):
             Maximum core distance to allow
         a_max (float)
             Maximum accessory distance to allow
+        ref_isolate (str)
+            Name of reference from which pruning can occur
 
     Returns:
         passed (bool)
             False if any samples failed
     """
     passed = True
+    to_prune = []
 
     # First check with numpy, which is quicker than iterating over everything
     if np.any(distMat[:,1] > a_max) or np.any(distMat[:,0] > c_max):
@@ -227,10 +230,18 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max):
         names = iterDistRows(refList, queryList, refList == queryList)
         for i, (ref, query) in enumerate(names):
             if distMat[i,0] > c_max or distMat[i,1] > a_max:
-                sys.stderr.write("WARNING: Accessory outlier at c = " + str(distMat[i,0]) + " a = " + str(distMat[i,1]) +
+                sys.stderr.write("WARNING: Outlier at c = " + str(distMat[i,0]) + " a = " + str(distMat[i,1]) +
                                  " 1:" + ref + " 2:" + query + "\n")
-
-    return passed
+                if ref_isolate is not None:
+                    if ref == ref_isolate:
+                        to_prune.append(query)
+                    elif query == ref_isolate:
+                        to_prune.append(ref)
+
+    if ref_isolate is None:
+        return passed
+    else:
+        return to_prune
 
 
 def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False):

From 3bcca2ef781ad66ff722acec27dd18038facbb0e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Feb 2021 21:50:09 +0000
Subject: [PATCH 068/327] Fix filtering condition

---
 PopPUNK/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 8c67e9f2..25539884 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -311,7 +311,7 @@ def main():
                                 args.reference_isolate)
         
         # prune based on distance from reference if provided
-        if args.reference_isolate is not None and args.qc_filter == "prune":
+        if args.reference_isolate is not None and len(names_to_remove) > 0 and args.qc_filter == "prune":
             # Remove sketches
             db_name = args.output + '/' + os.path.basename(args.output) + '.h5'
             filtered_db_name = args.output + '/' + 'filtered.' + os.path.basename(args.output) + '.h5'

From fde3e7d3dedd2719ed2157940de58929d81f8187 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Feb 2021 21:52:45 +0000
Subject: [PATCH 069/327] Add default reference isolate

---
 PopPUNK/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 777bd495..14ec39ad 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -199,7 +199,7 @@ def listDistInts(refSeqs, querySeqs, self=True):
         return comparisons
 
 
-def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate):
+def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate = None):
     """Checks distance matrix for outliers. At the moment
     just a threshold for accessory distance
 

From bd896a367274f55133d7253996f0ea01ac4050e2 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 1 Mar 2021 13:52:48 +0000
Subject: [PATCH 070/327] Add QC options to assign

---
 PopPUNK/assign.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index b4ab7c00..fcb2eee3 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -36,6 +36,8 @@ def assign_query(dbFuncs,
                  plot_fit,
                  graph_weights,
                  max_a_dist,
+                 max_pi_dist,
+                 reference_isolate,
                  model_dir,
                  strand_preserved,
                  previous_clustering,
@@ -142,7 +144,7 @@ def assign_query(dbFuncs,
                               number_plot_fits = plot_fit,
                               threads = threads)
     # QC distance matrix
-    qcPass = qcDistMat(qrDistMat, rNames, qNames, max_a_dist)
+    qcPass = qcDistMat(qrDistMat, rNames, qNames, max_c_dist, max_a_dist, reference_isolate)
 
     # Load the network based on supplied options
     genomeNetwork, old_cluster_file = \
@@ -368,6 +370,10 @@ def get_options():
                                                 'separate database [default = False]', default=False, action='store_true')
     qcGroup.add_argument('--max-a-dist', help='Maximum accessory distance to permit [default = 0.5]',
                                                 default = 0.5, type = float)
+    qcGroup.add_argument('--max-pi-dist', help='Maximum core distance to permit [default = 0.5]',
+                                                default = 0.5, type = float)
+    qcGroup.add_argument('--reference-isolate', help='Isolate from which distances can be calculated for pruning [default = None]',
+                                                default = None, type = str)
     qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond '
                                                 'which sequences will be excluded [default = 5]', default = None, type = int)
     qcGroup.add_argument('--length-range', help='Allowed length range, outside of which sequences will be excluded '
@@ -492,6 +498,8 @@ def main():
                  args.plot_fit,
                  args.graph_weights,
                  args.max_a_dist,
+                 args.max_pi_dist,
+                 args.reference_isolate,
                  args.model_dir,
                  args.strand_preserved,
                  args.previous_clustering,

From dbc6098410f84bd8ec994f70fcbda2feba8a9394 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 1 Mar 2021 14:01:12 +0000
Subject: [PATCH 071/327] Make pruning variable names consistent

---
 PopPUNK/assign.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index fcb2eee3..b1b11236 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -144,7 +144,7 @@ def assign_query(dbFuncs,
                               number_plot_fits = plot_fit,
                               threads = threads)
     # QC distance matrix
-    qcPass = qcDistMat(qrDistMat, rNames, qNames, max_c_dist, max_a_dist, reference_isolate)
+    qcPass = qcDistMat(qrDistMat, rNames, qNames, max_pi_dist, max_a_dist, reference_isolate)
 
     # Load the network based on supplied options
     genomeNetwork, old_cluster_file = \

From 354bf97d50a5a4918defd802d2da1f6e59123302 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 8 Mar 2021 13:26:48 +0000
Subject: [PATCH 072/327] Make data type in isolate clustering consistent

---
 PopPUNK/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 14ec39ad..33588739 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -289,7 +289,7 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False):
             cluster_name = clustersCsv.columns[cls_idx]
             cluster_name = cluster_name.replace('__autocolour','')
             if return_dict:
-                clusters[cluster_name][row.Index] = str(row[cls_idx + 1])
+                clusters[cluster_name][str(row.Index)] = str(row[cls_idx + 1])
             else:
                 if cluster_name not in clusters.keys():
                     clusters[cluster_name] = defaultdict(set)

From d57141469198f17a7f04219410c1abd47d2e0037 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 9 Mar 2021 09:45:59 +0000
Subject: [PATCH 073/327] Remove assumption that sparse matrices are
 symmetrical

---
 PopPUNK/network.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index f77271ba..2c82fd81 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -369,9 +369,7 @@ def constructNetwork(rlist, qlist, assignments, within_label,
     if edge_list:
         if weights is not None:
             for weight, (ref, query) in zip(weights, assignments):
-                # sparse matrix is symmetrical, avoid redundant loops
-                if ref < query:
-                    connections.append((ref, query, weight))
+                connections.append((ref, query, weight))
         else:
             connections = assignments
     elif sparse_input is not None:

From a6463fbb86b94655247a16b4863bc3397d164d58 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 9 Mar 2021 10:12:31 +0000
Subject: [PATCH 074/327] Again remove assumption that sparse matrices are
 symmetrical

---
 PopPUNK/network.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 2c82fd81..15074422 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -374,9 +374,7 @@ def constructNetwork(rlist, qlist, assignments, within_label,
             connections = assignments
     elif sparse_input is not None:
         for ref, query, weight in zip(sparse_input.row, sparse_input.col, sparse_input.data):
-            # sparse matrix is symmetrical, avoid redundant loops
-            if ref < query:
-                connections.append((ref, query, weight))
+            connections.append((ref, query, weight))
     else:
         for row_idx, (assignment, (ref, query)) in enumerate(zip(assignments,
                                                                  listDistInts(rlist, qlist,

From 4a255168c063d321ac0acde8a78db30109332f01 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 9 Mar 2021 22:00:07 +0000
Subject: [PATCH 075/327] Add cugraph support for lineage graphs

---
 PopPUNK/__main__.py |  11 ++--
 PopPUNK/network.py  | 119 ++++++++++++++++++++++++++++++--------------
 2 files changed, 88 insertions(+), 42 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 25539884..48527953 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -155,6 +155,7 @@ def get_options():
     other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]')
     other.add_argument('--gpu-sketch', default=False, action='store_true', help='Use a GPU when calculating sketches (read data only) [default = False]')
     other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]')
+    other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating networks [default = False]')
     other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]')
 
     other.add_argument('--version', action='version',
@@ -455,7 +456,7 @@ def main():
                                  queryList,
                                  assignments,
                                  model.within_label,
-                                 weights=weights)
+                                 weights = weights)
         else:
             # Lineage fit requires some iteration
             indivNetworks = {}
@@ -471,13 +472,15 @@ def main():
                                         refList,
                                         assignments[rank],
                                         0,
-                                        edge_list=True,
-                                        weights=weights
+                                        edge_list = True,
+                                        weights = weights,
+                                        use_gpu = args.gpu_graph
                                        )
                 lineage_clusters[rank] = \
                     printClusters(indivNetworks[rank],
                                   refList,
-                                  printCSV = False)
+                                  printCSV = False,
+                                  use_gpu = args.gpu_graph)
 
             # print output of each rank as CSV
             overall_lineage = createOverallLineage(rank_list, lineage_clusters)
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 15074422..611578c9 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -309,7 +309,7 @@ def load_previous_network(prev_G_fn, rlist, weights=False):
 def constructNetwork(rlist, qlist, assignments, within_label,
                      summarise = True, edge_list = False, weights = None,
                      weights_type = 'euclidean', sparse_input = None,
-                     previous_network = None):
+                     previous_network = None, use_gpu = False):
     """Construct an unweighted, undirected network without self-loops.
     Nodes are samples and edges where samples are within the same cluster
 
@@ -341,6 +341,8 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         previous_network (str)
             Name of file containing a previous network to be integrated into this new
             network
+        use_gpu (bool)
+            Whether to use GPUs for network construction
 
     Returns:
         G (graph)
@@ -408,37 +410,66 @@ def constructNetwork(rlist, qlist, assignments, within_label,
                 edge_tuple = (ref, query)
                 if ref < query:
                     connections.append(edge_tuple)
-                    
-    # build the graph
-    G = gt.Graph(directed = False)
-    G.add_vertex(len(vertex_labels))
 
-    if weights is not None or sparse_input is not None:
-        eweight = G.new_ep("float")
-        G.add_edge_list(connections, eprops = [eweight])
-        G.edge_properties["weight"] = eweight
+    # load GPU libraries if necessary
+    if use_gpu:
+        
+        # load CUDA libraries
+        try:
+            import cugraph
+            import cudf
+        except ImportError as e:
+            sys.stderr.write("cugraph and cudf unavailable\n")
+            raise ImportError(e)
+            
+        # create DataFrame using edge tuples
+        if weights is not None or sparse_input is not None:
+            connections_df = pd.DataFrame(connections, columns =['source', 'destination', 'weights'])
+        else:
+            connections_df = pd.DataFrame(connections, columns =['source', 'destination'])
+        G_df = cudf.DataFrame.from_pandas(connections_df)
+        
+        # construct graph
+        G_cu = cugraph.Graph()
+        if weights is not None or sparse_input is not None:
+            G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
+        else:
+            G_cu.from_cudf_edgelist(G_df, renumber=False)
+        quit()
+        return G_cu
+
     else:
-        G.add_edge_list(connections)
-
-    # add isolate ID to network
-    vid = G.new_vertex_property('string',
-                                vals = vertex_labels)
-    G.vp.id = vid
-
-    # print some summaries
-    if summarise:
-        (metrics, scores) = networkSummary(G)
-        sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t\t\t\t" + str(metrics[0]),
-                                                       "\tDensity\t\t\t\t\t" + "{:.4f}".format(metrics[1]),
-                                                       "\tTransitivity\t\t\t\t" + "{:.4f}".format(metrics[2]),
-                                                       "\tMean betweenness\t\t\t" + "{:.4f}".format(metrics[3]),
-                                                       "\tWeighted-mean betweenness\t\t" + "{:.4f}".format(metrics[4]),
-                                                       "\tScore\t\t\t\t\t" + "{:.4f}".format(scores[0]),
-                                                       "\tScore (w/ betweenness)\t\t\t" + "{:.4f}".format(scores[1]),
-                                                       "\tScore (w/ weighted-betweenness)\t\t" + "{:.4f}".format(scores[2])])
-                                                       + "\n")
-
-    return G
+
+        # build the graph
+        G = gt.Graph(directed = False)
+        G.add_vertex(len(vertex_labels))
+
+        if weights is not None or sparse_input is not None:
+            eweight = G.new_ep("float")
+            G.add_edge_list(connections, eprops = [eweight])
+            G.edge_properties["weight"] = eweight
+        else:
+            G.add_edge_list(connections)
+
+        # add isolate ID to network
+        vid = G.new_vertex_property('string',
+                                    vals = vertex_labels)
+        G.vp.id = vid
+
+        # print some summaries
+        if summarise:
+            (metrics, scores) = networkSummary(G)
+            sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t\t\t\t" + str(metrics[0]),
+                                                           "\tDensity\t\t\t\t\t" + "{:.4f}".format(metrics[1]),
+                                                           "\tTransitivity\t\t\t\t" + "{:.4f}".format(metrics[2]),
+                                                           "\tMean betweenness\t\t\t" + "{:.4f}".format(metrics[3]),
+                                                           "\tWeighted-mean betweenness\t\t" + "{:.4f}".format(metrics[4]),
+                                                           "\tScore\t\t\t\t\t" + "{:.4f}".format(scores[0]),
+                                                           "\tScore (w/ betweenness)\t\t\t" + "{:.4f}".format(scores[1]),
+                                                           "\tScore (w/ weighted-betweenness)\t\t" + "{:.4f}".format(scores[2])])
+                                                           + "\n")
+        quit()
+        return G
 
 def networkSummary(G, calc_betweenness=True):
     """Provides summary values about the network
@@ -621,7 +652,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
 
 def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
                   externalClusterCSV = None, printRef = True, printCSV = True,
-                  clustering_type = 'combined'):
+                  clustering_type = 'combined', use_gpu = False):
     """Get cluster assignments
 
     Also writes assignments to a CSV file
@@ -650,6 +681,8 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
         clustering_type (str)
             Type of clustering network, used for comparison with old clusters
             Default = 'combined'
+        use_gpu (bool)
+            Whether to use cugraph for network analysis
 
     Returns:
         clustering (dict)
@@ -660,13 +693,23 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
         raise RuntimeError("Trying to print query clusters with no query sequences")
 
     # get a sorted list of component assignments
-    component_assignments, component_frequencies = gt.label_components(G)
-    component_frequency_ranks = len(component_frequencies) - rankdata(component_frequencies, method = 'ordinal').astype(int)
-    newClusters = [set() for rank in range(len(component_frequency_ranks))]
-    for isolate_index, isolate_name in enumerate(rlist):
-        component = component_assignments.a[isolate_index]
-        component_rank = component_frequency_ranks[component]
-        newClusters[component_rank].add(isolate_name)
+    if use_gpu:
+        component_assignments = cugraph.components.connectivity.connected_components(G, directed = False)
+        component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
+        newClusters = [set() for rank in range(component_frequencies.size)]
+        for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment
+            component = component_assignments[isolate_index]
+            component_rank = component_frequencies.index[component]
+            newClusters[component_rank].add(isolate_name)
+    else:
+        component_assignments, component_frequencies = gt.label_components(G)
+        component_frequency_ranks = len(component_frequencies) - rankdata(component_frequencies, method = 'ordinal').astype(int)
+        # use components to determine new clusters
+        newClusters = [set() for rank in range(len(component_frequency_ranks))]
+        for isolate_index, isolate_name in enumerate(rlist):
+            component = component_assignments.a[isolate_index]
+            component_rank = component_frequency_ranks[component]
+            newClusters[component_rank].add(isolate_name)
 
     oldNames = set()
 

From c94fe04d2c4354f4da68f635b62304ee24351ed8 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 9 Mar 2021 22:09:45 +0000
Subject: [PATCH 076/327] Remove exit messages

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 611578c9..8bcbb136 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -435,7 +435,7 @@ def constructNetwork(rlist, qlist, assignments, within_label,
             G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:
             G_cu.from_cudf_edgelist(G_df, renumber=False)
-        quit()
+
         return G_cu
 
     else:
@@ -468,7 +468,7 @@ def constructNetwork(rlist, qlist, assignments, within_label,
                                                            "\tScore (w/ betweenness)\t\t\t" + "{:.4f}".format(scores[1]),
                                                            "\tScore (w/ weighted-betweenness)\t\t" + "{:.4f}".format(scores[2])])
                                                            + "\n")
-        quit()
+
         return G
 
 def networkSummary(G, calc_betweenness=True):

From db949c873b91cd69d304a671b51633b146f8e15b Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 9 Mar 2021 22:12:04 +0000
Subject: [PATCH 077/327] Load cugraph in processClusters

---
 PopPUNK/network.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 8bcbb136..5340302f 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -694,6 +694,15 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
 
     # get a sorted list of component assignments
     if use_gpu:
+    
+        # load CUDA libraries
+        try:
+            import cugraph
+            import cudf
+        except ImportError as e:
+            sys.stderr.write("cugraph and cudf unavailable\n")
+            raise ImportError(e)
+    
         component_assignments = cugraph.components.connectivity.connected_components(G, directed = False)
         component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
         newClusters = [set() for rank in range(component_frequencies.size)]

From b1d153893c6babac7c2c208883c327041b043519 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 9 Mar 2021 22:13:42 +0000
Subject: [PATCH 078/327] Fix connected component command

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 5340302f..f1c832f5 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -703,7 +703,7 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
             sys.stderr.write("cugraph and cudf unavailable\n")
             raise ImportError(e)
     
-        component_assignments = cugraph.components.connectivity.connected_components(G, directed = False)
+        component_assignments = cugraph.components.connectivity.connected_components(G)
         component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
         newClusters = [set() for rank in range(component_frequencies.size)]
         for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment

From 31683e164bf05b3b6d1515fc3fb3d0301f6872d2 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Mar 2021 06:07:37 +0000
Subject: [PATCH 079/327] Change cuDf index use

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index f1c832f5..25998a01 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -707,7 +707,7 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
         component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
         newClusters = [set() for rank in range(component_frequencies.size)]
         for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment
-            component = component_assignments[isolate_index]
+            component = component_assignments['labels'][isolate_index]
             component_rank = component_frequencies.index[component]
             newClusters[component_rank].add(isolate_name)
     else:

From 2ae71a84197c18ff84158feb70c286729a938ad5 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Mar 2021 06:11:15 +0000
Subject: [PATCH 080/327] Integer value conversion update

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 25998a01..23901efc 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -709,7 +709,7 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
         for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment
             component = component_assignments['labels'][isolate_index]
             component_rank = component_frequencies.index[component]
-            newClusters[component_rank].add(isolate_name)
+            newClusters[component_rank.astype(int)].add(isolate_name)
     else:
         component_assignments, component_frequencies = gt.label_components(G)
         component_frequency_ranks = len(component_frequencies) - rankdata(component_frequencies, method = 'ordinal').astype(int)

From b703f7a1d5856da11fdfa110cf67894437d1c358 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Mar 2021 06:25:58 +0000
Subject: [PATCH 081/327] Integer value conversion

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 23901efc..9a9573c3 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -709,7 +709,7 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
         for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment
             component = component_assignments['labels'][isolate_index]
             component_rank = component_frequencies.index[component]
-            newClusters[component_rank.astype(int)].add(isolate_name)
+            newClusters[component_rank.astype(int).item()].add(isolate_name)
     else:
         component_assignments, component_frequencies = gt.label_components(G)
         component_frequency_ranks = len(component_frequencies) - rankdata(component_frequencies, method = 'ordinal').astype(int)

From 2e90d28cbfe2570a0f9513a4ea713ad839bfe9f2 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Mar 2021 10:38:52 +0000
Subject: [PATCH 082/327] Process components correctly with cugraph

---
 PopPUNK/network.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 9a9573c3..3e349f5c 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -707,9 +707,10 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
         component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
         newClusters = [set() for rank in range(component_frequencies.size)]
         for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment
-            component = component_assignments['labels'][isolate_index]
-            component_rank = component_frequencies.index[component]
-            newClusters[component_rank.astype(int).item()].add(isolate_name)
+            component = component_assignments['labels'].iloc[isolate_index].item()
+            component_rank_bool = component_frequencies.index == component
+            component_rank = np.argmax(component_rank_bool.to_array())
+            newClusters[component_rank].add(isolate_name)
     else:
         component_assignments, component_frequencies = gt.label_components(G)
         component_frequency_ranks = len(component_frequencies) - rankdata(component_frequencies, method = 'ordinal').astype(int)

From 0906f0c68f25119aa44fba1a741c14c2c55419c3 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Mar 2021 10:55:28 +0000
Subject: [PATCH 083/327] Allow for checking of cugraphs

---
 PopPUNK/__main__.py |  4 +++-
 PopPUNK/network.py  | 22 ++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 48527953..d380d101 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -195,6 +195,7 @@ def main():
     from .network import constructNetwork
     from .network import extractReferences
     from .network import printClusters
+    from .network import get_vertex_list
 
     from .plot import writeClusterCsv
     from .plot import plot_scatter
@@ -495,7 +496,8 @@ def main():
             genomeNetwork = indivNetworks[min(rank_list)]
 
         # Ensure all in dists are in final network
-        networkMissing = set(map(str,set(range(len(refList))).difference(list(genomeNetwork.vertices()))))
+        vertex_list = get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph)
+        networkMissing = set(map(str,set(range(len(refList))).difference(vertex_list)))
         if len(networkMissing) > 0:
             missing_isolates = [refList[m] for m in networkMissing]
             sys.stderr.write("WARNING: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n")
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 3e349f5c..0cf4713f 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -939,3 +939,25 @@ def generate_minimum_spanning_tree(G, from_cugraph = False):
 
     sys.stderr.write("Completed calculation of minimum-spanning tree\n")
     return mst_network
+
+def get_vertex_list(G, use_gpu = False):
+    """Generate a list of node indices
+
+    Args:
+       G (network)
+           Graph tool network
+       use_gpu (bool)
+            Whether graph is a cugraph or not
+            [default = False]
+
+    Returns:
+       vlist (list)
+           List of integers corresponding to nodes
+    """
+    
+    if use_gpu:
+        vlist = G.nodes().tolist()
+    else:
+        vlist = list(G.vertices())
+    
+    return vlist

From c64a2c56b938b5cb622832ee5f7c4bb712c2dd1d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Mar 2021 11:00:50 +0000
Subject: [PATCH 084/327] Change cudf list conversion

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 0cf4713f..79ed614c 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -956,7 +956,7 @@ def get_vertex_list(G, use_gpu = False):
     """
     
     if use_gpu:
-        vlist = G.nodes().tolist()
+        vlist = G.nodes().to_arrow().to_pylist()
     else:
         vlist = list(G.vertices())
     

From b84a269e1817090d2555d483a87b851ab07a7cad Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Mar 2021 11:25:07 +0000
Subject: [PATCH 085/327] Update printClusters flags

---
 PopPUNK/__main__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index d380d101..bae50c16 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -506,7 +506,8 @@ def main():
         isolateClustering = {fit_type: printClusters(genomeNetwork,
                                                      refList,
                                                      output + "/" + os.path.basename(output),
-                                                     externalClusterCSV = args.external_clustering)}
+                                                     externalClusterCSV = args.external_clustering,
+                                                     use_gpu = args.gpu_graph)}
 
         # Write core and accessory based clusters, if they worked
         if model.indiv_fitted:

From cf1d85b15e596f2f7a72eca6991b55d9cf1d5e6b Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Mar 2021 11:57:52 +0000
Subject: [PATCH 086/327] Allow for saving of cugraph objects

---
 PopPUNK/__main__.py |  9 +++------
 PopPUNK/network.py  | 22 ++++++++++++++++++++++
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index bae50c16..a371da64 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -535,9 +535,7 @@ def main():
                 fit_type = 'accessory'
                 genomeNetwork = indivNetworks['accessory']
 
-        genomeNetwork.save(output + "/" + \
-                           os.path.basename(output) + '_graph.gt',
-                           fmt = 'gt')
+        save_network(genomeNetwork, prefix = output, suffix = "_graph", use_gpu = args.gpu_graph)
 
         #******************************#
         #*                            *#
@@ -557,9 +555,8 @@ def main():
                 prune_distance_matrix(refList, names_to_remove, distMat,
                                       output + "/" + os.path.basename(output) + ".refs.dists")
                 # Save reference network
-                genomeNetwork.save(output + "/" + \
-                                   os.path.basename(output) + '.refs_graph.gt',
-                                   fmt = 'gt')
+                save_network(genomeNetwork, prefix = output, suffix = ".refs_graph",
+                            use_gpu = args.gpu_graph)
                 removeFromDB(args.ref_db, output, names_to_remove)
                 os.rename(output + "/" + os.path.basename(output) + ".tmp.h5",
                           output + "/" + os.path.basename(output) + ".refs.h5")
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 79ed614c..3de225fd 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -961,3 +961,25 @@ def get_vertex_list(G, use_gpu = False):
         vlist = list(G.vertices())
     
     return vlist
+
+def save_network(G, prefix = None, suffix = None, use_gpu = False):
+    """Save a network to disc
+
+    Args:
+       G (network)
+           Graph tool network
+       prefix (str)
+           Prefix for output file
+       use_gpu (bool)
+           Whether graph is a cugraph or not
+           [default = False]
+
+    """
+    file_name = prefix + "/" + os.path.basename(prefix) + '_' + suffix
+    os.path.basename(prefix) + '_graph.csv.bz2'
+    if use_gpu:
+        G.to_pandas_edgelist().to_csv(file_name + '.csv.bz2',
+                compression='bz2')
+    else:
+        G.save(file_name + '.gt',
+                fmt = 'gt')

From 81b70339100a440eaced4c5841b8373d93edcca0 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Mar 2021 12:00:24 +0000
Subject: [PATCH 087/327] Fix missing function reference

---
 PopPUNK/__main__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index a371da64..9451f2ea 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -196,6 +196,7 @@ def main():
     from .network import extractReferences
     from .network import printClusters
     from .network import get_vertex_list
+    from .network import save_network
 
     from .plot import writeClusterCsv
     from .plot import plot_scatter

From 1f708d62ebb854f2930446eaf5005cb46d54fec3 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Mar 2021 21:49:26 +0000
Subject: [PATCH 088/327] Change vertex list to set for difference

---
 PopPUNK/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 9451f2ea..ca9cf606 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -497,7 +497,7 @@ def main():
             genomeNetwork = indivNetworks[min(rank_list)]
 
         # Ensure all in dists are in final network
-        vertex_list = get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph)
+        vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph))
         networkMissing = set(map(str,set(range(len(refList))).difference(vertex_list)))
         if len(networkMissing) > 0:
             missing_isolates = [refList[m] for m in networkMissing]

From 79b754fbc457e89f5e908e5d4014c33559db4dd4 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Mar 2021 21:54:04 +0000
Subject: [PATCH 089/327] GPU graphs for non-lineage mode

---
 PopPUNK/__main__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index ca9cf606..165325ba 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -458,7 +458,8 @@ def main():
                                  queryList,
                                  assignments,
                                  model.within_label,
-                                 weights = weights)
+                                 weights = weights,
+                                 use_gpu = args.gpu_graph)
         else:
             # Lineage fit requires some iteration
             indivNetworks = {}

From 6566544fd7a9196d680b7fc136a96e84f109d6ca Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 10 Mar 2021 22:31:59 +0000
Subject: [PATCH 090/327] Change node index extraction

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 3de225fd..be557649 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -956,7 +956,7 @@ def get_vertex_list(G, use_gpu = False):
     """
     
     if use_gpu:
-        vlist = G.nodes().to_arrow().to_pylist()
+        vlist = G.nodes().to_array().tolist()
     else:
         vlist = list(G.vertices())
     

From bec01e5f5100c2913f9b84f668b7ca696105a824 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 05:41:56 +0000
Subject: [PATCH 091/327] Restore missing nodes to GPU graph

---
 PopPUNK/__main__.py | 2 +-
 PopPUNK/network.py  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 165325ba..6731b79a 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -499,7 +499,7 @@ def main():
 
         # Ensure all in dists are in final network
         vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph))
-        networkMissing = set(map(str,set(range(len(refList))).difference(vertex_list)))
+        networkMissing = set(set(range(len(refList))).difference(vertex_list))
         if len(networkMissing) > 0:
             missing_isolates = [refList[m] for m in networkMissing]
             sys.stderr.write("WARNING: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n")
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index be557649..4cce839b 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -431,6 +431,7 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         
         # construct graph
         G_cu = cugraph.Graph()
+        G_cu.add_nodes_from(len(vertex_labels))
         if weights is not None or sparse_input is not None:
             G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:

From d216576555bb78616a3698bbcaa328cdecf21dd4 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 05:48:44 +0000
Subject: [PATCH 092/327] Add missing nodes

---
 PopPUNK/network.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 4cce839b..83c67f22 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -431,12 +431,11 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         
         # construct graph
         G_cu = cugraph.Graph()
-        G_cu.add_nodes_from(len(vertex_labels))
         if weights is not None or sparse_input is not None:
             G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:
             G_cu.from_cudf_edgelist(G_df, renumber=False)
-
+        G_cu.add_nodes_from(len(vertex_labels)) # add any missing unconnected nodes
         return G_cu
 
     else:

From a369c9d7fcdedf14e1b84c2fe00c39b5fee9e40b Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 05:51:57 +0000
Subject: [PATCH 093/327] Use range list in place of integer

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 83c67f22..b4d52fbd 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -435,7 +435,7 @@ def constructNetwork(rlist, qlist, assignments, within_label,
             G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:
             G_cu.from_cudf_edgelist(G_df, renumber=False)
-        G_cu.add_nodes_from(len(vertex_labels)) # add any missing unconnected nodes
+        G_cu.add_nodes_from(range(vertex_labels)) # add any missing unconnected nodes
         return G_cu
 
     else:

From d51d8fd07ab7be16e23dcf37b73411a77564db9f Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 06:14:15 +0000
Subject: [PATCH 094/327] Fix range list

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index b4d52fbd..d2c47854 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -435,7 +435,7 @@ def constructNetwork(rlist, qlist, assignments, within_label,
             G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:
             G_cu.from_cudf_edgelist(G_df, renumber=False)
-        G_cu.add_nodes_from(range(vertex_labels)) # add any missing unconnected nodes
+        G_cu.add_nodes_from(range(len(vertex_labels))) # add any missing unconnected nodes
         return G_cu
 
     else:

From a4c3210e2971026bbb8c55afd1fa43011e703a70 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 07:32:22 +0000
Subject: [PATCH 095/327] Remove pandas intermediate for data frame

---
 PopPUNK/network.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index d2c47854..130a56bc 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -424,10 +424,9 @@ def constructNetwork(rlist, qlist, assignments, within_label,
             
         # create DataFrame using edge tuples
         if weights is not None or sparse_input is not None:
-            connections_df = pd.DataFrame(connections, columns =['source', 'destination', 'weights'])
+            connections_df = cudf.DataFrame(connections, columns =['source', 'destination', 'weights'])
         else:
-            connections_df = pd.DataFrame(connections, columns =['source', 'destination'])
-        G_df = cudf.DataFrame.from_pandas(connections_df)
+            connections_df = cudf.DataFrame(connections, columns =['source', 'destination'])
         
         # construct graph
         G_cu = cugraph.Graph()

From ca28aa5f7bb7014067ab6bcec95ce84becf1bfe1 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 07:39:02 +0000
Subject: [PATCH 096/327] Fix data frame name

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 130a56bc..143be773 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -424,9 +424,9 @@ def constructNetwork(rlist, qlist, assignments, within_label,
             
         # create DataFrame using edge tuples
         if weights is not None or sparse_input is not None:
-            connections_df = cudf.DataFrame(connections, columns =['source', 'destination', 'weights'])
+            G_df = cudf.DataFrame(connections, columns =['source', 'destination', 'weights'])
         else:
-            connections_df = cudf.DataFrame(connections, columns =['source', 'destination'])
+            G_df = cudf.DataFrame(connections, columns =['source', 'destination'])
         
         # construct graph
         G_cu = cugraph.Graph()

From 702c6b97a753d844e03c35506c1ededd92ce5c1e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 08:20:30 +0000
Subject: [PATCH 097/327] Add in isolated vertices in GPU graph

---
 PopPUNK/network.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 143be773..76a5be47 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -428,13 +428,28 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         else:
             G_df = cudf.DataFrame(connections, columns =['source', 'destination'])
         
+        # ensure the highest-integer node is included in the edge list
+        # by adding a self-loop if necessary; see https://github.com/rapidsai/cugraph/issues/1206
+        max_in_df = G_df.max()
+        max_in_vertex_labels = len(vertex_labels)
+        print("Max in DF is " + str(max_in_df))
+        print("Max in labels is " + str(max_in_vertex_labels))
+        if max_in_df.astype(int).item() != max_in_vertex_labels:
+            if weights is not None or sparse_input is not None:
+                self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels, 0)
+                G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination', 'weights'])
+            else:
+                self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels)
+                G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination'])
+            G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
+        
         # construct graph
         G_cu = cugraph.Graph()
         if weights is not None or sparse_input is not None:
             G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:
             G_cu.from_cudf_edgelist(G_df, renumber=False)
-        G_cu.add_nodes_from(range(len(vertex_labels))) # add any missing unconnected nodes
+
         return G_cu
 
     else:

From 8c14b5ee59a080e669656149799b780f23ed07a9 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 08:27:48 +0000
Subject: [PATCH 098/327] Change max to int conversion

---
 PopPUNK/network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 76a5be47..fb70f88c 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -433,8 +433,9 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         max_in_df = G_df.max()
         max_in_vertex_labels = len(vertex_labels)
         print("Max in DF is " + str(max_in_df))
+        print("Max type is " + str(type(max_in_df)))
         print("Max in labels is " + str(max_in_vertex_labels))
-        if max_in_df.astype(int).item() != max_in_vertex_labels:
+        if max_in_df.iloc[0].item() != max_in_vertex_labels:
             if weights is not None or sparse_input is not None:
                 self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels, 0)
                 G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination', 'weights'])

From 414efff569851a822b1a7d96d6e073aed3c0aae3 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 08:33:07 +0000
Subject: [PATCH 099/327] Change max calculation

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index fb70f88c..4346a6a5 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -430,8 +430,8 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         
         # ensure the highest-integer node is included in the edge list
         # by adding a self-loop if necessary; see https://github.com/rapidsai/cugraph/issues/1206
-        max_in_df = G_df.max()
-        max_in_vertex_labels = len(vertex_labels)
+        max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
+        max_in_vertex_labels = len(vertex_labels)-1
         print("Max in DF is " + str(max_in_df))
         print("Max type is " + str(type(max_in_df)))
         print("Max in labels is " + str(max_in_vertex_labels))

From 6938b5319e3101d15e25154d56e3b99d360a4b3e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 08:37:07 +0000
Subject: [PATCH 100/327] Change max format

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 4346a6a5..f2bed628 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -435,7 +435,7 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         print("Max in DF is " + str(max_in_df))
         print("Max type is " + str(type(max_in_df)))
         print("Max in labels is " + str(max_in_vertex_labels))
-        if max_in_df.iloc[0].item() != max_in_vertex_labels:
+        if max_in_df.item() != max_in_vertex_labels:
             if weights is not None or sparse_input is not None:
                 self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels, 0)
                 G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination', 'weights'])

From 90ebda606635f909efb33737046c0c521ae2ca34 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 08:43:21 +0000
Subject: [PATCH 101/327] Add message checking on maximum

---
 PopPUNK/network.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index f2bed628..b8a234d2 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -443,6 +443,8 @@ def constructNetwork(rlist, qlist, assignments, within_label,
                 self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels)
                 G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination'])
             G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
+            new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
+            print("New max in DF is " + str(new_max_in_df))
         
         # construct graph
         G_cu = cugraph.Graph()

From 4915c678abf4714f138a306778b74e5d06ce813a Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 08:48:06 +0000
Subject: [PATCH 102/327] Change int to float

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index b8a234d2..ce610e3d 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -437,7 +437,7 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         print("Max in labels is " + str(max_in_vertex_labels))
         if max_in_df.item() != max_in_vertex_labels:
             if weights is not None or sparse_input is not None:
-                self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels, 0)
+                self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels, 0.0)
                 G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination', 'weights'])
             else:
                 self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels)

From 7a0e404b866249ff26a683e6964cdfbdffa2a8ff Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 08:50:57 +0000
Subject: [PATCH 103/327] Add warning for missing nodes

---
 PopPUNK/__main__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 6731b79a..4b66aba8 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -503,6 +503,7 @@ def main():
         if len(networkMissing) > 0:
             missing_isolates = [refList[m] for m in networkMissing]
             sys.stderr.write("WARNING: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n")
+            sys.stderr.write("These correspond to indices " + ", ".join(networkMissing) + "\n")
 
         fit_type = model.type
         isolateClustering = {fit_type: printClusters(genomeNetwork,

From 19a4248ba3f7b380c0c5211b0ebb3793c396b99e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 09:01:53 +0000
Subject: [PATCH 104/327] Test DF structure

---
 PopPUNK/network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index ce610e3d..5c3e54f0 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -451,6 +451,7 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         if weights is not None or sparse_input is not None:
             G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:
+            print("Data frame is " + str(G_df))
             G_cu.from_cudf_edgelist(G_df, renumber=False)
 
         return G_cu

From 6de75099c48a033b4bbbd0d706530341065694f8 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 09:17:30 +0000
Subject: [PATCH 105/327] Change warning message print format

---
 PopPUNK/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 4b66aba8..860140a3 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -503,7 +503,7 @@ def main():
         if len(networkMissing) > 0:
             missing_isolates = [refList[m] for m in networkMissing]
             sys.stderr.write("WARNING: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n")
-            sys.stderr.write("These correspond to indices " + ", ".join(networkMissing) + "\n")
+            sys.stderr.write("These correspond to indices " + ", ".join(map(str,networkMissing)) + "\n")
 
         fit_type = model.type
         isolateClustering = {fit_type: printClusters(genomeNetwork,

From bf278b0ac6456b5939d4bc71e98744c91f5234ee Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 09:48:50 +0000
Subject: [PATCH 106/327] Change cudf definition

---
 PopPUNK/network.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 5c3e54f0..0804590d 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -436,12 +436,11 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         print("Max type is " + str(type(max_in_df)))
         print("Max in labels is " + str(max_in_vertex_labels))
         if max_in_df.item() != max_in_vertex_labels:
+            G_self_loop = cudf.DataFrame()
+            G_self_loop['source'] = [max_in_vertex_labels]
+            G_self_loop['destination'] = [max_in_vertex_labels]
             if weights is not None or sparse_input is not None:
-                self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels, 0.0)
-                G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination', 'weights'])
-            else:
-                self_loop_connection = (max_in_vertex_labels, max_in_vertex_labels)
-                G_self_loop = cudf.DataFrame(self_loop_connection, columns =['source', 'destination'])
+                G_self_loop['weights'] = [0.0]
             G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
             new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
             print("New max in DF is " + str(new_max_in_df))

From 2d08c725a538dd6b9301e00b0e46c5b5ae93fa0a Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 11:29:56 +0000
Subject: [PATCH 107/327] Add reference extraction for GPU graphs

---
 PopPUNK/__main__.py |   6 +-
 PopPUNK/network.py  | 169 +++++++++++++++++++++++++-------------------
 2 files changed, 101 insertions(+), 74 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 860140a3..c9b2dfa0 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -549,7 +549,11 @@ def main():
         # (this no longer loses information and should generally be kept on)
         if model.type != "lineage":
             newReferencesIndices, newReferencesNames, newReferencesFile, genomeNetwork = \
-                extractReferences(genomeNetwork, refList, output, threads = args.threads)
+                extractReferences(genomeNetwork,
+                                    refList,
+                                    output,
+                                    threads = args.threads,
+                                    use_gpu = args.gpu_graph)
             nodes_to_remove = set(range(len(refList))).difference(newReferencesIndices)
             names_to_remove = [refList[n] for n in nodes_to_remove]
 
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 0804590d..087f6ac1 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -138,7 +138,7 @@ def cliquePrune(component, graph, reference_indices, components_list):
         ref_list = getCliqueRefs(subgraph, refs)
     return(list(ref_list))
 
-def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1):
+def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, use_gpu = False):
     """Extract references for each cluster based on cliques
 
        Writes chosen references to file by calling :func:`~writeReferences`
@@ -152,6 +152,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1):
                Prefix for output file (.refs will be appended)
            existingRefs (list)
                References that should be used for each clique
+           use_gpu (bool)
+               Use cugraph for graph analysis (default = False)
 
        Returns:
            refFileName (str)
@@ -167,83 +169,104 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1):
         index_lookup = {v:k for k,v in enumerate(dbOrder)}
         reference_indices = set([index_lookup[r] for r in references])
 
-    # Each component is independent, so can be multithreaded
-    components = gt.label_components(G)[0].a
-
-    # Turn gt threading off and on again either side of the parallel loop
-    if gt.openmp_enabled():
-        gt.openmp_set_num_threads(1)
+    if use_gpu:
 
-    # Cliques are pruned, taking one reference from each, until none remain
-    with Pool(processes=threads) as pool:
-        ref_lists = pool.map(partial(cliquePrune,
-                                        graph=G,
-                                        reference_indices=reference_indices,
-                                        components_list=components),
-                             set(components))
-    # Returns nested lists, which need to be flattened
-    reference_indices = set([entry for sublist in ref_lists for entry in sublist])
+        # load CUDA libraries
+        try:
+            import cugraph
+            import cudf
+        except ImportError as e:
+            sys.stderr.write("cugraph and cudf unavailable\n")
+            raise ImportError(e)
+    
+        # For large network, use more approximate method for extracting references
+        reference = {}
+        G_truss = cugraph.community.ktruss_subgraph.k_truss(G, 3)
+        component_assignments = cugraph.components.connectivity.connected_components(G_truss)
+        raw_reference_indices = component_assignments.groupby('').nth(0).iloc[:0]
+        print("Raw type: " + str(type(raw_reference_indices)))
+        print("Raw refs: " + str(raw_reference_indices))
+        quit()
+    
+    else:
 
-    if gt.openmp_enabled():
-        gt.openmp_set_num_threads(threads)
-
-    # Use a vertex filter to extract the subgraph of refences
-    # as a graphview
-    reference_vertex = G.new_vertex_property('bool')
-    for n, vertex in enumerate(G.vertices()):
-        if n in reference_indices:
-            reference_vertex[vertex] = True
-        else:
-            reference_vertex[vertex] = False
-    G_ref = gt.GraphView(G, vfilt = reference_vertex)
-    G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object
-
-    # Find any clusters which are represented by >1 references
-    # This creates a dictionary: cluster_id: set(ref_idx in cluster)
-    clusters_in_full_graph = printClusters(G, dbOrder, printCSV=False)
-    reference_clusters_in_full_graph = defaultdict(set)
-    for reference_index in reference_indices:
-        reference_clusters_in_full_graph[clusters_in_full_graph[dbOrder[reference_index]]].add(reference_index)
-
-    # Calculate the component membership within the reference graph
-    ref_order = [name for idx, name in enumerate(dbOrder) if idx in frozenset(reference_indices)]
-    clusters_in_reference_graph = printClusters(G_ref, ref_order, printCSV=False)
-    # Record the components/clusters the references are in the reference graph
-    # dict: name: ref_cluster
-    reference_clusters_in_reference_graph = {}
-    for reference_name in ref_order:
-        reference_clusters_in_reference_graph[reference_name] = clusters_in_reference_graph[reference_name]
-
-    # Check if multi-reference components have been split as a validation test
-    # First iterate through clusters
-    network_update_required = False
-    for cluster_id, ref_idxs in reference_clusters_in_full_graph.items():
-        # Identify multi-reference clusters by this length
-        if len(ref_idxs) > 1:
-            check = list(ref_idxs)
-            # check if these are still in the same component in the reference graph
-            for i in range(len(check)):
-                component_i = reference_clusters_in_reference_graph[dbOrder[check[i]]]
-                for j in range(i + 1, len(check)):
-                    # Add intermediate nodes
-                    component_j = reference_clusters_in_reference_graph[dbOrder[check[j]]]
-                    if component_i != component_j:
-                        network_update_required = True
-                        vertex_list, edge_list = gt.shortest_path(G, check[i], check[j])
-                        # update reference list
-                        for vertex in vertex_list:
-                            reference_vertex[vertex] = True
-                            reference_indices.add(int(vertex))
-
-    # update reference graph if vertices have been added
-    if network_update_required:
+        # Each component is independent, so can be multithreaded
+        components = gt.label_components(G)[0].a
+
+        # Turn gt threading off and on again either side of the parallel loop
+        if gt.openmp_enabled():
+            gt.openmp_set_num_threads(1)
+
+        # Cliques are pruned, taking one reference from each, until none remain
+        with Pool(processes=threads) as pool:
+            ref_lists = pool.map(partial(cliquePrune,
+                                            graph=G,
+                                            reference_indices=reference_indices,
+                                            components_list=components),
+                                 set(components))
+        # Returns nested lists, which need to be flattened
+        reference_indices = set([entry for sublist in ref_lists for entry in sublist])
+
+        if gt.openmp_enabled():
+            gt.openmp_set_num_threads(threads)
+
+        # Use a vertex filter to extract the subgraph of refences
+        # as a graphview
+        reference_vertex = G.new_vertex_property('bool')
+        for n, vertex in enumerate(G.vertices()):
+            if n in reference_indices:
+                reference_vertex[vertex] = True
+            else:
+                reference_vertex[vertex] = False
         G_ref = gt.GraphView(G, vfilt = reference_vertex)
         G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object
 
-    # Order found references as in mash sketch files
-    reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
-    refFileName = writeReferences(reference_names, outPrefix)
-    return reference_indices, reference_names, refFileName, G_ref
+        # Find any clusters which are represented by >1 references
+        # This creates a dictionary: cluster_id: set(ref_idx in cluster)
+        clusters_in_full_graph = printClusters(G, dbOrder, printCSV=False)
+        reference_clusters_in_full_graph = defaultdict(set)
+        for reference_index in reference_indices:
+            reference_clusters_in_full_graph[clusters_in_full_graph[dbOrder[reference_index]]].add(reference_index)
+
+        # Calculate the component membership within the reference graph
+        ref_order = [name for idx, name in enumerate(dbOrder) if idx in frozenset(reference_indices)]
+        clusters_in_reference_graph = printClusters(G_ref, ref_order, printCSV=False)
+        # Record the components/clusters the references are in the reference graph
+        # dict: name: ref_cluster
+        reference_clusters_in_reference_graph = {}
+        for reference_name in ref_order:
+            reference_clusters_in_reference_graph[reference_name] = clusters_in_reference_graph[reference_name]
+
+        # Check if multi-reference components have been split as a validation test
+        # First iterate through clusters
+        network_update_required = False
+        for cluster_id, ref_idxs in reference_clusters_in_full_graph.items():
+            # Identify multi-reference clusters by this length
+            if len(ref_idxs) > 1:
+                check = list(ref_idxs)
+                # check if these are still in the same component in the reference graph
+                for i in range(len(check)):
+                    component_i = reference_clusters_in_reference_graph[dbOrder[check[i]]]
+                    for j in range(i + 1, len(check)):
+                        # Add intermediate nodes
+                        component_j = reference_clusters_in_reference_graph[dbOrder[check[j]]]
+                        if component_i != component_j:
+                            network_update_required = True
+                            vertex_list, edge_list = gt.shortest_path(G, check[i], check[j])
+                            # update reference list
+                            for vertex in vertex_list:
+                                reference_vertex[vertex] = True
+                                reference_indices.add(int(vertex))
+
+        # update reference graph if vertices have been added
+        if network_update_required:
+            G_ref = gt.GraphView(G, vfilt = reference_vertex)
+            G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object
+
+        # Order found references as in mash sketch files
+        reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
+        refFileName = writeReferences(reference_names, outPrefix)
+        return reference_indices, reference_names, refFileName, G_ref
 
 def writeReferences(refList, outPrefix):
     """Writes chosen references to file

From 21dc84bddbf2f9a1a367347afeb71a6846c06f4e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 11:34:07 +0000
Subject: [PATCH 108/327] Change ktruss command

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 087f6ac1..653a3c3f 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -181,7 +181,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
     
         # For large network, use more approximate method for extracting references
         reference = {}
-        G_truss = cugraph.community.ktruss_subgraph.k_truss(G, 3)
+        G_truss = cugraph.ktruss_subgraph(G, 3)
         component_assignments = cugraph.components.connectivity.connected_components(G_truss)
         raw_reference_indices = component_assignments.groupby('').nth(0).iloc[:0]
         print("Raw type: " + str(type(raw_reference_indices)))

From f64c422228c849b25dd093ab59160c1302600021 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 11:50:51 +0000
Subject: [PATCH 109/327] Change ktruss processing

---
 PopPUNK/network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 653a3c3f..e803338c 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -183,7 +183,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         reference = {}
         G_truss = cugraph.ktruss_subgraph(G, 3)
         component_assignments = cugraph.components.connectivity.connected_components(G_truss)
-        raw_reference_indices = component_assignments.groupby('').nth(0).iloc[:0]
+        print("Assignments: " + str(component_assignments))
+        raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0]
         print("Raw type: " + str(type(raw_reference_indices)))
         print("Raw refs: " + str(raw_reference_indices))
         quit()

From f88a0c2e6c5943e1ac53885abb2e66ec14264294 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 11:54:54 +0000
Subject: [PATCH 110/327] Change components processing

---
 PopPUNK/network.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index e803338c..0fe1bb85 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -182,7 +182,9 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # For large network, use more approximate method for extracting references
         reference = {}
         G_truss = cugraph.ktruss_subgraph(G, 3)
-        component_assignments = cugraph.components.connectivity.connected_components(G_truss)
+        component_assignments = cugraph.components.connectivity.connected_components(G_truss,
+                                                                                    directed = False,
+                                                                                    return_labels = True)
         print("Assignments: " + str(component_assignments))
         raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0]
         print("Raw type: " + str(type(raw_reference_indices)))

From 77133cb36478e3d77b421cbbef9295887382d92f Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 12:02:48 +0000
Subject: [PATCH 111/327] Change components options

---
 PopPUNK/network.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 0fe1bb85..ad9b42a6 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -183,7 +183,6 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         reference = {}
         G_truss = cugraph.ktruss_subgraph(G, 3)
         component_assignments = cugraph.components.connectivity.connected_components(G_truss,
-                                                                                    directed = False,
                                                                                     return_labels = True)
         print("Assignments: " + str(component_assignments))
         raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0]

From 9dcaefdc4099ec2b2bf900b6b8e220fe1379cc6d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 12:07:18 +0000
Subject: [PATCH 112/327] Format Gtruss for graph input

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index ad9b42a6..428d60e1 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -182,8 +182,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # For large network, use more approximate method for extracting references
         reference = {}
         G_truss = cugraph.ktruss_subgraph(G, 3)
-        component_assignments = cugraph.components.connectivity.connected_components(G_truss,
-                                                                                    return_labels = True)
+        print("Gtruss type: " + str(type(G_truss)))
+        component_assignments = cugraph.components.connectivity.connected_components(G_truss)
         print("Assignments: " + str(component_assignments))
         raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0]
         print("Raw type: " + str(type(raw_reference_indices)))

From 5eb7e692ac3ce9814b01f0b32a71038d37d463b7 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 16:37:44 +0000
Subject: [PATCH 113/327] Try option 1 for ktruss

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 428d60e1..2131f80c 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -181,7 +181,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
     
         # For large network, use more approximate method for extracting references
         reference = {}
-        G_truss = cugraph.ktruss_subgraph(G, 3)
+        G_truss = cugraph.ktruss_subgraph.k_truss(G, 3)
         print("Gtruss type: " + str(type(G_truss)))
         component_assignments = cugraph.components.connectivity.connected_components(G_truss)
         print("Assignments: " + str(component_assignments))

From b2897b60e05aebe232afae97ac0e620d8f962b36 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 16:42:39 +0000
Subject: [PATCH 114/327] Raise ktruss k to 5

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 2131f80c..ed14d29f 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -181,7 +181,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
     
         # For large network, use more approximate method for extracting references
         reference = {}
-        G_truss = cugraph.ktruss_subgraph.k_truss(G, 3)
+        G_truss = cugraph.ktruss_subgraph(G, 5)
         print("Gtruss type: " + str(type(G_truss)))
         component_assignments = cugraph.components.connectivity.connected_components(G_truss)
         print("Assignments: " + str(component_assignments))

From 9b5865537256d46c338329dd9c07e5c42404ac98 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 16:48:05 +0000
Subject: [PATCH 115/327] Change ktruss formats

---
 PopPUNK/network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index ed14d29f..04acb2bf 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -181,9 +181,10 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
     
         # For large network, use more approximate method for extracting references
         reference = {}
+        print("G type: " + str(type(G)))
         G_truss = cugraph.ktruss_subgraph(G, 5)
         print("Gtruss type: " + str(type(G_truss)))
-        component_assignments = cugraph.components.connectivity.connected_components(G_truss)
+        component_assignments = cugraph.components.connectivity.connected_components(G)
         print("Assignments: " + str(component_assignments))
         raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0]
         print("Raw type: " + str(type(raw_reference_indices)))

From 98dba614381ef2f46e52ab20117bcc364c7a4852 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 17:01:02 +0000
Subject: [PATCH 116/327] Print network summaries

---
 PopPUNK/network.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 04acb2bf..77d1a8d3 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -182,8 +182,12 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # For large network, use more approximate method for extracting references
         reference = {}
         print("G type: " + str(type(G)))
+        print("G nodes: " + str(G.number_of_nodes())
+        print("G edges: " + str(G.number_of_edges())
         G_truss = cugraph.ktruss_subgraph(G, 5)
         print("Gtruss type: " + str(type(G_truss)))
+        print("Gtruss nodes: " + str(G_truss.number_of_nodes())
+        print("Gtruss edges: " + str(G_truss.number_of_edges())
         component_assignments = cugraph.components.connectivity.connected_components(G)
         print("Assignments: " + str(component_assignments))
         raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0]

From 24713ee6f7692cfbf205540ed3c525aba371f8b0 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 17:02:35 +0000
Subject: [PATCH 117/327] Fix grammar

---
 PopPUNK/network.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 77d1a8d3..58276d8c 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -182,12 +182,12 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # For large network, use more approximate method for extracting references
         reference = {}
         print("G type: " + str(type(G)))
-        print("G nodes: " + str(G.number_of_nodes())
-        print("G edges: " + str(G.number_of_edges())
+        print("G nodes: " + str(G.number_of_nodes()))
+        print("G edges: " + str(G.number_of_edges()))
         G_truss = cugraph.ktruss_subgraph(G, 5)
         print("Gtruss type: " + str(type(G_truss)))
-        print("Gtruss nodes: " + str(G_truss.number_of_nodes())
-        print("Gtruss edges: " + str(G_truss.number_of_edges())
+        print("Gtruss nodes: " + str(G_truss.number_of_nodes()))
+        print("Gtruss edges: " + str(G_truss.number_of_edges()))
         component_assignments = cugraph.components.connectivity.connected_components(G)
         print("Assignments: " + str(component_assignments))
         raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0]

From 65d8cb4c7f2d9351c5fd443295f44b22d51dddb5 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 21:04:30 +0000
Subject: [PATCH 118/327] Test Louvain

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 58276d8c..1eaa225c 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -184,7 +184,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         print("G type: " + str(type(G)))
         print("G nodes: " + str(G.number_of_nodes()))
         print("G edges: " + str(G.number_of_edges()))
-        G_truss = cugraph.ktruss_subgraph(G, 5)
+        G_truss = cugraph.louvain(G)
         print("Gtruss type: " + str(type(G_truss)))
         print("Gtruss nodes: " + str(G_truss.number_of_nodes()))
         print("Gtruss edges: " + str(G_truss.number_of_edges()))

From 2be951355aad3d2d38b10ff973085acf1b8d788d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 21:08:03 +0000
Subject: [PATCH 119/327] Test Leiden

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 1eaa225c..55609173 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -184,7 +184,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         print("G type: " + str(type(G)))
         print("G nodes: " + str(G.number_of_nodes()))
         print("G edges: " + str(G.number_of_edges()))
-        G_truss = cugraph.louvain(G)
+        G_truss = cugraph.leiden(G)
         print("Gtruss type: " + str(type(G_truss)))
         print("Gtruss nodes: " + str(G_truss.number_of_nodes()))
         print("Gtruss edges: " + str(G_truss.number_of_edges()))

From cf012fbf2c0bb1edda8d76ad32b7eae09c75335d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 21:58:24 +0000
Subject: [PATCH 120/327] Process Leiden output

---
 PopPUNK/network.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 55609173..074f1b3c 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -184,11 +184,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         print("G type: " + str(type(G)))
         print("G nodes: " + str(G.number_of_nodes()))
         print("G edges: " + str(G.number_of_edges()))
-        G_truss = cugraph.leiden(G)
-        print("Gtruss type: " + str(type(G_truss)))
-        print("Gtruss nodes: " + str(G_truss.number_of_nodes()))
-        print("Gtruss edges: " + str(G_truss.number_of_edges()))
-        component_assignments = cugraph.components.connectivity.connected_components(G)
+        component_assignments = cugraph.leiden(G)
         print("Assignments: " + str(component_assignments))
         raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0]
         print("Raw type: " + str(type(raw_reference_indices)))

From 74bb8ee1725fe7c8d53bbe0a1e9c03559e828f1b Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 22:03:10 +0000
Subject: [PATCH 121/327] Process Leiden both outputs

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 074f1b3c..43e3a699 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -184,7 +184,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         print("G type: " + str(type(G)))
         print("G nodes: " + str(G.number_of_nodes()))
         print("G edges: " + str(G.number_of_edges()))
-        component_assignments = cugraph.leiden(G)
+        component_assignments, score = cugraph.leiden(G)
         print("Assignments: " + str(component_assignments))
         raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0]
         print("Raw type: " + str(type(raw_reference_indices)))

From a6941765741a15a1cb047d6e244370c8c6cb7fd6 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 22:08:16 +0000
Subject: [PATCH 122/327] Change grouping variable

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 43e3a699..8443a07f 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -186,7 +186,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         print("G edges: " + str(G.number_of_edges()))
         component_assignments, score = cugraph.leiden(G)
         print("Assignments: " + str(component_assignments))
-        raw_reference_indices = component_assignments.groupby('labels').nth(0).iloc[:0]
+        raw_reference_indices = component_assignments.groupby('partition').nth(0).iloc[:0]
         print("Raw type: " + str(type(raw_reference_indices)))
         print("Raw refs: " + str(raw_reference_indices))
         quit()

From 0c0789d18710fc3d41f3fa408d5ada534d8a42cd Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 11 Mar 2021 22:16:29 +0000
Subject: [PATCH 123/327] Test grouping code

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 8443a07f..1be700ac 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -186,7 +186,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         print("G edges: " + str(G.number_of_edges()))
         component_assignments, score = cugraph.leiden(G)
         print("Assignments: " + str(component_assignments))
-        raw_reference_indices = component_assignments.groupby('partition').nth(0).iloc[:0]
+        raw_reference_indices = component_assignments.groupby('partition').nth(0)#.iloc[:0]
         print("Raw type: " + str(type(raw_reference_indices)))
         print("Raw refs: " + str(raw_reference_indices))
         quit()

From e0d6f87d385fe2aec8d748d4fef607e52b8d22a9 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 06:31:16 +0000
Subject: [PATCH 124/327] Fi grouping code

---
 PopPUNK/network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 1be700ac..ae5045fe 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -186,7 +186,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         print("G edges: " + str(G.number_of_edges()))
         component_assignments, score = cugraph.leiden(G)
         print("Assignments: " + str(component_assignments))
-        raw_reference_indices = component_assignments.groupby('partition').nth(0)#.iloc[:0]
+        # group by partition, which becomes the first column, so retrieve second column
+        raw_reference_indices = component_assignments.groupby('partition').nth(0).iloc[:1]
         print("Raw type: " + str(type(raw_reference_indices)))
         print("Raw refs: " + str(raw_reference_indices))
         quit()

From b9319dcb256855dae9114615ebfebcb408df4b59 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 07:31:10 +0000
Subject: [PATCH 125/327] Change iloc selection

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index ae5045fe..b06caf41 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -187,7 +187,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         component_assignments, score = cugraph.leiden(G)
         print("Assignments: " + str(component_assignments))
         # group by partition, which becomes the first column, so retrieve second column
-        raw_reference_indices = component_assignments.groupby('partition').nth(0).iloc[:1]
+        raw_reference_indices = component_assignments.groupby('partition').nth(0).iloc[:,1]
         print("Raw type: " + str(type(raw_reference_indices)))
         print("Raw refs: " + str(raw_reference_indices))
         quit()

From 5b3d1832a5c94ef3b08e68b51f40f06037387b10 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 07:37:14 +0000
Subject: [PATCH 126/327] Change selection processing

---
 PopPUNK/network.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index b06caf41..dfec5135 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -187,9 +187,10 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         component_assignments, score = cugraph.leiden(G)
         print("Assignments: " + str(component_assignments))
         # group by partition, which becomes the first column, so retrieve second column
-        raw_reference_indices = component_assignments.groupby('partition').nth(0).iloc[:,1]
-        print("Raw type: " + str(type(raw_reference_indices)))
-        print("Raw refs: " + str(raw_reference_indices))
+        reference_index_df = component_assignments.groupby('partition').nth(0).iloc[:,1]
+        print("Raw type: " + str(type(reference_index_df)))
+        reference_indices = reference_index_df['vertex'].tolist()
+        print("Raw refs: " + str(reference_indices))
         quit()
     
     else:

From b29d82c3656ca238a388e1795f8765e5b44c476d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 07:38:43 +0000
Subject: [PATCH 127/327] Remove column select

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index dfec5135..aa6c6182 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -187,7 +187,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         component_assignments, score = cugraph.leiden(G)
         print("Assignments: " + str(component_assignments))
         # group by partition, which becomes the first column, so retrieve second column
-        reference_index_df = component_assignments.groupby('partition').nth(0).iloc[:,1]
+        reference_index_df = component_assignments.groupby('partition').nth(0)
         print("Raw type: " + str(type(reference_index_df)))
         reference_indices = reference_index_df['vertex'].tolist()
         print("Raw refs: " + str(reference_indices))

From 1b5fd31e5a9e8c118afb60418aecae0152746845 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 07:43:30 +0000
Subject: [PATCH 128/327] Change list conversion

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index aa6c6182..42e253a0 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -189,7 +189,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # group by partition, which becomes the first column, so retrieve second column
         reference_index_df = component_assignments.groupby('partition').nth(0)
         print("Raw type: " + str(type(reference_index_df)))
-        reference_indices = reference_index_df['vertex'].tolist()
+        reference_indices = reference_index_df['vertex'].to_arrow().to_pylist()
         print("Raw refs: " + str(reference_indices))
         quit()
     

From 85ac6f5ff1d9be8bd104830367deba77a2b8754b Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 09:33:09 +0000
Subject: [PATCH 129/327] Add reference graph construction

---
 PopPUNK/network.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 42e253a0..ed7a2507 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -191,7 +191,18 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         print("Raw type: " + str(type(reference_index_df)))
         reference_indices = reference_index_df['vertex'].to_arrow().to_pylist()
         print("Raw refs: " + str(reference_indices))
-        quit()
+        
+        # Order found references as in mash sketch files
+        reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
+        print("Reference names: " + str(reference_names))
+        refFileName = writeReferences(reference_names, outPrefix)
+        
+        # Construct reference graph
+        G_df = G.view_edge_list()
+        G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)]
+        G_ref = cugraph.Graph()
+        G_ref.from_cudf_edgelist(G_ref_df, edge_attr='weights', renumber=False
+        return reference_indices, reference_names, refFileName, G_ref
     
     else:
 

From 17a0997960b8d97c0b4eb19ebcacbe16228fb7ef Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 09:46:30 +0000
Subject: [PATCH 130/327] Add missing bracket

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index ed7a2507..553fb536 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -201,7 +201,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         G_df = G.view_edge_list()
         G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)]
         G_ref = cugraph.Graph()
-        G_ref.from_cudf_edgelist(G_ref_df, edge_attr='weights', renumber=False
+        G_ref.from_cudf_edgelist(G_ref_df, edge_attr='weights', renumber=False)
         return reference_indices, reference_names, refFileName, G_ref
     
     else:

From 2e1f3eefa524036c72587b8c68cf957835548706 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 09:47:58 +0000
Subject: [PATCH 131/327] Add edge list

---
 PopPUNK/network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 553fb536..d5ccacd1 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -199,6 +199,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         
         # Construct reference graph
         G_df = G.view_edge_list()
+        print("Edge list: " + str(G_df))
         G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)]
         G_ref = cugraph.Graph()
         G_ref.from_cudf_edgelist(G_ref_df, edge_attr='weights', renumber=False)

From e5cb974305f3a12cfce5ed450d14853acb130641 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 09:48:58 +0000
Subject: [PATCH 132/327] Change column names

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index d5ccacd1..69003b97 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -200,7 +200,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # Construct reference graph
         G_df = G.view_edge_list()
         print("Edge list: " + str(G_df))
-        G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)]
+        G_ref_df = G_df[G_df['src'].isin(reference_names) & G_df['dst'].isin(reference_names)]
         G_ref = cugraph.Graph()
         G_ref.from_cudf_edgelist(G_ref_df, edge_attr='weights', renumber=False)
         return reference_indices, reference_names, refFileName, G_ref

From eecf23930b46749832ba89fe4205e35fb8986f0a Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 10:48:58 +0000
Subject: [PATCH 133/327] Remove weights from reference graph

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 69003b97..077dbad3 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -202,7 +202,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         print("Edge list: " + str(G_df))
         G_ref_df = G_df[G_df['src'].isin(reference_names) & G_df['dst'].isin(reference_names)]
         G_ref = cugraph.Graph()
-        G_ref.from_cudf_edgelist(G_ref_df, edge_attr='weights', renumber=False)
+        G_ref.from_cudf_edgelist(G_ref_df, renumber=False)
         return reference_indices, reference_names, refFileName, G_ref
     
     else:

From 7fe3277654716419e5f63201da15d0003f1900af Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 11:06:02 +0000
Subject: [PATCH 134/327] Add self loops for reference graph

---
 PopPUNK/network.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 077dbad3..545f49e4 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -197,12 +197,22 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         print("Reference names: " + str(reference_names))
         refFileName = writeReferences(reference_names, outPrefix)
         
-        # Construct reference graph
+        # Extract reference edges
         G_df = G.view_edge_list()
         print("Edge list: " + str(G_df))
-        G_ref_df = G_df[G_df['src'].isin(reference_names) & G_df['dst'].isin(reference_names)]
+        G_df.columns[0:1] = ['source','destination']
+        G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)]
+        print("Ref graph: " + str(G_ref_df))
+        # Add self-loop if needing
+        max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
+        max_in_vertex_labels = len(reference_names)-1
+        if max_in_df.item() != max_in_vertex_labels:
+            G_self_loop = cudf.DataFrame()
+            G_self_loop['source'] = [max_in_vertex_labels]
+            G_self_loop['destination'] = [max_in_vertex_labels]
+        # Construct graph
         G_ref = cugraph.Graph()
-        G_ref.from_cudf_edgelist(G_ref_df, renumber=False)
+        G_ref.from_cudf_edgelist(G_ref_df)
         return reference_indices, reference_names, refFileName, G_ref
     
     else:

From d6a344222bea77d0c31ccf66df95c54798dfb8fc Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 11:07:32 +0000
Subject: [PATCH 135/327] Change column names

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 545f49e4..4223454a 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -200,7 +200,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # Extract reference edges
         G_df = G.view_edge_list()
         print("Edge list: " + str(G_df))
-        G_df.columns[0:1] = ['source','destination']
+        G_df.columns = ['source','destination']
         G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)]
         print("Ref graph: " + str(G_ref_df))
         # Add self-loop if needing

From fc9d0a594e45cfb0cf90d38745466f38bfa9210f Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 11:13:14 +0000
Subject: [PATCH 136/327] Change df concatenation

---
 PopPUNK/network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 4223454a..a6a13139 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -210,6 +210,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             G_self_loop = cudf.DataFrame()
             G_self_loop['source'] = [max_in_vertex_labels]
             G_self_loop['destination'] = [max_in_vertex_labels]
+            G_ref_df = cudf.concat([G_ref_df,G_self_loop], ignore_index = True)
         # Construct graph
         G_ref = cugraph.Graph()
         G_ref.from_cudf_edgelist(G_ref_df)

From 366630b0e088a611c4ed16bcb9baa33c5e339e8b Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 11:17:53 +0000
Subject: [PATCH 137/327] Print ref graph

---
 PopPUNK/network.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index a6a13139..09afdf9f 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -211,9 +211,11 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             G_self_loop['source'] = [max_in_vertex_labels]
             G_self_loop['destination'] = [max_in_vertex_labels]
             G_ref_df = cudf.concat([G_ref_df,G_self_loop], ignore_index = True)
+        print("Ref df: " + str(G_ref_df))
         # Construct graph
         G_ref = cugraph.Graph()
         G_ref.from_cudf_edgelist(G_ref_df)
+        print("Ref graph: " + str(G_ref))
         return reference_indices, reference_names, refFileName, G_ref
     
     else:

From 76671a3baa6eb3140404317e23957c719f2fcab0 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 11:24:08 +0000
Subject: [PATCH 138/327] Add resolution parameter to Leiden method

---
 PopPUNK/network.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 09afdf9f..5f308cd1 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -181,28 +181,20 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
     
         # For large network, use more approximate method for extracting references
         reference = {}
-        print("G type: " + str(type(G)))
-        print("G nodes: " + str(G.number_of_nodes()))
-        print("G edges: " + str(G.number_of_edges()))
-        component_assignments, score = cugraph.leiden(G)
-        print("Assignments: " + str(component_assignments))
+        # Leiden method has resolution parameter - higher values give greater precision
+        component_assignments, score = cugraph.leiden(G, resolution = 1.0)
         # group by partition, which becomes the first column, so retrieve second column
         reference_index_df = component_assignments.groupby('partition').nth(0)
-        print("Raw type: " + str(type(reference_index_df)))
         reference_indices = reference_index_df['vertex'].to_arrow().to_pylist()
-        print("Raw refs: " + str(reference_indices))
         
         # Order found references as in mash sketch files
         reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
-        print("Reference names: " + str(reference_names))
         refFileName = writeReferences(reference_names, outPrefix)
         
         # Extract reference edges
         G_df = G.view_edge_list()
-        print("Edge list: " + str(G_df))
         G_df.columns = ['source','destination']
         G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)]
-        print("Ref graph: " + str(G_ref_df))
         # Add self-loop if needing
         max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
         max_in_vertex_labels = len(reference_names)-1
@@ -211,11 +203,9 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             G_self_loop['source'] = [max_in_vertex_labels]
             G_self_loop['destination'] = [max_in_vertex_labels]
             G_ref_df = cudf.concat([G_ref_df,G_self_loop], ignore_index = True)
-        print("Ref df: " + str(G_ref_df))
         # Construct graph
         G_ref = cugraph.Graph()
         G_ref.from_cudf_edgelist(G_ref_df)
-        print("Ref graph: " + str(G_ref))
         return reference_indices, reference_names, refFileName, G_ref
     
     else:

From 1a646b62e80bfadc65bb1498bc9b061cf41431a2 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 11:49:21 +0000
Subject: [PATCH 139/327] Add GPU graph loading

---
 PopPUNK/network.py | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 5f308cd1..ca6c9c41 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -34,7 +34,7 @@
 from .utils import isolateNameToLabel
 
 def fetchNetwork(network_dir, model, refList, ref_graph = False,
-                  core_only = False, accessory_only = False):
+                  core_only = False, accessory_only = False, use_gpu = False):
     """Load the network based on input options
 
        Returns the network as a graph-tool format graph, and sets
@@ -52,12 +52,12 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
                 [default = False]
             core_only (bool)
                 Return the network created using only core distances
-
                 [default = False]
             accessory_only (bool)
                 Return the network created using only accessory distances
-
                 [default = False]
+            use_gpu (bool)
+                Use cugraph library to load graph
 
        Returns:
             genomeNetwork (graph)
@@ -67,25 +67,36 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
     """
     # If a refined fit, may use just core or accessory distances
     dir_prefix = network_dir + "/" + os.path.basename(network_dir)
+    if use_gpu:
+        graph_suffix = '.csv.bz2'
+    else:
+        graph_suffix = '.gt'
     if core_only and model.type == 'refine':
         model.slope = 0
-        network_file = dir_prefix + '_core_graph.gt'
+        network_file = dir_prefix + '_core_graph' + graph_suffix
         cluster_file = dir_prefix + '_core_clusters.csv'
     elif accessory_only and model.type == 'refine':
         model.slope = 1
-        network_file = dir_prefix + '_accessory_graph.gt'
+        network_file = dir_prefix + '_accessory_graph' + graph_suffix
         cluster_file = dir_prefix + '_accessory_clusters.csv'
     else:
-        if ref_graph and os.path.isfile(dir_prefix + '.refs_graph.gt'):
-            network_file = dir_prefix + '.refs_graph.gt'
+        if ref_graph and os.path.isfile(dir_prefix + '.refs_graph' + graph_suffix):
+            network_file = dir_prefix + '.refs_graph' + graph_suffix
         else:
-            network_file = dir_prefix + '_graph.gt'
+            network_file = dir_prefix + '_graph' + graph_suffix
         cluster_file = dir_prefix + '_clusters.csv'
         if core_only or accessory_only:
             sys.stderr.write("Can only do --core-only or --accessory-only fits from "
                              "a refined fit. Using the combined distances.\n")
 
-    genomeNetwork = gt.load_graph(network_file)
+    if use_gpu:
+        G_df = cudf.read_csv(network_file, compression = 'gzip')
+        if weights in G_df.columns:
+            genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
+        else:
+            genomeNetwork.from_cudf_edgelist(G_df,renumber=False)
+    else:
+        genomeNetwork = gt.load_graph(network_file)
     sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n")
 
     # Ensure all in dists are in final network
@@ -475,9 +486,6 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         # by adding a self-loop if necessary; see https://github.com/rapidsai/cugraph/issues/1206
         max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
         max_in_vertex_labels = len(vertex_labels)-1
-        print("Max in DF is " + str(max_in_df))
-        print("Max type is " + str(type(max_in_df)))
-        print("Max in labels is " + str(max_in_vertex_labels))
         if max_in_df.item() != max_in_vertex_labels:
             G_self_loop = cudf.DataFrame()
             G_self_loop['source'] = [max_in_vertex_labels]
@@ -1038,8 +1046,8 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False):
     file_name = prefix + "/" + os.path.basename(prefix) + '_' + suffix
     os.path.basename(prefix) + '_graph.csv.bz2'
     if use_gpu:
-        G.to_pandas_edgelist().to_csv(file_name + '.csv.bz2',
-                compression='bz2')
+        G.to_csv(file_name + '.csv.gz',
+                compression='gzip')
     else:
         G.save(file_name + '.gt',
                 fmt = 'gt')

From f22cf552c561afe638abb298f08f2f7c6d4106fd Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 11:53:10 +0000
Subject: [PATCH 140/327] Change GPU graph writing

---
 PopPUNK/network.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index ca6c9c41..d8c29fe0 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -1046,8 +1046,7 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False):
     file_name = prefix + "/" + os.path.basename(prefix) + '_' + suffix
     os.path.basename(prefix) + '_graph.csv.bz2'
     if use_gpu:
-        G.to_csv(file_name + '.csv.gz',
-                compression='gzip')
+        G.edges().to_csv(file_name + '.csv.gz', compression='gzip')
     else:
         G.save(file_name + '.gt',
                 fmt = 'gt')

From accafd3e18c7503a1d14319131d9cefeebdb80d7 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 11:57:54 +0000
Subject: [PATCH 141/327] Change CSV compression

---
 PopPUNK/network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index d8c29fe0..10d4ed7d 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -1046,7 +1046,8 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False):
     file_name = prefix + "/" + os.path.basename(prefix) + '_' + suffix
     os.path.basename(prefix) + '_graph.csv.bz2'
     if use_gpu:
-        G.edges().to_csv(file_name + '.csv.gz', compression='gzip')
+        G.to_pandas_edgelist().to_csv(file_name + '.csv.gz',
+                compression='gzip')
     else:
         G.save(file_name + '.gt',
                 fmt = 'gt')

From a2bb2845df6ff4e028ddb515f51efbfaf906ddeb Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 12:10:39 +0000
Subject: [PATCH 142/327] Change output file name

---
 PopPUNK/network.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 10d4ed7d..a7556541 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -494,14 +494,12 @@ def constructNetwork(rlist, qlist, assignments, within_label,
                 G_self_loop['weights'] = [0.0]
             G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
             new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
-            print("New max in DF is " + str(new_max_in_df))
         
         # construct graph
         G_cu = cugraph.Graph()
         if weights is not None or sparse_input is not None:
             G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:
-            print("Data frame is " + str(G_df))
             G_cu.from_cudf_edgelist(G_df, renumber=False)
 
         return G_cu
@@ -1043,8 +1041,7 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False):
            [default = False]
 
     """
-    file_name = prefix + "/" + os.path.basename(prefix) + '_' + suffix
-    os.path.basename(prefix) + '_graph.csv.bz2'
+    file_name = prefix + "/" + os.path.basename(prefix)
     if use_gpu:
         G.to_pandas_edgelist().to_csv(file_name + '.csv.gz',
                 compression='gzip')

From 2729a11fccefd41496a196cc1eb1302c7fdc4a82 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 14:10:02 +0000
Subject: [PATCH 143/327] Add suffix to output file

---
 PopPUNK/network.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index a7556541..2e8a4dcc 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -1042,6 +1042,8 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False):
 
     """
     file_name = prefix + "/" + os.path.basename(prefix)
+    if suffix is not None:
+        file_name = file_name + '_' + suffix
     if use_gpu:
         G.to_pandas_edgelist().to_csv(file_name + '.csv.gz',
                 compression='gzip')

From 9466ae66f074c043259f3ecaffb3bf43d3225eb9 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 14:11:50 +0000
Subject: [PATCH 144/327] Correct suffix to output file

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 2e8a4dcc..530e8805 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -1043,7 +1043,7 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False):
     """
     file_name = prefix + "/" + os.path.basename(prefix)
     if suffix is not None:
-        file_name = file_name + '_' + suffix
+        file_name = file_name + suffix
     if use_gpu:
         G.to_pandas_edgelist().to_csv(file_name + '.csv.gz',
                 compression='gzip')

From 80533f836069e17af968436f618c8ba2de27c3f3 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Fri, 12 Mar 2021 15:18:36 +0000
Subject: [PATCH 145/327] Fix dist order with lineage mode

---
 PopPUNK/__main__.py |  2 +-
 PopPUNK/assign.py   | 30 ++++++++++--------------------
 PopPUNK/models.py   |  3 +--
 PopPUNK/utils.py    | 13 ++++++++++---
 4 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 860140a3..c19294d8 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -538,7 +538,7 @@ def main():
                 fit_type = 'accessory'
                 genomeNetwork = indivNetworks['accessory']
 
-        save_network(genomeNetwork, prefix = output, suffix = "_graph", use_gpu = args.gpu_graph)
+        save_network(genomeNetwork, prefix = output, suffix = "graph", use_gpu = args.gpu_graph)
 
         #******************************#
         #*                            *#
diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index b1b11236..7b52d943 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -119,7 +119,12 @@ def assign_query(dbFuncs,
             for reference in refFile:
                 rNames.append(reference.rstrip())
     else:
-        rNames = getSeqsInDb(ref_db + "/" + os.path.basename(ref_db) + ".h5")
+        if os.path.isfile(distances + ",pkl"):
+            rNames = readPickle(distances, enforce_self = True, distances=False)[0]
+        elif update_db:
+            sys.stderr.write("Reference distances missing, cannot use --update-db\n")
+        else:
+            rNames = getSeqsInDb(ref_db + "/" + os.path.basename(ref_db) + ".h5")
     # construct database
     if (web and json_sketch):
         qNames = sketch_to_hdf5(json_sketch, output)
@@ -244,28 +249,13 @@ def assign_query(dbFuncs,
         else:
             genomeNetwork.save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt')
 
-        # Update distance matrices with all calculated distances
-        if distances == None:
-            distanceFiles = ref_db + "/" + os.path.basename(ref_db) + ".dists"
-        else:
-            distanceFiles = distances
-
         # Load the previous distances
         refList_loaded, refList_copy, self, rrDistMat = \
-            readPickle(distanceFiles,
+            readPickle(distances,
                        enforce_self = True)
-        # qrDistMat: order of ref labels is the same as in the database (usually
-        # ordered). Order in original rrDistMat is arbitrary, leading to an
-        # awkwardness here. We prefer to reorder the qrDistMat to match, as it is
-        # usually smaller and has a simpler layout in long form
-        # At the end, rNames is updated to match what has been loaded
-        if refList_loaded != rNames:
-            match_order = [rNames.index(i) for i in refList_loaded] * len(qNames)
-            for q_offset in range(len(qNames)):
-                for r_offset in range(len(rNames)):
-                    match_order[q_offset * len(rNames) + r_offset] += q_offset * len(rNames)
-            qrDistMat = qrDistMat[match_order, :]
-            rNames = refList_loaded
+        # This should now always be true, otherwise both qrDistMat and sparse matrix
+        # may need reordering
+        assert(refList_loaded == rNames)
 
         combined_seq, core_distMat, acc_distMat = \
             update_distance_matrices(rNames, rrDistMat,
diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 902cf738..4f92c861 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -847,8 +847,7 @@ def fit(self, X, accessory, threads):
                 pp_sketchlib.sparsifyDists(
                     pp_sketchlib.longToSquare(X[:, [self.dist_col]], threads),
                     0,
-                    rank,
-                    threads
+                    rank
                 )
             data = [epsilon if d < epsilon else d for d in data]
             self.nn_dists[rank] = coo_matrix((data, (row, col)),
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 33588739..cb801865 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -103,7 +103,7 @@ def storePickle(rlist, qlist, self, X, pklName):
     np.save(pklName + ".npy", X)
 
 
-def readPickle(pklName, enforce_self = False):
+def readPickle(pklName, enforce_self=False, distances=True):
     """Loads core and accessory distances saved by :func:`~storePickle`
 
     Called during ``--fit-model``
@@ -115,6 +115,10 @@ def readPickle(pklName, enforce_self = False):
             Error if self == False
 
             [default = True]
+        distances (bool)
+            Read the distance matrix
+
+            [default = Trie]
 
     Returns:
         rlist (list)
@@ -131,7 +135,10 @@ def readPickle(pklName, enforce_self = False):
         if enforce_self and not self:
             sys.stderr.write("Old distances " + pklName + ".npy not complete\n")
             sys.stderr.exit(1)
-    X = np.load(pklName + ".npy")
+    if distances:
+        X = np.load(pklName + ".npy")
+    else:
+        X = None
     return rlist, qlist, self, X
 
 
@@ -432,7 +439,7 @@ def readRfile(rFile, oneSeq=False):
     list_iterable = zip(names, sequences)
     sorted_names = sorted(list_iterable)
     tuples = zip(*sorted_names)
-    names, sequences = [list(tuple) for tuple in tuples]
+    names, sequences = [list(r_tuple) for r_tuple in tuples]
 
     return (names, sequences)
 

From ce8135e65e035ea3273d3e7902da491712a4e24e Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Fri, 12 Mar 2021 15:21:50 +0000
Subject: [PATCH 146/327] docstring typo

---
 PopPUNK/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index cb801865..8745f968 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -118,7 +118,7 @@ def readPickle(pklName, enforce_self=False, distances=True):
         distances (bool)
             Read the distance matrix
 
-            [default = Trie]
+            [default = True]
 
     Returns:
         rlist (list)

From cca4a7c0959bef6a7f6dfd8018090323bef1034f Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:07:37 +0000
Subject: [PATCH 147/327] Add GPU summaries

---
 PopPUNK/network.py | 85 +++++++++++++++++++++++++++++-----------------
 1 file changed, 54 insertions(+), 31 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 530e8805..e265de8d 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -496,13 +496,11 @@ def constructNetwork(rlist, qlist, assignments, within_label,
             new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
         
         # construct graph
-        G_cu = cugraph.Graph()
+        G = cugraph.Graph()
         if weights is not None or sparse_input is not None:
-            G_cu.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
+            G.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:
-            G_cu.from_cudf_edgelist(G_df, renumber=False)
-
-        return G_cu
+            G.from_cudf_edgelist(G_df, renumber=False)
 
     else:
 
@@ -522,22 +520,22 @@ def constructNetwork(rlist, qlist, assignments, within_label,
                                     vals = vertex_labels)
         G.vp.id = vid
 
-        # print some summaries
-        if summarise:
-            (metrics, scores) = networkSummary(G)
-            sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t\t\t\t" + str(metrics[0]),
-                                                           "\tDensity\t\t\t\t\t" + "{:.4f}".format(metrics[1]),
-                                                           "\tTransitivity\t\t\t\t" + "{:.4f}".format(metrics[2]),
-                                                           "\tMean betweenness\t\t\t" + "{:.4f}".format(metrics[3]),
-                                                           "\tWeighted-mean betweenness\t\t" + "{:.4f}".format(metrics[4]),
-                                                           "\tScore\t\t\t\t\t" + "{:.4f}".format(scores[0]),
-                                                           "\tScore (w/ betweenness)\t\t\t" + "{:.4f}".format(scores[1]),
-                                                           "\tScore (w/ weighted-betweenness)\t\t" + "{:.4f}".format(scores[2])])
-                                                           + "\n")
-
-        return G
-
-def networkSummary(G, calc_betweenness=True):
+    # print some summaries
+    if summarise:
+        (metrics, scores) = networkSummary(G, use_gpu = use_gpu)
+        sys.stderr.write("Network summary:\n" + "\n".join(["\tComponents\t\t\t\t" + str(metrics[0]),
+                                                       "\tDensity\t\t\t\t\t" + "{:.4f}".format(metrics[1]),
+                                                       "\tTransitivity\t\t\t\t" + "{:.4f}".format(metrics[2]),
+                                                       "\tMean betweenness\t\t\t" + "{:.4f}".format(metrics[3]),
+                                                       "\tWeighted-mean betweenness\t\t" + "{:.4f}".format(metrics[4]),
+                                                       "\tScore\t\t\t\t\t" + "{:.4f}".format(scores[0]),
+                                                       "\tScore (w/ betweenness)\t\t\t" + "{:.4f}".format(scores[1]),
+                                                       "\tScore (w/ weighted-betweenness)\t\t" + "{:.4f}".format(scores[2])])
+                                                       + "\n")
+
+    return G
+
+def networkSummary(G, calc_betweenness=True, use_gpu = False):
     """Provides summary values about the network
 
     Args:
@@ -545,6 +543,8 @@ def networkSummary(G, calc_betweenness=True):
             The network of strains from :func:`~constructNetwork`
         calc_betweenness (bool)
             Whether to calculate betweenness stats
+        use_gpu (bool)
+            Whether to use cugraph for graph analysis
 
     Returns:
         metrics (list)
@@ -553,27 +553,50 @@ def networkSummary(G, calc_betweenness=True):
         scores (list)
             List of scores
     """
-    component_assignments, component_frequencies = gt.label_components(G)
-    components = len(component_frequencies)
-    density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1))
-    transitivity = gt.global_clustering(G)[0]
+    if use_gpu:
+        component_assignments = cugraph.components.connectivity.connected_components(G)
+        components = component_assignments['labels'].unique()
+        density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1))
+        triangle_count = cugraph.community.triangle_count.triangles(G)
+        degree = G.degree()
+        triad_count = sum([d * (d - 1) for d in degree)
+        transitivity = triangle_count/triad_count
+    else:
+        component_assignments, component_frequencies = gt.label_components(G)
+        components = len(component_frequencies)
+        density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1))
+        transitivity = gt.global_clustering(G)[0]
 
     mean_bt = 0
     weighted_mean_bt = 0
     if calc_betweenness:
         betweenness = []
         sizes = []
-        for component, size in enumerate(component_frequencies):
-            if size > 3:
-                vfilt = component_assignments.a == component
-                subgraph = gt.GraphView(G, vfilt=vfilt)
-                betweenness.append(max(gt.betweenness(subgraph, norm = True)[0].a))
-                sizes.append(size)
+        
+        if use_gpu:
+            component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
+            for component in components:
+                size = component_frequencies[component_frequencies.index == component]
+                if size > 3:
+                    print("Component count df: " + str(component_assignments))
+                    component_vertices = component_assignments['vertices'][component_assignments['labels']==component]
+                    subgraph = cugraph.subgraph(G, component_vertices)
+                    component_betweenness = cugraph.betweenness_centrality(G)
+                    betweenness.append(np.amax(component_betweenness))
+                    sizes.append(size)
+        else:
+            for component, size in enumerate(component_frequencies):
+                if size > 3:
+                    vfilt = component_assignments.a == component
+                    subgraph = gt.GraphView(G, vfilt=vfilt)
+                    betweenness.append(max(gt.betweenness(subgraph, norm = True)[0].a))
+                    sizes.append(size)
 
         if len(betweenness) > 1:
             mean_bt = np.mean(betweenness)
             weighted_mean_bt = np.average(betweenness, weights=sizes)
 
+    # Calculate scores
     metrics = [components, density, transitivity, mean_bt, weighted_mean_bt]
     base_score = transitivity * (1 - density)
     scores = [base_score, base_score * (1 - metrics[3]), base_score * (1 - metrics[4])]

From 4a53fd274b34ed481395c7efd869b714817f7561 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:08:29 +0000
Subject: [PATCH 148/327] Remove surplus bracket

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index e265de8d..fb76ad70 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -556,7 +556,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
     if use_gpu:
         component_assignments = cugraph.components.connectivity.connected_components(G)
         components = component_assignments['labels'].unique()
-        density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1))
+        density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)
         triangle_count = cugraph.community.triangle_count.triangles(G)
         degree = G.degree()
         triad_count = sum([d * (d - 1) for d in degree)

From 3c7d1d25248ce3ee88008c4c90c2f22d6b927ccc Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:09:35 +0000
Subject: [PATCH 149/327] Change sum of degree

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index fb76ad70..b78439ea 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -559,7 +559,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)
         triangle_count = cugraph.community.triangle_count.triangles(G)
         degree = G.degree()
-        triad_count = sum([d * (d - 1) for d in degree)
+        triad_count = sum([d * (d - 1) for d in degree])
         transitivity = triangle_count/triad_count
     else:
         component_assignments, component_frequencies = gt.label_components(G)

From b75dccedd2d58b06e52eec92c0c1b45e61fec50a Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:11:01 +0000
Subject: [PATCH 150/327] Load cugraph libraries

---
 PopPUNK/network.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index b78439ea..c81e9dae 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -554,6 +554,15 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
             List of scores
     """
     if use_gpu:
+    
+        # load CUDA libraries
+        try:
+            import cugraph
+            import cudf
+        except ImportError as e:
+            sys.stderr.write("cugraph and cudf unavailable\n")
+            raise ImportError(e)
+    
         component_assignments = cugraph.components.connectivity.connected_components(G)
         components = component_assignments['labels'].unique()
         density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)

From 8ccddf4c7c56d5c042d92210934a9f235fc49795 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:12:15 +0000
Subject: [PATCH 151/327] Print degree for debug

---
 PopPUNK/network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index c81e9dae..354ed7a2 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -568,6 +568,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)
         triangle_count = cugraph.community.triangle_count.triangles(G)
         degree = G.degree()
+        print("Degree is " + str(degree))
         triad_count = sum([d * (d - 1) for d in degree])
         transitivity = triangle_count/triad_count
     else:

From 81bfb5ea659046d9d0f232fe478ee2e245e64e87 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:14:47 +0000
Subject: [PATCH 152/327] Change access to degree

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 354ed7a2..07c74412 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -567,9 +567,9 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         components = component_assignments['labels'].unique()
         density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)
         triangle_count = cugraph.community.triangle_count.triangles(G)
-        degree = G.degree()
+        degree_df = G.degree()
         print("Degree is " + str(degree))
-        triad_count = sum([d * (d - 1) for d in degree])
+        triad_count = sum([d * (d - 1) for d in degree_df['degree'])
         transitivity = triangle_count/triad_count
     else:
         component_assignments, component_frequencies = gt.label_components(G)

From 3368c6b6f87d7916d6221c3fdd29795265383bc7 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:15:41 +0000
Subject: [PATCH 153/327] Add missing bracket

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 07c74412..7d7d3aaf 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -569,7 +569,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         triangle_count = cugraph.community.triangle_count.triangles(G)
         degree_df = G.degree()
         print("Degree is " + str(degree))
-        triad_count = sum([d * (d - 1) for d in degree_df['degree'])
+        triad_count = sum([d * (d - 1) for d in degree_df['degree']])
         transitivity = triangle_count/triad_count
     else:
         component_assignments, component_frequencies = gt.label_components(G)

From ff0720177fb777f5962364d5dfaed99797b5d514 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:16:50 +0000
Subject: [PATCH 154/327] Change degree print statement

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 7d7d3aaf..3f27068a 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -568,7 +568,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)
         triangle_count = cugraph.community.triangle_count.triangles(G)
         degree_df = G.degree()
-        print("Degree is " + str(degree))
+        print("Degree is " + str(degree_df['degree']))
         triad_count = sum([d * (d - 1) for d in degree_df['degree']])
         transitivity = triangle_count/triad_count
     else:

From 628ad32b5c53a96a244a9e02fe3fb1d5d7703e8e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:18:40 +0000
Subject: [PATCH 155/327] Convert to pandas

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 3f27068a..0833120f 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -569,7 +569,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         triangle_count = cugraph.community.triangle_count.triangles(G)
         degree_df = G.degree()
         print("Degree is " + str(degree_df['degree']))
-        triad_count = sum([d * (d - 1) for d in degree_df['degree']])
+        triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()])
         transitivity = triangle_count/triad_count
     else:
         component_assignments, component_frequencies = gt.label_components(G)

From 8e36694c70e7ed3a4bb079c36b34363b091e900c Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:20:31 +0000
Subject: [PATCH 156/327] Change iteration over components

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 0833120f..14b152f5 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -568,8 +568,8 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)
         triangle_count = cugraph.community.triangle_count.triangles(G)
         degree_df = G.degree()
-        print("Degree is " + str(degree_df['degree']))
         triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()])
+        print("triad_count is " + str(triad_count))
         transitivity = triangle_count/triad_count
     else:
         component_assignments, component_frequencies = gt.label_components(G)
@@ -585,7 +585,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         
         if use_gpu:
             component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
-            for component in components:
+            for component in components.to_pandas():
                 size = component_frequencies[component_frequencies.index == component]
                 if size > 3:
                     print("Component count df: " + str(component_assignments))

From f19adfc63ea1f72768f0aaefad0b1ad3122da240 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:23:21 +0000
Subject: [PATCH 157/327] Print details of components

---
 PopPUNK/network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 14b152f5..6ca6b789 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -587,6 +587,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
             component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
             for component in components.to_pandas():
                 size = component_frequencies[component_frequencies.index == component]
+                print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies))
                 if size > 3:
                     print("Component count df: " + str(component_assignments))
                     component_vertices = component_assignments['vertices'][component_assignments['labels']==component]

From 14a14f93c27452ab0931ed214d4f65858fa67e67 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:29:38 +0000
Subject: [PATCH 158/327] Convert series value to int

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 6ca6b789..6cbcdff3 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -586,7 +586,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         if use_gpu:
             component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
             for component in components.to_pandas():
-                size = component_frequencies[component_frequencies.index == component]
+                size = component_frequencies[component_frequencies.index == component].astype(int)
                 print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies))
                 if size > 3:
                     print("Component count df: " + str(component_assignments))

From 38298f25fd7bb4aa43ef53a307477bc8985f6799 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:33:29 +0000
Subject: [PATCH 159/327] Extract single value for size

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 6cbcdff3..be01983d 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -586,7 +586,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         if use_gpu:
             component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
             for component in components.to_pandas():
-                size = component_frequencies[component_frequencies.index == component].astype(int)
+                size = component_frequencies[component_frequencies.index == component].iloc[0].astype(int)
                 print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies))
                 if size > 3:
                     print("Component count df: " + str(component_assignments))

From fb6ba093be72dba1525558cb96b2399c5a06bf08 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:35:39 +0000
Subject: [PATCH 160/327] Change column name

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index be01983d..dbe2ac4a 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -590,7 +590,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
                 print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies))
                 if size > 3:
                     print("Component count df: " + str(component_assignments))
-                    component_vertices = component_assignments['vertices'][component_assignments['labels']==component]
+                    component_vertices = component_assignments['vertex'][component_assignments['labels']==component]
                     subgraph = cugraph.subgraph(G, component_vertices)
                     component_betweenness = cugraph.betweenness_centrality(G)
                     betweenness.append(np.amax(component_betweenness))

From 2973f6e26f9c1621ec55e5534b7f3a2c33cfda01 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:37:13 +0000
Subject: [PATCH 161/327] Print component betweenness

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index dbe2ac4a..10cf8c82 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -589,10 +589,10 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
                 size = component_frequencies[component_frequencies.index == component].iloc[0].astype(int)
                 print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies))
                 if size > 3:
-                    print("Component count df: " + str(component_assignments))
                     component_vertices = component_assignments['vertex'][component_assignments['labels']==component]
                     subgraph = cugraph.subgraph(G, component_vertices)
                     component_betweenness = cugraph.betweenness_centrality(G)
+                    print("Component betweenness: " + str(component_betweenness))
                     betweenness.append(np.amax(component_betweenness))
                     sizes.append(size)
         else:

From 49e4fc96097703d163258a13867e8f6931099e5a Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:40:37 +0000
Subject: [PATCH 162/327] Find maximum betweenness

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 10cf8c82..51b08d9b 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -593,7 +593,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
                     subgraph = cugraph.subgraph(G, component_vertices)
                     component_betweenness = cugraph.betweenness_centrality(G)
                     print("Component betweenness: " + str(component_betweenness))
-                    betweenness.append(np.amax(component_betweenness))
+                    betweenness.append(component_betweenness['component_betweenness'].max())
                     sizes.append(size)
         else:
             for component, size in enumerate(component_frequencies):

From 2461855ccfe31634cae4109fff6a4aad474d2824 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:41:44 +0000
Subject: [PATCH 163/327] Change column name

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 51b08d9b..c625cf6b 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -593,7 +593,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
                     subgraph = cugraph.subgraph(G, component_vertices)
                     component_betweenness = cugraph.betweenness_centrality(G)
                     print("Component betweenness: " + str(component_betweenness))
-                    betweenness.append(component_betweenness['component_betweenness'].max())
+                    betweenness.append(component_betweenness['betweenness_centrality'].max())
                     sizes.append(size)
         else:
             for component, size in enumerate(component_frequencies):

From 69edec78ddf6080287799e9460b33f6409d91a0e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:44:37 +0000
Subject: [PATCH 164/327] Betweeness access change

---
 PopPUNK/network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index c625cf6b..a7883444 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -564,7 +564,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
             raise ImportError(e)
     
         component_assignments = cugraph.components.connectivity.connected_components(G)
-        components = component_assignments['labels'].unique()
+        components = component_assignments['labels'].unique().astype(int)
         density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)
         triangle_count = cugraph.community.triangle_count.triangles(G)
         degree_df = G.degree()
@@ -594,6 +594,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
                     component_betweenness = cugraph.betweenness_centrality(G)
                     print("Component betweenness: " + str(component_betweenness))
                     betweenness.append(component_betweenness['betweenness_centrality'].max())
+                    print("Betweenness: " + str(betweenness))
                     sizes.append(size)
         else:
             for component, size in enumerate(component_frequencies):

From 00d84d572ebdf43737f20180e85c4dae6681c3ed Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:48:14 +0000
Subject: [PATCH 165/327] Change summary stat recording

---
 PopPUNK/network.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index a7883444..b5f3ca94 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -564,7 +564,8 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
             raise ImportError(e)
     
         component_assignments = cugraph.components.connectivity.connected_components(G)
-        components = component_assignments['labels'].unique().astype(int)
+        component_nums = component_assignments['labels'].unique().astype(int)
+        components = len(component_nums)
         density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)
         triangle_count = cugraph.community.triangle_count.triangles(G)
         degree_df = G.degree()
@@ -585,7 +586,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         
         if use_gpu:
             component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
-            for component in components.to_pandas():
+            for component in component_nums.to_pandas():
                 size = component_frequencies[component_frequencies.index == component].iloc[0].astype(int)
                 print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies))
                 if size > 3:
@@ -607,6 +608,9 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         if len(betweenness) > 1:
             mean_bt = np.mean(betweenness)
             weighted_mean_bt = np.average(betweenness, weights=sizes)
+        else:
+            mean_bt = betweenness[0]
+            weighted_mean_bt = betweenness[0]
 
     # Calculate scores
     metrics = [components, density, transitivity, mean_bt, weighted_mean_bt]

From 438c269405dc3db43a72c97b6cfbe9ca1b438054 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 16:50:41 +0000
Subject: [PATCH 166/327] Tidy up debug messages

---
 PopPUNK/network.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index b5f3ca94..f4182ffa 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -570,7 +570,6 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         triangle_count = cugraph.community.triangle_count.triangles(G)
         degree_df = G.degree()
         triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()])
-        print("triad_count is " + str(triad_count))
         transitivity = triangle_count/triad_count
     else:
         component_assignments, component_frequencies = gt.label_components(G)
@@ -588,14 +587,11 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
             component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
             for component in component_nums.to_pandas():
                 size = component_frequencies[component_frequencies.index == component].iloc[0].astype(int)
-                print("Component: " + str(component) + " size: " + str(size) + " freqs: " + str(component_frequencies))
                 if size > 3:
                     component_vertices = component_assignments['vertex'][component_assignments['labels']==component]
                     subgraph = cugraph.subgraph(G, component_vertices)
                     component_betweenness = cugraph.betweenness_centrality(G)
-                    print("Component betweenness: " + str(component_betweenness))
                     betweenness.append(component_betweenness['betweenness_centrality'].max())
-                    print("Betweenness: " + str(betweenness))
                     sizes.append(size)
         else:
             for component, size in enumerate(component_frequencies):

From 03820241068220f2bc4c94f1924196e730e08d29 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 22:05:08 +0000
Subject: [PATCH 167/327] Transitivity calculation details

---
 PopPUNK/network.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index f4182ffa..884c25fa 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -571,11 +571,13 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         degree_df = G.degree()
         triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()])
         transitivity = triangle_count/triad_count
+        print("Triangle count CPU: " + str(triangle_count) + " Triad count CPU: " + str(triad_count))
     else:
         component_assignments, component_frequencies = gt.label_components(G)
         components = len(component_frequencies)
         density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1))
         transitivity = gt.global_clustering(G)[0]
+        print("Triangle count CPU: " + str(gt.global_clustering(G)[1]) + " Triad count CPU: " + str(gt.global_clustering(G)[2]))
 
     mean_bt = 0
     weighted_mean_bt = 0

From 583fa13bd4d0134802c603dd6fae4fcdf6ff1d74 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 22:08:56 +0000
Subject: [PATCH 168/327] Change printing of debug

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 884c25fa..af160bcf 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -571,13 +571,13 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         degree_df = G.degree()
         triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()])
         transitivity = triangle_count/triad_count
-        print("Triangle count CPU: " + str(triangle_count) + " Triad count CPU: " + str(triad_count))
+        print("Triangle count GPU: " + str(triangle_count) + " Triad count GPU: " + str(triad_count))
     else:
         component_assignments, component_frequencies = gt.label_components(G)
         components = len(component_frequencies)
         density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1))
         transitivity = gt.global_clustering(G)[0]
-        print("Triangle count CPU: " + str(gt.global_clustering(G)[1]) + " Triad count CPU: " + str(gt.global_clustering(G)[2]))
+        print("Triangle count CPU: " + str(gt.global_clustering(G)))
 
     mean_bt = 0
     weighted_mean_bt = 0

From e5bb57ac7c3c34e2bedd4000855d3d51f9b464b2 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 12 Mar 2021 22:10:40 +0000
Subject: [PATCH 169/327] Print counts

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index af160bcf..1402b394 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -577,7 +577,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         components = len(component_frequencies)
         density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1))
         transitivity = gt.global_clustering(G)[0]
-        print("Triangle count CPU: " + str(gt.global_clustering(G)))
+        print("Triangle count CPU: " + str(gt.global_clustering(G, ret_counts = True)))
 
     mean_bt = 0
     weighted_mean_bt = 0

From bdf8a83141e10552824cce34a1702d6d47ece760 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 06:51:24 +0000
Subject: [PATCH 170/327] Enable GPUs for refinement

---
 PopPUNK/network.py |  2 +-
 PopPUNK/refine.py  | 58 ++++++++++++++++++++++++++++++++--------------
 2 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 1402b394..d3d1ab26 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -577,7 +577,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         components = len(component_frequencies)
         density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1))
         transitivity = gt.global_clustering(G)[0]
-        print("Triangle count CPU: " + str(gt.global_clustering(G, ret_counts = True)))
+        print("Triangle/triad count CPU: " + str(gt.global_clustering(G, ret_counts = True)))
 
     mean_bt = 0
     weighted_mean_bt = 0
diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index 4dab0523..1baf2ff3 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -32,7 +32,7 @@
 
 def refineFit(distMat, sample_names, start_s, mean0, mean1,
               max_move, min_move, slope = 2, score_idx = 0,
-              unconstrained = False, no_local = False, num_processes = 1):
+              unconstrained = False, no_local = False, num_processes = 1, use_gpu = use_gpu):
     """Try to refine a fit by maximising a network score based on transitivity and density.
 
     Iteratively move the decision boundary to do this, using starting point from existing model.
@@ -65,8 +65,10 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1,
             Quicker, but may be less well refined.
         num_processes (int)
             Number of threads to use in the global optimisation step.
-
             (default = 1)
+        use_gpu (bool)
+            Whether to use cugraph for graph analyses
+
     Returns:
         start_point (tuple)
             (x, y) co-ordinates of starting point
@@ -117,7 +119,8 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1,
                                             distMat = distances_shared,
                                             x_range = x_max,
                                             y_range = y_max,
-                                            score_idx = score_idx),
+                                            score_idx = score_idx,
+                                            use_gpu = use_gpu),
                                     range(global_grid_resolution))
 
         if gt.openmp_enabled():
@@ -148,7 +151,7 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1,
             poppunk_refine.thresholdIterate1D(distMat, s_range, slope,
                                                   start_point[0], start_point[1],
                                                   mean1[0], mean1[1], num_processes)
-        global_s = growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx)
+        global_s = growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, use_gpu = use_gpu)
         min_idx = np.argmin(np.array(global_s))
         if min_idx > 0 and min_idx < len(s_range) - 1:
             bounds = [s_range[min_idx-1], s_range[min_idx+1]]
@@ -162,7 +165,8 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1,
         local_s = scipy.optimize.minimize_scalar(newNetwork,
                         bounds=bounds,
                         method='Bounded', options={'disp': True},
-                        args = (sample_names, distMat, start_point, mean1, gradient, slope, score_idx))
+                        args = (sample_names, distMat, start_point, mean1, gradient, slope, score_idx, use_gpu = use_gpu),
+                        )
         optimised_s = local_s.x
 
     # Convert to x_max, y_max if needed
@@ -180,7 +184,7 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1,
     return start_point, optimal_x, optimal_y, min_move, max_move
 
 
-def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_idx = 0):
+def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_idx = 0, use_gpu = False):
     """Construct a network, then add edges to it iteratively.
     Input is from ``pp_sketchlib.iterateBoundary1D`` or``pp_sketchlib.iterateBoundary2D``
 
@@ -201,6 +205,9 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_
             [default = 0]
         thread_idx (int)
             Optional thread idx (if multithreaded) to offset progress bar by
+        use_gpu (bool)
+            Whether to use cugraph for graph analyses
+            
     Returns:
         scores (list)
             -1 * network score for each of x_range.
@@ -219,12 +226,17 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_
                 # At first offset, make a new network, otherwise just add the new edges
                 if prev_idx == 0:
                     G = constructNetwork(sample_names, sample_names, edge_list, -1,
-                                         summarise=False, edge_list=True)
+                                         summarise=False, edge_list=True, use_gpu = use_gpu)
                 else:
-                    G.add_edge_list(edge_list)
+                    if use_gpu:
+                        G = constructNetwork(sample_names, sample_names, edge_list, -1,
+                                                summarise=False, edge_list=True, use_gpu = use_gpu)
+                    else:
+                        # Not currently possible with GPU - https://github.com/rapidsai/cugraph/issues/805
+                        G.add_edge_list(edge_list)
                 # Add score into vector for any offsets passed (should usually just be one)
                 for s in range(prev_idx, idx):
-                    scores.append(-networkSummary(G, score_idx > 0)[1][score_idx])
+                    scores.append(-networkSummary(G, score_idx > 0, use_gpu = use_gpu)[1][score_idx])
                     pbar.update(1)
                 prev_idx = idx
                 edge_list = []
@@ -233,18 +245,23 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_
         # Add score for final offset(s) at end of loop
         if prev_idx == 0:
             G = constructNetwork(sample_names, sample_names, edge_list, -1,
-                                 summarise=False, edge_list=True)
+                                 summarise=False, edge_list=True, use_gpu = use_gpu)
         else:
-            G.add_edge_list(edge_list)
+            if use_gpu:
+                G = constructNetwork(sample_names, sample_names, edge_list, -1,
+                                        summarise=False, edge_list=True, use_gpu = use_gpu)
+            else:
+                # Not currently possible with GPU - https://github.com/rapidsai/cugraph/issues/805
+                G.add_edge_list(edge_list)
         for s in range(prev_idx, len(s_range)):
-            scores.append(-networkSummary(G, score_idx > 0)[1][score_idx])
+            scores.append(-networkSummary(G, score_idx > 0, use_gpu = use_gpu)[1][score_idx])
             pbar.update(1)
 
     return(scores)
 
 
 def newNetwork(s, sample_names, distMat, start_point, mean1, gradient,
-               slope=2, score_idx=0, cpus=1):
+               slope=2, score_idx=0, cpus=1, use_gpu = False):
     """Wrapper function for :func:`~PopPUNK.network.constructNetwork` which is called
     by optimisation functions moving a triangular decision boundary.
 
@@ -273,6 +290,9 @@ def newNetwork(s, sample_names, distMat, start_point, mean1, gradient,
             [default = 0]
         cpus (int)
             Number of CPUs to use for calculating assignment
+        use_gpu (bool)
+            Whether to use cugraph for graph analysis
+            
     Returns:
         score (float)
             -1 * network score. Where network score is from :func:`~PopPUNK.network.networkSummary`
@@ -294,13 +314,14 @@ def newNetwork(s, sample_names, distMat, start_point, mean1, gradient,
 
     # Make network
     boundary_assignments = poppunk_refine.assignThreshold(distMat, slope, x_max, y_max, cpus)
-    G = constructNetwork(sample_names, sample_names, boundary_assignments, -1, summarise = False)
+    G = constructNetwork(sample_names, sample_names, boundary_assignments, -1, summarise = False,
+                            use_gpu = use_gpu)
 
     # Return score
-    score = networkSummary(G, score_idx > 0)[1][score_idx]
+    score = networkSummary(G, score_idx > 0, use_gpu = use_gpu)[1][score_idx]
     return(-score)
 
-def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0):
+def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0, use_gpu = use_gpu):
     """Wrapper function for thresholdIterate2D and :func:`growNetwork`.
 
     For a given y_max, constructs networks across x_range and returns a list
@@ -320,6 +341,9 @@ def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0):
         score_idx (int)
             Index of score from :func:`~PopPUNK.network.networkSummary` to use
             [default = 0]
+        use_gpu (bool)
+            Whether to use cugraph for graph analysis
+            
     Returns:
         scores (list)
             -1 * network score for each of x_range.
@@ -334,7 +358,7 @@ def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0):
     y_max = y_range[y_idx]
     i_vec, j_vec, idx_vec = \
             poppunk_refine.thresholdIterate2D(distMat, x_range, y_max)
-    scores = growNetwork(sample_names, i_vec, j_vec, idx_vec, x_range, score_idx, y_idx)
+    scores = growNetwork(sample_names, i_vec, j_vec, idx_vec, x_range, score_idx, y_idx, use_gpu = use_gpu)
     return(scores)
 
 def readManualStart(startFile):

From 2e1c802791a26678f8cb2272439cb96cb3d16852 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 06:57:11 +0000
Subject: [PATCH 171/327] Change kwarg to arg in optimise

---
 PopPUNK/refine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index 1baf2ff3..fe873317 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -165,7 +165,7 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1,
         local_s = scipy.optimize.minimize_scalar(newNetwork,
                         bounds=bounds,
                         method='Bounded', options={'disp': True},
-                        args = (sample_names, distMat, start_point, mean1, gradient, slope, score_idx, use_gpu = use_gpu),
+                        args = (sample_names, distMat, start_point, mean1, gradient, slope, score_idx, use_gpu),
                         )
         optimised_s = local_s.x
 

From 4e68b7baf99f78748ee0a17202c6f56b2aba285e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 06:58:23 +0000
Subject: [PATCH 172/327] Change default arguments

---
 PopPUNK/refine.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index fe873317..2af32f64 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -32,7 +32,7 @@
 
 def refineFit(distMat, sample_names, start_s, mean0, mean1,
               max_move, min_move, slope = 2, score_idx = 0,
-              unconstrained = False, no_local = False, num_processes = 1, use_gpu = use_gpu):
+              unconstrained = False, no_local = False, num_processes = 1, use_gpu = False):
     """Try to refine a fit by maximising a network score based on transitivity and density.
 
     Iteratively move the decision boundary to do this, using starting point from existing model.
@@ -321,7 +321,7 @@ def newNetwork(s, sample_names, distMat, start_point, mean1, gradient,
     score = networkSummary(G, score_idx > 0, use_gpu = use_gpu)[1][score_idx]
     return(-score)
 
-def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0, use_gpu = use_gpu):
+def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0, use_gpu = False):
     """Wrapper function for thresholdIterate2D and :func:`growNetwork`.
 
     For a given y_max, constructs networks across x_range and returns a list

From a2ce78919a79d1ecbf793d1ce36ccd6dd948c307 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 07:14:50 +0000
Subject: [PATCH 173/327] Cascade use_gpu argument through functions

---
 PopPUNK/__main__.py | 3 ++-
 PopPUNK/refine.py   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index c9b2dfa0..41e2f198 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -415,7 +415,8 @@ def main():
                                             args.unconstrained,
                                             args.score_idx,
                                             args.no_local,
-                                            args.threads)
+                                            args.threads,
+                                            use_gpu = args.gpu_graph)
                 new_model.plot(distMat)
                 model = new_model
             elif args.fit_model == "threshold":
diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index 2af32f64..8bff91dd 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -165,7 +165,8 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1,
         local_s = scipy.optimize.minimize_scalar(newNetwork,
                         bounds=bounds,
                         method='Bounded', options={'disp': True},
-                        args = (sample_names, distMat, start_point, mean1, gradient, slope, score_idx, use_gpu),
+                        args = (sample_names, distMat, start_point, mean1, gradient,
+                                slope, score_idx, num_processes, use_gpu),
                         )
         optimised_s = local_s.x
 

From 97487851cbd56293d8ecba7135dc7e7e9474a604 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 07:16:02 +0000
Subject: [PATCH 174/327] Change refine arguments

---
 PopPUNK/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 41e2f198..3a55283e 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -416,7 +416,7 @@ def main():
                                             args.score_idx,
                                             args.no_local,
                                             args.threads,
-                                            use_gpu = args.gpu_graph)
+                                            args.gpu_graph)
                 new_model.plot(distMat)
                 model = new_model
             elif args.fit_model == "threshold":

From e7ff375777a60bc3f14e27716d076bfef3a8508d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 07:21:14 +0000
Subject: [PATCH 175/327] Communicate GPU use

---
 PopPUNK/models.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 902cf738..5bac8220 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -530,7 +530,7 @@ def __init__(self, outPrefix):
         self.unconstrained = False
 
     def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indiv_refine = False,
-            unconstrained = False, score_idx = 0, no_local = False, threads = 1):
+            unconstrained = False, score_idx = 0, no_local = False, threads = 1, use_gpu = False):
         '''Extends :func:`~ClusterFit.fit`
 
         Fits the distances by optimising network score, by calling
@@ -553,11 +553,9 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
             startFile (str)
                 A file defining an initial fit, rather than one from ``--fit-model``.
                 See documentation for format.
-
                 (default = None).
             indiv_refine (bool)
                 Run refinement for core and accessory distances separately
-
                 (default = False).
             unconstrained (bool)
                 If True, search in 2D and change the slope of the boundary
@@ -569,8 +567,10 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
                 Quicker, but may be less well refined.
             num_processes (int)
                 Number of threads to use in the global optimisation step.
-
                 (default = 1)
+            use_gpu (bool)
+                Whether to use cugraph for graph analyses
+                
         Returns:
             y (numpy.array)
                 Cluster assignments of samples in X
@@ -581,6 +581,14 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
         self.min_move = min_move
         self.unconstrained = unconstrained
 
+        # load CUDA libraries
+        try:
+            import cugraph
+            import cudf
+        except ImportError as e:
+            sys.stderr.write("cugraph and cudf unavailable\n")
+            raise ImportError(e)
+
         # Get starting point
         model.no_scale()
         if startFile:
@@ -618,7 +626,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
           refineFit(X/self.scale,
                     sample_names, self.start_s, self.mean0, self.mean1, self.max_move, self.min_move,
                     slope = 2, score_idx = score_idx, unconstrained = unconstrained,
-                    no_local = no_local, num_processes = threads)
+                    no_local = no_local, num_processes = threads, use_gpu = use_gpu)
         self.fitted = True
 
         # Try and do a 1D refinement for both core and accessory
@@ -631,12 +639,14 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
                 start_point, self.core_boundary, core_acc, self.min_move, self.max_move = \
                   refineFit(X/self.scale,
                             sample_names, self.start_s, self.mean0, self.mean1, self.max_move, self.min_move,
-                            slope = 0, score_idx = score_idx, no_local = no_local,num_processes = threads)
+                            slope = 0, score_idx = score_idx, no_local = no_local,num_processes = threads,
+                            use_gpu = use_gpu)
                 # optimise accessory distance boundary
                 start_point, acc_core, self.accessory_boundary, self.min_move, self.max_move = \
                   refineFit(X/self.scale,
                             sample_names, self.start_s,self.mean0, self.mean1, self.max_move, self.min_move,
-                            slope = 1, score_idx = score_idx, no_local = no_local, num_processes = threads)
+                            slope = 1, score_idx = score_idx, no_local = no_local, num_processes = threads,
+                            use_gpu = use_gpu)
                 self.indiv_fitted = True
             except RuntimeError as e:
                 sys.stderr.write("Could not separately refine core and accessory boundaries. "

From 9eba43877fa7f321f76e8137d5c31a7dc7dedcff Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 19:08:17 +0000
Subject: [PATCH 176/327] Improve graph reconstruction in refinement

---
 PopPUNK/refine.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index 8bff91dd..90840633 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -230,8 +230,10 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_
                                          summarise=False, edge_list=True, use_gpu = use_gpu)
                 else:
                     if use_gpu:
-                        G = constructNetwork(sample_names, sample_names, edge_list, -1,
-                                                summarise=False, edge_list=True, use_gpu = use_gpu)
+                        G_extra_df = cudf.DataFrame(edge_list, columns =['source', 'destination'])
+                        G_df = cudf.concat([G.view_edge_list(),G_extra_df], ignore_index = True)
+                        G = cugraph.Graph()
+                        G.from_cudf_edgelist(G_df)
                     else:
                         # Not currently possible with GPU - https://github.com/rapidsai/cugraph/issues/805
                         G.add_edge_list(edge_list)

From ca2992b62796885979509ddbb1f5faa2e81e306c Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 19:09:50 +0000
Subject: [PATCH 177/327] Load CUDA libraries

---
 PopPUNK/refine.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index 90840633..7453e80d 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -214,6 +214,15 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_
             -1 * network score for each of x_range.
             Where network score is from :func:`~PopPUNK.network.networkSummary`
     """
+    
+    # load CUDA libraries
+    try:
+        import cugraph
+        import cudf
+    except ImportError as e:
+        sys.stderr.write("cugraph and cudf unavailable\n")
+        raise ImportError(e)
+    
     scores = []
     edge_list = []
     prev_idx = 0

From 64e6284c1ccf1cf2f46e6d7263442e057fabbc30 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 19:11:44 +0000
Subject: [PATCH 178/327] Add debug message

---
 PopPUNK/refine.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index 7453e80d..f151e94a 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -241,6 +241,7 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_
                     if use_gpu:
                         G_extra_df = cudf.DataFrame(edge_list, columns =['source', 'destination'])
                         G_df = cudf.concat([G.view_edge_list(),G_extra_df], ignore_index = True)
+                        print("DF is " + str(G_df))
                         G = cugraph.Graph()
                         G.from_cudf_edgelist(G_df)
                     else:

From 181a3282b45a320a40d03a0e546d88b31d495519 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 19:13:04 +0000
Subject: [PATCH 179/327] Fix column names

---
 PopPUNK/refine.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index f151e94a..13055d7d 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -239,9 +239,8 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_
                                          summarise=False, edge_list=True, use_gpu = use_gpu)
                 else:
                     if use_gpu:
-                        G_extra_df = cudf.DataFrame(edge_list, columns =['source', 'destination'])
+                        G_extra_df = cudf.DataFrame(edge_list, columns =['src', 'dst'])
                         G_df = cudf.concat([G.view_edge_list(),G_extra_df], ignore_index = True)
-                        print("DF is " + str(G_df))
                         G = cugraph.Graph()
                         G.from_cudf_edgelist(G_df)
                     else:

From 900ed2450b5d61cdf774ed47f0dd871c66df5ab6 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 19:14:49 +0000
Subject: [PATCH 180/327] Make column names consistent

---
 PopPUNK/refine.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index 13055d7d..073910d1 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -239,8 +239,10 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_
                                          summarise=False, edge_list=True, use_gpu = use_gpu)
                 else:
                     if use_gpu:
-                        G_extra_df = cudf.DataFrame(edge_list, columns =['src', 'dst'])
-                        G_df = cudf.concat([G.view_edge_list(),G_extra_df], ignore_index = True)
+                        G_current_df = G.view_edge_list()
+                        G_current_df.columns = ['source','destination']
+                        G_extra_df = cudf.DataFrame(edge_list, columns =['source','destination'])
+                        G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True)
                         G = cugraph.Graph()
                         G.from_cudf_edgelist(G_df)
                     else:

From 5601a2b12973b1c3315c2d42fa3c277ac49e9a24 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 19:34:46 +0000
Subject: [PATCH 181/327] Updating networks with CUDA

---
 PopPUNK/network.py | 60 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 11 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index d3d1ab26..f71c977e 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -206,7 +206,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         G_df = G.view_edge_list()
         G_df.columns = ['source','destination']
         G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)]
-        # Add self-loop if needing
+        # Add self-loop if needed
         max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
         max_in_vertex_labels = len(reference_names)-1
         if max_in_df.item() != max_in_vertex_labels:
@@ -651,6 +651,8 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
             be annotated as an edge attribute
         threads (int)
             Number of threads to use if new db created
+        use_gpu (bool)
+            Whether to use cugraph for analysis
 
             (default = 1)
     Returns:
@@ -738,18 +740,54 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
                     new_edges.append(edge_tuple)
 
     # finish by updating the network
-    G.add_vertex(len(qList))
-
-    if weights is not None:
-        eweight = G.new_ep("float")
-        G.add_edge_list(new_edges, eprops = [eweight])
-        G.edge_properties["weight"] = eweight
+    if use_gpu:
+    
+        # load CUDA libraries
+        try:
+            import cugraph
+            import cudf
+        except ImportError as e:
+            sys.stderr.write("cugraph and cudf unavailable\n")
+            raise ImportError(e)
+        
+        # construct updated graph
+        G_current_df = G.view_edge_list()
+        if weights is not None:
+            G_current_df.columns = ['source','destination','weights']
+            G_extra_df = cudf.DataFrame(edge_list, columns =['source','destination','weights'])
+            G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True)
+        else:
+            G_current_df.columns = ['source','destination']
+            G_extra_df = cudf.DataFrame(edge_list, columns =['source','destination'])
+            G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True)
+        G = cugraph.Graph()
+        G.from_cudf_edgelist(G_df)
+        
+        # use self-loop to ensure all nodes are present
+        max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
+        max_in_vertex_labels = ref_count + len(qList) - 1
+        if max_in_df.item() != max_in_vertex_labels:
+            G_self_loop = cudf.DataFrame()
+            G_self_loop['source'] = [max_in_vertex_labels]
+            G_self_loop['destination'] = [max_in_vertex_labels]
+            G = cudf.concat([G,G_self_loop], ignore_index = True)
+        # Construct graph
+        G = cugraph.Graph()
+        G.from_cudf_edgelist(G_df)
+        
     else:
-        G.add_edge_list(new_edges)
+        G.add_vertex(len(qList))
+
+        if weights is not None:
+            eweight = G.new_ep("float")
+            G.add_edge_list(new_edges, eprops = [eweight])
+            G.edge_properties["weight"] = eweight
+        else:
+            G.add_edge_list(new_edges)
 
-    # including the vertex ID property map
-    for i, q in enumerate(qList):
-        G.vp.id[i + len(rList)] = q
+        # including the vertex ID property map
+        for i, q in enumerate(qList):
+            G.vp.id[i + len(rList)] = q
 
     return qqDistMat
 

From 15c106af974236b3783fb4d023bb9889adfc876c Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 21:21:28 +0000
Subject: [PATCH 182/327] Change betweeness processing

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index f71c977e..aef891c1 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -606,7 +606,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         if len(betweenness) > 1:
             mean_bt = np.mean(betweenness)
             weighted_mean_bt = np.average(betweenness, weights=sizes)
-        else:
+        elif len(betweenness) == 1:
             mean_bt = betweenness[0]
             weighted_mean_bt = betweenness[0]
 

From f950cd8bfbfafba45a130fbf94ff2579c52d4b9e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 22:13:11 +0000
Subject: [PATCH 183/327] Add GPU options to assign

---
 PopPUNK/assign.py  | 44 ++++++++++++++++++++++++++++++--------------
 PopPUNK/network.py |  3 ++-
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index b1b11236..ae2b0bcf 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -46,7 +46,11 @@ def assign_query(dbFuncs,
                  accessory_only,
                  web,
                  json_sketch,
-                 save_partial_query_graph):
+                 save_partial_query_graph,
+                 gpu_sketch,
+                 gpu_dist,
+                 gpu_graph,
+                 deviceid):
     """Code for assign query mode. Written as a separate function so it can be called
     by web APIs"""
 
@@ -60,6 +64,7 @@ def assign_query(dbFuncs,
     from .network import extractReferences
     from .network import addQueryToNetwork
     from .network import printClusters
+    from .network import save_network
 
     from .plot import writeClusterCsv
 
@@ -133,7 +138,9 @@ def assign_query(dbFuncs,
                                     threads,
                                     overwrite,
                                     codon_phased = codon_phased,
-                                    calc_random = False)
+                                    calc_random = False,
+                                    use_gpu = gpu_sketch,
+                                    deviceid = deviceid)
     # run query
     qrDistMat = queryDatabase(rNames = rNames,
                               qNames = qNames,
@@ -142,7 +149,8 @@ def assign_query(dbFuncs,
                               klist = kmers,
                               self = False,
                               number_plot_fits = plot_fit,
-                              threads = threads)
+                              threads = threads,
+                              use_gpu = gpu_dist)
     # QC distance matrix
     qcPass = qcDistMat(qrDistMat, rNames, qNames, max_pi_dist, max_a_dist, reference_isolate)
 
@@ -153,7 +161,8 @@ def assign_query(dbFuncs,
                      rNames,
                      ref_graph = use_ref_graph,
                      core_only = core_only,
-                     accessory_only = accessory_only)
+                     accessory_only = accessory_only,
+                     use_gpu = gpu_graph)
 
     if model.type == 'lineage':
         # Assign lineages by calculating query-query information
@@ -165,7 +174,8 @@ def assign_query(dbFuncs,
                                   klist = kmers,
                                   self = True,
                                   number_plot_fits = 0,
-                                  threads = threads)
+                                  threads = threads,
+                                  use_gpu = gpu_dist)
         model.extend(qqDistMat, qrDistMat)
 
         genomeNetwork = {}
@@ -182,7 +192,8 @@ def assign_query(dbFuncs,
                                                    assignment,
                                                    0,
                                                    edge_list = True,
-                                                   weights=weights)
+                                                   weights=weights,
+                                                   use_gpu = gpu_graph)
 
             isolateClustering[rank] = \
                 printClusters(genomeNetwork[rank],
@@ -214,7 +225,7 @@ def assign_query(dbFuncs,
                                 genomeNetwork, kmers,
                                 queryAssignments, model, output, update_db,
                                 strand_preserved,
-                                weights = weights, threads = threads)
+                                weights = weights, threads = threads, use_gpu = gpu_graph)
 
         isolateClustering = \
             {'combined': printClusters(genomeNetwork, rNames + qNames,
@@ -237,12 +248,12 @@ def assign_query(dbFuncs,
         joinDBs(ref_db, output, output,
                 {"threads": threads, "strand_preserved": strand_preserved})
         if model.type == 'lineage':
-            genomeNetwork[min(model.ranks)].save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt')
+            save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph)
             # Save sparse distance matrices and updated model
             model.outPrefix = os.path.basename(output)
             model.save()
         else:
-            genomeNetwork.save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt')
+            save_network(genomeNetwork, prefix = output, suffix = '_graph', use_gpu = gpu_graph)
 
         # Update distance matrices with all calculated distances
         if distances == None:
@@ -289,7 +300,7 @@ def assign_query(dbFuncs,
             dbOrder = rNames + qNames
             newRepresentativesIndices, newRepresentativesNames, \
                 newRepresentativesFile, genomeNetwork = \
-                    extractReferences(genomeNetwork, dbOrder, output, rNames, threads = threads)
+                    extractReferences(genomeNetwork, dbOrder, output, rNames, threads = threads, use_gpu = gpu_graph)
             # intersection that maintains order
             newQueries = [x for x in qNames if x in frozenset(newRepresentativesNames)]
 
@@ -303,7 +314,7 @@ def assign_query(dbFuncs,
                 postpruning_combined_seq, newDistMat = \
                     prune_distance_matrix(combined_seq, names_to_remove, complete_distMat,
                                           output + "/" + os.path.basename(output) + ".refs.dists")
-                genomeNetwork.save(output + "/" + os.path.basename(output) + '.refs_graph.gt', fmt = 'gt')
+                save_network(genomeNetwork, prefix = output, suffix = 'refs_graph', use_gpu = gpu_graph)
                 removeFromDB(output, output, names_to_remove)
                 os.rename(output + "/" + os.path.basename(output) + ".tmp.h5",
                           output + "/" + os.path.basename(output) + ".refs.h5")
@@ -314,9 +325,9 @@ def assign_query(dbFuncs,
         storePickle(rNames, qNames, False, qrDistMat, dists_out)
         if save_partial_query_graph:
             if model.type == 'lineage':
-                genomeNetwork[min(model.ranks)].save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt')
+                save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph)
             else:
-                genomeNetwork.save(output + "/" + os.path.basename(output) + '_graph.gt', fmt = 'gt')
+                save_network(genomeNetwork, prefix = output, suffix = '_graph', use_gpu = gpu_graph)
 
     return(isolateClustering)
 
@@ -404,6 +415,7 @@ def get_options():
     other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]')
     other.add_argument('--gpu-sketch', default=False, action='store_true', help='Use a GPU when calculating sketches (read data only) [default = False]')
     other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]')
+    other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when constructing networks [default = False]')
     other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]')
     other.add_argument('--version', action='version',
                        version='%(prog)s '+__version__)
@@ -508,7 +520,11 @@ def main():
                  args.accessory_only,
                  web=False,
                  json_sketch=None,
-                 save_partial_query_graph=False)
+                 save_partial_query_graph=False,
+                 args.gpu_sketch,
+                 args.gpu_dist,
+                 arg.gpu_graph,
+                 args.deviceid)
 
     sys.stderr.write("\nDone\n")
 
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index aef891c1..011d6a37 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -618,7 +618,8 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
 
 def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
                       assignments, model, queryDB, queryQuery = False,
-                      strand_preserved = False, weights = None, threads = 1):
+                      strand_preserved = False, weights = None, threads = 1,
+                      use_gpu = False):
     """Finds edges between queries and items in the reference database,
     and modifies the network to include them.
 

From a6a037eefcc2a31edaa9dfb4aa4fd412d528d65c Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 22:33:38 +0000
Subject: [PATCH 184/327] Change function argument order

---
 PopPUNK/assign.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index ae2b0bcf..be44325f 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -44,13 +44,13 @@ def assign_query(dbFuncs,
                  external_clustering,
                  core_only,
                  accessory_only,
-                 web,
-                 json_sketch,
-                 save_partial_query_graph,
                  gpu_sketch,
                  gpu_dist,
                  gpu_graph,
-                 deviceid):
+                 deviceid,
+                 web,
+                 json_sketch,
+                 save_partial_query_graph):
     """Code for assign query mode. Written as a separate function so it can be called
     by web APIs"""
 
@@ -518,13 +518,13 @@ def main():
                  args.external_clustering,
                  args.core_only,
                  args.accessory_only,
-                 web=False,
-                 json_sketch=None,
-                 save_partial_query_graph=False,
                  args.gpu_sketch,
                  args.gpu_dist,
                  arg.gpu_graph,
-                 args.deviceid)
+                 args.deviceid,
+                 web=False,
+                 json_sketch=None,
+                 save_partial_query_graph=False)
 
     sys.stderr.write("\nDone\n")
 

From 2b65e614c0556c63ee950f1b0a948bf7e00e18da Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 13 Mar 2021 22:35:52 +0000
Subject: [PATCH 185/327] Change argument typo

---
 PopPUNK/assign.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index be44325f..0eacef19 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -520,7 +520,7 @@ def main():
                  args.accessory_only,
                  args.gpu_sketch,
                  args.gpu_dist,
-                 arg.gpu_graph,
+                 args.gpu_graph,
                  args.deviceid,
                  web=False,
                  json_sketch=None,

From 9cc9ad6344567ee5b0169695268525e6c7e136bb Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 20:54:07 +0000
Subject: [PATCH 186/327] Add CUDA load for querying

---
 PopPUNK/network.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 011d6a37..31eb67cb 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -68,6 +68,15 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
     # If a refined fit, may use just core or accessory distances
     dir_prefix = network_dir + "/" + os.path.basename(network_dir)
     if use_gpu:
+
+        # load CUDA libraries
+        try:
+            import cugraph
+            import cudf
+        except ImportError as e:
+            sys.stderr.write("cugraph and cudf unavailable\n")
+            raise ImportError(e)
+
         graph_suffix = '.csv.bz2'
     else:
         graph_suffix = '.gt'

From 403506272929cd2bfc08a26dfb2a998388f27928 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 20:59:35 +0000
Subject: [PATCH 187/327] Fix graph suffix for GPU

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 31eb67cb..fb70b7e0 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -77,7 +77,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
             sys.stderr.write("cugraph and cudf unavailable\n")
             raise ImportError(e)
 
-        graph_suffix = '.csv.bz2'
+        graph_suffix = '.csv.gz'
     else:
         graph_suffix = '.gt'
     if core_only and model.type == 'refine':

From 01dcf3758223d20fcd48bab68530f0548a4d2aff Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 21:03:17 +0000
Subject: [PATCH 188/327] Quote column name

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index fb70b7e0..03258a09 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -100,7 +100,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
 
     if use_gpu:
         G_df = cudf.read_csv(network_file, compression = 'gzip')
-        if weights in G_df.columns:
+        if 'weights' in G_df.columns:
             genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:
             genomeNetwork.from_cudf_edgelist(G_df,renumber=False)

From cbeebaf9d9dc494a870d2fabe46a806e1cbad266 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 21:05:59 +0000
Subject: [PATCH 189/327] Define graph name

---
 PopPUNK/network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 03258a09..d288f61d 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -100,6 +100,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
 
     if use_gpu:
         G_df = cudf.read_csv(network_file, compression = 'gzip')
+        genomeNetwork = cugraph.Graph()
         if 'weights' in G_df.columns:
             genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:

From aee03b142af3829c5af98b73e9ee95e2c00b31fe Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 21:11:04 +0000
Subject: [PATCH 190/327] Change cudf column names on loading

---
 PopPUNK/network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index d288f61d..92e40e2d 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -100,6 +100,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
 
     if use_gpu:
         G_df = cudf.read_csv(network_file, compression = 'gzip')
+        G_df.columns = ['source','destination']
         genomeNetwork = cugraph.Graph()
         if 'weights' in G_df.columns:
             genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)

From a0fce563ef520e5e98de8e3355644ebfa8259dc0 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 21:12:13 +0000
Subject: [PATCH 191/327] Add weights to column names

---
 PopPUNK/network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 92e40e2d..b80ea4e2 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -100,11 +100,12 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
 
     if use_gpu:
         G_df = cudf.read_csv(network_file, compression = 'gzip')
-        G_df.columns = ['source','destination']
         genomeNetwork = cugraph.Graph()
         if 'weights' in G_df.columns:
+            G_df.columns = ['source','destination','weights']
             genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:
+            G_df.columns = ['source','destination']
             genomeNetwork.from_cudf_edgelist(G_df,renumber=False)
     else:
         genomeNetwork = gt.load_graph(network_file)

From 43d13a7a6fa7bfa67827aed0f23689582689b416 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 21:17:17 +0000
Subject: [PATCH 192/327] Print formatted DF

---
 PopPUNK/network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index b80ea4e2..eea657aa 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -105,6 +105,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
             G_df.columns = ['source','destination','weights']
             genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:
+            print("DF is " + str(G_df))
             G_df.columns = ['source','destination']
             genomeNetwork.from_cudf_edgelist(G_df,renumber=False)
     else:

From 2ea95f4f60803c17164090ab2f0306aa266c7d26 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 21:20:53 +0000
Subject: [PATCH 193/327] Remove Pandas index from CSV

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index eea657aa..4f0b69b3 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -1135,7 +1135,7 @@ def save_network(G, prefix = None, suffix = None, use_gpu = False):
         file_name = file_name + suffix
     if use_gpu:
         G.to_pandas_edgelist().to_csv(file_name + '.csv.gz',
-                compression='gzip')
+                compression='gzip', index = False)
     else:
         G.save(file_name + '.gt',
                 fmt = 'gt')

From 41868b9a2dc991fdbdc315f00e6ddcedf5e86e6e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 21:26:05 +0000
Subject: [PATCH 194/327] Update graph loading message

---
 PopPUNK/network.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 4f0b69b3..83cb0590 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -105,15 +105,16 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
             G_df.columns = ['source','destination','weights']
             genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
         else:
-            print("DF is " + str(G_df))
             G_df.columns = ['source','destination']
             genomeNetwork.from_cudf_edgelist(G_df,renumber=False)
+        sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.number_of_vertices()))) + " samples\n")
     else:
         genomeNetwork = gt.load_graph(network_file)
-    sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n")
+        sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n")
 
     # Ensure all in dists are in final network
-    networkMissing = set(map(str,set(range(len(refList))).difference(list(genomeNetwork.vertices()))))
+    vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph))
+    networkMissing = set(set(range(len(refList))).difference(vertex_list))
     if len(networkMissing) > 0:
         sys.stderr.write("WARNING: Samples " + ",".join(networkMissing) + " are missing from the final network\n")
 

From 39e80a76134a63d52cf109a0edd0777ad56278fe Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 21:26:55 +0000
Subject: [PATCH 195/327] Update graph loading message again

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 83cb0590..2c28f8e4 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -107,7 +107,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
         else:
             G_df.columns = ['source','destination']
             genomeNetwork.from_cudf_edgelist(G_df,renumber=False)
-        sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.number_of_vertices()))) + " samples\n")
+        sys.stderr.write("Network loaded: " + str(genomeNetwork.number_of_vertices()) + " samples\n")
     else:
         genomeNetwork = gt.load_graph(network_file)
         sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n")

From c12d08a17cdcd1d398d837248a126a8ccae28e14 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 21:27:48 +0000
Subject: [PATCH 196/327] Change gpu option

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 2c28f8e4..827db192 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -113,7 +113,7 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
         sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n")
 
     # Ensure all in dists are in final network
-    vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph))
+    vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = use_gpu))
     networkMissing = set(set(range(len(refList))).difference(vertex_list))
     if len(networkMissing) > 0:
         sys.stderr.write("WARNING: Samples " + ",".join(networkMissing) + " are missing from the final network\n")

From 04ee820bb417c14511b10ac0c011bd8d909a4534 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 21:30:48 +0000
Subject: [PATCH 197/327] Change name of tuples

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 827db192..90d67bae 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -769,11 +769,11 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
         G_current_df = G.view_edge_list()
         if weights is not None:
             G_current_df.columns = ['source','destination','weights']
-            G_extra_df = cudf.DataFrame(edge_list, columns =['source','destination','weights'])
+            G_extra_df = cudf.DataFrame(new_edges, columns =['source','destination','weights'])
             G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True)
         else:
             G_current_df.columns = ['source','destination']
-            G_extra_df = cudf.DataFrame(edge_list, columns =['source','destination'])
+            G_extra_df = cudf.DataFrame(new_edges, columns =['source','destination'])
             G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True)
         G = cugraph.Graph()
         G.from_cudf_edgelist(G_df)

From 655025b3fba743db7414dfa230f9e36d8b00dd27 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 21:34:41 +0000
Subject: [PATCH 198/327] Change printClusters to use GPU

---
 PopPUNK/assign.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index 0eacef19..1bcfb678 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -198,7 +198,8 @@ def assign_query(dbFuncs,
             isolateClustering[rank] = \
                 printClusters(genomeNetwork[rank],
                               rNames + qNames,
-                              printCSV = False)
+                              printCSV = False,
+                              use_gpu = gpu_graph)
 
         overall_lineage = createOverallLineage(model.ranks, isolateClustering)
         writeClusterCsv(
@@ -232,7 +233,8 @@ def assign_query(dbFuncs,
                                         output + "/" + os.path.basename(output),
                                         old_cluster_file,
                                         external_clustering,
-                                        write_references or update_db)}
+                                        write_references or update_db,
+                                        use_gpu = gpu_graph)}
 
     # Update DB as requested
     dists_out = output + "/" + os.path.basename(output) + ".dists"

From 50cc3172196929b2bc5e6d20f938240afa7894d7 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 21:38:10 +0000
Subject: [PATCH 199/327] Edit component assignments

---
 PopPUNK/network.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 90d67bae..8442a514 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -862,7 +862,9 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
         component_assignments = cugraph.components.connectivity.connected_components(G)
         component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
         newClusters = [set() for rank in range(component_frequencies.size)]
+        print("Assignments: " + str(component_assignments))
         for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment
+            print("Index: " + str(isolate_index))
             component = component_assignments['labels'].iloc[isolate_index].item()
             component_rank_bool = component_frequencies.index == component
             component_rank = np.argmax(component_rank_bool.to_array())

From dc9a702de26529789054c9835043cca3608027b0 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 21:54:24 +0000
Subject: [PATCH 200/327] Print node count

---
 PopPUNK/network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 8442a514..d9132e83 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -859,6 +859,7 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
             sys.stderr.write("cugraph and cudf unavailable\n")
             raise ImportError(e)
     
+        print("num nodes is " + str(G.number_of_vertices()))
         component_assignments = cugraph.components.connectivity.connected_components(G)
         component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
         newClusters = [set() for rank in range(component_frequencies.size)]

From 067149ca40b7fbae9fff509420640beb50278231 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 22:08:41 +0000
Subject: [PATCH 201/327] Return updated graph from function

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index d9132e83..15b947d0 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -804,7 +804,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
         for i, q in enumerate(qList):
             G.vp.id[i + len(rList)] = q
 
-    return qqDistMat
+    return G, qqDistMat
 
 def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
                   externalClusterCSV = None, printRef = True, printCSV = True,

From bde7dd8042bd428361e4f054b0f151b3cfbd2d82 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 22:09:48 +0000
Subject: [PATCH 202/327] Update to be consistent with changes to network
 function

---
 PopPUNK/assign.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index 1bcfb678..e5c92ec6 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -221,7 +221,8 @@ def assign_query(dbFuncs,
             weights = qrDistMat
         else:
             weights = None
-        qqDistMat = \
+
+        genomeNetwork, qqDistMat = \
             addQueryToNetwork(dbFuncs, rNames, qNames,
                                 genomeNetwork, kmers,
                                 queryAssignments, model, output, update_db,

From 0aa02c631f192dccf8f1470aa6dbe3ad41b02329 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 14 Mar 2021 22:14:55 +0000
Subject: [PATCH 203/327] Remove debug messages

---
 PopPUNK/network.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 15b947d0..5e008262 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -859,13 +859,10 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
             sys.stderr.write("cugraph and cudf unavailable\n")
             raise ImportError(e)
     
-        print("num nodes is " + str(G.number_of_vertices()))
         component_assignments = cugraph.components.connectivity.connected_components(G)
         component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)
         newClusters = [set() for rank in range(component_frequencies.size)]
-        print("Assignments: " + str(component_assignments))
         for isolate_index, isolate_name in enumerate(rlist): # assume sorted at the moment
-            print("Index: " + str(isolate_index))
             component = component_assignments['labels'].iloc[isolate_index].item()
             component_rank_bool = component_frequencies.index == component
             component_rank = np.argmax(component_rank_bool.to_array())

From eca9b50f60d5298d2e698cd034fe00e1c3e0206e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 09:49:45 +0000
Subject: [PATCH 204/327] Ensure consistency across function arguments

---
 PopPUNK/__main__.py |  3 +--
 PopPUNK/assign.py   |  2 +-
 PopPUNK/models.py   | 13 +++++++------
 PopPUNK/network.py  | 22 +++++++++++-----------
 PopPUNK/refine.py   | 13 +++++++------
 5 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 6275af83..90f42aaf 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -419,7 +419,6 @@ def main():
                                             args.unconstrained,
                                             args.score_idx,
                                             args.no_local,
-                                            args.threads,
                                             args.gpu_graph)
                 new_model.plot(distMat)
                 model = new_model
@@ -545,7 +544,7 @@ def main():
                 fit_type = 'accessory'
                 genomeNetwork = indivNetworks['accessory']
 
-        save_network(genomeNetwork, prefix = output, suffix = "graph", use_gpu = args.gpu_graph)
+        save_network(genomeNetwork, prefix = output, suffix = "_graph", use_gpu = args.gpu_graph)
 
         #******************************#
         #*                            *#
diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index 75994f9e..5a1dfe1c 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -125,7 +125,7 @@ def assign_query(dbFuncs,
             for reference in refFile:
                 rNames.append(reference.rstrip())
     else:
-        if os.path.isfile(distances + ",pkl"):
+        if os.path.isfile(distances + ".pkl"):
             rNames = readPickle(distances, enforce_self = True, distances=False)[0]
         elif update_db:
             sys.stderr.write("Reference distances missing, cannot use --update-db\n")
diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 78f90df4..1dd8997a 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -726,12 +726,13 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
         self.unconstrained = unconstrained
 
         # load CUDA libraries
-        try:
-            import cugraph
-            import cudf
-        except ImportError as e:
-            sys.stderr.write("cugraph and cudf unavailable\n")
-            raise ImportError(e)
+        if use_gpu:
+            try:
+                import cugraph
+                import cudf
+            except ImportError as e:
+                sys.stderr.write("cugraph and cudf unavailable\n")
+                raise ImportError(e)
 
         # Get starting point
         model.no_scale()
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index ff0847a1..1d1945d8 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -585,13 +585,13 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         degree_df = G.degree()
         triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()])
         transitivity = triangle_count/triad_count
-        print("Triangle count GPU: " + str(triangle_count) + " Triad count GPU: " + str(triad_count))
+#        print("Triangle count GPU: " + str(triangle_count) + " Triad count GPU: " + str(triad_count))
     else:
         component_assignments, component_frequencies = gt.label_components(G)
         components = len(component_frequencies)
         density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1))
         transitivity = gt.global_clustering(G)[0]
-        print("Triangle/triad count CPU: " + str(gt.global_clustering(G, ret_counts = True)))
+#        print("Triangle/triad count CPU: " + str(gt.global_clustering(G, ret_counts = True)))
 
     mean_bt = 0
     weighted_mean_bt = 0
@@ -704,14 +704,14 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
         else:
             sys.stderr.write("Calculating all query-query distances\n")
             addRandom(queryDB, qList, kmers, strand_preserved, threads = threads)
-            qlist1, qlist2, qqDistMat = queryDatabase(rNames = qList,
-                                                      qNames = qList,
-                                                      dbPrefix = queryDB,
-                                                      queryPrefix = queryDB,
-                                                      klist = kmers,
-                                                      self = True,
-                                                      number_plot_fits = 0,
-                                                      threads = threads)
+            qqDistMat = queryDatabase(rNames = qList,
+                                      qNames = qList,
+                                      dbPrefix = queryDB,
+                                      queryPrefix = queryDB,
+                                      klist = kmers,
+                                      self = True,
+                                      number_plot_fits = 0,
+                                      threads = threads)
 
             queryAssignation = model.assign(qqDistMat)
             for row_idx, (assignment, (ref, query)) in enumerate(zip(queryAssignation, listDistInts(qList, qList, self = True))):
@@ -748,7 +748,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
             # identify any links between queries and store in the same links dict
             # links dict now contains lists of links both to original database and new queries
             # have to use names and link to query list in order to match to node indices
-            for row_idx, (assignment, (query1, query2)) in enumerate(zip(queryAssignation, iterDistRows(qlist1, qlist2, self = True))):
+            for row_idx, (assignment, (query1, query2)) in enumerate(zip(queryAssignation, iterDistRows(qList, qListp, self = True))):
                 if assignment == model.within_label:
                     if weights is not None:
                         dist = np.linalg.norm(qqDistMat[row_idx, :])
diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index 5462f7af..e1b0d505 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -216,12 +216,13 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_
     """
     
     # load CUDA libraries
-    try:
-        import cugraph
-        import cudf
-    except ImportError as e:
-        sys.stderr.write("cugraph and cudf unavailable\n")
-        raise ImportError(e)
+    if use_gpu:
+        try:
+            import cugraph
+            import cudf
+        except ImportError as e:
+            sys.stderr.write("cugraph and cudf unavailable\n")
+            raise ImportError(e)
     
     scores = []
     edge_list = []

From d7fb6529ffa80a59244cf9ef5c881f76048a7f11 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 17:18:05 +0000
Subject: [PATCH 205/327] Add omitted sys exit when missing distance file

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/assign.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index 5a1dfe1c..b1361aab 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -129,6 +129,7 @@ def assign_query(dbFuncs,
             rNames = readPickle(distances, enforce_self = True, distances=False)[0]
         elif update_db:
             sys.stderr.write("Reference distances missing, cannot use --update-db\n")
+            sys.exit(1)
         else:
             rNames = getSeqsInDb(ref_db + "/" + os.path.basename(ref_db) + ".h5")
     # construct database

From 593b26f07e4d229620304b69199792bbf4eec0d9 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 17:18:36 +0000
Subject: [PATCH 206/327] Update file name formatting

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 scripts/poppunk_batch_mst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 4b37af81..c5d01e0c 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -402,7 +402,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
         os.rename(os.path.join(output_dir,os.path.basename(output_dir) + "_lineages.csv"),
                   os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv"))
         for rank in ranks:
-            os.rename(os.path.join(output_dir,os.path.basename(output_dir) + "_rank" + str(rank) + "_fit.npz"),
+            os.rename(os.path.join(output_dir, os.path.basename(output_dir) + "_rank" + str(rank) + "_fit.npz"),
                       os.path.join(args.output,os.path.basename(args.output) + "_rank" + str(rank) + "_fit.npz"))
 
         # Merge with epidemiological data if requested

From 69724c0324a566dd35e355582136c708e18efb2b Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 17:23:23 +0000
Subject: [PATCH 207/327] Edit whitespace

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 1d1945d8..5660f8fe 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -222,7 +222,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)]
         # Add self-loop if needed
         max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
-        max_in_vertex_labels = len(reference_names)-1
+        max_in_vertex_labels = len(reference_names) - 1
         if max_in_df.item() != max_in_vertex_labels:
             G_self_loop = cudf.DataFrame()
             G_self_loop['source'] = [max_in_vertex_labels]

From 470fec0bc1c115da64439ff5e5a38dbbd9ef92f2 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 17:23:56 +0000
Subject: [PATCH 208/327] Edit whitespace

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 scripts/poppunk_batch_mst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index c5d01e0c..6a6f8eae 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -403,7 +403,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
                   os.path.join(args.output,os.path.basename(args.output) + "_lineages.csv"))
         for rank in ranks:
             os.rename(os.path.join(output_dir, os.path.basename(output_dir) + "_rank" + str(rank) + "_fit.npz"),
-                      os.path.join(args.output,os.path.basename(args.output) + "_rank" + str(rank) + "_fit.npz"))
+                      os.path.join(args.output, os.path.basename(args.output) + "_rank" + str(rank) + "_fit.npz"))
 
         # Merge with epidemiological data if requested
         if args.info_csv is not None:

From 9f600de9a05abc4b9dc28af0b2700266ae63042d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 21:00:42 +0000
Subject: [PATCH 209/327] Edit whitespace

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 5660f8fe..43d1e479 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -221,7 +221,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         G_df.columns = ['source','destination']
         G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)]
         # Add self-loop if needed
-        max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
+        max_in_df = np.amax([G_df['source'].max(), G_df['destination'].max()])
         max_in_vertex_labels = len(reference_names) - 1
         if max_in_df.item() != max_in_vertex_labels:
             G_self_loop = cudf.DataFrame()

From 95702445aa0c4a9d8df5ca10a2bd4bdb8751e61e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 21:02:12 +0000
Subject: [PATCH 210/327] Fix qList variable name

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 43d1e479..829e4e63 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -748,7 +748,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
             # identify any links between queries and store in the same links dict
             # links dict now contains lists of links both to original database and new queries
             # have to use names and link to query list in order to match to node indices
-            for row_idx, (assignment, (query1, query2)) in enumerate(zip(queryAssignation, iterDistRows(qList, qListp, self = True))):
+            for row_idx, (assignment, (query1, query2)) in enumerate(zip(queryAssignation, iterDistRows(qList, qList, self = True))):
                 if assignment == model.within_label:
                     if weights is not None:
                         dist = np.linalg.norm(qqDistMat[row_idx, :])

From 95fb23516b98f61c81d0875abb99b74e9b440412 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 21:03:01 +0000
Subject: [PATCH 211/327] Remove debug message

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/network.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 829e4e63..a830b47d 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -591,7 +591,6 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         components = len(component_frequencies)
         density = len(list(G.edges()))/(0.5 * len(list(G.vertices())) * (len(list(G.vertices())) - 1))
         transitivity = gt.global_clustering(G)[0]
-#        print("Triangle/triad count CPU: " + str(gt.global_clustering(G, ret_counts = True)))
 
     mean_bt = 0
     weighted_mean_bt = 0

From f323ccf391a7eb3ec997b044748ee8bdf1158de2 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 21:09:41 +0000
Subject: [PATCH 212/327] Edit whitespace

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index a830b47d..0ea1eba6 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -227,7 +227,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             G_self_loop = cudf.DataFrame()
             G_self_loop['source'] = [max_in_vertex_labels]
             G_self_loop['destination'] = [max_in_vertex_labels]
-            G_ref_df = cudf.concat([G_ref_df,G_self_loop], ignore_index = True)
+            G_ref_df = cudf.concat([G_ref_df, G_self_loop], ignore_index = True)
         # Construct graph
         G_ref = cugraph.Graph()
         G_ref.from_cudf_edgelist(G_ref_df)

From 5333e881cae37865d09d91d9bd1b29f0a699a185 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 21:10:23 +0000
Subject: [PATCH 213/327] Remove debug message

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/network.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 0ea1eba6..6ebaa035 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -585,7 +585,6 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         degree_df = G.degree()
         triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()])
         transitivity = triangle_count/triad_count
-#        print("Triangle count GPU: " + str(triangle_count) + " Triad count GPU: " + str(triad_count))
     else:
         component_assignments, component_frequencies = gt.label_components(G)
         components = len(component_frequencies)

From 78a295fb1bc1937a368bede2e3c142e8c736e67a Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 21:12:39 +0000
Subject: [PATCH 214/327] Change comment wording

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/refine.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index e1b0d505..6743d8a7 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -247,7 +247,8 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_
                         G = cugraph.Graph()
                         G.from_cudf_edgelist(G_df)
                     else:
-                        # Not currently possible with GPU - https://github.com/rapidsai/cugraph/issues/805
+                        # Adding edges to network not currently possible with GPU - https://github.com/rapidsai/cugraph/issues/805
+                        # We add to the cuDF, and then reconstruct the network instead
                         G.add_edge_list(edge_list)
                 # Add score into vector for any offsets passed (should usually just be one)
                 for s in range(prev_idx, idx):
@@ -453,4 +454,3 @@ def likelihoodBoundary(s, model, start, end, within, between):
     X = transformLine(s, start, end).reshape(1, -1)
     responsibilities = model.assign(X, progress = False, values = True)
     return(responsibilities[0, within] - responsibilities[0, between])
-

From 6eacd4b2be7b6d5afd2a25a5b0d34ab7f773416e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 21:21:54 +0000
Subject: [PATCH 215/327] Reorder column indices

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index f9427ff5..fff66adc 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -252,7 +252,7 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate = None):
     to_prune = []
 
     # First check with numpy, which is quicker than iterating over everything
-    if np.any(distMat[:,1] > a_max) or np.any(distMat[:,0] > c_max):
+    if np.any(distMat[:, 0] > c_max) or np.any(distMat[:, 1] > a_max):
         passed = False
         names = iterDistRows(refList, queryList, refList == queryList)
         for i, (ref, query) in enumerate(names):

From 083722bd9e62a58c35603db2b6bc43700d5e53af Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 21:25:23 +0000
Subject: [PATCH 216/327] Change column indices

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index fff66adc..e8021e00 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -256,7 +256,7 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate = None):
         passed = False
         names = iterDistRows(refList, queryList, refList == queryList)
         for i, (ref, query) in enumerate(names):
-            if distMat[i,0] > c_max or distMat[i,1] > a_max:
+            if distMat[i, 0] > c_max or distMat[i, 1] > a_max:
                 sys.stderr.write("WARNING: Outlier at c = " + str(distMat[i,0]) + " a = " + str(distMat[i,1]) +
                                  " 1:" + ref + " 2:" + query + "\n")
                 if ref_isolate is not None:

From bbdf53becc916b66fc436f1292e373da92ee42d9 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 21:27:45 +0000
Subject: [PATCH 217/327] Edit whitespace

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index e8021e00..2beba400 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -257,7 +257,7 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate = None):
         names = iterDistRows(refList, queryList, refList == queryList)
         for i, (ref, query) in enumerate(names):
             if distMat[i, 0] > c_max or distMat[i, 1] > a_max:
-                sys.stderr.write("WARNING: Outlier at c = " + str(distMat[i,0]) + " a = " + str(distMat[i,1]) +
+                sys.stderr.write("WARNING: Outlier at c = " + str(distMat[i, 0]) + " a = " + str(distMat[i, 1]) +
                                  " 1:" + ref + " 2:" + query + "\n")
                 if ref_isolate is not None:
                     if ref == ref_isolate:

From c7daea2c6267bb9dcb5dd0909775fbc86c1dd6ad Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 21:40:31 +0000
Subject: [PATCH 218/327] Update assign arguments

---
 PopPUNK/web.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/web.py b/PopPUNK/web.py
index c1f6060f..f7939d1c 100644
--- a/PopPUNK/web.py
+++ b/PopPUNK/web.py
@@ -74,12 +74,17 @@ def sketchAssign():
                                     args.assign.plot_fit,
                                     args.assign.graph_weights,
                                     args.assign.max_a_dist,
+                                    args.assign.max_pi_dist,
                                     args.assign.model_dir,
                                     args.assign.strand_preserved,
                                     args.assign.previous_clustering,
                                     args.assign.external_clustering,
                                     args.assign.core_only,
                                     args.assign.accessory_only,
+                                    args.assign.gpu_sketch,
+                                    args.assign.gpu_dist,
+                                    args.assign.gpu_graph,
+                                    args.assign.deviceid,
                                     args.assign.web,
                                     sketch_dict["sketch"],
                                     args.assign.save_partial_query_graph)
@@ -323,4 +328,4 @@ def main():
     scheduler.init_app(app)
     scheduler.start()
     atexit.register(lambda: scheduler.shutdown())
-    app.run(debug=False,use_reloader=False)
\ No newline at end of file
+    app.run(debug=False,use_reloader=False)

From e18ed35ad03f3c67fe362c07c26a115a22cb3910 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 21:56:06 +0000
Subject: [PATCH 219/327] Make CUDA library imports global

---
 PopPUNK/models.py     | 19 ++++++++++++-------
 PopPUNK/network.py    | 23 +++++++++++++++--------
 PopPUNK/refine.py     | 19 ++++++++++++-------
 PopPUNK/sparse_mst.py | 20 +++++++++++++-------
 4 files changed, 52 insertions(+), 29 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 1dd8997a..ac46e40e 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -33,6 +33,15 @@
     sys.stderr.write("This version of PopPUNK requires python v3.8 or higher\n")
     sys.exit(0)
 
+# GPU support
+try:
+    import cugraph
+    import cudf
+    gpu_lib = True
+except ImportError as e:
+    sys.stderr.write("cugraph and cudf unavailable\n")
+    gpu_lib = False
+
 import pp_sketchlib
 import poppunk_refine
 
@@ -726,13 +735,9 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
         self.unconstrained = unconstrained
 
         # load CUDA libraries
-        if use_gpu:
-            try:
-                import cugraph
-                import cudf
-            except ImportError as e:
-                sys.stderr.write("cugraph and cudf unavailable\n")
-                raise ImportError(e)
+        if use_gpu and not gpu_lib:
+            sys.stderr.write('Unable to load GPU libraries; exiting\n')
+            sys.exit(1)
 
         # Get starting point
         model.no_scale()
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 6ebaa035..9c864b69 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -22,6 +22,15 @@
 import graph_tool.all as gt
 import dendropy
 
+# GPU support
+try:
+    import cugraph
+    import cudf
+    gpu_lib = True
+except ImportError as e:
+    sys.stderr.write("cugraph and cudf unavailable\n")
+    gpu_lib = False
+
 from .__main__ import accepted_weights_types
 
 from .sketchlib import addRandom
@@ -67,19 +76,17 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
     """
     # If a refined fit, may use just core or accessory distances
     dir_prefix = network_dir + "/" + os.path.basename(network_dir)
-    if use_gpu:
 
-        # load CUDA libraries
-        try:
-            import cugraph
-            import cudf
-        except ImportError as e:
-            sys.stderr.write("cugraph and cudf unavailable\n")
-            raise ImportError(e)
+    # load CUDA libraries
+    if use_gpu and not gpu_lib:
+        sys.stderr.write('Unable to load GPU libraries; exiting\n')
+        sys.exit(1)
 
+    if use_gpu:
         graph_suffix = '.csv.gz'
     else:
         graph_suffix = '.gt'
+
     if core_only and model.type == 'refine':
         model.slope = 0
         network_file = dir_prefix + '_core_graph' + graph_suffix
diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index 6743d8a7..2095e82e 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -24,6 +24,15 @@
 import poppunk_refine
 import graph_tool.all as gt
 
+# GPU support
+try:
+    import cugraph
+    import cudf
+    gpu_lib = True
+except ImportError as e:
+    sys.stderr.write("cugraph and cudf unavailable\n")
+    gpu_lib = False
+
 from .network import constructNetwork
 from .network import networkSummary
 
@@ -216,13 +225,9 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_
     """
     
     # load CUDA libraries
-    if use_gpu:
-        try:
-            import cugraph
-            import cudf
-        except ImportError as e:
-            sys.stderr.write("cugraph and cudf unavailable\n")
-            raise ImportError(e)
+    if use_gpu and not gpu_lib:
+        sys.stderr.write('Unable to load GPU libraries; exiting\n')
+        sys.exit(1)
     
     scores = []
     edge_list = []
diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py
index 34b47763..adfce4d2 100755
--- a/PopPUNK/sparse_mst.py
+++ b/PopPUNK/sparse_mst.py
@@ -12,6 +12,15 @@
 import pandas as pd
 from scipy import sparse
 
+# GPU support
+try:
+    import cugraph
+    import cudf
+    gpu_lib = True
+except ImportError as e:
+    sys.stderr.write("cugraph and cudf unavailable\n")
+    gpu_lib = False
+
 # import poppunk package
 from .__init__ import __version__
 
@@ -61,13 +70,10 @@ def main():
     args = get_options()
 
     import graph_tool.all as gt
-    try:
-        import cugraph
-        import cudf
-    except ImportError as e:
-        if args.gpu_graph:
-            sys.stderr.write("cugraph and cudf unavailable\n")
-            raise ImportError(e)
+    # load CUDA libraries
+    if use_gpu and not gpu_lib:
+        sys.stderr.write('Unable to load GPU libraries; exiting\n')
+        sys.exit(1)
 
     # Read in sample names
     if (args.distance_pkl is not None) ^ (args.previous_clustering is not None):

From fcf285878d94b545a10453aded5f7e2719b6c119 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 22:02:35 +0000
Subject: [PATCH 220/327] Only import GPU libraries once

---
 PopPUNK/network.py | 50 ++++++++++++++--------------------------------
 1 file changed, 15 insertions(+), 35 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 9c864b69..cda818fb 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -203,13 +203,9 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
 
     if use_gpu:
 
-        # load CUDA libraries
-        try:
-            import cugraph
-            import cudf
-        except ImportError as e:
-            sys.stderr.write("cugraph and cudf unavailable\n")
-            raise ImportError(e)
+        if not gpu_lib:
+            sys.stderr.write('Unable to load GPU libraries; exiting\n')
+            sys.exit(1)
     
         # For large network, use more approximate method for extracting references
         reference = {}
@@ -489,13 +485,9 @@ def constructNetwork(rlist, qlist, assignments, within_label,
     # load GPU libraries if necessary
     if use_gpu:
         
-        # load CUDA libraries
-        try:
-            import cugraph
-            import cudf
-        except ImportError as e:
-            sys.stderr.write("cugraph and cudf unavailable\n")
-            raise ImportError(e)
+        if not gpu_lib:
+           sys.stderr.write('Unable to load GPU libraries; exiting\n')
+           sys.exit(1)
             
         # create DataFrame using edge tuples
         if weights is not None or sparse_input is not None:
@@ -576,13 +568,9 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
     """
     if use_gpu:
     
-        # load CUDA libraries
-        try:
-            import cugraph
-            import cudf
-        except ImportError as e:
-            sys.stderr.write("cugraph and cudf unavailable\n")
-            raise ImportError(e)
+        if not gpu_lib:
+           sys.stderr.write('Unable to load GPU libraries; exiting\n')
+           sys.exit(1)
     
         component_assignments = cugraph.components.connectivity.connected_components(G)
         component_nums = component_assignments['labels'].unique().astype(int)
@@ -765,13 +753,9 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
     # finish by updating the network
     if use_gpu:
     
-        # load CUDA libraries
-        try:
-            import cugraph
-            import cudf
-        except ImportError as e:
-            sys.stderr.write("cugraph and cudf unavailable\n")
-            raise ImportError(e)
+        if not gpu_lib:
+           sys.stderr.write('Unable to load GPU libraries; exiting\n')
+           sys.exit(1)
         
         # construct updated graph
         G_current_df = G.view_edge_list()
@@ -859,13 +843,9 @@ def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
     # get a sorted list of component assignments
     if use_gpu:
     
-        # load CUDA libraries
-        try:
-            import cugraph
-            import cudf
-        except ImportError as e:
-            sys.stderr.write("cugraph and cudf unavailable\n")
-            raise ImportError(e)
+        if not gpu_lib:
+           sys.stderr.write('Unable to load GPU libraries; exiting\n')
+           sys.exit(1)
     
         component_assignments = cugraph.components.connectivity.connected_components(G)
         component_frequencies = component_assignments['labels'].value_counts(sort = True, ascending = False)

From 090994f1969fe6edc08c43b7b028d19d129e7538 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 15 Mar 2021 22:12:55 +0000
Subject: [PATCH 221/327] Add reference isolate to assign command

---
 PopPUNK/web.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/web.py b/PopPUNK/web.py
index f7939d1c..3a45967c 100644
--- a/PopPUNK/web.py
+++ b/PopPUNK/web.py
@@ -75,6 +75,7 @@ def sketchAssign():
                                     args.assign.graph_weights,
                                     args.assign.max_a_dist,
                                     args.assign.max_pi_dist,
+                                    args.assign.reference_isolate,
                                     args.assign.model_dir,
                                     args.assign.strand_preserved,
                                     args.assign.previous_clustering,

From 89f97ce6ab183ddc9f08dff120757787f73d49f9 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 06:12:18 +0000
Subject: [PATCH 222/327] Changes to command line phrasing

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 90f42aaf..5d597923 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -96,7 +96,7 @@ def get_options():
                                                 default = 0.5, type = float)
     qcGroup.add_argument('--max-pi-dist', help='Maximum core distance to permit [default = 0.5]',
                                                 default = 0.5, type = float)
-    qcGroup.add_argument('--reference-isolate', help='Isolate from which distances can be calculated for pruning [default = None]',
+    qcGroup.add_argument('--reference-isolate', help='Isolate from which distances will be calculated for pruning [default = None]',
                                                 default = None, type = str)
     qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond '
                                                 'which sequences will be excluded [default = 5]', default = 5, type = int)

From 059d804a1b02846eae87787ca24c5087503e0e40 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 06:16:14 +0000
Subject: [PATCH 223/327] Change GPU library loading

---
 PopPUNK/sparse_mst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py
index adfce4d2..5678e60d 100755
--- a/PopPUNK/sparse_mst.py
+++ b/PopPUNK/sparse_mst.py
@@ -71,7 +71,7 @@ def main():
 
     import graph_tool.all as gt
     # load CUDA libraries
-    if use_gpu and not gpu_lib:
+    if args.gpu_graph and not gpu_lib:
         sys.stderr.write('Unable to load GPU libraries; exiting\n')
         sys.exit(1)
 

From 7f110c8f6a1cb9a645929824c3ea4a96842dd655 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 06:31:28 +0000
Subject: [PATCH 224/327] Update web test

---
 test/test-web.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/test-web.py b/test/test-web.py
index dd0f7390..56f47bf5 100644
--- a/test/test-web.py
+++ b/test/test-web.py
@@ -38,12 +38,18 @@
                             args.assign.plot_fit,
                             args.assign.graph_weights,
                             args.assign.max_a_dist,
+                            args.assign.max_pi_dist,
+                            args.assign.reference_isolate,
                             args.assign.model_dir,
                             args.assign.strand_preserved,
                             args.assign.previous_clustering,
                             args.assign.external_clustering,
                             args.assign.core_only,
                             args.assign.accessory_only,
+                            args.assign.gpu_sketch,
+                            args.assign.gpu_dist,
+                            args.assign.gpu_graph,
+                            args.assign.deviceid,
                             args.assign.web,
                             sketch,
                             args.assign.save_partial_query_graph)

From a87c94db07b611be45b7b445d26899bdd282e0c0 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 11:27:33 +0000
Subject: [PATCH 225/327] Change distance QC routine

---
 PopPUNK/__main__.py | 63 ++++++++++++++++--------------
 PopPUNK/utils.py    | 95 +++++++++++++++++++++++++++++++--------------
 2 files changed, 100 insertions(+), 58 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 5d597923..a350f5f5 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -230,7 +230,10 @@ def main():
         'length_sigma': args.length_sigma,
         'length_range': args.length_range,
         'prop_n': args.prop_n,
-        'upper_n': args.upper_n
+        'upper_n': args.upper_n,
+        'max_pi_dist': args.max_pi_dist,
+        'max_a_dist': args.max_a_dist,
+        'reference_isolate': args.reference_isolate
     }
 
     # Dict of DB access functions
@@ -285,7 +288,7 @@ def main():
             sys.stderr.write("--create-db requires --r-files and --output")
             sys.exit(1)
 
-        # generate sketches and QC sequences
+        # generate sketches and QC sequences to identify sequences not matching specified criteria
         createDatabaseDir(args.output, kmers)
         seq_names_passing = \
             constructDatabase(
@@ -298,6 +301,7 @@ def main():
                 codon_phased = args.codon_phased,
                 calc_random = True)
 
+        # calculate distances between sequences
         distMat = queryDatabase(rNames = seq_names_passing,
                                 qNames = seq_names_passing,
                                 dbPrefix = args.output,
@@ -306,35 +310,36 @@ def main():
                                 self = True,
                                 number_plot_fits = args.plot_fit,
                                 threads = args.threads)
-        names_to_remove = qcDistMat(distMat,
+
+        # QC pairwise distances to identify long distances indicative of anomalous sequences in the collection
+        seq_names_passing, distMat = qcDistMat(distMat,
                                 seq_names_passing,
                                 seq_names_passing,
-                                args.max_pi_dist,
-                                args.max_a_dist,
-                                args.reference_isolate)
-        
-        # prune based on distance from reference if provided
-        if args.reference_isolate is not None and len(names_to_remove) > 0 and args.qc_filter == "prune":
-            # Remove sketches
-            db_name = args.output + '/' + os.path.basename(args.output) + '.h5'
-            filtered_db_name = args.output + '/' + 'filtered.' + os.path.basename(args.output) + '.h5'
-            removeFromDB(db_name,
-                         filtered_db_name,
-                         names_to_remove,
-                         full_names = True)
-            os.rename(filtered_db_name, db_name)
-            # Remove from distance matrix
-            prune_distance_matrix(seq_names_passing,
-                                    names_to_remove,
-                                    distMat,
-                                    args.output + "/" + os.path.basename(args.output) + ".dists")
-            # Remove from reflist
-            seq_names_passing = [seq_names_passing.remove(x) for x in names_to_remove]
-            sys.stderr.write("Successfully removed from the database: " + str(names_to_remove))
-        else:
-            # Save results
-            dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
-            storePickle(seq_names_passing, seq_names_passing, True, distMat, dists_out)
+                                args.output,
+                                qc_dict)
+
+#        # prune based on distance from reference if provided
+#        if args.reference_isolate is not None and len(names_to_remove) > 0 and args.qc_filter == "prune":
+#            # Remove sketches
+#            db_name = args.output + '/' + os.path.basename(args.output) + '.h5'
+#            filtered_db_name = args.output + '/' + 'filtered.' + os.path.basename(args.output) + '.h5'
+#            removeFromDB(db_name,
+#                         filtered_db_name,
+#                         names_to_remove,
+#                         full_names = True)
+#            os.rename(filtered_db_name, db_name)
+#            # Remove from distance matrix
+#            prune_distance_matrix(seq_names_passing,
+#                                    names_to_remove,
+#                                    distMat,
+#                                    args.output + "/" + os.path.basename(args.output) + ".dists")
+#            # Remove from reflist
+#            seq_names_passing = [seq_names_passing.remove(x) for x in names_to_remove]
+#            sys.stderr.write("Successfully removed from the database: " + str(names_to_remove))
+#        else:
+#            # Save results
+#            dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
+#            storePickle(seq_names_passing, seq_names_passing, True, distMat, dists_out)
 
         # Plot results
         plot_scatter(distMat,
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 2beba400..5f82d4e4 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -226,9 +226,8 @@ def listDistInts(refSeqs, querySeqs, self=True):
         return comparisons
 
 
-def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate = None):
-    """Checks distance matrix for outliers. At the moment
-    just a threshold for accessory distance
+def qcDistMat(distMat, refList, queryList, prefix, qc_dict):
+    """Checks distance matrix for outliers.
 
     Args:
         distMat (np.array)
@@ -237,38 +236,76 @@ def qcDistMat(distMat, refList, queryList, c_max, a_max, ref_isolate = None):
             Reference labels
         queryList (list)
             Query labels (or refList if self)
-        c_max (float)
-            Maximum core distance to allow
-        a_max (float)
-            Maximum accessory distance to allow
-        ref_isolate (str)
-            Name of reference from which pruning can occur
-
+        prefix (list)
+            Prefix for output files
+        qc_dict (dict)
+            Dict of QC options
+            
     Returns:
-        passed (bool)
-            False if any samples failed
+        seq_names_passing (list)
+            List of isolates passing QC distance filters
+        distMat ([n,2] numpy ndarray)
+            Filtered long form distance matrix
     """
-    passed = True
+    
+    # avoid circular import
+    from .prune_db import prune_distance_matrix
+    from .sketchlib import removeFromDB
+    
     to_prune = []
 
     # First check with numpy, which is quicker than iterating over everything
-    if np.any(distMat[:, 0] > c_max) or np.any(distMat[:, 1] > a_max):
-        passed = False
-        names = iterDistRows(refList, queryList, refList == queryList)
-        for i, (ref, query) in enumerate(names):
-            if distMat[i, 0] > c_max or distMat[i, 1] > a_max:
-                sys.stderr.write("WARNING: Outlier at c = " + str(distMat[i, 0]) + " a = " + str(distMat[i, 1]) +
-                                 " 1:" + ref + " 2:" + query + "\n")
-                if ref_isolate is not None:
-                    if ref == ref_isolate:
-                        to_prune.append(query)
-                    elif query == ref_isolate:
-                        to_prune.append(ref)
-
-    if ref_isolate is None:
-        return passed
+    long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist()
+    if len(long_distance_rows) > 0:
+        names = list(iterDistRows(refList, queryList, refList == queryList))
+        # Prune sequences based on reference sequence
+        if qc_dict['reference_isolate'] is not None:
+            for i in long_distance_rows:
+                if names[i][0] == qc_dict['reference_isolate']:
+                    to_prune.append(names[i][1])
+                elif names[i][1] == qc_dict['reference_isolate']:
+                    to_prune.append(names[i][0])
+        else:
+            anomalous_isolates = set()
+            for i in long_distance_rows:
+                anomalous_isolates.add(names[i][0])
+                anomalous_isolates.add(names[i][1])
+            to_prune = list(anomalous_isolates)
+
+    # Create overall list of sequences
+    if refList == refList:
+        seq_names_passing = refList
     else:
-        return to_prune
+        seq_names_passing = refList + queryList
+    
+    # prune based on distance from reference if provided
+    if qc_dict['qc_filter'] == 'stop':
+        if len(to_prune) > 0:
+            sys.stderr.write('Outlier distances exceed QC thresholds; prune sequences or raise thresholds\n')
+            sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n')
+            sys.exit(1)
+    elif qc_dict['qc_filter'] == 'prune' and len(to_prune) > 0:
+        if qc_dict['reference_isolate'] is None:
+            sys.stderr.write('Distances exceeded QC thresholds but no reference isolate supplied\n')
+            sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n')
+        else:
+            # Remove sketches
+            db_name = prefix + '/' + os.path.basename(prefix) + '.h5'
+            filtered_db_name = prefix + '/' + 'filtered.' + os.path.basename(prefix) + '.h5'
+            removeFromDB(db_name,
+                         filtered_db_name,
+                         to_prune,
+                         full_names = True)
+            os.rename(filtered_db_name, db_name)
+            # Remove from distance matrix
+            seq_names_passing, distMat = prune_distance_matrix(seq_names_passing,
+                                                                to_prune,
+                                                                distMat,
+                                                                prefix + "/" + os.path.basename(prefix) + ".dists")
+            # Remove from reflist
+            sys.stderr.write('Successfully pruned from the database: ' + ';'.join(to_prune))
+
+    return seq_names_passing, distMat
 
 
 def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False):

From 46a9a9abbb02b2cad27e4664500336f9c51d796c Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 13:45:37 +0000
Subject: [PATCH 226/327] Update distance QC functions

---
 PopPUNK/__main__.py | 28 +++-------------------------
 PopPUNK/assign.py   | 11 ++++++++---
 PopPUNK/utils.py    |  2 +-
 PopPUNK/web.py      |  1 +
 4 files changed, 13 insertions(+), 29 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index a350f5f5..36dfc85a 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -318,29 +318,6 @@ def main():
                                 args.output,
                                 qc_dict)
 
-#        # prune based on distance from reference if provided
-#        if args.reference_isolate is not None and len(names_to_remove) > 0 and args.qc_filter == "prune":
-#            # Remove sketches
-#            db_name = args.output + '/' + os.path.basename(args.output) + '.h5'
-#            filtered_db_name = args.output + '/' + 'filtered.' + os.path.basename(args.output) + '.h5'
-#            removeFromDB(db_name,
-#                         filtered_db_name,
-#                         names_to_remove,
-#                         full_names = True)
-#            os.rename(filtered_db_name, db_name)
-#            # Remove from distance matrix
-#            prune_distance_matrix(seq_names_passing,
-#                                    names_to_remove,
-#                                    distMat,
-#                                    args.output + "/" + os.path.basename(args.output) + ".dists")
-#            # Remove from reflist
-#            seq_names_passing = [seq_names_passing.remove(x) for x in names_to_remove]
-#            sys.stderr.write("Successfully removed from the database: " + str(names_to_remove))
-#        else:
-#            # Save results
-#            dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
-#            storePickle(seq_names_passing, seq_names_passing, True, distMat, dists_out)
-
         # Plot results
         plot_scatter(distMat,
                      args.output + "/" + os.path.basename(args.output) + "_distanceDistribution",
@@ -390,8 +367,9 @@ def main():
 
         # Load the distances
         refList, queryList, self, distMat = readPickle(distances, enforce_self=True)
-        if qcDistMat(distMat, refList, queryList, args.max_pi_dist, args.max_a_dist) == False \
-                and args.qc_filter == "stop":
+        seq_names = set(set(refList) | set(queryList))
+        seq_names_passing, distMat = qcDistMat(distMat, refList, queryList, args.output, qc_dict)
+        if length(set(seq_names_passing).difference(seq_names)) > 0 and args.qc_filter == "stop":
             sys.stderr.write("Distances failed quality control (change QC options to run anyway)\n")
             sys.exit(1)
 
diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index b1361aab..6ad90613 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -28,6 +28,7 @@ def assign_query(dbFuncs,
                  ref_db,
                  q_files,
                  output,
+                 qc_dict,
                  update_db,
                  write_references,
                  distances,
@@ -159,7 +160,7 @@ def assign_query(dbFuncs,
                               threads = threads,
                               use_gpu = gpu_dist)
     # QC distance matrix
-    qcPass = qcDistMat(qrDistMat, rNames, qNames, max_pi_dist, max_a_dist, reference_isolate)
+    seq_names_passing, distMat = qcDistMat(qrDistMat, rNames, qNames, output, qc_dict)
 
     # Load the network based on supplied options
     genomeNetwork, old_cluster_file = \
@@ -443,7 +444,7 @@ def main():
 
     # Dict of QC options for passing to database construction and querying functions
     if args.length_sigma is None and None in args.length_range and args.prop_n is None \
-        and args.upper_n is None:
+        and args.upper_n is None and args.max_a_dist is None and args.max_pi_dist is None:
         qc_dict = {'run_qc': False }
     else:
         # define defaults if one QC parameter given
@@ -468,7 +469,10 @@ def main():
             'length_sigma': length_sigma,
             'length_range': args.length_range,
             'prop_n': prop_n,
-            'upper_n': args.upper_n
+            'upper_n': args.upper_n,
+            'max_pi_dist': args.max_pi_dist,
+            'max_a_dist': args.max_a_dist,
+            'reference_isolate': args.reference_isolate
         }
 
     # Dict of DB access functions for assign_query (which is out of scope)
@@ -497,6 +501,7 @@ def main():
                  args.db,
                  args.query,
                  args.output,
+                 qc_dict,
                  args.update_db,
                  args.write_references,
                  distances,
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 5f82d4e4..079e5fa0 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -303,7 +303,7 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict):
                                                                 distMat,
                                                                 prefix + "/" + os.path.basename(prefix) + ".dists")
             # Remove from reflist
-            sys.stderr.write('Successfully pruned from the database: ' + ';'.join(to_prune))
+            sys.stderr.write('Pruned from the database after failing distance QC: ' + ';'.join(to_prune))
 
     return seq_names_passing, distMat
 
diff --git a/PopPUNK/web.py b/PopPUNK/web.py
index 3a45967c..a8ed3a7e 100644
--- a/PopPUNK/web.py
+++ b/PopPUNK/web.py
@@ -66,6 +66,7 @@ def sketchAssign():
                                     args.assign.ref_db,
                                     args.assign.q_files,
                                     outdir,
+                                    qc_dict,
                                     args.assign.update_db,
                                     args.assign.write_references,
                                     args.assign.distances,

From 40a6e4f49b956c7c5b2ea4a944ded9d0a045be85 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 14:33:04 +0000
Subject: [PATCH 227/327] Select reference isolate where not supplied

---
 PopPUNK/sketchlib.py | 40 ++++++++++++++++++++++++++++++++++++++++
 PopPUNK/utils.py     | 36 ++++++++++++++++++------------------
 2 files changed, 58 insertions(+), 18 deletions(-)

diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py
index 60f42a54..d00f75f4 100644
--- a/PopPUNK/sketchlib.py
+++ b/PopPUNK/sketchlib.py
@@ -572,6 +572,46 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
     return distMat
 
 
+def pickReferenceIsolate(prefix, names):
+    """Selects a reference isolate as that with a minimal proportion
+    of missing data.
+
+    Args:
+        prefix (str)
+            Prefix of output files
+        names (list)
+            Names of samples to QC
+
+    Returns:
+        reference_isolate (str)
+            Name of isolate selected as reference
+    """
+    # open databases
+    db_name = prefix + '/' + os.path.basename(prefix) + '.h5'
+    hdf_in = h5py.File(db_name, 'r+')
+    
+    min_prop_n = 1.0
+    reference_isolate = None
+    
+    try:
+        # process data structures
+        read_grp = hdf_in['sketches']
+        # iterate through sketches
+        for dataset in read_grp:
+            if hdf_in['sketches'][dataset].attrs['missing_bases']/hdf_in['sketches'][dataset].attrs['length'] < min_prop_n:
+                min_prop_n = hdf_in['sketches'][dataset].attrs['missing_bases']/hdf_in['sketches'][dataset].attrs['length']
+                reference_isolate = dataset
+            if min_prop_n == 0.0:
+                break
+    # if failure still close files to avoid corruption
+    except:
+        hdf_in.close()
+        sys.stderr.write('Problem processing h5 databases during QC - aborting\n')
+        print("Unexpected error:", sys.exc_info()[0], file = sys.stderr)
+        raise
+
+    return reference_isolate
+
 def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads):
     """Calculates random match probability based on means of genomes
     in assemblyList, and looks for length outliers.
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 079e5fa0..4f4c0dab 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -251,32 +251,32 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict):
     # avoid circular import
     from .prune_db import prune_distance_matrix
     from .sketchlib import removeFromDB
+    from .sketchlib import pickReferenceIsolate
     
+    # Create overall list of sequences
+    if refList == refList:
+        seq_names_passing = refList
+    else:
+        seq_names_passing = refList + queryList
+        
+    # Sequences to remove
     to_prune = []
 
+    # Pick reference isolate if not supplied
+    if qc_dict['reference_isolate'] is None:
+        qc_dict['reference_isolate'] = pickReferenceIsolate(prefix, seq_names_passing)
+        sys.stderr.write('Selected reference isolate is ' + qc_dict['reference_isolate'] + '\n')
+
     # First check with numpy, which is quicker than iterating over everything
     long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist()
     if len(long_distance_rows) > 0:
         names = list(iterDistRows(refList, queryList, refList == queryList))
         # Prune sequences based on reference sequence
-        if qc_dict['reference_isolate'] is not None:
-            for i in long_distance_rows:
-                if names[i][0] == qc_dict['reference_isolate']:
-                    to_prune.append(names[i][1])
-                elif names[i][1] == qc_dict['reference_isolate']:
-                    to_prune.append(names[i][0])
-        else:
-            anomalous_isolates = set()
-            for i in long_distance_rows:
-                anomalous_isolates.add(names[i][0])
-                anomalous_isolates.add(names[i][1])
-            to_prune = list(anomalous_isolates)
-
-    # Create overall list of sequences
-    if refList == refList:
-        seq_names_passing = refList
-    else:
-        seq_names_passing = refList + queryList
+        for i in long_distance_rows:
+            if names[i][0] == qc_dict['reference_isolate']:
+                to_prune.append(names[i][1])
+            elif names[i][1] == qc_dict['reference_isolate']:
+                to_prune.append(names[i][0])
     
     # prune based on distance from reference if provided
     if qc_dict['qc_filter'] == 'stop':

From 845acb083d408bce4da270c13d0a94cb3b05e3f2 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 14:36:10 +0000
Subject: [PATCH 228/327] Change missing nodes to error

---
 PopPUNK/__main__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 36dfc85a..9b892029 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -491,8 +491,9 @@ def main():
         networkMissing = set(set(range(len(refList))).difference(vertex_list))
         if len(networkMissing) > 0:
             missing_isolates = [refList[m] for m in networkMissing]
-            sys.stderr.write("WARNING: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n")
+            sys.stderr.write("ERROR: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n")
             sys.stderr.write("These correspond to indices " + ", ".join(map(str,networkMissing)) + "\n")
+            sys.exit(1)
 
         fit_type = model.type
         isolateClustering = {fit_type: printClusters(genomeNetwork,

From bb2f9e11ded1faf0cf763ba37725195b7e9b75c1 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 14:50:10 +0000
Subject: [PATCH 229/327] Use function for checking network vertex count

---
 PopPUNK/__main__.py |  9 ++-------
 PopPUNK/network.py  | 23 +++++++++++++++++++----
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 9b892029..f2426db5 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -197,6 +197,7 @@ def main():
     from .network import printClusters
     from .network import get_vertex_list
     from .network import save_network
+    from .network import checkNetworkVertexCount
 
     from .plot import writeClusterCsv
     from .plot import plot_scatter
@@ -487,13 +488,7 @@ def main():
             genomeNetwork = indivNetworks[min(rank_list)]
 
         # Ensure all in dists are in final network
-        vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = args.gpu_graph))
-        networkMissing = set(set(range(len(refList))).difference(vertex_list))
-        if len(networkMissing) > 0:
-            missing_isolates = [refList[m] for m in networkMissing]
-            sys.stderr.write("ERROR: Samples " + ", ".join(missing_isolates) + " are missing from the final network\n")
-            sys.stderr.write("These correspond to indices " + ", ".join(map(str,networkMissing)) + "\n")
-            sys.exit(1)
+        checkNetworkVertexCount(refList, genomeNetwork, use_gpu)
 
         fit_type = model.type
         isolateClustering = {fit_type: printClusters(genomeNetwork,
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index cda818fb..e733d64b 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -120,12 +120,27 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
         sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n")
 
     # Ensure all in dists are in final network
+    checkNetworkVertexCount(refList, genomeNetwork, use_gpu)
+
+    return genomeNetwork, cluster_file
+
+def checkNetworkVertexCount(seq_list, G, use_gpu):
+    """Checks the number of network vertices matches the number
+    of sequence names.
+
+    Args:
+        seq_list (list)
+            The list of sequence names
+        G (graph)
+            The network of sequences
+        use_gpu (bool)
+            Whether to use cugraph for graph analyses
+    """
     vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = use_gpu))
-    networkMissing = set(set(range(len(refList))).difference(vertex_list))
+    networkMissing = set(set(range(len(seq_list))).difference(vertex_list))
     if len(networkMissing) > 0:
-        sys.stderr.write("WARNING: Samples " + ",".join(networkMissing) + " are missing from the final network\n")
-
-    return (genomeNetwork, cluster_file)
+        sys.stderr.write("ERROR: Samples " + ",".join(networkMissing) + " are missing from the final network\n")
+        sys.exit(1)
 
 def getCliqueRefs(G, reference_indices = set()):
     """Recursively prune a network of its cliques. Returns one vertex from

From dbcb4f62a956b3ce73b560964a8cef4cce90104e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 14:56:57 +0000
Subject: [PATCH 230/327] Tidy up obsolete text

---
 PopPUNK/network.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index e733d64b..64f2a170 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -230,7 +230,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         reference_index_df = component_assignments.groupby('partition').nth(0)
         reference_indices = reference_index_df['vertex'].to_arrow().to_pylist()
         
-        # Order found references as in mash sketch files
+        # Order found references as in sketchlib database
         reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
         refFileName = writeReferences(reference_names, outPrefix)
         
@@ -249,7 +249,6 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # Construct graph
         G_ref = cugraph.Graph()
         G_ref.from_cudf_edgelist(G_ref_df)
-        return reference_indices, reference_names, refFileName, G_ref
     
     else:
 
@@ -329,7 +328,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # Order found references as in mash sketch files
         reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
         refFileName = writeReferences(reference_names, outPrefix)
-        return reference_indices, reference_names, refFileName, G_ref
+    return reference_indices, reference_names, refFileName, G_ref
 
 def writeReferences(refList, outPrefix):
     """Writes chosen references to file

From f61ccd01286cc0ab8cd509326ac0ab62f487a8bc Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 19:14:47 +0000
Subject: [PATCH 231/327] Add self_loop function

---
 PopPUNK/network.py | 55 ++++++++++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 64f2a170..13d3d6b2 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -136,7 +136,7 @@ def checkNetworkVertexCount(seq_list, G, use_gpu):
         use_gpu (bool)
             Whether to use cugraph for graph analyses
     """
-    vertex_list = set(get_vertex_list(genomeNetwork, use_gpu = use_gpu))
+    vertex_list = set(get_vertex_list(G, use_gpu = use_gpu))
     networkMissing = set(set(range(len(seq_list))).difference(vertex_list))
     if len(networkMissing) > 0:
         sys.stderr.write("ERROR: Samples " + ",".join(networkMissing) + " are missing from the final network\n")
@@ -239,16 +239,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         G_df.columns = ['source','destination']
         G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)]
         # Add self-loop if needed
-        max_in_df = np.amax([G_df['source'].max(), G_df['destination'].max()])
         max_in_vertex_labels = len(reference_names) - 1
-        if max_in_df.item() != max_in_vertex_labels:
-            G_self_loop = cudf.DataFrame()
-            G_self_loop['source'] = [max_in_vertex_labels]
-            G_self_loop['destination'] = [max_in_vertex_labels]
-            G_ref_df = cudf.concat([G_ref_df, G_self_loop], ignore_index = True)
-        # Construct graph
-        G_ref = cugraph.Graph()
-        G_ref.from_cudf_edgelist(G_ref_df)
+        G_ref = add_self_loop(G_ref_df,max_in_vertex_labels)
     
     else:
 
@@ -781,20 +773,13 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
             G_current_df.columns = ['source','destination']
             G_extra_df = cudf.DataFrame(new_edges, columns =['source','destination'])
             G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True)
-        G = cugraph.Graph()
-        G.from_cudf_edgelist(G_df)
         
         # use self-loop to ensure all nodes are present
-        max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
         max_in_vertex_labels = ref_count + len(qList) - 1
-        if max_in_df.item() != max_in_vertex_labels:
-            G_self_loop = cudf.DataFrame()
-            G_self_loop['source'] = [max_in_vertex_labels]
-            G_self_loop['destination'] = [max_in_vertex_labels]
-            G = cudf.concat([G,G_self_loop], ignore_index = True)
-        # Construct graph
-        G = cugraph.Graph()
-        G.from_cudf_edgelist(G_df)
+        include_weights = False
+        if weights is not None:
+            include_weights = True
+        G = add_self_loop(G_df, max_in_vertex_labels, weights = include_weights)
         
     else:
         G.add_vertex(len(qList))
@@ -812,6 +797,34 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
 
     return G, qqDistMat
 
+def add_self_loop(G_df, seq_num, weights = False):
+    """Adds self-loop to cugraph graph to ensure all nodes are included in
+    the graph, even if singletons.
+
+    Args:
+        G_df (cudf)
+            cudf data frame containing edge list
+        seq_num (int)
+            The expected number of nodes in the graph
+
+    Returns:
+        G_new (graph)
+            Dictionary of cluster assignments (keys are sequence names)
+    """
+    # use self-loop to ensure all nodes are present
+    max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
+    if max_in_df.item() != seq_num:
+        G_self_loop = cudf.DataFrame()
+        G_self_loop['source'] = [seq_num]
+        G_self_loop['destination'] = [seq_num]
+        if weights:
+            G_self_loop['weight'] = 0.0
+        G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
+    # Construct graph
+    G_new = cugraph.Graph()
+    G_new.from_cudf_edgelist(G_df)
+    return G_new
+
 def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,
                   externalClusterCSV = None, printRef = True, printCSV = True,
                   clustering_type = 'combined', use_gpu = False):

From 55507b4b2b6af3f6bffe7bc7bec3262646918311 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 19:19:01 +0000
Subject: [PATCH 232/327] Remove condition on adding edges

---
 PopPUNK/network.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 13d3d6b2..4ca21ecf 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -478,15 +478,13 @@ def constructNetwork(rlist, qlist, assignments, within_label,
                                                                                 weights = True)
             for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights):
                 edge_tuple = (ref, query, weight)
-                if ref < query:
-                    connections.append(edge_tuple)
+                connections.append(edge_tuple)
         else:
             extra_sources, extra_targets = load_previous_network(prev_G,rlist,
                                                                                 weights = False)
             for (ref, query) in zip(extra_sources, extra_targets):
                 edge_tuple = (ref, query)
-                if ref < query:
-                    connections.append(edge_tuple)
+                connections.append(edge_tuple)
 
     # load GPU libraries if necessary
     if use_gpu:

From acbeeea105fe4710bd849b6a8decf1075db66de0 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 20:32:44 +0000
Subject: [PATCH 233/327] Add copy function for models

---
 PopPUNK/assign.py | 3 +--
 PopPUNK/models.py | 6 ++++++
 PopPUNK/utils.py  | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index 6ad90613..6542fc7d 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -288,8 +288,7 @@ def assign_query(dbFuncs,
 
         # Copy model if needed
         if output != model.outPrefix:
-            model.outPrefix = output
-            model.save()
+            model.copy(output)
 
         # Clique pruning
         if model.type != 'lineage':
diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index ac46e40e..502575ff 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -261,6 +261,12 @@ def no_scale(self):
         is done in the scaled space).
         '''
         self.scale = np.array([1, 1], dtype = self.default_dtype)
+        
+    def copy(self, prefix):
+        """Copy the model to a new directory
+        """
+        self.outPrefix = prefix
+        save()
 
 
 class BGMMFit(ClusterFit):
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 4f4c0dab..91a04015 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -303,7 +303,7 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict):
                                                                 distMat,
                                                                 prefix + "/" + os.path.basename(prefix) + ".dists")
             # Remove from reflist
-            sys.stderr.write('Pruned from the database after failing distance QC: ' + ';'.join(to_prune))
+            sys.stderr.write('Pruned from the database after failing distance QC: ' + ';'.join(to_prune) + '\n')
 
     return seq_names_passing, distMat
 

From 90144c0103856e4a30d02b575280a204c355f655 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 21:03:44 +0000
Subject: [PATCH 234/327] Fix file and cluster name processing

---
 PopPUNK/visualise.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py
index 0bc7d8f1..39c7bb35 100644
--- a/PopPUNK/visualise.py
+++ b/PopPUNK/visualise.py
@@ -359,9 +359,17 @@ def generate_visualisations(query_db,
             if not overwrite:
                 existing_tree = load_tree(output, "MST", distances=mst_distances)
             if existing_tree is None:
-                # Get a default clustering if none provided
-                if display_cluster is None:
-                    display_cluster = list(isolateClustering.keys())[0]
+                # Check selecting clustering type is in CSV
+                clustering_name = 'Cluster'
+                if display_cluster != None:
+                    if display_cluster not in isolateClustering.keys():
+                        clustering_name = list(isolateClustering.keys())[0]
+                        sys.stderr.write('Unable to find clustering column ' + display_cluster + ' in file ' +
+                                         prev_clustering + '; instead using ' + clustering_name + '\n')
+                    else:
+                        clustering_name = display_cluster
+                else:
+                    clustering_name = list(isolateClustering.keys())[0]
                 # Get distance matrix
                 complete_distMat = \
                     np.hstack((pp_sketchlib.squareToLong(core_distMat, threads).reshape(-1, 1),
@@ -376,7 +384,7 @@ def generate_visualisations(query_db,
                                     weights_type=mst_distances,
                                     summarise=False)
                 mst_graph = generate_minimum_spanning_tree(G)
-                drawMST(mst_graph, output, isolateClustering, display_cluster, overwrite)
+                drawMST(mst_graph, output, isolateClustering, clustering_name, overwrite)
                 mst_tree = mst_to_phylogeny(mst_graph, isolateNameToLabel(combined_seq))
             else:
                 mst_tree = existing_tree
@@ -435,7 +443,7 @@ def generate_visualisations(query_db,
 
     if cytoscape:
         sys.stderr.write("Writing cytoscape output\n")
-        genomeNetwork, cluster_file = fetchNetwork(os.path.dirname(prev_clustering),
+        genomeNetwork, cluster_file = fetchNetwork(os.path.dirname(graph_dir),
                                                     model,
                                                     rlist,
                                                     False,

From 1b404614a722385b41f14b7dc9e42f8e98a30814 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 16 Mar 2021 21:45:07 +0000
Subject: [PATCH 235/327] Change network loading functions

---
 PopPUNK/network.py   | 36 +++++++++++++++++++++++++++++-------
 PopPUNK/visualise.py | 28 +++++++++-------------------
 2 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 4ca21ecf..c23352e6 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -104,9 +104,34 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
         if core_only or accessory_only:
             sys.stderr.write("Can only do --core-only or --accessory-only fits from "
                              "a refined fit. Using the combined distances.\n")
+    
+    # Load network file
+    genomeNetwork = load_network_file(network_file, use_gpu = use_gpu)
+
+    # Ensure all in dists are in final network
+    checkNetworkVertexCount(refList, genomeNetwork, use_gpu)
 
+    return genomeNetwork, cluster_file
+
+def load_network_file(fn, use_gpu = False):
+    """Load the network based on input options
+
+       Returns the network as a graph-tool format graph, and sets
+       the slope parameter of the passed model object.
+
+       Args:
+            fn (str)
+                Network file name
+            use_gpu (bool)
+                Use cugraph library to load graph
+
+       Returns:
+            genomeNetwork (graph)
+                The loaded network
+    """
+    # Load the network from the specified file
     if use_gpu:
-        G_df = cudf.read_csv(network_file, compression = 'gzip')
+        G_df = cudf.read_csv(fn, compression = 'gzip')
         genomeNetwork = cugraph.Graph()
         if 'weights' in G_df.columns:
             G_df.columns = ['source','destination','weights']
@@ -116,13 +141,10 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False,
             genomeNetwork.from_cudf_edgelist(G_df,renumber=False)
         sys.stderr.write("Network loaded: " + str(genomeNetwork.number_of_vertices()) + " samples\n")
     else:
-        genomeNetwork = gt.load_graph(network_file)
+        genomeNetwork = gt.load_graph(fn)
         sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n")
-
-    # Ensure all in dists are in final network
-    checkNetworkVertexCount(refList, genomeNetwork, use_gpu)
-
-    return genomeNetwork, cluster_file
+    
+    return genomeNetwork
 
 def checkNetworkVertexCount(seq_list, G, use_gpu):
     """Checks the number of network vertices matches the number
diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py
index 39c7bb35..3369177f 100644
--- a/PopPUNK/visualise.py
+++ b/PopPUNK/visualise.py
@@ -62,8 +62,8 @@ def get_options():
                              'from poppunk_assign [default = use that in the directory '
                              'of the query database]',
                         type = str)
-    iGroup.add_argument('--use-network',
-                        help='Specify a directory containing a .gt file to use for any graph visualisations',
+    iGroup.add_argument('--network-file',
+                        help='Specify a file to use for any graph visualisations',
                         type = str)
     iGroup.add_argument('--display-cluster',
                         help='Column of clustering CSV to use for plotting',
@@ -109,6 +109,7 @@ def get_options():
     other = parser.add_argument_group('Other options')
     other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]')
     other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]')
+    other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating graphs [default = False]')
     other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]')
     other.add_argument('--strand-preserved', default=False, action='store_true',
                        help='If distances being calculated, treat strand as known when calculating random '
@@ -149,7 +150,8 @@ def generate_visualisations(query_db,
                             model_dir,
                             previous_clustering,
                             previous_query_clustering,
-                            use_network,
+                            network_file,
+                            gpu_graph,
                             info_csv,
                             rapidnj,
                             tree,
@@ -165,6 +167,7 @@ def generate_visualisations(query_db,
     from .network import constructNetwork
     from .network import fetchNetwork
     from .network import generate_minimum_spanning_tree
+    from .network import load_network_file
 
     from .plot import drawMST
     from .plot import outputsForMicroreact
@@ -326,15 +329,6 @@ def generate_visualisations(query_db,
                                                mode = mode,
                                                return_dict = True)
 
-    # Set graph location
-    if use_network is not None:
-        graph_dir = use_network
-        if graph_dir != prev_clustering:
-            sys.stderr.write("WARNING: Loading graph from a different directory to clusters\n")
-            sys.stderr.write("WARNING: Ensure that they are consistent\n")
-    else:
-        graph_dir = prev_clustering
-
     # Join clusters with query clusters if required
     if not self:
         if previous_query_clustering is not None:
@@ -443,12 +437,7 @@ def generate_visualisations(query_db,
 
     if cytoscape:
         sys.stderr.write("Writing cytoscape output\n")
-        genomeNetwork, cluster_file = fetchNetwork(os.path.dirname(graph_dir),
-                                                    model,
-                                                    rlist,
-                                                    False,
-                                                    core_only,
-                                                    accessory_only)
+        genomeNetwork = load_network_file(network_file, use_gpu = gpu_graph)
         outputsForCytoscape(genomeNetwork, mst_graph, isolateClustering, output, info_csv, viz_subset = viz_subset)
         if model.type == 'lineage':
             sys.stderr.write("Note: Only support for output of cytoscape graph at lowest rank\n")
@@ -478,7 +467,8 @@ def main():
                             args.model_dir,
                             args.previous_clustering,
                             args.previous_query_clustering,
-                            args.use_network,
+                            args.network_file,
+                            args.gpu_graph,
                             args.info_csv,
                             args.rapidnj,
                             args.tree,

From ae777f4ff1831417f563e7ca5dbc8cd4731a7193 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 06:43:54 +0000
Subject: [PATCH 236/327] Update import of old networks to use cugraph

---
 PopPUNK/network.py    | 94 +++++++++++++++++++++++++++++++------------
 PopPUNK/sparse_mst.py |  6 ++-
 2 files changed, 73 insertions(+), 27 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index c23352e6..19f195c4 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -365,8 +365,9 @@ def writeReferences(refList, outPrefix):
 
     return refFileName
 
-def load_previous_network(prev_G_fn, rlist, weights=False):
-    """Load previous network with graph-tool, extract the edges to match the
+def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False,
+                    use_gpu = False):
+    """Load previous network, extract the edges to match the
     vertex order specified in rlist, and also return weights if specified.
 
     Args:
@@ -374,9 +375,14 @@ def load_previous_network(prev_G_fn, rlist, weights=False):
             Path of file containing existing network.
         rlist (list)
             List of reference sequence labels in new network
+        previous_pkl (str)
+            Path of pkl file containing names of sequences in
+            previous network
         weights (bool)
             Whether to return edge weights
             (default = False)
+        use_gpu (bool)
+            Whether to use cugraph for graph analyses
 
     Returns:
         source_ids (list)
@@ -387,20 +393,46 @@ def load_previous_network(prev_G_fn, rlist, weights=False):
             Weights for each new edge
     """
     # get list for translating node IDs to rlist
-    prev_G = gt.load_graph(prev_G_fn)
-    old_ids = prev_G.vp["id"]
+    prev_G = load_network_file(prev_G_fn, use_gpu = use_gpu)
+    
+    # load list of names in previous network
+    if previous_pkl is not None:
+        with open(previous_pkl, 'rb') as pickle_file:
+            old_rlist, old_qlist, self = pickle.load(pickle_file)
+        if self:
+            old_ids = old_rlist
+        else:
+            old_ids = old_rlist + old_qlist
+    else:
+        sys.stderr.write('Pkl file containing names of sequences in previous network\n')
+        sys.exit(1)
+    
+    # Get edges as lists of source,destination,weight using original IDs
+    if use_gpu:
+        G_df = prev_G.view_edge_list()
+        if weights:
+            G_df.columns = ['source','destination','weight']
+            edge_weights = G_df['weight'].to_arrow().to_pylist()
+        else:
+            G_df.columns = ['source','destination']
+        old_source_ids = G_df['source'].to_arrow().to_pylist()
+        old_target_ids = G_df['destination'].to_arrow().to_pylist()
+    else:
+        # get the source and target nodes
+        old_source_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "source")
+        old_target_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "target")
+        # get the weights
+        if weights:
+            edge_weights = list(prev_G.ep['weight'])
+    
+    # Update IDs to new versions
     old_id_indices = [rlist.index(x) for x in old_ids]
-    # get the source and target nods
-    source_old_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "source")
-    target_old_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "target")
     # translate to indices
-    source_ids = [old_id_indices[x] for x in source_old_ids]
-    target_ids = [old_id_indices[x] for x in target_old_ids]
-    # convert to ndarray
-    # get the weights
+    source_ids = [old_id_indices[x] for x in old_source_ids]
+    target_ids = [old_id_indices[x] for x in old_target_ids]
+    
+    # return values
     if weights:
-        edge_weights = list(prev_G.ep['weight'])
-        # return values
         return source_ids, target_ids, edge_weights
     else:
         return source_ids, target_ids
@@ -408,7 +440,7 @@ def load_previous_network(prev_G_fn, rlist, weights=False):
 def constructNetwork(rlist, qlist, assignments, within_label,
                      summarise = True, edge_list = False, weights = None,
                      weights_type = 'euclidean', sparse_input = None,
-                     previous_network = None, use_gpu = False):
+                     previous_network = None, previous_pkl = None, use_gpu = False):
     """Construct an unweighted, undirected network without self-loops.
     Nodes are samples and edges where samples are within the same cluster
 
@@ -440,6 +472,8 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         previous_network (str)
             Name of file containing a previous network to be integrated into this new
             network
+        previous_pkl (str)
+            Name of file containing the names of the sequences in the previous_network
         use_gpu (bool)
             Whether to use GPUs for network construction
 
@@ -495,18 +529,28 @@ def constructNetwork(rlist, qlist, assignments, within_label,
 
     # read previous graph
     if previous_network is not None:
-        if weights is not None or sparse_input is not None:
-            extra_sources, extra_targets, extra_weights = load_previous_network(previous_network,rlist,
-                                                                                weights = True)
-            for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights):
-                edge_tuple = (ref, query, weight)
-                connections.append(edge_tuple)
+        if previous_pkl is not None:
+            if weights is not None or sparse_input is not None:
+                extra_sources, extra_targets, extra_weights = network_to_edges(previous_network,
+                                                                                    rlist,
+                                                                                    previous_pkl = previous_pkl,
+                                                                                    weights = True,
+                                                                                    use_gpu = use_gpu)
+                for (ref, query, weight) in zip(extra_sources, extra_targets, extra_weights):
+                    edge_tuple = (ref, query, weight)
+                    connections.append(edge_tuple)
+            else:
+                extra_sources, extra_targets = network_to_edges(prev_G,
+                                                                rlist,
+                                                                previous_pkl = previous_pkl,
+                                                                weights = False,
+                                                                use_gpu = use_gpu)
+                for (ref, query) in zip(extra_sources, extra_targets):
+                    edge_tuple = (ref, query)
+                    connections.append(edge_tuple)
         else:
-            extra_sources, extra_targets = load_previous_network(prev_G,rlist,
-                                                                                weights = False)
-            for (ref, query) in zip(extra_sources, extra_targets):
-                edge_tuple = (ref, query)
-                connections.append(edge_tuple)
+            sys.stderr.write('A distance pkl corresponding to ' + previous_pkl + ' is required for loading\n')
+            sys.exit(1)
 
     # load GPU libraries if necessary
     if use_gpu:
diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py
index 5678e60d..24fc2bfa 100755
--- a/PopPUNK/sparse_mst.py
+++ b/PopPUNK/sparse_mst.py
@@ -106,9 +106,11 @@ def main():
         # Load previous MST if specified
         if args.previous_mst is not None:
             print("Previous: " + str(args.previous_mst))
-            extra_sources, extra_targets, extra_weights = load_previous_network(args.previous_mst,
+            extra_sources, extra_targets, extra_weights = network_to_edges(args.previous_mst,
                                                                                   rlist,
-                                                                                  weights = True)
+                                                                                  previous_pkl = args.distance_pkl,
+                                                                                  weights = True,
+                                                                                  use_gpu = use_gpu)
             sources = np.append(sparse_mat.row, np.asarray(extra_sources))
             targets = np.append(sparse_mat.col, np.asarray(extra_targets))
             weights = np.append(sparse_mat.data, np.asarray(extra_weights))

From 616c31e295b74edd159d42668ab142c66cb38234 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 08:40:14 +0000
Subject: [PATCH 237/327] Fix processing of distance matrix

---
 PopPUNK/__main__.py  |  6 +++---
 PopPUNK/sketchlib.py |  2 +-
 PopPUNK/utils.py     | 14 ++++++++------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index f2426db5..ce44f443 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -369,8 +369,8 @@ def main():
         # Load the distances
         refList, queryList, self, distMat = readPickle(distances, enforce_self=True)
         seq_names = set(set(refList) | set(queryList))
-        seq_names_passing, distMat = qcDistMat(distMat, refList, queryList, args.output, qc_dict)
-        if length(set(seq_names_passing).difference(seq_names)) > 0 and args.qc_filter == "stop":
+        seq_names_passing, distMat = qcDistMat(distMat, refList, queryList, output, qc_dict)
+        if len(set(seq_names_passing).difference(seq_names)) > 0 and args.qc_filter == "stop":
             sys.stderr.write("Distances failed quality control (change QC options to run anyway)\n")
             sys.exit(1)
 
@@ -488,7 +488,7 @@ def main():
             genomeNetwork = indivNetworks[min(rank_list)]
 
         # Ensure all in dists are in final network
-        checkNetworkVertexCount(refList, genomeNetwork, use_gpu)
+        checkNetworkVertexCount(refList, genomeNetwork, use_gpu = args.gpu_graph)
 
         fit_type = model.type
         isolateClustering = {fit_type: printClusters(genomeNetwork,
diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py
index d00f75f4..528fc1d2 100644
--- a/PopPUNK/sketchlib.py
+++ b/PopPUNK/sketchlib.py
@@ -589,7 +589,7 @@ def pickReferenceIsolate(prefix, names):
     # open databases
     db_name = prefix + '/' + os.path.basename(prefix) + '.h5'
     hdf_in = h5py.File(db_name, 'r+')
-    
+
     min_prop_n = 1.0
     reference_isolate = None
     
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 91a04015..8739bdd2 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -236,7 +236,7 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict):
             Reference labels
         queryList (list)
             Query labels (or refList if self)
-        prefix (list)
+        prefix (str)
             Prefix for output files
         qc_dict (dict)
             Dict of QC options
@@ -279,15 +279,15 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict):
                 to_prune.append(names[i][0])
     
     # prune based on distance from reference if provided
-    if qc_dict['qc_filter'] == 'stop':
-        if len(to_prune) > 0:
-            sys.stderr.write('Outlier distances exceed QC thresholds; prune sequences or raise thresholds\n')
-            sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n')
-            sys.exit(1)
+    if qc_dict['qc_filter'] == 'stop' and len(to_prune) > 0:
+        sys.stderr.write('Outlier distances exceed QC thresholds; prune sequences or raise thresholds\n')
+        sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n')
+        sys.exit(1)
     elif qc_dict['qc_filter'] == 'prune' and len(to_prune) > 0:
         if qc_dict['reference_isolate'] is None:
             sys.stderr.write('Distances exceeded QC thresholds but no reference isolate supplied\n')
             sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n')
+            sys.exit(1)
         else:
             # Remove sketches
             db_name = prefix + '/' + os.path.basename(prefix) + '.h5'
@@ -304,6 +304,8 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict):
                                                                 prefix + "/" + os.path.basename(prefix) + ".dists")
             # Remove from reflist
             sys.stderr.write('Pruned from the database after failing distance QC: ' + ';'.join(to_prune) + '\n')
+    else:
+        storePickle(seq_names_passing, seq_names_passing, True, distMat, prefix + "/" + os.path.basename(prefix) + ".dists")
 
     return seq_names_passing, distMat
 

From 03ab3f8b05847b0ea34ba9a78e069bba78f58f25 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 09:14:50 +0000
Subject: [PATCH 238/327] Avoid overwrite on  qcDistMat

---
 PopPUNK/__main__.py | 11 ++++++-----
 PopPUNK/assign.py   |  4 ++--
 PopPUNK/utils.py    | 10 ++++++----
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index ce44f443..fa6650f7 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -314,10 +314,11 @@ def main():
 
         # QC pairwise distances to identify long distances indicative of anomalous sequences in the collection
         seq_names_passing, distMat = qcDistMat(distMat,
-                                seq_names_passing,
-                                seq_names_passing,
-                                args.output,
-                                qc_dict)
+                                                seq_names_passing,
+                                                seq_names_passing,
+                                                args.output,
+                                                args.output,
+                                                qc_dict)
 
         # Plot results
         plot_scatter(distMat,
@@ -369,7 +370,7 @@ def main():
         # Load the distances
         refList, queryList, self, distMat = readPickle(distances, enforce_self=True)
         seq_names = set(set(refList) | set(queryList))
-        seq_names_passing, distMat = qcDistMat(distMat, refList, queryList, output, qc_dict)
+        seq_names_passing, distMat = qcDistMat(distMat, refList, queryList, args.ref_db, output, qc_dict)
         if len(set(seq_names_passing).difference(seq_names)) > 0 and args.qc_filter == "stop":
             sys.stderr.write("Distances failed quality control (change QC options to run anyway)\n")
             sys.exit(1)
diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index 6542fc7d..3f9b05ee 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -160,7 +160,7 @@ def assign_query(dbFuncs,
                               threads = threads,
                               use_gpu = gpu_dist)
     # QC distance matrix
-    seq_names_passing, distMat = qcDistMat(qrDistMat, rNames, qNames, output, qc_dict)
+    seq_names_passing, distMat = qcDistMat(qrDistMat, rNames, qNames, ref_db, output, qc_dict)
 
     # Load the network based on supplied options
     genomeNetwork, old_cluster_file = \
@@ -249,7 +249,7 @@ def assign_query(dbFuncs,
     dists_out = output + "/" + os.path.basename(output) + ".dists"
     if update_db:
         # Check new sequences pass QC before adding them
-        if not qcPass:
+        if len(set(seq_names_passing).difference(rNames + qNames)) > 0:
             sys.stderr.write("Queries contained outlier distances, "
                              "not updating database\n")
         else:
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 8739bdd2..c48ca280 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -226,7 +226,7 @@ def listDistInts(refSeqs, querySeqs, self=True):
         return comparisons
 
 
-def qcDistMat(distMat, refList, queryList, prefix, qc_dict):
+def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict):
     """Checks distance matrix for outliers.
 
     Args:
@@ -236,8 +236,10 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict):
             Reference labels
         queryList (list)
             Query labels (or refList if self)
+        ref_db (str)
+            Prefix of reference database
         prefix (str)
-            Prefix for output files
+            Prefix of output files
         qc_dict (dict)
             Dict of QC options
             
@@ -264,7 +266,7 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict):
 
     # Pick reference isolate if not supplied
     if qc_dict['reference_isolate'] is None:
-        qc_dict['reference_isolate'] = pickReferenceIsolate(prefix, seq_names_passing)
+        qc_dict['reference_isolate'] = pickReferenceIsolate(ref_db, seq_names_passing)
         sys.stderr.write('Selected reference isolate is ' + qc_dict['reference_isolate'] + '\n')
 
     # First check with numpy, which is quicker than iterating over everything
@@ -290,7 +292,7 @@ def qcDistMat(distMat, refList, queryList, prefix, qc_dict):
             sys.exit(1)
         else:
             # Remove sketches
-            db_name = prefix + '/' + os.path.basename(prefix) + '.h5'
+            db_name = ref_db + '/' + os.path.basename(ref_db) + '.h5'
             filtered_db_name = prefix + '/' + 'filtered.' + os.path.basename(prefix) + '.h5'
             removeFromDB(db_name,
                          filtered_db_name,

From ef450c6a7b67832d8c45f159b13f169dc08ec4fd Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 10:09:29 +0000
Subject: [PATCH 239/327] Start checking reference graph connectivity

---
 PopPUNK/network.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 19f195c4..1872d106 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -259,10 +259,14 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # Extract reference edges
         G_df = G.view_edge_list()
         G_df.columns = ['source','destination']
-        G_ref_df = G_df[G_df['source'].isin(reference_names) & G_df['destination'].isin(reference_names)]
+        G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)]
         # Add self-loop if needed
         max_in_vertex_labels = len(reference_names) - 1
-        G_ref = add_self_loop(G_ref_df,max_in_vertex_labels)
+        G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False)
+        
+        # Check on targets
+        reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
+        print("Reference component assignments: " + str(reference_component_assignments))
     
     else:
 
@@ -339,7 +343,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             G_ref = gt.GraphView(G, vfilt = reference_vertex)
             G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object
 
-        # Order found references as in mash sketch files
+        # Order found references as in sketch files
         reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
         refFileName = writeReferences(reference_names, outPrefix)
     return reference_indices, reference_names, refFileName, G_ref
@@ -424,7 +428,7 @@ def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False,
         # get the weights
         if weights:
             edge_weights = list(prev_G.ep['weight'])
-    
+
     # Update IDs to new versions
     old_id_indices = [rlist.index(x) for x in old_ids]
     # translate to indices
@@ -861,7 +865,7 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers,
 
     return G, qqDistMat
 
-def add_self_loop(G_df, seq_num, weights = False):
+def add_self_loop(G_df, seq_num, weights = False, renumber = True):
     """Adds self-loop to cugraph graph to ensure all nodes are included in
     the graph, even if singletons.
 
@@ -870,6 +874,8 @@ def add_self_loop(G_df, seq_num, weights = False):
             cudf data frame containing edge list
         seq_num (int)
             The expected number of nodes in the graph
+        renumber (bool)
+            Whether to renumber the vertices when added to the graph
 
     Returns:
         G_new (graph)
@@ -886,7 +892,7 @@ def add_self_loop(G_df, seq_num, weights = False):
         G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
     # Construct graph
     G_new = cugraph.Graph()
-    G_new.from_cudf_edgelist(G_df)
+    G_new.from_cudf_edgelist(G_df, renumber = renumber)
     return G_new
 
 def printClusters(G, rlist, outPrefix = "_clusters.csv", oldClusterFile = None,

From 08f58ff1088c813ce76407a00c04719b5f67b44e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 10:26:16 +0000
Subject: [PATCH 240/327] Enable qcDistMat to create output directory

---
 PopPUNK/utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index c48ca280..41ef484a 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -264,6 +264,14 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict):
     # Sequences to remove
     to_prune = []
 
+    # Create output directory if it does not exist already
+    if not os.path.isdir(prefix):
+        try:
+            os.makedirs(prefix)
+        except OSError:
+            sys.stderr.write("Cannot create output directory " + prefix + "\n")
+            sys.exit(1)
+
     # Pick reference isolate if not supplied
     if qc_dict['reference_isolate'] is None:
         qc_dict['reference_isolate'] = pickReferenceIsolate(ref_db, seq_names_passing)

From 6094b2bcfbf4a6eba50601142df8bad9951cae77 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 10:33:14 +0000
Subject: [PATCH 241/327] Change vertex count error message

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 1872d106..e01f8779 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -161,7 +161,7 @@ def checkNetworkVertexCount(seq_list, G, use_gpu):
     vertex_list = set(get_vertex_list(G, use_gpu = use_gpu))
     networkMissing = set(set(range(len(seq_list))).difference(vertex_list))
     if len(networkMissing) > 0:
-        sys.stderr.write("ERROR: Samples " + ",".join(networkMissing) + " are missing from the final network\n")
+        sys.stderr.write("ERROR: Samples " + ",".join(map(str,networkMissing)) + " are missing from the final network\n")
         sys.exit(1)
 
 def getCliqueRefs(G, reference_indices = set()):

From 93c872d6acfc2134541e5c46c701570229310fe9 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 10:41:40 +0000
Subject: [PATCH 242/327] Change column naming in cugraph

---
 PopPUNK/network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index e01f8779..52b6899b 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -258,7 +258,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         
         # Extract reference edges
         G_df = G.view_edge_list()
-        G_df.columns = ['source','destination']
+        if 'src' in G_df.columns:
+            G_df.rename(columns={'src': 'source','dst': 'destination'})
         G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)]
         # Add self-loop if needed
         max_in_vertex_labels = len(reference_names) - 1

From 0f77ade99089feaa20449fada801943728224019 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 10:43:11 +0000
Subject: [PATCH 243/327] Get column names

---
 PopPUNK/network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 52b6899b..7bcb662c 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -258,6 +258,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         
         # Extract reference edges
         G_df = G.view_edge_list()
+        print('G_df original structure: ' + str(G_df))
         if 'src' in G_df.columns:
             G_df.rename(columns={'src': 'source','dst': 'destination'})
         G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)]

From 90aee39123b485bf15df3d434079297e28c423cc Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 10:44:37 +0000
Subject: [PATCH 244/327] Rename in place

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 7bcb662c..13750745 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -260,7 +260,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         G_df = G.view_edge_list()
         print('G_df original structure: ' + str(G_df))
         if 'src' in G_df.columns:
-            G_df.rename(columns={'src': 'source','dst': 'destination'})
+            G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True)
         G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)]
         # Add self-loop if needed
         max_in_vertex_labels = len(reference_names) - 1

From b0375939c6e1c76046a6f11a6ca71aa193ece5ff Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 10:48:46 +0000
Subject: [PATCH 245/327] Compare cudfs

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 13750745..0d1ca25d 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -258,7 +258,6 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         
         # Extract reference edges
         G_df = G.view_edge_list()
-        print('G_df original structure: ' + str(G_df))
         if 'src' in G_df.columns:
             G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True)
         G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)]
@@ -269,6 +268,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # Check on targets
         reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
         print("Reference component assignments: " + str(reference_component_assignments))
+        print("Component assignments: " + str(component_assignments))
     
     else:
 

From ce8fccbf3b4bf0e3dde7f92bdbcb346d0c69fc43 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 10:55:09 +0000
Subject: [PATCH 246/327] View reference cudf

---
 PopPUNK/network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 0d1ca25d..c427b4fe 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -267,6 +267,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         
         # Check on targets
         reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
+        print("Reference df: " + str(G_ref_df))
         print("Reference component assignments: " + str(reference_component_assignments))
         print("Component assignments: " + str(component_assignments))
     

From fe24179a5468215de518dffd03463a2637d54245 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 10:56:42 +0000
Subject: [PATCH 247/327] View overall cudf

---
 PopPUNK/network.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index c427b4fe..d16e9d6e 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -267,6 +267,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         
         # Check on targets
         reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
+        print("Reference indices: " + str(reference_indices))
+        print("Overall cudf: " + str(G_df))
         print("Reference df: " + str(G_ref_df))
         print("Reference component assignments: " + str(reference_component_assignments))
         print("Component assignments: " + str(component_assignments))

From b8d03b2aae9b8330bfb89da680d11c556d8deb25 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 11:47:28 +0000
Subject: [PATCH 248/327] Concat cudf

---
 PopPUNK/network.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index d16e9d6e..44b1280e 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -246,10 +246,12 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
     
         # For large network, use more approximate method for extracting references
         reference = {}
+        # Record the original components to which sequences belonged
+        component_assignments = cugraph.components.connectivity.connected_components(G)
         # Leiden method has resolution parameter - higher values give greater precision
-        component_assignments, score = cugraph.leiden(G, resolution = 1.0)
+        partition_assignments, score = cugraph.leiden(G, resolution = 1.0)
         # group by partition, which becomes the first column, so retrieve second column
-        reference_index_df = component_assignments.groupby('partition').nth(0)
+        reference_index_df = partition_assignments.groupby('partition').nth(0)
         reference_indices = reference_index_df['vertex'].to_arrow().to_pylist()
         
         # Order found references as in sketchlib database
@@ -262,11 +264,14 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True)
         G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)]
         # Add self-loop if needed
-        max_in_vertex_labels = len(reference_names) - 1
+        max_in_vertex_labels = max(reference_indices)
         G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False)
         
         # Check on targets
         reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
+        combined_vertex_assignments = cudf.concat([reference_component_assignments,component_assignments],
+                                                    axis = 1,
+                                                    join = 'inner')
         print("Reference indices: " + str(reference_indices))
         print("Overall cudf: " + str(G_df))
         print("Reference df: " + str(G_ref_df))

From a8e7e33673fdcb40bc0c692dc59a256c473b972b Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 11:54:35 +0000
Subject: [PATCH 249/327] Merge cudf

---
 PopPUNK/network.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 44b1280e..afb539af 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -263,20 +263,22 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         if 'src' in G_df.columns:
             G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True)
         G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)]
+        G_ref_df.rename(columns={'labels': 'ref_labels'})
         # Add self-loop if needed
         max_in_vertex_labels = max(reference_indices)
         G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False)
         
         # Check on targets
         reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
-        combined_vertex_assignments = cudf.concat([reference_component_assignments,component_assignments],
-                                                    axis = 1,
-                                                    join = 'inner')
+        combined_vertex_assignments = reference_component_assignments.merge(component_assignments,
+                                                                            on = 'vertex',
+                                                                            how = 'left')
         print("Reference indices: " + str(reference_indices))
         print("Overall cudf: " + str(G_df))
         print("Reference df: " + str(G_ref_df))
         print("Reference component assignments: " + str(reference_component_assignments))
         print("Component assignments: " + str(component_assignments))
+        print("Combined assignments: " + str(combined_vertex_assignments))
     
     else:
 

From 86b8d85bbd0914c8eb3782296862d010eabdaa23 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 11:57:57 +0000
Subject: [PATCH 250/327] Filter merged cudf

---
 PopPUNK/network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index afb539af..29cad4e5 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -263,7 +263,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         if 'src' in G_df.columns:
             G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True)
         G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)]
-        G_ref_df.rename(columns={'labels': 'ref_labels'})
+        G_ref_df.rename(columns={'labels': 'ref_labels'}, inplace=True)
         # Add self-loop if needed
         max_in_vertex_labels = max(reference_indices)
         G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False)
@@ -273,6 +273,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         combined_vertex_assignments = reference_component_assignments.merge(component_assignments,
                                                                             on = 'vertex',
                                                                             how = 'left')
+        combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
         print("Reference indices: " + str(reference_indices))
         print("Overall cudf: " + str(G_df))
         print("Reference df: " + str(G_ref_df))

From c34a51f187728294c309b1da090c09c521329f14 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 12:09:58 +0000
Subject: [PATCH 251/327] Summarise merged cudf

---
 PopPUNK/network.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 29cad4e5..8a78f82a 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -269,17 +269,23 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False)
         
         # Check on targets
-        reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
-        combined_vertex_assignments = reference_component_assignments.merge(component_assignments,
-                                                                            on = 'vertex',
-                                                                            how = 'left')
-        combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
-        print("Reference indices: " + str(reference_indices))
-        print("Overall cudf: " + str(G_df))
-        print("Reference df: " + str(G_ref_df))
-        print("Reference component assignments: " + str(reference_component_assignments))
-        print("Component assignments: " + str(component_assignments))
-        print("Combined assignments: " + str(combined_vertex_assignments))
+        partition_match = False
+        while partition_match:
+            reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
+            combined_vertex_assignments = reference_component_assignments.merge(component_assignments,
+                                                                                on = 'vertex',
+                                                                                how = 'left')
+            combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
+            combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels','ref_labels'])['ref_labels'].count()
+            max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max()
+            print('max is ' + str(max_ref_comp_count))
+            print("Reference indices: " + str(reference_indices))
+            print("Overall cudf: " + str(G_df))
+            print("Reference df: " + str(G_ref_df))
+            print("Reference component assignments: " + str(reference_component_assignments))
+            print("Component assignments: " + str(component_assignments))
+            print("Combined assignments: " + str(combined_vertex_assignments))
+            partition_match = True
     
     else:
 

From b4d1fbd3e258dd682e76608a1ed0c0921a2b5b17 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 12:10:51 +0000
Subject: [PATCH 252/327] Change bool in while loop

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 8a78f82a..dcdc5095 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -269,7 +269,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False)
         
         # Check on targets
-        partition_match = False
+        partition_mismatch = True
         while partition_match:
             reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
             combined_vertex_assignments = reference_component_assignments.merge(component_assignments,
@@ -285,7 +285,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             print("Reference component assignments: " + str(reference_component_assignments))
             print("Component assignments: " + str(component_assignments))
             print("Combined assignments: " + str(combined_vertex_assignments))
-            partition_match = True
+            partition_match = False
     
     else:
 

From 8c93ac3f84156ed1af62dc68a8e4691351976c39 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 12:29:19 +0000
Subject: [PATCH 253/327] Rename bool variable

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index dcdc5095..9fd3c303 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -270,7 +270,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         
         # Check on targets
         partition_mismatch = True
-        while partition_match:
+        while partition_mismatch:
             reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
             combined_vertex_assignments = reference_component_assignments.merge(component_assignments,
                                                                                 on = 'vertex',

From af08e6a27104f65a0c78cfcc1cfed0ab51d75b30 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 13:03:18 +0000
Subject: [PATCH 254/327] Change cudf tallying

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 9fd3c303..98742672 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -276,7 +276,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                                                                                 on = 'vertex',
                                                                                 how = 'left')
             combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
-            combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels','ref_labels'])['ref_labels'].count()
+            combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'])['ref_labels'].nunique()
             max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max()
             print('max is ' + str(max_ref_comp_count))
             print("Reference indices: " + str(reference_indices))

From 3af73cabc6d519db7e6c7120625247c02cf873db Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 13:05:45 +0000
Subject: [PATCH 255/327] identify column names

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 98742672..7080dbaa 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -276,6 +276,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                                                                                 on = 'vertex',
                                                                                 how = 'left')
             combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
+            print("Combined assignments: " + str(combined_vertex_assignments))
             combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'])['ref_labels'].nunique()
             max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max()
             print('max is ' + str(max_ref_comp_count))
@@ -284,7 +285,6 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             print("Reference df: " + str(G_ref_df))
             print("Reference component assignments: " + str(reference_component_assignments))
             print("Component assignments: " + str(component_assignments))
-            print("Combined assignments: " + str(combined_vertex_assignments))
             partition_match = False
     
     else:

From 2f7d1f59fbe38823a9f944ec804eaa0c3add4bea Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 13:08:09 +0000
Subject: [PATCH 256/327] Rename columns

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 7080dbaa..36c8938f 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -263,7 +263,6 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         if 'src' in G_df.columns:
             G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True)
         G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)]
-        G_ref_df.rename(columns={'labels': 'ref_labels'}, inplace=True)
         # Add self-loop if needed
         max_in_vertex_labels = max(reference_indices)
         G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False)
@@ -272,6 +271,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         partition_mismatch = True
         while partition_mismatch:
             reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
+            reference_component_assignments.rename(columns={'labels': 'ref_labels'}, inplace=True)
             combined_vertex_assignments = reference_component_assignments.merge(component_assignments,
                                                                                 on = 'vertex',
                                                                                 how = 'left')

From 049aba55388c2d00281b9619c1f81195ef21b22d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 13:10:39 +0000
Subject: [PATCH 257/327] Fix loop control

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 36c8938f..c1db8519 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -285,7 +285,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             print("Reference df: " + str(G_ref_df))
             print("Reference component assignments: " + str(reference_component_assignments))
             print("Component assignments: " + str(component_assignments))
-            partition_match = False
+            partition_mismatch = False
     
     else:
 

From ce670742bbd7c16c80b5980b0952d429851a0678 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 13:14:04 +0000
Subject: [PATCH 258/327] Remove some debug messages

---
 PopPUNK/network.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index c1db8519..487847ce 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -276,15 +276,15 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                                                                                 on = 'vertex',
                                                                                 how = 'left')
             combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
-            print("Combined assignments: " + str(combined_vertex_assignments))
             combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'])['ref_labels'].nunique()
             max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max()
+            print("Combined assignments: " + str(combined_vertex_assignments))
             print('max is ' + str(max_ref_comp_count))
-            print("Reference indices: " + str(reference_indices))
-            print("Overall cudf: " + str(G_df))
-            print("Reference df: " + str(G_ref_df))
-            print("Reference component assignments: " + str(reference_component_assignments))
-            print("Component assignments: " + str(component_assignments))
+#            print("Reference indices: " + str(reference_indices))
+#            print("Overall cudf: " + str(G_df))
+#            print("Reference df: " + str(G_ref_df))
+#            print("Reference component assignments: " + str(reference_component_assignments))
+#            print("Component assignments: " + str(component_assignments))
             partition_mismatch = False
     
     else:

From a80f20686622742f0a96a4da9b8fc364ebcf6461 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 13:24:16 +0000
Subject: [PATCH 259/327] Print counting information

---
 PopPUNK/network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 487847ce..0fd33afc 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -276,7 +276,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                                                                                 on = 'vertex',
                                                                                 how = 'left')
             combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
-            combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'])['ref_labels'].nunique()
+            print("Counting: " + str(combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels']))
+            combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique()
             max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max()
             print("Combined assignments: " + str(combined_vertex_assignments))
             print('max is ' + str(max_ref_comp_count))

From ab52160c654d0780d6857938da88d50153ceb5de Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 13:25:07 +0000
Subject: [PATCH 260/327] Print counting information

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 0fd33afc..bfed5cb3 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -276,7 +276,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                                                                                 on = 'vertex',
                                                                                 how = 'left')
             combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
-            print("Counting: " + str(combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels']))
+            print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels']))
             combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique()
             max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max()
             print("Combined assignments: " + str(combined_vertex_assignments))

From 490a227b0f267edf1d1666db6976d222294b1b89 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 13:26:13 +0000
Subject: [PATCH 261/327] Print as list

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index bfed5cb3..6f20f5b4 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -276,7 +276,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                                                                                 on = 'vertex',
                                                                                 how = 'left')
             combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
-            print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels']))
+            print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].to_arrow().to_pylist()))
             combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique()
             max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max()
             print("Combined assignments: " + str(combined_vertex_assignments))

From 7683bf9e7a9b6cf2cd1f3f9b4d6dd7acb1d0e22c Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 13:27:16 +0000
Subject: [PATCH 262/327] Print as unique list

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 6f20f5b4..e1bb9fd8 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -276,7 +276,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                                                                                 on = 'vertex',
                                                                                 how = 'left')
             combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
-            print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].to_arrow().to_pylist()))
+            print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().to_arrow().to_pylist()))
             combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique()
             max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max()
             print("Combined assignments: " + str(combined_vertex_assignments))

From 659ccff76da1a0fc5018c311568fb9796cdf6734 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 13:31:24 +0000
Subject: [PATCH 263/327] Find overall max

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index e1bb9fd8..653d3944 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -278,7 +278,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
             print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().to_arrow().to_pylist()))
             combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique()
-            max_ref_comp_count = combined_vertex_assignments['ref_comp_count'].max()
+            max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max()
             print("Combined assignments: " + str(combined_vertex_assignments))
             print('max is ' + str(max_ref_comp_count))
 #            print("Reference indices: " + str(reference_indices))

From 80bbd67dbdbd47ef36f7b51e4197198f9c5ab8b4 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 14:05:34 +0000
Subject: [PATCH 264/327] Test reference connectivity

---
 PopPUNK/network.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 653d3944..4284082d 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -249,7 +249,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # Record the original components to which sequences belonged
         component_assignments = cugraph.components.connectivity.connected_components(G)
         # Leiden method has resolution parameter - higher values give greater precision
-        partition_assignments, score = cugraph.leiden(G, resolution = 1.0)
+        partition_assignments, score = cugraph.leiden(G, resolution = 10.0)
         # group by partition, which becomes the first column, so retrieve second column
         reference_index_df = partition_assignments.groupby('partition').nth(0)
         reference_indices = reference_index_df['vertex'].to_arrow().to_pylist()
@@ -277,8 +277,19 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                                                                                 how = 'left')
             combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
             print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().to_arrow().to_pylist()))
-            combined_vertex_assignments['ref_comp_count'] = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique()
             max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max()
+            if max_ref_comp_count == 1:
+                partition_mismatch = False
+            else:
+                for component, component_df in combined_vertex_assignments.groupby():
+                    print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique()))
+                    if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique() > 1:
+                        G_component_df = G_df[G_df['labels'] == component]
+                        print("Component info: " + str(G_component_df))
+                        G_component = cugraph.Graph()
+                        G_component.from_cudf_edgelist(G_component_df)
+                        traversal = cugraph.traversal.sssp(G_component,source = component_df['vertex'][0])
+                        print("Traversal: " + str(traversal))
             print("Combined assignments: " + str(combined_vertex_assignments))
             print('max is ' + str(max_ref_comp_count))
 #            print("Reference indices: " + str(reference_indices))

From 51e442e0034fc61b18162e4c356d6339769503f3 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 14:06:46 +0000
Subject: [PATCH 265/327] use debug mode

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 4284082d..bd29242f 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -278,7 +278,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
             print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().to_arrow().to_pylist()))
             max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max()
-            if max_ref_comp_count == 1:
+            if max_ref_comp_count == 0:
                 partition_mismatch = False
             else:
                 for component, component_df in combined_vertex_assignments.groupby():

From 0e69d8c5ee360cb0e188e3c9739ceb9d23fa8a0b Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 14:11:01 +0000
Subject: [PATCH 266/327] Change group by variable

---
 PopPUNK/network.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index bd29242f..16898aef 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -281,15 +281,15 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             if max_ref_comp_count == 0:
                 partition_mismatch = False
             else:
-                for component, component_df in combined_vertex_assignments.groupby():
+                for component, component_df in combined_vertex_assignments.groupby([labels], sort = False):
                     print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique()))
                     if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique() > 1:
                         G_component_df = G_df[G_df['labels'] == component]
                         print("Component info: " + str(G_component_df))
                         G_component = cugraph.Graph()
                         G_component.from_cudf_edgelist(G_component_df)
-                        traversal = cugraph.traversal.sssp(G_component,source = component_df['vertex'][0])
-                        print("Traversal: " + str(traversal))
+                        distances, predecessors = cugraph.traversal.sssp(G_component,source = component_df['vertex'][0])
+                        print("Traversal: " + str(predecessors))
             print("Combined assignments: " + str(combined_vertex_assignments))
             print('max is ' + str(max_ref_comp_count))
 #            print("Reference indices: " + str(reference_indices))

From c98179254cba1fd973af8caa87efbdff5b19b7b5 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 14:38:13 +0000
Subject: [PATCH 267/327] Correct group variable selection

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 16898aef..5d8a7a83 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -281,7 +281,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             if max_ref_comp_count == 0:
                 partition_mismatch = False
             else:
-                for component, component_df in combined_vertex_assignments.groupby([labels], sort = False):
+                for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False):
                     print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique()))
                     if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique() > 1:
                         G_component_df = G_df[G_df['labels'] == component]

From 906b534fb9be2451d73f868a97760b24410588de Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 14:42:58 +0000
Subject: [PATCH 268/327] Add extra debug print statement

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 5d8a7a83..d7ccbda9 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -282,8 +282,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                 partition_mismatch = False
             else:
                 for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False):
-                    print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique()))
-                    if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique() > 1:
+                    print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0]))
+                    if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1:
                         G_component_df = G_df[G_df['labels'] == component]
                         print("Component info: " + str(G_component_df))
                         G_component = cugraph.Graph()

From ee510c434cb7922200ae9bdaeef980152817b01b Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 14:45:38 +0000
Subject: [PATCH 269/327] Change ref selection

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index d7ccbda9..85af62cd 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -249,7 +249,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # Record the original components to which sequences belonged
         component_assignments = cugraph.components.connectivity.connected_components(G)
         # Leiden method has resolution parameter - higher values give greater precision
-        partition_assignments, score = cugraph.leiden(G, resolution = 10.0)
+        partition_assignments, score = cugraph.leiden(G, resolution = 0.1)
         # group by partition, which becomes the first column, so retrieve second column
         reference_index_df = partition_assignments.groupby('partition').nth(0)
         reference_indices = reference_index_df['vertex'].to_arrow().to_pylist()

From 2b981e2423b894c9d5727768237385cb5de1a943 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 14:47:08 +0000
Subject: [PATCH 270/327] Extend debug mode

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 85af62cd..93966156 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -283,7 +283,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             else:
                 for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False):
                     print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0]))
-                    if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1:
+                    if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 0:
                         G_component_df = G_df[G_df['labels'] == component]
                         print("Component info: " + str(G_component_df))
                         G_component = cugraph.Graph()

From 23e6b6023085d50675f1bba27091c71d14495241 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 14:50:54 +0000
Subject: [PATCH 271/327] Further debug

---
 PopPUNK/network.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 93966156..198706aa 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -284,7 +284,9 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                 for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False):
                     print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0]))
                     if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 0:
-                        G_component_df = G_df[G_df['labels'] == component]
+                        vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex']
+                        print("Vertices in components: " + str(vertices_in_component))
+                        G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)]
                         print("Component info: " + str(G_component_df))
                         G_component = cugraph.Graph()
                         G_component.from_cudf_edgelist(G_component_df)

From 2151ec8cd99df30a813c8c8c2bb51c9b46490494 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 14:52:23 +0000
Subject: [PATCH 272/327] Changes to debug message

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 198706aa..9e5644f5 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -290,8 +290,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                         print("Component info: " + str(G_component_df))
                         G_component = cugraph.Graph()
                         G_component.from_cudf_edgelist(G_component_df)
-                        distances, predecessors = cugraph.traversal.sssp(G_component,source = component_df['vertex'][0])
-                        print("Traversal: " + str(predecessors))
+                        traversal = cugraph.traversal.sssp(G_component,source = component_df['vertex'][0])
+                        print("Traversal: " + str(traversal))
             print("Combined assignments: " + str(combined_vertex_assignments))
             print('max is ' + str(max_ref_comp_count))
 #            print("Reference indices: " + str(reference_indices))

From a57a1b8ad55d586df6e28eb84e192434c83e7025 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 15:29:21 +0000
Subject: [PATCH 273/327] Update reference indices

---
 PopPUNK/network.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 9e5644f5..407c2a09 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -290,9 +290,17 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                         print("Component info: " + str(G_component_df))
                         G_component = cugraph.Graph()
                         G_component.from_cudf_edgelist(G_component_df)
-                        traversal = cugraph.traversal.sssp(G_component,source = component_df['vertex'][0])
+                        traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0])
                         print("Traversal: " + str(traversal))
-            print("Combined assignments: " + str(combined_vertex_assignments))
+                        reference_index_set = set(reference_indices)
+                        predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'])
+                        while len(predecessors) > 0 and len(reference_index_set.difference(predecessors)) > 0:
+                            reference_index_set = reference_index_set.union(predecessors)
+                            predecessors = set()
+                            predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'])
+                        print("Predecessors: " + str(predecessors))
+                        reference_indices = list(reference_index_set)
+            print("Final references: " + str(reference_indices))
             print('max is ' + str(max_ref_comp_count))
 #            print("Reference indices: " + str(reference_indices))
 #            print("Overall cudf: " + str(G_df))
@@ -376,9 +384,9 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             G_ref = gt.GraphView(G, vfilt = reference_vertex)
             G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object
 
-        # Order found references as in sketch files
-        reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
-        refFileName = writeReferences(reference_names, outPrefix)
+    # Order found references as in sketch files
+    reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
+    refFileName = writeReferences(reference_names, outPrefix)
     return reference_indices, reference_names, refFileName, G_ref
 
 def writeReferences(refList, outPrefix):

From 920a66be9c94492c693e98ebcc17701ecc29f487 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 15:35:57 +0000
Subject: [PATCH 274/327] Change series to set conversion

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 407c2a09..b48961a9 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -293,11 +293,11 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                         traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0])
                         print("Traversal: " + str(traversal))
                         reference_index_set = set(reference_indices)
-                        predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'])
+                        predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'].to_arrow().to_pylist())
                         while len(predecessors) > 0 and len(reference_index_set.difference(predecessors)) > 0:
                             reference_index_set = reference_index_set.union(predecessors)
                             predecessors = set()
-                            predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'])
+                            predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'].to_arrow().to_pylist())
                         print("Predecessors: " + str(predecessors))
                         reference_indices = list(reference_index_set)
             print("Final references: " + str(reference_indices))

From 8c9cf4dc2d19f7375de6cc50588783afab59e704 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 15:40:51 +0000
Subject: [PATCH 275/327] Change set processing

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index b48961a9..3626398c 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -294,7 +294,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                         print("Traversal: " + str(traversal))
                         reference_index_set = set(reference_indices)
                         predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'].to_arrow().to_pylist())
-                        while len(predecessors) > 0 and len(reference_index_set.difference(predecessors)) > 0:
+                        while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0:
                             reference_index_set = reference_index_set.union(predecessors)
                             predecessors = set()
                             predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'].to_arrow().to_pylist())

From 4f646b3012830a2b4fefce4b5f1d1222dd5e6a5f Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 15:51:51 +0000
Subject: [PATCH 276/327] Change filtering conditions

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 3626398c..bbe959d5 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -293,11 +293,11 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                         traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0])
                         print("Traversal: " + str(traversal))
                         reference_index_set = set(reference_indices)
-                        predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'].to_arrow().to_pylist())
+                        predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist())
                         while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0:
                             reference_index_set = reference_index_set.union(predecessors)
                             predecessors = set()
-                            predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] >= 0]['predecessor'].to_arrow().to_pylist())
+                            predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist())
                         print("Predecessors: " + str(predecessors))
                         reference_indices = list(reference_index_set)
             print("Final references: " + str(reference_indices))

From 4669728291d3632d9026355f9b0bd48c7b85a618 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 15:54:36 +0000
Subject: [PATCH 277/327] Change definition of reference set

---
 PopPUNK/network.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index bbe959d5..ae89f62f 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -293,11 +293,13 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                         traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0])
                         print("Traversal: " + str(traversal))
                         reference_index_set = set(reference_indices)
-                        predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist())
+                        #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist())
+                        predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].to_arrow().to_pylist()
+                        predecessors = set(predecessor_list[predecessor_list >= 0])
                         while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0:
                             reference_index_set = reference_index_set.union(predecessors)
-                            predecessors = set()
-                            predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist())
+                            predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].to_arrow().to_pylist()
+                            predecessors = set(predecessor_list[predecessor_list >= 0])
                         print("Predecessors: " + str(predecessors))
                         reference_indices = list(reference_index_set)
             print("Final references: " + str(reference_indices))

From 25b8cccf9f5a2d1e20902b2fbcee130ae8d023c9 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 15:57:07 +0000
Subject: [PATCH 278/327] Change definition of reference set

---
 PopPUNK/network.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index ae89f62f..948fe07e 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -294,8 +294,10 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                         print("Traversal: " + str(traversal))
                         reference_index_set = set(reference_indices)
                         #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist())
-                        predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].to_arrow().to_pylist()
-                        predecessors = set(predecessor_list[predecessor_list >= 0])
+                        predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor']
+                        print("pred list: " + str(predecessor_list))
+                        predecessors = set(predecessor_list[predecessor_list >= 0].to_arrow().to_pylist())
+#                        predecessors = set(predecessor_list[predecessor_list >= 0])
                         while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0:
                             reference_index_set = reference_index_set.union(predecessors)
                             predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].to_arrow().to_pylist()

From 294f3fad8902c86d349ebf3baccca10bae3eed1e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 16:00:46 +0000
Subject: [PATCH 279/327] Change definition of reference set

---
 PopPUNK/network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 948fe07e..28939b6b 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -294,6 +294,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                         print("Traversal: " + str(traversal))
                         reference_index_set = set(reference_indices)
                         #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist())
+                        print("Ref indices: " + str(reference_indices))
                         predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor']
                         print("pred list: " + str(predecessor_list))
                         predecessors = set(predecessor_list[predecessor_list >= 0].to_arrow().to_pylist())

From ba946b1a55b46acdf33d87ae83c721b62461b8fb Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 16:07:45 +0000
Subject: [PATCH 280/327] Debug series filtering

---
 PopPUNK/network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 28939b6b..9c632f4b 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -296,6 +296,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                         #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist())
                         print("Ref indices: " + str(reference_indices))
                         predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor']
+                        print("Raw traversal: " + str(traversal[traversal['vertex'].isin(reference_indices)]))
                         print("pred list: " + str(predecessor_list))
                         predecessors = set(predecessor_list[predecessor_list >= 0].to_arrow().to_pylist())
 #                        predecessors = set(predecessor_list[predecessor_list >= 0])

From cebdf3aad32f78b1786c6bd704dcc3b61f8e6745 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 16:09:50 +0000
Subject: [PATCH 281/327] Extract series values

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 9c632f4b..3322fd19 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -295,7 +295,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                         reference_index_set = set(reference_indices)
                         #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist())
                         print("Ref indices: " + str(reference_indices))
-                        predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor']
+                        predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values()
                         print("Raw traversal: " + str(traversal[traversal['vertex'].isin(reference_indices)]))
                         print("pred list: " + str(predecessor_list))
                         predecessors = set(predecessor_list[predecessor_list >= 0].to_arrow().to_pylist())

From f86e814e257e6d5a69e3ad9db9397c22c823f899 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 16:14:14 +0000
Subject: [PATCH 282/327] Change extraction of values from series

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 3322fd19..9512d515 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -295,10 +295,10 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                         reference_index_set = set(reference_indices)
                         #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist())
                         print("Ref indices: " + str(reference_indices))
-                        predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values()
+                        predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values
                         print("Raw traversal: " + str(traversal[traversal['vertex'].isin(reference_indices)]))
                         print("pred list: " + str(predecessor_list))
-                        predecessors = set(predecessor_list[predecessor_list >= 0].to_arrow().to_pylist())
+                        predecessors = set(predecessor_list[predecessor_list >= 0])
 #                        predecessors = set(predecessor_list[predecessor_list >= 0])
                         while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0:
                             reference_index_set = reference_index_set.union(predecessors)

From 26bbc0a2bf7d0e505d0fae0a8339ab8125741ade Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 16:35:53 +0000
Subject: [PATCH 283/327] Comment code for impending review

---
 PopPUNK/network.py | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 9512d515..46c80c3c 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -267,52 +267,48 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         max_in_vertex_labels = max(reference_indices)
         G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False)
         
-        # Check on targets
+        # Check references in same component in overall graph are connected in the reference graph
         partition_mismatch = True
         while partition_mismatch:
+            # Get components of original reference graph
             reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
             reference_component_assignments.rename(columns={'labels': 'ref_labels'}, inplace=True)
+            # Merge with component assignments from overall graph
             combined_vertex_assignments = reference_component_assignments.merge(component_assignments,
                                                                                 on = 'vertex',
                                                                                 how = 'left')
             combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
-            print("Counting: " + str(combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().to_arrow().to_pylist()))
+            # Find the number of components in the reference graph associated with each component in the overall graph -
+            # should be one if there is a one-to-one mapping of components - else links need to be added
             max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max()
-            if max_ref_comp_count == 0:
+            if max_ref_comp_count == 1:
                 partition_mismatch = False
             else:
+                # Iterate through components
                 for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False):
-                    print("Nunique!: " + str(component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0]))
-                    if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 0:
+                    # Find components in the overall graph matching multiple components in the reference graph
+                    if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1:
+                        # Make a graph of the component from the overall graph
                         vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex']
-                        print("Vertices in components: " + str(vertices_in_component))
                         G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)]
-                        print("Component info: " + str(G_component_df))
                         G_component = cugraph.Graph()
                         G_component.from_cudf_edgelist(G_component_df)
+                        # Find single shortest path from a reference
+                        # Should check first will always be a reference
                         traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0])
-                        print("Traversal: " + str(traversal))
                         reference_index_set = set(reference_indices)
-                        #predecessors = set(traversal[traversal['vertex'].isin(reference_indices) & traversal['predecessor'] != -1]['predecessor'].to_arrow().to_pylist())
-                        print("Ref indices: " + str(reference_indices))
+                        # Add predecessors to reference sequences on the SSSPs
                         predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values
-                        print("Raw traversal: " + str(traversal[traversal['vertex'].isin(reference_indices)]))
-                        print("pred list: " + str(predecessor_list))
                         predecessors = set(predecessor_list[predecessor_list >= 0])
-#                        predecessors = set(predecessor_list[predecessor_list >= 0])
+                        # Add predecessors to reference set and check whether this results in complete paths
+                        # where complete paths are indicated by references' predecessors being within the set of
+                        # references
                         while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0:
                             reference_index_set = reference_index_set.union(predecessors)
-                            predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].to_arrow().to_pylist()
+                            predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values
                             predecessors = set(predecessor_list[predecessor_list >= 0])
-                        print("Predecessors: " + str(predecessors))
+                        # Add expanded reference set to the overall list
                         reference_indices = list(reference_index_set)
-            print("Final references: " + str(reference_indices))
-            print('max is ' + str(max_ref_comp_count))
-#            print("Reference indices: " + str(reference_indices))
-#            print("Overall cudf: " + str(G_df))
-#            print("Reference df: " + str(G_ref_df))
-#            print("Reference component assignments: " + str(reference_component_assignments))
-#            print("Component assignments: " + str(component_assignments))
             partition_mismatch = False
     
     else:

From a9d8fb0f4c121a7091a7f2c5c441bc415fe35c5e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 16:37:08 +0000
Subject: [PATCH 284/327] Remove unnecessary loop

---
 PopPUNK/network.py | 77 ++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 40 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 46c80c3c..fa20c0dd 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -268,48 +268,45 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False)
         
         # Check references in same component in overall graph are connected in the reference graph
-        partition_mismatch = True
-        while partition_mismatch:
-            # Get components of original reference graph
-            reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
-            reference_component_assignments.rename(columns={'labels': 'ref_labels'}, inplace=True)
-            # Merge with component assignments from overall graph
-            combined_vertex_assignments = reference_component_assignments.merge(component_assignments,
-                                                                                on = 'vertex',
-                                                                                how = 'left')
-            combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
-            # Find the number of components in the reference graph associated with each component in the overall graph -
-            # should be one if there is a one-to-one mapping of components - else links need to be added
-            max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max()
-            if max_ref_comp_count == 1:
-                partition_mismatch = False
-            else:
-                # Iterate through components
-                for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False):
-                    # Find components in the overall graph matching multiple components in the reference graph
-                    if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1:
-                        # Make a graph of the component from the overall graph
-                        vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex']
-                        G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)]
-                        G_component = cugraph.Graph()
-                        G_component.from_cudf_edgelist(G_component_df)
-                        # Find single shortest path from a reference
-                        # Should check first will always be a reference
-                        traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0])
-                        reference_index_set = set(reference_indices)
-                        # Add predecessors to reference sequences on the SSSPs
+        # First get components of original reference graph
+        reference_component_assignments = cugraph.components.connectivity.connected_components(G_ref)
+        reference_component_assignments.rename(columns={'labels': 'ref_labels'}, inplace=True)
+        # Merge with component assignments from overall graph
+        combined_vertex_assignments = reference_component_assignments.merge(component_assignments,
+                                                                            on = 'vertex',
+                                                                            how = 'left')
+        combined_vertex_assignments = combined_vertex_assignments[combined_vertex_assignments['vertex'].isin(reference_indices)]
+        # Find the number of components in the reference graph associated with each component in the overall graph -
+        # should be one if there is a one-to-one mapping of components - else links need to be added
+        max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max()
+        if max_ref_comp_count == 1:
+            partition_mismatch = False
+        else:
+            # Iterate through components
+            for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False):
+                # Find components in the overall graph matching multiple components in the reference graph
+                if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1:
+                    # Make a graph of the component from the overall graph
+                    vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex']
+                    G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)]
+                    G_component = cugraph.Graph()
+                    G_component.from_cudf_edgelist(G_component_df)
+                    # Find single shortest path from a reference
+                    # Should check first will always be a reference
+                    traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0])
+                    reference_index_set = set(reference_indices)
+                    # Add predecessors to reference sequences on the SSSPs
+                    predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values
+                    predecessors = set(predecessor_list[predecessor_list >= 0])
+                    # Add predecessors to reference set and check whether this results in complete paths
+                    # where complete paths are indicated by references' predecessors being within the set of
+                    # references
+                    while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0:
+                        reference_index_set = reference_index_set.union(predecessors)
                         predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values
                         predecessors = set(predecessor_list[predecessor_list >= 0])
-                        # Add predecessors to reference set and check whether this results in complete paths
-                        # where complete paths are indicated by references' predecessors being within the set of
-                        # references
-                        while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0:
-                            reference_index_set = reference_index_set.union(predecessors)
-                            predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values
-                            predecessors = set(predecessor_list[predecessor_list >= 0])
-                        # Add expanded reference set to the overall list
-                        reference_indices = list(reference_index_set)
-            partition_mismatch = False
+                    # Add expanded reference set to the overall list
+                    reference_indices = list(reference_index_set)
     
     else:
 

From 92d0f0c38f17a11722c022965cd23e9bfa64b27b Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 16:43:45 +0000
Subject: [PATCH 285/327] Change vertex selection for SSSP

---
 PopPUNK/network.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index fa20c0dd..b4e569ea 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -280,20 +280,18 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # should be one if there is a one-to-one mapping of components - else links need to be added
         max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max()
         if max_ref_comp_count == 1:
-            partition_mismatch = False
-        else:
             # Iterate through components
             for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False):
                 # Find components in the overall graph matching multiple components in the reference graph
-                if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1:
+                if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0]== 1:
                     # Make a graph of the component from the overall graph
                     vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex']
+                    references_in_component = vertices_in_component[vertices_in_component.isin(reference_indices)].values
                     G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)]
                     G_component = cugraph.Graph()
                     G_component.from_cudf_edgelist(G_component_df)
                     # Find single shortest path from a reference
-                    # Should check first will always be a reference
-                    traversal = cugraph.traversal.sssp(G_component,source = vertices_in_component.iloc[0])
+                    traversal = cugraph.traversal.sssp(G_component,source = references_in_component[0])
                     reference_index_set = set(reference_indices)
                     # Add predecessors to reference sequences on the SSSPs
                     predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values

From 01bbe775589d6f2e7d2a2298e24b44e656c97a03 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 16:46:16 +0000
Subject: [PATCH 286/327] Reconstruct reference graph where necessary

---
 PopPUNK/network.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index b4e569ea..5af05e35 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -279,11 +279,11 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # Find the number of components in the reference graph associated with each component in the overall graph -
         # should be one if there is a one-to-one mapping of components - else links need to be added
         max_ref_comp_count = combined_vertex_assignments.groupby(['labels'], sort = False)['ref_labels'].nunique().max()
-        if max_ref_comp_count == 1:
+        if max_ref_comp_count > 1:
             # Iterate through components
             for component, component_df in combined_vertex_assignments.groupby(['labels'], sort = False):
                 # Find components in the overall graph matching multiple components in the reference graph
-                if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0]== 1:
+                if component_df.groupby(['labels'], sort = False)['ref_labels'].nunique().iloc[0] > 1:
                     # Make a graph of the component from the overall graph
                     vertices_in_component = component_assignments[component_assignments['labels']==component]['vertex']
                     references_in_component = vertices_in_component[vertices_in_component.isin(reference_indices)].values
@@ -305,7 +305,10 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                         predecessors = set(predecessor_list[predecessor_list >= 0])
                     # Add expanded reference set to the overall list
                     reference_indices = list(reference_index_set)
-    
+            # Create new reference graph
+            G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)]
+            G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False)
+            
     else:
 
         # Each component is independent, so can be multithreaded

From ad66e9cab61f4f970ea9cb5b0648a8b115721781 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 16:51:00 +0000
Subject: [PATCH 287/327] Debug for missing nodes

---
 PopPUNK/network.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 5af05e35..5e2688f5 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -290,7 +290,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                     G_component_df = G_df[G_df['source'].isin(vertices_in_component) & G_df['destination'].isin(vertices_in_component)]
                     G_component = cugraph.Graph()
                     G_component.from_cudf_edgelist(G_component_df)
-                    # Find single shortest path from a reference
+                    # Find single shortest path from a reference to all other nodes in the component
                     traversal = cugraph.traversal.sssp(G_component,source = references_in_component[0])
                     reference_index_set = set(reference_indices)
                     # Add predecessors to reference sequences on the SSSPs
@@ -1239,6 +1239,9 @@ def get_vertex_list(G, use_gpu = False):
     
     if use_gpu:
         vlist = G.nodes().to_array().tolist()
+        print("Nodes: " + str(G.nodes()))
+        print("Array: " + str(G.nodes().to_array()))
+        print("List: " + str(G.nodes().to_array().tolist()))
     else:
         vlist = list(G.vertices())
     

From 7d49d8647a1653c7e5904c2979f8280e85f19d55 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 16:55:30 +0000
Subject: [PATCH 288/327] Remove debug message

---
 PopPUNK/network.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 5e2688f5..835b67a4 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -1239,9 +1239,6 @@ def get_vertex_list(G, use_gpu = False):
     
     if use_gpu:
         vlist = G.nodes().to_array().tolist()
-        print("Nodes: " + str(G.nodes()))
-        print("Array: " + str(G.nodes().to_array()))
-        print("List: " + str(G.nodes().to_array().tolist()))
     else:
         vlist = list(G.vertices())
     

From 512c1272c1f19ea59bf59fd7b53549b55ca4acf9 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 17:09:13 +0000
Subject: [PATCH 289/327] Add missing nodes with cugraph

---
 PopPUNK/network.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 835b67a4..8d2f7703 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -614,21 +614,23 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         # by adding a self-loop if necessary; see https://github.com/rapidsai/cugraph/issues/1206
         max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
         max_in_vertex_labels = len(vertex_labels)-1
-        if max_in_df.item() != max_in_vertex_labels:
-            G_self_loop = cudf.DataFrame()
-            G_self_loop['source'] = [max_in_vertex_labels]
-            G_self_loop['destination'] = [max_in_vertex_labels]
-            if weights is not None or sparse_input is not None:
-                G_self_loop['weights'] = [0.0]
-            G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
-            new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
-        
-        # construct graph
-        G = cugraph.Graph()
-        if weights is not None or sparse_input is not None:
-            G.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
-        else:
-            G.from_cudf_edgelist(G_df, renumber=False)
+        G = add_self_loop(G_df, max_in_vertex_labels, weights = (if weights is not None), renumber = False)
+#        
+#        if max_in_df.item() != max_in_vertex_labels:
+#            G_self_loop = cudf.DataFrame()
+#            G_self_loop['source'] = [max_in_vertex_labels]
+#            G_self_loop['destination'] = [max_in_vertex_labels]
+#            if weights is not None or sparse_input is not None:
+#                G_self_loop['weights'] = [0.0]
+#            G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
+#            new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
+#        
+#        # construct graph
+#        G = cugraph.Graph()
+#        if weights is not None or sparse_input is not None:
+#            G.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
+#        else:
+#            G.from_cudf_edgelist(G_df, renumber=False)
 
     else:
 

From c504f964436248b9aa20aa6ecfed2f52e4522f2f Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 17:10:47 +0000
Subject: [PATCH 290/327] Add missing nodes with cugraph

---
 PopPUNK/network.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 8d2f7703..366f8f25 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -614,8 +614,11 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         # by adding a self-loop if necessary; see https://github.com/rapidsai/cugraph/issues/1206
         max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
         max_in_vertex_labels = len(vertex_labels)-1
-        G = add_self_loop(G_df, max_in_vertex_labels, weights = (if weights is not None), renumber = False)
-#        
+        use_weights = False
+        if weights is not None:
+            use_weights = True
+        G = add_self_loop(G_df, max_in_vertex_labels, weights = use_weights, renumber = False)
+#
 #        if max_in_df.item() != max_in_vertex_labels:
 #            G_self_loop = cudf.DataFrame()
 #            G_self_loop['source'] = [max_in_vertex_labels]
@@ -624,7 +627,7 @@ def constructNetwork(rlist, qlist, assignments, within_label,
 #                G_self_loop['weights'] = [0.0]
 #            G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
 #            new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
-#        
+#
 #        # construct graph
 #        G = cugraph.Graph()
 #        if weights is not None or sparse_input is not None:

From 8b5ed909d888debeda0d882d5d7113d5ff1ca049 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 17:14:10 +0000
Subject: [PATCH 291/327] Add missing nodes with cugraph

---
 PopPUNK/network.py | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 366f8f25..512af9b1 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -618,22 +618,6 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         if weights is not None:
             use_weights = True
         G = add_self_loop(G_df, max_in_vertex_labels, weights = use_weights, renumber = False)
-#
-#        if max_in_df.item() != max_in_vertex_labels:
-#            G_self_loop = cudf.DataFrame()
-#            G_self_loop['source'] = [max_in_vertex_labels]
-#            G_self_loop['destination'] = [max_in_vertex_labels]
-#            if weights is not None or sparse_input is not None:
-#                G_self_loop['weights'] = [0.0]
-#            G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
-#            new_max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
-#
-#        # construct graph
-#        G = cugraph.Graph()
-#        if weights is not None or sparse_input is not None:
-#            G.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False)
-#        else:
-#            G.from_cudf_edgelist(G_df, renumber=False)
 
     else:
 
@@ -928,6 +912,14 @@ def add_self_loop(G_df, seq_num, weights = False, renumber = True):
             Dictionary of cluster assignments (keys are sequence names)
     """
     # use self-loop to ensure all nodes are present
+     = np.amin([G_df['source'].max(),G_df['destination'].max()])
+    if min_in_df.item() > 0:
+        G_self_loop = cudf.DataFrame()
+        G_self_loop['source'] = [0]
+        G_self_loop['destination'] = [0]
+        if weights:
+            G_self_loop['weight'] = 0.0
+        G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
     max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
     if max_in_df.item() != seq_num:
         G_self_loop = cudf.DataFrame()

From 624bce8376599b6f1c48f75626f6ab7779739ce6 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 17:14:47 +0000
Subject: [PATCH 292/327] Add missing nodes with cugraph

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 512af9b1..b1187b51 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -912,7 +912,7 @@ def add_self_loop(G_df, seq_num, weights = False, renumber = True):
             Dictionary of cluster assignments (keys are sequence names)
     """
     # use self-loop to ensure all nodes are present
-     = np.amin([G_df['source'].max(),G_df['destination'].max()])
+    min_in_df = np.amin([G_df['source'].max(),G_df['destination'].max()])
     if min_in_df.item() > 0:
         G_self_loop = cudf.DataFrame()
         G_self_loop['source'] = [0]

From f032f26218686e75549f2f64b091263245d2cd8a Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 21:10:32 +0000
Subject: [PATCH 293/327] Change cugraph node count retrieval

---
 PopPUNK/network.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index b1187b51..6f871845 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -912,13 +912,13 @@ def add_self_loop(G_df, seq_num, weights = False, renumber = True):
             Dictionary of cluster assignments (keys are sequence names)
     """
     # use self-loop to ensure all nodes are present
-    min_in_df = np.amin([G_df['source'].max(),G_df['destination'].max()])
+    min_in_df = np.amin([G_df['source'].min(),G_df['destination'].min()])
     if min_in_df.item() > 0:
         G_self_loop = cudf.DataFrame()
         G_self_loop['source'] = [0]
         G_self_loop['destination'] = [0]
         if weights:
-            G_self_loop['weight'] = 0.0
+            G_self_loop['weights'] = 0.0
         G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
     max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()])
     if max_in_df.item() != seq_num:
@@ -926,7 +926,7 @@ def add_self_loop(G_df, seq_num, weights = False, renumber = True):
         G_self_loop['source'] = [seq_num]
         G_self_loop['destination'] = [seq_num]
         if weights:
-            G_self_loop['weight'] = 0.0
+            G_self_loop['weights'] = 0.0
         G_df = cudf.concat([G_df,G_self_loop], ignore_index = True)
     # Construct graph
     G_new = cugraph.Graph()
@@ -1235,7 +1235,7 @@ def get_vertex_list(G, use_gpu = False):
     """
     
     if use_gpu:
-        vlist = G.nodes().to_array().tolist()
+        vlist = range(G.number_of_vertices().item())
     else:
         vlist = list(G.vertices())
     

From 5dc2b0eca2a37e47c3a3f395e58cedb9460b6e64 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 21:11:45 +0000
Subject: [PATCH 294/327] Change int format

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 6f871845..f888eafc 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -1235,7 +1235,7 @@ def get_vertex_list(G, use_gpu = False):
     """
     
     if use_gpu:
-        vlist = range(G.number_of_vertices().item())
+        vlist = range(G.number_of_vertices())
     else:
         vlist = list(G.vertices())
     

From 7602dd31d7dbe11c4d04a4217a2271325fb6a505 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 21:40:34 +0000
Subject: [PATCH 295/327] Change save function definition

---
 PopPUNK/models.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 502575ff..51218083 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -39,7 +39,6 @@
     import cudf
     gpu_lib = True
 except ImportError as e:
-    sys.stderr.write("cugraph and cudf unavailable\n")
     gpu_lib = False
 
 import pp_sketchlib
@@ -266,7 +265,7 @@ def copy(self, prefix):
         """Copy the model to a new directory
         """
         self.outPrefix = prefix
-        save()
+        self.save()
 
 
 class BGMMFit(ClusterFit):

From bdb01410185a2f089abdf075b3c5b72d64dfa9fd Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 21:42:36 +0000
Subject: [PATCH 296/327] Change GPU score calculation

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index f888eafc..64d61c95 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -681,9 +681,9 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         components = len(component_nums)
         density = G.number_of_edges()/(0.5 * G.number_of_vertices() * G.number_of_vertices() - 1)
         triangle_count = cugraph.community.triangle_count.triangles(G)
-        degree_df = G.degree()
+        degree_df = G.in_degree()
         triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()])
-        transitivity = triangle_count/triad_count
+        transitivity = 2*triangle_count/triad_count
     else:
         component_assignments, component_frequencies = gt.label_components(G)
         components = len(component_frequencies)

From 4b78c1e19af42ad4565e051aaf9bcb45a30f90a8 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 17 Mar 2021 22:06:26 +0000
Subject: [PATCH 297/327] Update cytoscape viz test

---
 test/run_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/run_test.py b/test/run_test.py
index a72b450d..61cd3706 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -67,7 +67,7 @@
 # viz
 sys.stderr.write("Running visualisations (poppunk_visualise)\n")
 subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --microreact", shell=True, check=True)
-subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --cytoscape", shell=True, check=True)
+subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --cytoscape --network-file example_db/example_db_graph.gt", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --phandango", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree", shell=True, check=True)
 subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt", shell=True, check=True)

From 35abac00853e50b00657ff50e78d3b2bd644a5fb Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Mar 2021 11:32:51 +0000
Subject: [PATCH 298/327] Changes to messages and function arguments

---
 PopPUNK/models.py     | 3 +--
 PopPUNK/network.py    | 1 -
 PopPUNK/refine.py     | 1 -
 PopPUNK/sparse_mst.py | 3 +--
 4 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 51218083..46be4219 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -1009,8 +1009,7 @@ def fit(self, X, accessory):
                 pp_sketchlib.sparsifyDists(
                     pp_sketchlib.longToSquare(X[:, [self.dist_col]], self.threads),
                     0,
-                    rank,
-                    self.threads
+                    rank
                 )
             data = [epsilon if d < epsilon else d for d in data]
             self.nn_dists[rank] = coo_matrix((data, (row, col)),
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 64d61c95..666bdd6f 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -28,7 +28,6 @@
     import cudf
     gpu_lib = True
 except ImportError as e:
-    sys.stderr.write("cugraph and cudf unavailable\n")
     gpu_lib = False
 
 from .__main__ import accepted_weights_types
diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index 2095e82e..f72c6900 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -30,7 +30,6 @@
     import cudf
     gpu_lib = True
 except ImportError as e:
-    sys.stderr.write("cugraph and cudf unavailable\n")
     gpu_lib = False
 
 from .network import constructNetwork
diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py
index 24fc2bfa..68f8e321 100755
--- a/PopPUNK/sparse_mst.py
+++ b/PopPUNK/sparse_mst.py
@@ -18,13 +18,12 @@
     import cudf
     gpu_lib = True
 except ImportError as e:
-    sys.stderr.write("cugraph and cudf unavailable\n")
     gpu_lib = False
 
 # import poppunk package
 from .__init__ import __version__
 
-from .network import constructNetwork, generate_minimum_spanning_tree, load_previous_network
+from .network import constructNetwork, generate_minimum_spanning_tree, network_to_edges
 from .plot import drawMST
 from .trees import mst_to_phylogeny, write_tree
 from .utils import setGtThreads, readIsolateTypeFromCsv

From 6a7e806e7f8d3dcb2d7c746cd32d383249373e2e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 18 Mar 2021 11:48:47 +0000
Subject: [PATCH 299/327] Disambiguation of term 'reference'

---
 PopPUNK/__main__.py  |  4 ++--
 PopPUNK/assign.py    |  8 ++++----
 PopPUNK/sketchlib.py | 12 ++++++------
 PopPUNK/utils.py     | 16 ++++++++--------
 PopPUNK/web.py       |  2 +-
 5 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index fa6650f7..592102c1 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -96,7 +96,7 @@ def get_options():
                                                 default = 0.5, type = float)
     qcGroup.add_argument('--max-pi-dist', help='Maximum core distance to permit [default = 0.5]',
                                                 default = 0.5, type = float)
-    qcGroup.add_argument('--reference-isolate', help='Isolate from which distances will be calculated for pruning [default = None]',
+    qcGroup.add_argument('--type-isolate', help='Isolate from which distances will be calculated for pruning [default = None]',
                                                 default = None, type = str)
     qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond '
                                                 'which sequences will be excluded [default = 5]', default = 5, type = int)
@@ -234,7 +234,7 @@ def main():
         'upper_n': args.upper_n,
         'max_pi_dist': args.max_pi_dist,
         'max_a_dist': args.max_a_dist,
-        'reference_isolate': args.reference_isolate
+        'type_isolate': args.type_isolate
     }
 
     # Dict of DB access functions
diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index 3f9b05ee..bc86d15e 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -38,7 +38,7 @@ def assign_query(dbFuncs,
                  graph_weights,
                  max_a_dist,
                  max_pi_dist,
-                 reference_isolate,
+                 type_isolate,
                  model_dir,
                  strand_preserved,
                  previous_clustering,
@@ -378,7 +378,7 @@ def get_options():
                                                 default = 0.5, type = float)
     qcGroup.add_argument('--max-pi-dist', help='Maximum core distance to permit [default = 0.5]',
                                                 default = 0.5, type = float)
-    qcGroup.add_argument('--reference-isolate', help='Isolate from which distances can be calculated for pruning [default = None]',
+    qcGroup.add_argument('--type-isolate', help='Isolate from which distances can be calculated for pruning [default = None]',
                                                 default = None, type = str)
     qcGroup.add_argument('--length-sigma', help='Number of standard deviations of length distribution beyond '
                                                 'which sequences will be excluded [default = 5]', default = None, type = int)
@@ -471,7 +471,7 @@ def main():
             'upper_n': args.upper_n,
             'max_pi_dist': args.max_pi_dist,
             'max_a_dist': args.max_a_dist,
-            'reference_isolate': args.reference_isolate
+            'type_isolate': args.type_isolate
         }
 
     # Dict of DB access functions for assign_query (which is out of scope)
@@ -510,7 +510,7 @@ def main():
                  args.graph_weights,
                  args.max_a_dist,
                  args.max_pi_dist,
-                 args.reference_isolate,
+                 args.type_isolate,
                  args.model_dir,
                  args.strand_preserved,
                  args.previous_clustering,
diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py
index 528fc1d2..cdfd3ef8 100644
--- a/PopPUNK/sketchlib.py
+++ b/PopPUNK/sketchlib.py
@@ -572,8 +572,8 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
     return distMat
 
 
-def pickReferenceIsolate(prefix, names):
-    """Selects a reference isolate as that with a minimal proportion
+def pickTypeIsolate(prefix, names):
+    """Selects a type isolate as that with a minimal proportion
     of missing data.
 
     Args:
@@ -583,7 +583,7 @@ def pickReferenceIsolate(prefix, names):
             Names of samples to QC
 
     Returns:
-        reference_isolate (str)
+        type_isolate (str)
             Name of isolate selected as reference
     """
     # open databases
@@ -591,7 +591,7 @@ def pickReferenceIsolate(prefix, names):
     hdf_in = h5py.File(db_name, 'r+')
 
     min_prop_n = 1.0
-    reference_isolate = None
+    type_isolate = None
     
     try:
         # process data structures
@@ -600,7 +600,7 @@ def pickReferenceIsolate(prefix, names):
         for dataset in read_grp:
             if hdf_in['sketches'][dataset].attrs['missing_bases']/hdf_in['sketches'][dataset].attrs['length'] < min_prop_n:
                 min_prop_n = hdf_in['sketches'][dataset].attrs['missing_bases']/hdf_in['sketches'][dataset].attrs['length']
-                reference_isolate = dataset
+                type_isolate = dataset
             if min_prop_n == 0.0:
                 break
     # if failure still close files to avoid corruption
@@ -610,7 +610,7 @@ def pickReferenceIsolate(prefix, names):
         print("Unexpected error:", sys.exc_info()[0], file = sys.stderr)
         raise
 
-    return reference_isolate
+    return type_isolate
 
 def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads):
     """Calculates random match probability based on means of genomes
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 41ef484a..fee09417 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -253,7 +253,7 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict):
     # avoid circular import
     from .prune_db import prune_distance_matrix
     from .sketchlib import removeFromDB
-    from .sketchlib import pickReferenceIsolate
+    from .sketchlib import pickTypeIsolate
     
     # Create overall list of sequences
     if refList == refList:
@@ -272,10 +272,10 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict):
             sys.stderr.write("Cannot create output directory " + prefix + "\n")
             sys.exit(1)
 
-    # Pick reference isolate if not supplied
-    if qc_dict['reference_isolate'] is None:
-        qc_dict['reference_isolate'] = pickReferenceIsolate(ref_db, seq_names_passing)
-        sys.stderr.write('Selected reference isolate is ' + qc_dict['reference_isolate'] + '\n')
+    # Pick type isolate if not supplied
+    if qc_dict['type_isolate'] is None:
+        qc_dict['type_isolate'] = pickTypeIsolate(ref_db, seq_names_passing)
+        sys.stderr.write('Selected type isolate is ' + qc_dict['type_isolate'] + '\n')
 
     # First check with numpy, which is quicker than iterating over everything
     long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist()
@@ -283,9 +283,9 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict):
         names = list(iterDistRows(refList, queryList, refList == queryList))
         # Prune sequences based on reference sequence
         for i in long_distance_rows:
-            if names[i][0] == qc_dict['reference_isolate']:
+            if names[i][0] == qc_dict['type_isolate']:
                 to_prune.append(names[i][1])
-            elif names[i][1] == qc_dict['reference_isolate']:
+            elif names[i][1] == qc_dict['type_isolate']:
                 to_prune.append(names[i][0])
     
     # prune based on distance from reference if provided
@@ -294,7 +294,7 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict):
         sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n')
         sys.exit(1)
     elif qc_dict['qc_filter'] == 'prune' and len(to_prune) > 0:
-        if qc_dict['reference_isolate'] is None:
+        if qc_dict['type_isolate'] is None:
             sys.stderr.write('Distances exceeded QC thresholds but no reference isolate supplied\n')
             sys.stderr.write('Problem distances involved sequences ' + ';'.join(to_prune) + '\n')
             sys.exit(1)
diff --git a/PopPUNK/web.py b/PopPUNK/web.py
index a8ed3a7e..5303a724 100644
--- a/PopPUNK/web.py
+++ b/PopPUNK/web.py
@@ -76,7 +76,7 @@ def sketchAssign():
                                     args.assign.graph_weights,
                                     args.assign.max_a_dist,
                                     args.assign.max_pi_dist,
-                                    args.assign.reference_isolate,
+                                    args.assign.type_isolate,
                                     args.assign.model_dir,
                                     args.assign.strand_preserved,
                                     args.assign.previous_clustering,

From a94fe2d686b3d9cebcf5d5c27323726204ac3fbc Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 09:02:51 +0000
Subject: [PATCH 300/327] Check type isolate is in QC filtered set

---
 PopPUNK/sketchlib.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py
index cdfd3ef8..4aa4d43f 100644
--- a/PopPUNK/sketchlib.py
+++ b/PopPUNK/sketchlib.py
@@ -742,6 +742,13 @@ def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads
 
     # This gives back retained in the same order as names
     retained = [x for x in names if x in frozenset(retained)]
+    
+    # stop if type sequence does not pass QC or is absent
+    if qc_dict['type_isolate'] not in retained:
+        sys.stderr.write('Type isolate ' + qc_dict['type_isolate'] + ' not found in isolates after QC; check '
+        'name of type isolate and QC options\n')
+        sys.exit(1)
+    
     return retained
 
 def fitKmerCurve(pairwise, klist, jacobian):

From 348e502795e4d98670a53ed0e1f9ee08a1b2db13 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 09:22:37 +0000
Subject: [PATCH 301/327] Add type isolate to reference set

---
 PopPUNK/__main__.py |  1 +
 PopPUNK/assign.py   | 10 ++++++++--
 PopPUNK/network.py  | 20 ++++++++++++++++++--
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 592102c1..6cdbabaa 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -538,6 +538,7 @@ def main():
                 extractReferences(genomeNetwork,
                                     refList,
                                     output,
+                                    type_isolate = qc_dict['type_isolate'],
                                     threads = args.threads,
                                     use_gpu = args.gpu_graph)
             nodes_to_remove = set(range(len(refList))).difference(newReferencesIndices)
diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index bc86d15e..ce4e2d2c 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -295,7 +295,13 @@ def assign_query(dbFuncs,
             dbOrder = rNames + qNames
             newRepresentativesIndices, newRepresentativesNames, \
                 newRepresentativesFile, genomeNetwork = \
-                    extractReferences(genomeNetwork, dbOrder, output, rNames, threads = threads, use_gpu = gpu_graph)
+                    extractReferences(genomeNetwork,
+                                        dbOrder,
+                                        output,
+                                        rNames,
+                                        type_isolate = qc_dict['type_isolate'],
+                                        threads = threads,
+                                        use_gpu = gpu_graph)
             # intersection that maintains order
             newQueries = [x for x in qNames if x in frozenset(newRepresentativesNames)]
 
@@ -444,7 +450,7 @@ def main():
     # Dict of QC options for passing to database construction and querying functions
     if args.length_sigma is None and None in args.length_range and args.prop_n is None \
         and args.upper_n is None and args.max_a_dist is None and args.max_pi_dist is None:
-        qc_dict = {'run_qc': False }
+        qc_dict = {'run_qc': False, 'type_isolate': None }
     else:
         # define defaults if one QC parameter given
         # length_sigma
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 666bdd6f..6cc611fd 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -206,7 +206,8 @@ def cliquePrune(component, graph, reference_indices, components_list):
         ref_list = getCliqueRefs(subgraph, refs)
     return(list(ref_list))
 
-def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, use_gpu = False):
+def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
+                        existingRefs = None, threads = 1, use_gpu = False):
     """Extract references for each cluster based on cliques
 
        Writes chosen references to file by calling :func:`~writeReferences`
@@ -218,6 +219,8 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
                The order of files in the sketches, so returned references are in the same order
            outPrefix (str)
                Prefix for output file (.refs will be appended)
+           type_isolate (str)
+               Isolate to be included in set of references
            existingRefs (list)
                References that should be used for each clique
            use_gpu (bool)
@@ -237,6 +240,15 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         index_lookup = {v:k for k,v in enumerate(dbOrder)}
         reference_indices = set([index_lookup[r] for r in references])
 
+    # Add type isolate, if necessary
+    type_isolate_index = None
+    if type_isolate is not None:
+        if type_isolate is in dbOrder:
+            type_isolate_index = dbOrder.index(type_isolate)
+        else:
+            sys.stderr.write('Type isolate ' + type_isolate + ' not found\n')
+            sys.exit(1)
+
     if use_gpu:
 
         if not gpu_lib:
@@ -252,7 +264,7 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
         # group by partition, which becomes the first column, so retrieve second column
         reference_index_df = partition_assignments.groupby('partition').nth(0)
         reference_indices = reference_index_df['vertex'].to_arrow().to_pylist()
-        
+                
         # Order found references as in sketchlib database
         reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
         refFileName = writeReferences(reference_names, outPrefix)
@@ -383,6 +395,10 @@ def extractReferences(G, dbOrder, outPrefix, existingRefs = None, threads = 1, u
             G_ref = gt.GraphView(G, vfilt = reference_vertex)
             G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object
 
+    # Add type isolate if necessary
+    if type_isolate_index is not None and type_isolate_index not in reference_indices:
+        reference_indices.add(type_isolate_index)
+
     # Order found references as in sketch files
     reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
     refFileName = writeReferences(reference_names, outPrefix)

From 5041a91cc838de18e20acdc1e25cd6564a69dd2e Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 09:37:33 +0000
Subject: [PATCH 302/327] Fixes to conditional statements

---
 PopPUNK/network.py   | 2 +-
 PopPUNK/sketchlib.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 6cc611fd..1de0cfbc 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -243,7 +243,7 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
     # Add type isolate, if necessary
     type_isolate_index = None
     if type_isolate is not None:
-        if type_isolate is in dbOrder:
+        if type_isolate in dbOrder:
             type_isolate_index = dbOrder.index(type_isolate)
         else:
             sys.stderr.write('Type isolate ' + type_isolate + ' not found\n')
diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py
index 4aa4d43f..7a8ef51d 100644
--- a/PopPUNK/sketchlib.py
+++ b/PopPUNK/sketchlib.py
@@ -744,7 +744,7 @@ def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads
     retained = [x for x in names if x in frozenset(retained)]
     
     # stop if type sequence does not pass QC or is absent
-    if qc_dict['type_isolate'] not in retained:
+    if qc_dict['type_isolate'] is not None and qc_dict['type_isolate'] not in retained:
         sys.stderr.write('Type isolate ' + qc_dict['type_isolate'] + ' not found in isolates after QC; check '
         'name of type isolate and QC options\n')
         sys.exit(1)

From 7901e42ce7d3129cc4cb491d8077accb6d347bc0 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 11:28:57 +0000
Subject: [PATCH 303/327] Fixes to function arguments

---
 PopPUNK/assign.py  |  2 +-
 PopPUNK/network.py | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index ce4e2d2c..dc359f5e 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -298,7 +298,7 @@ def assign_query(dbFuncs,
                     extractReferences(genomeNetwork,
                                         dbOrder,
                                         output,
-                                        rNames,
+                                        existingRefs = rNames,
                                         type_isolate = qc_dict['type_isolate'],
                                         threads = threads,
                                         use_gpu = gpu_graph)
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 1de0cfbc..30f5be08 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -160,7 +160,7 @@ def checkNetworkVertexCount(seq_list, G, use_gpu):
     vertex_list = set(get_vertex_list(G, use_gpu = use_gpu))
     networkMissing = set(set(range(len(seq_list))).difference(vertex_list))
     if len(networkMissing) > 0:
-        sys.stderr.write("ERROR: Samples " + ",".join(map(str,networkMissing)) + " are missing from the final network\n")
+        sys.stderr.write("ERROR: " + str(len(networkMissing)) + " samples are missing from the final network\n")
         sys.exit(1)
 
 def getCliqueRefs(G, reference_indices = set()):
@@ -264,7 +264,11 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
         # group by partition, which becomes the first column, so retrieve second column
         reference_index_df = partition_assignments.groupby('partition').nth(0)
         reference_indices = reference_index_df['vertex'].to_arrow().to_pylist()
-                
+
+        # Add type isolate if necessary - before edges are added
+        if type_isolate_index is not None and type_isolate_index not in reference_indices:
+            reference_indices.add(type_isolate_index)
+
         # Order found references as in sketchlib database
         reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
         refFileName = writeReferences(reference_names, outPrefix)
@@ -339,6 +343,10 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
         # Returns nested lists, which need to be flattened
         reference_indices = set([entry for sublist in ref_lists for entry in sublist])
 
+        # Add type isolate if necessary - before edges are added
+        if type_isolate_index is not None and type_isolate_index not in reference_indices:
+            reference_indices.add(type_isolate_index)
+
         if gt.openmp_enabled():
             gt.openmp_set_num_threads(threads)
 
@@ -395,10 +403,6 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
             G_ref = gt.GraphView(G, vfilt = reference_vertex)
             G_ref = gt.Graph(G_ref, prune = True) # https://stackoverflow.com/questions/30839929/graph-tool-graphview-object
 
-    # Add type isolate if necessary
-    if type_isolate_index is not None and type_isolate_index not in reference_indices:
-        reference_indices.add(type_isolate_index)
-
     # Order found references as in sketch files
     reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]
     refFileName = writeReferences(reference_names, outPrefix)

From 7742e8344993949f02e377abd79c1fc1e3e46f8d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 11:30:21 +0000
Subject: [PATCH 304/327] Change cudf memory management

---
 PopPUNK/network.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 30f5be08..b10512eb 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -622,7 +622,10 @@ def constructNetwork(rlist, qlist, assignments, within_label,
         if not gpu_lib:
            sys.stderr.write('Unable to load GPU libraries; exiting\n')
            sys.exit(1)
-            
+        
+        # Set memory management for large networks
+        cudf.set_allocator("managed")
+        
         # create DataFrame using edge tuples
         if weights is not None or sparse_input is not None:
             G_df = cudf.DataFrame(connections, columns =['source', 'destination', 'weights'])

From 77e9d64e0a3ea1a6aefae236c67c825afe71eb32 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 12:40:07 +0000
Subject: [PATCH 305/327] Add no-plot mode for models

---
 PopPUNK/__main__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 6cdbabaa..075cf1b3 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -116,6 +116,8 @@ def get_options():
     modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster '
                                                         'in DBSCAN fitting [default = 0.0001]', type=float, default=0.0001)
     modelGroup.add_argument('--threshold', help='Cutoff if using --fit-model threshold', type=float)
+    modelGroup.add_argument('--no-plot', help='Switch off plotting, which can be slow for large datasets', type=bool,
+                                                default=False, action='store_true')
 
     # model refinement
     refinementGroup = parser.add_argument_group('Refine model options')

From 2c7d0141edf14042e00dc3faee013e7daa8f12cc Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 12:42:19 +0000
Subject: [PATCH 306/327] Extend no-plot mode for models

---
 PopPUNK/__main__.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 075cf1b3..72f8fede 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -389,13 +389,15 @@ def main():
                 model = DBSCANFit(output)
                 model.set_threads(args.threads)
                 assignments = model.fit(distMat, args.D, args.min_cluster_prop)
-                model.plot()
+                if not args.no_plot:
+                    model.plot()
             # Run Gaussian model
             elif args.fit_model == "bgmm":
                 model = BGMMFit(output)
                 model.set_threads(args.threads)
                 assignments = model.fit(distMat, args.K)
-                model.plot(distMat, assignments)
+                if not args.no_plot:
+                    model.plot(distMat, assignments)
             elif args.fit_model == "refine":
                 new_model = RefineFit(output)
                 model.set_threads(args.threads)
@@ -407,14 +409,16 @@ def main():
                                             args.score_idx,
                                             args.no_local,
                                             args.gpu_graph)
-                new_model.plot(distMat)
+                if not args.no_plot:
+                    new_model.plot(distMat)
                 model = new_model
             elif args.fit_model == "threshold":
                 new_model = RefineFit(output)
                 new_model.set_threads(args.threads)
                 assignments = new_model.apply_threshold(distMat,
                                                         args.threshold)
-                new_model.plot(distMat)
+                if not args.no_plot:
+                    new_model.plot(distMat)
                 model = new_model
             elif args.fit_model == "lineage":
                 # run lineage clustering. Sparsity & low rank should keep memory
@@ -422,7 +426,8 @@ def main():
                 model = LineageFit(output, rank_list)
                 model.set_threads(args.threads)
                 model.fit(distMat, args.use_accessory)
-                model.plot(distMat)
+                if not args.no_plot:
+                    model.plot(distMat)
 
                 assignments = {}
                 for rank in rank_list:

From c450710f102053c3186ed86863795ff729e979eb Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 13:12:36 +0000
Subject: [PATCH 307/327] Move model processing flags to optimisation arg group

---
 PopPUNK/__main__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 72f8fede..4d8a78cc 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -116,8 +116,6 @@ def get_options():
     modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster '
                                                         'in DBSCAN fitting [default = 0.0001]', type=float, default=0.0001)
     modelGroup.add_argument('--threshold', help='Cutoff if using --fit-model threshold', type=float)
-    modelGroup.add_argument('--no-plot', help='Switch off plotting, which can be slow for large datasets', type=bool,
-                                                default=False, action='store_true')
 
     # model refinement
     refinementGroup = parser.add_argument_group('Refine model options')
@@ -127,8 +125,6 @@ def get_options():
             type=float, default = None)
     refinementGroup.add_argument('--manual-start', help='A file containing information for a start point. '
             'See documentation for help.', default=None)
-    refinementGroup.add_argument('--no-local', help='Do not perform the local optimization step (speed up on very large datasets)',
-            default=False, action='store_true')
     refinementGroup.add_argument('--model-dir', help='Directory containing model to use for assigning queries '
                                                    'to clusters [default = reference database directory]', type = str)
     refinementGroup.add_argument('--score-idx',
@@ -159,6 +155,10 @@ def get_options():
     other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]')
     other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating networks [default = False]')
     other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]')
+    other.add_argument('--no-plot', help='Switch off model plotting, which can be slow for large datasets', type=bool,
+                                                default=False, action='store_true')
+    other.add_argument('--no-local', help='Do not perform the local optimization step in model refinement (speed up on very large datasets)',
+                                                default=False, action='store_true')
 
     other.add_argument('--version', action='version',
                        version='%(prog)s '+__version__)

From f823964f92b8e3ec4057491a7ff2c65ff10d6522 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 13:15:44 +0000
Subject: [PATCH 308/327] Update new lines

---
 scripts/poppunk_batch_mst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/poppunk_batch_mst.py b/scripts/poppunk_batch_mst.py
index 6a6f8eae..50fa152e 100755
--- a/scripts/poppunk_batch_mst.py
+++ b/scripts/poppunk_batch_mst.py
@@ -247,7 +247,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
     # Validate batches
     batch_names = sorted(set(batches))
     if len(batch_names) < 2:
-        sys.stderr.write("You must supply multiple batches")
+        sys.stderr.write("You must supply multiple batches\n")
         sys.exit(1)
     first_batch = batch_names.pop(0)
 

From a5c4f0282f98084706307cf3ef07b9066f7ae432 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 13:17:35 +0000
Subject: [PATCH 309/327] Edit whitespace

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index b10512eb..e58a442b 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -322,7 +322,7 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
                     reference_indices = list(reference_index_set)
             # Create new reference graph
             G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)]
-            G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False)
+            G_ref = add_self_loop(G_ref_df, max_in_vertex_labels, renumber = False)
             
     else:
 

From e6abfb489bc3c5f02c1bf8841a933c3a1119acbd Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 13:17:58 +0000
Subject: [PATCH 310/327] Edit whitespace

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index e58a442b..e455e076 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -705,7 +705,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
         triangle_count = cugraph.community.triangle_count.triangles(G)
         degree_df = G.in_degree()
         triad_count = sum([d * (d - 1) for d in degree_df['degree'].to_pandas()])
-        transitivity = 2*triangle_count/triad_count
+        transitivity = 2 * triangle_count/triad_count
     else:
         component_assignments, component_frequencies = gt.label_components(G)
         components = len(component_frequencies)

From 78ee9c23dd61415abc9ee96b26af9e4ffae8b0e5 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 13:18:25 +0000
Subject: [PATCH 311/327] Edit whitespace

Co-authored-by: John Lees <lees.john6@gmail.com>
---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index e455e076..7c2b6ac4 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -934,7 +934,7 @@ def add_self_loop(G_df, seq_num, weights = False, renumber = True):
             Dictionary of cluster assignments (keys are sequence names)
     """
     # use self-loop to ensure all nodes are present
-    min_in_df = np.amin([G_df['source'].min(),G_df['destination'].min()])
+    min_in_df = np.amin([G_df['source'].min(), G_df['destination'].min()])
     if min_in_df.item() > 0:
         G_self_loop = cudf.DataFrame()
         G_self_loop['source'] = [0]

From 5b362fbb70dd3ad095b46e5f27eeadd5f916d656 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 13:19:39 +0000
Subject: [PATCH 312/327] Reinsert library loading warning

---
 PopPUNK/refine.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index f72c6900..2095e82e 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -30,6 +30,7 @@
     import cudf
     gpu_lib = True
 except ImportError as e:
+    sys.stderr.write("cugraph and cudf unavailable\n")
     gpu_lib = False
 
 from .network import constructNetwork

From 03f11a86c907a527f5725cd95f87858e6fa9a68a Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 13:34:51 +0000
Subject: [PATCH 313/327] Change model plotting behaviour

---
 PopPUNK/__main__.py | 23 +++++++++--------------
 PopPUNK/models.py   |  5 ++++-
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 4d8a78cc..47e17ba3 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -155,7 +155,7 @@ def get_options():
     other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]')
     other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating networks [default = False]')
     other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]')
-    other.add_argument('--no-plot', help='Switch off model plotting, which can be slow for large datasets', type=bool,
+    other.add_argument('--no-plot', help='Switch off model plotting, which can be slow for large datasets',
                                                 default=False, action='store_true')
     other.add_argument('--no-local', help='Do not perform the local optimization step in model refinement (speed up on very large datasets)',
                                                 default=False, action='store_true')
@@ -323,9 +323,10 @@ def main():
                                                 qc_dict)
 
         # Plot results
-        plot_scatter(distMat,
-                     args.output + "/" + os.path.basename(args.output) + "_distanceDistribution",
-                     args.output + " distances")
+        if not args.no_plot:
+            plot_scatter(distMat,
+                         args.output + "/" + os.path.basename(args.output) + "_distanceDistribution",
+                         args.output + " distances")
 
     #******************************#
     #*                            *#
@@ -389,15 +390,11 @@ def main():
                 model = DBSCANFit(output)
                 model.set_threads(args.threads)
                 assignments = model.fit(distMat, args.D, args.min_cluster_prop)
-                if not args.no_plot:
-                    model.plot()
             # Run Gaussian model
             elif args.fit_model == "bgmm":
                 model = BGMMFit(output)
                 model.set_threads(args.threads)
                 assignments = model.fit(distMat, args.K)
-                if not args.no_plot:
-                    model.plot(distMat, assignments)
             elif args.fit_model == "refine":
                 new_model = RefineFit(output)
                 model.set_threads(args.threads)
@@ -409,16 +406,12 @@ def main():
                                             args.score_idx,
                                             args.no_local,
                                             args.gpu_graph)
-                if not args.no_plot:
-                    new_model.plot(distMat)
                 model = new_model
             elif args.fit_model == "threshold":
                 new_model = RefineFit(output)
                 new_model.set_threads(args.threads)
                 assignments = new_model.apply_threshold(distMat,
                                                         args.threshold)
-                if not args.no_plot:
-                    new_model.plot(distMat)
                 model = new_model
             elif args.fit_model == "lineage":
                 # run lineage clustering. Sparsity & low rank should keep memory
@@ -426,8 +419,6 @@ def main():
                 model = LineageFit(output, rank_list)
                 model.set_threads(args.threads)
                 model.fit(distMat, args.use_accessory)
-                if not args.no_plot:
-                    model.plot(distMat)
 
                 assignments = {}
                 for rank in rank_list:
@@ -436,6 +427,10 @@ def main():
 
             # save model
             model.save()
+            
+            # plot model
+            if not args.no_plot:
+                model.plot(distMat, assignments)
 
         # use model
         else:
diff --git a/PopPUNK/models.py b/PopPUNK/models.py
index 46be4219..48968247 100644
--- a/PopPUNK/models.py
+++ b/PopPUNK/models.py
@@ -1048,7 +1048,7 @@ def load(self, fit_npz, fit_obj):
         self.nn_dists = fit_npz
         self.fitted = True
 
-    def plot(self, X):
+    def plot(self, X, y = None):
         '''Extends :func:`~ClusterFit.plot`
 
         Write a summary of the fit, and plot the results using
@@ -1057,6 +1057,9 @@ def plot(self, X):
         Args:
             X (numpy.array)
                 Core and accessory distances
+            y (any)
+                Unused variable for compatibility with other
+                plotting functions
         '''
         ClusterFit.plot(self, X)
         for rank in self.ranks:

From 3e5160e5a1b7520cf4d475e6a975d6ded57cb9fd Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 13:59:58 +0000
Subject: [PATCH 314/327] Add cudf and cugraph

---
 environment.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/environment.yml b/environment.yml
index 0c525fe0..d8bdce3e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -4,6 +4,8 @@ channels:
   - bioconda
   - defaults
   - r
+  - nvidia
+  - rapidsai
 dependencies:
   - pip
   - numpy
@@ -42,3 +44,5 @@ dependencies:
   - libgomp
   - tqdm
   - flask-apscheduler
+  - cudf
+  - cugraph

From 13e565fec7099512f8fcc84a89ec0fa7c71e44c6 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 17:24:19 +0000
Subject: [PATCH 315/327] Change list command to append

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 7c2b6ac4..d684c35d 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -267,7 +267,7 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
 
         # Add type isolate if necessary - before edges are added
         if type_isolate_index is not None and type_isolate_index not in reference_indices:
-            reference_indices.add(type_isolate_index)
+            reference_indices.append(type_isolate_index)
 
         # Order found references as in sketchlib database
         reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)]

From 771ae57ee25925e2a4b42b676ee80c5a3b15c920 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 19:29:19 +0000
Subject: [PATCH 316/327] Limit betweenness calculation with GPU

---
 PopPUNK/network.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index d684c35d..4019b919 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -310,7 +310,8 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
                     reference_index_set = set(reference_indices)
                     # Add predecessors to reference sequences on the SSSPs
                     predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values
-                    predecessors = set(predecessor_list[predecessor_list >= 0])
+                    print("Predecessors: " + str(predecessor_list) + " type: " + str(type(predecessor_list)))
+                    predecessors = set(predecessor_list[predecessor_list >= 0].values)
                     # Add predecessors to reference set and check whether this results in complete paths
                     # where complete paths are indicated by references' predecessors being within the set of
                     # references
@@ -725,7 +726,7 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
                 if size > 3:
                     component_vertices = component_assignments['vertex'][component_assignments['labels']==component]
                     subgraph = cugraph.subgraph(G, component_vertices)
-                    component_betweenness = cugraph.betweenness_centrality(G)
+                    component_betweenness = cugraph.betweenness_centrality(G, k = 250)
                     betweenness.append(component_betweenness['betweenness_centrality'].max())
                     sizes.append(size)
         else:

From ca8c4292d4c350b6364cee0844b499a8f61ecad6 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 20:32:53 +0000
Subject: [PATCH 317/327] Set from numpy ndarray

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index ac42166b..1badcce9 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -312,8 +312,8 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
                     reference_index_set = set(reference_indices)
                     # Add predecessors to reference sequences on the SSSPs
                     predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values
-                    print("Predecessors: " + str(predecessor_list) + " type: " + str(type(predecessor_list)))
-                    predecessors = set(predecessor_list[predecessor_list >= 0].values)
+                    print("Predecessors: " + str(predecessor_list.flatten()) + " type: " + str(type(predecessor_list)))
+                    predecessors = set(predecessor_list[predecessor_list >= 0].flatten())
                     # Add predecessors to reference set and check whether this results in complete paths
                     # where complete paths are indicated by references' predecessors being within the set of
                     # references

From b526a2f06b594a93d7efb1db731d3f420bd67196 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 21:40:57 +0000
Subject: [PATCH 318/327] Convert ndarray to list

---
 PopPUNK/network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 1badcce9..98df6821 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -312,8 +312,8 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
                     reference_index_set = set(reference_indices)
                     # Add predecessors to reference sequences on the SSSPs
                     predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values
-                    print("Predecessors: " + str(predecessor_list.flatten()) + " type: " + str(type(predecessor_list)))
-                    predecessors = set(predecessor_list[predecessor_list >= 0].flatten())
+                    print("Predecessors: " + str(predecessor_list.flatten().tolist()) + " type: " + str(type(predecessor_list)))
+                    predecessors = set(predecessor_list[predecessor_list >= 0].flatten().tolist())
                     # Add predecessors to reference set and check whether this results in complete paths
                     # where complete paths are indicated by references' predecessors being within the set of
                     # references

From 414c27bc838aee5ea559f8439210a3682f8ed77f Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 22:06:50 +0000
Subject: [PATCH 319/327] Convert ndarray to list

---
 PopPUNK/network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 98df6821..e6091054 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -320,7 +320,7 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
                     while len(predecessors) > 0 and len(predecessors - reference_index_set) > 0:
                         reference_index_set = reference_index_set.union(predecessors)
                         predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values
-                        predecessors = set(predecessor_list[predecessor_list >= 0])
+                        predecessors = set(predecessor_list[predecessor_list >= 0].flatten().tolist())
                     # Add expanded reference set to the overall list
                     reference_indices = list(reference_index_set)
             # Create new reference graph

From 0cef59d21b518cac65014b2445e12227fa0cdcd6 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 22 Mar 2021 22:40:01 +0000
Subject: [PATCH 320/327] Remove debug message

---
 PopPUNK/network.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index e6091054..bb6bf73e 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -312,7 +312,6 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
                     reference_index_set = set(reference_indices)
                     # Add predecessors to reference sequences on the SSSPs
                     predecessor_list = traversal[traversal['vertex'].isin(reference_indices)]['predecessor'].values
-                    print("Predecessors: " + str(predecessor_list.flatten().tolist()) + " type: " + str(type(predecessor_list)))
                     predecessors = set(predecessor_list[predecessor_list >= 0].flatten().tolist())
                     # Add predecessors to reference set and check whether this results in complete paths
                     # where complete paths are indicated by references' predecessors being within the set of

From d6667a1e95004e3c1f207e512aebbba7ef97a9ba Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Tue, 23 Mar 2021 12:23:55 +0000
Subject: [PATCH 321/327] Remove nvidia packages from CI

---
 environment.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/environment.yml b/environment.yml
index d8bdce3e..0c525fe0 100644
--- a/environment.yml
+++ b/environment.yml
@@ -4,8 +4,6 @@ channels:
   - bioconda
   - defaults
   - r
-  - nvidia
-  - rapidsai
 dependencies:
   - pip
   - numpy
@@ -44,5 +42,3 @@ dependencies:
   - libgomp
   - tqdm
   - flask-apscheduler
-  - cudf
-  - cugraph

From b5e03eba24eb05b127dffa375e3b11da762e50d9 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Tue, 23 Mar 2021 12:25:37 +0000
Subject: [PATCH 322/327] Remove whitespace

---
 PopPUNK/utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index fee09417..fc4c4310 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -242,25 +242,25 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict):
             Prefix of output files
         qc_dict (dict)
             Dict of QC options
-            
+
     Returns:
         seq_names_passing (list)
             List of isolates passing QC distance filters
         distMat ([n,2] numpy ndarray)
             Filtered long form distance matrix
     """
-    
+
     # avoid circular import
     from .prune_db import prune_distance_matrix
     from .sketchlib import removeFromDB
     from .sketchlib import pickTypeIsolate
-    
+
     # Create overall list of sequences
     if refList == refList:
         seq_names_passing = refList
     else:
         seq_names_passing = refList + queryList
-        
+
     # Sequences to remove
     to_prune = []
 
@@ -287,7 +287,7 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict):
                 to_prune.append(names[i][1])
             elif names[i][1] == qc_dict['type_isolate']:
                 to_prune.append(names[i][0])
-    
+
     # prune based on distance from reference if provided
     if qc_dict['qc_filter'] == 'stop' and len(to_prune) > 0:
         sys.stderr.write('Outlier distances exceed QC thresholds; prune sequences or raise thresholds\n')

From 62e69d25e2dfb03eb14f86c0e5b37cf49575897a Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Tue, 23 Mar 2021 12:29:08 +0000
Subject: [PATCH 323/327] Remove cudf/cugraph err message

---
 PopPUNK/network.py | 6 +++---
 PopPUNK/refine.py  | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index bb6bf73e..6bd4bb90 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -19,6 +19,7 @@
 from collections import defaultdict, Counter
 from functools import partial
 from multiprocessing import Pool
+import pickle
 import graph_tool.all as gt
 import dendropy
 
@@ -252,7 +253,6 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
             sys.exit(1)
 
     if use_gpu:
-
         if not gpu_lib:
             sys.stderr.write('Unable to load GPU libraries; exiting\n')
             sys.exit(1)
@@ -327,7 +327,6 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None,
             G_ref = add_self_loop(G_ref_df, max_in_vertex_labels, renumber = False)
 
     else:
-
         # Each component is independent, so can be multithreaded
         components = gt.label_components(G)[0].a
 
@@ -470,7 +469,8 @@ def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False,
         else:
             old_ids = old_rlist + old_qlist
     else:
-        sys.stderr.write('Pkl file containing names of sequences in previous network\n')
+        sys.stderr.write('Missing .pkl file containing names of sequences in '
+                         'previous network\n')
         sys.exit(1)
 
     # Get edges as lists of source,destination,weight using original IDs
diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index 55196767..139a50bc 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -30,7 +30,6 @@
     import cudf
     gpu_lib = True
 except ImportError as e:
-    sys.stderr.write("cugraph and cudf unavailable\n")
     gpu_lib = False
 
 from .network import constructNetwork

From a1c6692ea9b6ab3270713054485fde99129e64df Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Tue, 23 Mar 2021 12:35:40 +0000
Subject: [PATCH 324/327] trailing whitespace

---
 PopPUNK/sketchlib.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py
index 7a8ef51d..b5099e59 100644
--- a/PopPUNK/sketchlib.py
+++ b/PopPUNK/sketchlib.py
@@ -592,7 +592,7 @@ def pickTypeIsolate(prefix, names):
 
     min_prop_n = 1.0
     type_isolate = None
-    
+
     try:
         # process data structures
         read_grp = hdf_in['sketches']
@@ -742,13 +742,13 @@ def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads
 
     # This gives back retained in the same order as names
     retained = [x for x in names if x in frozenset(retained)]
-    
+
     # stop if type sequence does not pass QC or is absent
     if qc_dict['type_isolate'] is not None and qc_dict['type_isolate'] not in retained:
         sys.stderr.write('Type isolate ' + qc_dict['type_isolate'] + ' not found in isolates after QC; check '
         'name of type isolate and QC options\n')
         sys.exit(1)
-    
+
     return retained
 
 def fitKmerCurve(pairwise, klist, jacobian):

From 1a11b21c926e3da08332094f49a1a17a2d876c94 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 23 Mar 2021 12:44:13 +0000
Subject: [PATCH 325/327] Change cugraph betweenness calculation

---
 PopPUNK/network.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 6bd4bb90..34354c1f 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -727,7 +727,11 @@ def networkSummary(G, calc_betweenness=True, use_gpu = False):
                 if size > 3:
                     component_vertices = component_assignments['vertex'][component_assignments['labels']==component]
                     subgraph = cugraph.subgraph(G, component_vertices)
-                    component_betweenness = cugraph.betweenness_centrality(G, k = 250)
+                    max_betweeness_k = 1000
+                    if len(component_vertices) >= max_betweeness_k:
+                        component_betweenness = cugraph.betweenness_centrality(subgraph, k = max_betweeness_k)
+                    else:
+                        component_betweenness = cugraph.betweenness_centrality(subgraph)
                     betweenness.append(component_betweenness['betweenness_centrality'].max())
                     sizes.append(size)
         else:

From f98bd0fa2c7a029f8196a9056ada11d614c0dd24 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Tue, 23 Mar 2021 16:32:00 +0000
Subject: [PATCH 326/327] Remove multiprocessing block from 2d network refine
 w/ GPU

---
 MANIFEST.in       |  2 +-
 PopPUNK/refine.py | 68 +++++++++++++++++++++++++++--------------------
 PopPUNK/utils.py  |  2 +-
 environment.yml   |  2 +-
 4 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index b510d8bd..ad9e8edf 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,2 @@
 recursive-include scripts *.py
-recursive-include PopPUNK/data *.json *.gz *.txt
\ No newline at end of file
+recursive-include PopPUNK/data *.gz
\ No newline at end of file
diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py
index 139a50bc..fc3b752b 100644
--- a/PopPUNK/refine.py
+++ b/PopPUNK/refine.py
@@ -112,30 +112,41 @@ def refineFit(distMat, sample_names, start_s, mean0, mean1,
         x_max = np.linspace(x_max_start, x_max_end, global_grid_resolution, dtype=np.float32)
         y_max = np.linspace(y_max_start, y_max_end, global_grid_resolution, dtype=np.float32)
 
-        if gt.openmp_enabled():
-            gt.openmp_set_num_threads(1)
-
-        with SharedMemoryManager() as smm:
-            shm_distMat = smm.SharedMemory(size = distMat.nbytes)
-            distances_shared_array = np.ndarray(distMat.shape, dtype = distMat.dtype, buffer = shm_distMat.buf)
-            distances_shared_array[:] = distMat[:]
-            distances_shared = NumpyShared(name = shm_distMat.name, shape = distMat.shape, dtype = distMat.dtype)
-
-            with Pool(processes = num_processes) as pool:
-                global_s = pool.map(partial(newNetwork2D,
-                                            sample_names = sample_names,
-                                            distMat = distances_shared,
-                                            x_range = x_max,
-                                            y_range = y_max,
-                                            score_idx = score_idx,
-                                            use_gpu = use_gpu),
-                                    range(global_grid_resolution))
-
-        if gt.openmp_enabled():
-            gt.openmp_set_num_threads(num_processes)
-
-        global_s = list(chain.from_iterable(global_s))
-        min_idx = np.argmin(np.array(global_s))
+        if use_gpu:
+            global_s = map(partial(newNetwork2D,
+                                   sample_names = sample_names,
+                                   distMat = distMat,
+                                   x_range = x_max,
+                                   y_range = y_max,
+                                   score_idx = score_idx,
+                                   use_gpu = True),
+                           range(global_grid_resolution))
+        else:
+            if gt.openmp_enabled():
+                gt.openmp_set_num_threads(1)
+
+            with SharedMemoryManager() as smm:
+                shm_distMat = smm.SharedMemory(size = distMat.nbytes)
+                distances_shared_array = np.ndarray(distMat.shape, dtype = distMat.dtype, buffer = shm_distMat.buf)
+                distances_shared_array[:] = distMat[:]
+                distances_shared = NumpyShared(name = shm_distMat.name, shape = distMat.shape, dtype = distMat.dtype)
+
+                with Pool(processes = num_processes) as pool:
+                    global_s = pool.map(partial(newNetwork2D,
+                                                sample_names = sample_names,
+                                                distMat = distances_shared,
+                                                x_range = x_max,
+                                                y_range = y_max,
+                                                score_idx = score_idx,
+                                                use_gpu = False),
+                                        range(global_grid_resolution))
+
+            if gt.openmp_enabled():
+                gt.openmp_set_num_threads(num_processes)
+
+        global_s = np.array(list(chain.from_iterable(global_s)))
+        global_s[np.isnan(global_s)] = 1
+        min_idx = np.argmin(global_s)
         optimal_x = x_max[min_idx % global_grid_resolution]
         optimal_y = y_max[min_idx // global_grid_resolution]
 
@@ -217,18 +228,17 @@ def growNetwork(sample_names, i_vec, j_vec, idx_vec, s_range, score_idx, thread_
             Optional thread idx (if multithreaded) to offset progress bar by
         use_gpu (bool)
             Whether to use cugraph for graph analyses
-            
+
     Returns:
         scores (list)
             -1 * network score for each of x_range.
             Where network score is from :func:`~PopPUNK.network.networkSummary`
     """
-    
     # load CUDA libraries
     if use_gpu and not gpu_lib:
         sys.stderr.write('Unable to load GPU libraries; exiting\n')
         sys.exit(1)
-    
+
     scores = []
     edge_list = []
     prev_idx = 0
@@ -313,7 +323,7 @@ def newNetwork(s, sample_names, distMat, start_point, mean1, gradient,
             Number of CPUs to use for calculating assignment
         use_gpu (bool)
             Whether to use cugraph for graph analysis
-            
+
     Returns:
         score (float)
             -1 * network score. Where network score is from :func:`~PopPUNK.network.networkSummary`
@@ -364,7 +374,7 @@ def newNetwork2D(y_idx, sample_names, distMat, x_range, y_range, score_idx=0, us
             [default = 0]
         use_gpu (bool)
             Whether to use cugraph for graph analysis
-            
+
     Returns:
         scores (list)
             -1 * network score for each of x_range.
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index fc4c4310..eb880604 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -275,7 +275,7 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict):
     # Pick type isolate if not supplied
     if qc_dict['type_isolate'] is None:
         qc_dict['type_isolate'] = pickTypeIsolate(ref_db, seq_names_passing)
-        sys.stderr.write('Selected type isolate is ' + qc_dict['type_isolate'] + '\n')
+        sys.stderr.write('Selected type isolate for distance QC is ' + qc_dict['type_isolate'] + '\n')
 
     # First check with numpy, which is quicker than iterating over everything
     long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist()
diff --git a/environment.yml b/environment.yml
index 0c525fe0..76b01588 100644
--- a/environment.yml
+++ b/environment.yml
@@ -17,7 +17,7 @@ dependencies:
   - hdbscan
   - rapidnj
   - h5py
-  - pp-sketchlib >=1.6.2
+  - pp-sketchlib >=1.7.0
   - graph-tool >=2.35
   - requests
   - flask

From e1879a82c3dcdcb387687d94e6f9ad76d4e95180 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Tue, 23 Mar 2021 16:56:21 +0000
Subject: [PATCH 327/327] Fix web test

---
 PopPUNK/assign.py  |   5 +-
 PopPUNK/web.py     |   8 +-
 test/clean_test.py |   4 +-
 test/test-web.py   | 220 +++++++++++++++++++++++----------------------
 test/web_args.txt  |   7 +-
 5 files changed, 132 insertions(+), 112 deletions(-)

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
index 626d2cb9..dcd1f276 100644
--- a/PopPUNK/assign.py
+++ b/PopPUNK/assign.py
@@ -160,7 +160,10 @@ def assign_query(dbFuncs,
                               threads = threads,
                               use_gpu = gpu_dist)
     # QC distance matrix
-    seq_names_passing, distMat = qcDistMat(qrDistMat, rNames, qNames, ref_db, output, qc_dict)
+    if qc_dict['run_qc']:
+        seq_names_passing = qcDistMat(qrDistMat, rNames, qNames, ref_db, output, qc_dict)[0]
+    else:
+        seq_names_passing = rNames + qNames
 
     # Load the network based on supplied options
     genomeNetwork, old_cluster_file = \
diff --git a/PopPUNK/web.py b/PopPUNK/web.py
index 5303a724..6b013f61 100644
--- a/PopPUNK/web.py
+++ b/PopPUNK/web.py
@@ -114,16 +114,18 @@ def sketchAssign():
                                 args.visualise.strand_preserved,
                                 outdir + "/include.txt",
                                 species_db,
-                                species_db,
+                                species_db + "/" + os.path.basename(species_db) + "_clusters.csv",
                                 args.visualise.previous_query_clustering,
-                                outdir,
+                                outdir + "/" + os.path.basename(outdir) + "_graph.gt",                                args.visualise.gpu_graph,
                                 args.visualise.info_csv,
                                 args.visualise.rapidnj,
                                 args.visualise.tree,
                                 args.visualise.mst_distances,
                                 args.visualise.overwrite,
                                 args.visualise.core_only,
-                                args.visualise.accessory_only)
+                                args.visualise.accessory_only,
+                                args.visualise.display_cluster,
+                                web=True)
         networkJson = graphml_to_json(outdir)
         if len(to_include) >= 3:
             with open(os.path.join(outdir, os.path.basename(outdir) + "_core_NJ.nwk"), "r") as p:
diff --git a/test/clean_test.py b/test/clean_test.py
index 29852e14..b1923144 100755
--- a/test/clean_test.py
+++ b/test/clean_test.py
@@ -43,7 +43,9 @@ def deleteDir(dirname):
     "example_api",
     "batch1",
     "batch2",
-    "batch12"
+    "batch3",
+    "batch12",
+    "batch123"
 ]
 for outDir in outputDirs:
     deleteDir(outDir)
diff --git a/test/test-web.py b/test/test-web.py
index 56f47bf5..a69505c1 100644
--- a/test/test-web.py
+++ b/test/test-web.py
@@ -10,114 +10,122 @@
 from PopPUNK.utils import setupDBFuncs
 from PopPUNK.visualise import generate_visualisations
 
-# Copy and move args and sketch files into example dirs
-copyfile("web_args.txt", "example_db/args.txt")
-copyfile("example_viz/example_viz_core_NJ.nwk", "example_viz/example_viz.nwk")
+def main():
+    # Copy and move args and sketch files into example dirs
+    copyfile("web_args.txt", "example_db/args.txt")
+    copyfile("example_viz/example_viz_core_NJ.nwk", "example_viz/example_viz.nwk")
 
-# Test the output of the PopPUNk-web upload route for incorrect data types
-sys.stderr.write('\nTesting assign for PopPUNK-web\n')
-with open("json_sketch.txt", "r") as s:
-    sketch = s.read()
-species = "Listeria monocytogenes"
-species_db = "example_db"
-outdir = "example_api"
-if not os.path.exists(outdir):
-    os.mkdir(outdir)
-args = default_options(species_db)
-qc_dict = {'run_qc': False }
-dbFuncs = setupDBFuncs(args.assign, args.assign.min_kmer_count, qc_dict)
-ClusterResult = assign_query(dbFuncs,
-                            args.assign.ref_db,
-                            args.assign.q_files,
+    # Test the output of the PopPUNk-web upload route for incorrect data types
+    sys.stderr.write('\nTesting assign for PopPUNK-web\n')
+    with open("json_sketch.txt", "r") as s:
+        sketch = s.read()
+    species = "Listeria monocytogenes"
+    species_db = "example_db"
+    outdir = "example_api"
+    if not os.path.exists(outdir):
+        os.mkdir(outdir)
+    args = default_options(species_db)
+    qc_dict = {'run_qc': False }
+    dbFuncs = setupDBFuncs(args.assign, args.assign.min_kmer_count, qc_dict)
+    ClusterResult = assign_query(dbFuncs,
+                                args.assign.ref_db,
+                                args.assign.q_files,
+                                outdir,
+                                qc_dict,
+                                args.assign.update_db,
+                                args.assign.write_references,
+                                args.assign.distances,
+                                args.assign.threads,
+                                args.assign.overwrite,
+                                args.assign.plot_fit,
+                                args.assign.graph_weights,
+                                args.assign.max_a_dist,
+                                args.assign.max_pi_dist,
+                                args.assign.type_isolate,
+                                args.assign.model_dir,
+                                args.assign.strand_preserved,
+                                args.assign.previous_clustering,
+                                args.assign.external_clustering,
+                                args.assign.core_only,
+                                args.assign.accessory_only,
+                                args.assign.gpu_sketch,
+                                args.assign.gpu_dist,
+                                args.assign.gpu_graph,
+                                args.assign.deviceid,
+                                args.assign.web,
+                                sketch,
+                                args.assign.save_partial_query_graph)
+    query, query_prevalence, clusters, prevalences, alias_dict, to_include = \
+        summarise_clusters(outdir, species, species_db)
+    colours = get_colours(query, clusters)
+    url = api(query, "example_viz")
+    sys.stderr.write('PopPUNK-web assign test successful\n')
+
+    # Test generate_visualisations() for PopPUNK-web
+    sys.stderr.write('\nTesting visualisations for PopPUNK-web\n')
+    if len(to_include) < 3:
+        args.visualise.microreact = False
+    generate_visualisations(outdir,
+                            species_db,
+                            None,
+                            args.visualise.threads,
                             outdir,
-                            args.assign.update_db,
-                            args.assign.write_references,
-                            args.assign.distances,
-                            args.assign.threads,
-                            args.assign.overwrite,
-                            args.assign.plot_fit,
-                            args.assign.graph_weights,
-                            args.assign.max_a_dist,
-                            args.assign.max_pi_dist,
-                            args.assign.reference_isolate,
-                            args.assign.model_dir,
-                            args.assign.strand_preserved,
-                            args.assign.previous_clustering,
-                            args.assign.external_clustering,
-                            args.assign.core_only,
-                            args.assign.accessory_only,
-                            args.assign.gpu_sketch,
-                            args.assign.gpu_dist,
-                            args.assign.gpu_graph,
-                            args.assign.deviceid,
-                            args.assign.web,
-                            sketch,
-                            args.assign.save_partial_query_graph)
-query, query_prevalence, clusters, prevalences, alias_dict, to_include = \
-    summarise_clusters(outdir, species, species_db)
-colours = get_colours(query, clusters)
-url = api(query, "example_viz")
-sys.stderr.write('PopPUNK-web assign test successful\n')
+                            args.visualise.gpu_dist,
+                            args.visualise.deviceid,
+                            args.visualise.external_clustering,
+                            args.visualise.microreact,
+                            args.visualise.phandango,
+                            args.visualise.grapetree,
+                            args.visualise.cytoscape,
+                            args.visualise.perplexity,
+                            args.visualise.strand_preserved,
+                            outdir + "/include.txt",
+                            species_db,
+                            species_db + "/" + os.path.basename(species_db) + "_clusters.csv",
+                            args.visualise.previous_query_clustering,
+                            outdir + "/" + os.path.basename(outdir) + "_graph.gt",
+                            args.visualise.gpu_graph,
+                            args.visualise.info_csv,
+                            args.visualise.rapidnj,
+                            args.visualise.tree,
+                            args.visualise.mst_distances,
+                            args.visualise.overwrite,
+                            args.visualise.core_only,
+                            args.visualise.accessory_only,
+                            args.visualise.display_cluster,
+                            web=True)
+    networkJson = graphml_to_json(outdir)
+    if len(to_include) >= 3:
+        with open(os.path.join(outdir, os.path.basename(outdir) + "_core_NJ.nwk"), "r") as p:
+            phylogeny = p.read()
+    else:
+        phylogeny = "A tree cannot be built with fewer than 3 samples."
 
-# Test generate_visualisations() for PopPUNK-web
-sys.stderr.write('\nTesting visualisations for PopPUNK-web\n')
-if len(to_include) < 3:
-    args.visualise.microreact = False
-generate_visualisations(outdir,
-                        species_db,
-                        None,
-                        args.visualise.threads,
-                        outdir,
-                        args.visualise.gpu_dist,
-                        args.visualise.deviceid,
-                        args.visualise.external_clustering,
-                        args.visualise.microreact,
-                        args.visualise.phandango,
-                        args.visualise.grapetree,
-                        args.visualise.cytoscape,
-                        args.visualise.perplexity,
-                        args.visualise.strand_preserved,
-                        outdir + "/include.txt",
-                        species_db,
-                        species_db,
-                        args.visualise.previous_query_clustering,
-                        outdir,
-                        args.visualise.info_csv,
-                        args.visualise.rapidnj,
-                        args.visualise.tree,
-                        args.visualise.mst_distances,
-                        args.visualise.overwrite,
-                        args.visualise.core_only,
-                        args.visualise.accessory_only)
-networkJson = graphml_to_json(outdir)
-if len(to_include) >= 3:
-    with open(os.path.join(outdir, os.path.basename(outdir) + "_core_NJ.nwk"), "r") as p:
-        phylogeny = p.read()
-else:
-    phylogeny = "A tree cannot be built with fewer than 3 samples."
+    # ensure web api outputs are of the correct type
+    if not isinstance(species, str):
+        raise TypeError('"Species" datatype is incorrect, should be string.\n')
+    if not (isinstance(query_prevalence, float) or isinstance(query_prevalence, int)):
+        raise TypeError('"query_prevalence" datatype is incorrect, should be float/integer.\n')
+    if not isinstance(query, str):
+        raise TypeError('"query" datatype is incorrect, should be string.\n')
+    if not isinstance(clusters, list) and not isinstance(clusters[0], str):
+        raise TypeError('"clusters" datatype is incorrect, should be list of strings.\n')
+    if not isinstance(prevalences, list) and not (isinstance(prevalences[0], float) or isinstance(prevalences[0], int)):
+        raise TypeError('"prevalences" datatype is incorrect, should be list of floats/integers.\n')
+    if not isinstance(colours, list) and not isinstance(colours[0], str):
+        raise TypeError('"colours" datatype is incorrect, should be list of strings.\n')
+    if not isinstance(url, str):
+        raise TypeError('"url" datatype is incorrect, should be string.\n')
+    if not isinstance(alias_dict, dict):
+        raise TypeError('"alias_dict" datatype is incorrect, should be dictionary.\n')
+    if not isinstance(outdir, str):
+        raise TypeError('"outdir" datatype is incorrect, should be string.\n')
+    if not isinstance(networkJson, dict):
+        raise TypeError('"networkJson" datatype is incorrect, should be dict.\n')
+    if not isinstance(phylogeny, str):
+        raise TypeError('"phylogeny" datatype is incorrect, should be str.\n')
 
-# ensure web api outputs are of the correct type
-if not isinstance(species, str):
-    raise TypeError('"Species" datatype is incorrect, should be string.\n')
-if not (isinstance(query_prevalence, float) or isinstance(query_prevalence, int)):
-    raise TypeError('"query_prevalence" datatype is incorrect, should be float/integer.\n')
-if not isinstance(query, str):
-    raise TypeError('"query" datatype is incorrect, should be string.\n')
-if not isinstance(clusters, list) and not isinstance(clusters[0], str):
-    raise TypeError('"clusters" datatype is incorrect, should be list of strings.\n')
-if not isinstance(prevalences, list) and not (isinstance(prevalences[0], float) or isinstance(prevalences[0], int)):
-    raise TypeError('"prevalences" datatype is incorrect, should be list of floats/integers.\n')
-if not isinstance(colours, list) and not isinstance(colours[0], str):
-    raise TypeError('"colours" datatype is incorrect, should be list of strings.\n')
-if not isinstance(url, str):
-    raise TypeError('"url" datatype is incorrect, should be string.\n')
-if not isinstance(alias_dict, dict):
-    raise TypeError('"alias_dict" datatype is incorrect, should be dictionary.\n')
-if not isinstance(outdir, str):
-    raise TypeError('"outdir" datatype is incorrect, should be string.\n')
-if not isinstance(networkJson, dict):
-    raise TypeError('"networkJson" datatype is incorrect, should be dict.\n')
-if not isinstance(phylogeny, str):
-    raise TypeError('"phylogeny" datatype is incorrect, should be str.\n')
+    sys.stderr.write('\nAPI tests complete\n')
 
-sys.stderr.write('\nAPI tests complete\n')
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/test/web_args.txt b/test/web_args.txt
index 2eb7f0fb..dc397c11 100644
--- a/test/web_args.txt
+++ b/test/web_args.txt
@@ -8,6 +8,8 @@
         "plot_fit":0,
         "graph_weights":true,
         "max_a_dist":0.5,
+        "max_pi_dist":0.5,
+        "type_isolate":null,
         "strand_preserved":false,
         "external_clustering":null,
         "core_only":false,
@@ -19,6 +21,7 @@
         "gpu_sketch":false,
         "deviceid":0,
         "gpu_dist":false,
+        "gpu_graph":false,
         "min_kmer_count":0,
         "min_k":14,
         "max_k":29,
@@ -36,6 +39,7 @@
     "visualise":{
         "threads":1,
         "gpu_dist":false,
+        "gpu_graph":false,
         "deviceid":0,
         "external_clustering":null,
         "microreact":true,
@@ -51,6 +55,7 @@
         "mst_distances":"core",
         "overwrite":true,
         "core_only":false,
-        "accessory_only":false
+        "accessory_only":false,
+        "display_cluster":null
     }
 }