From 4b123bbbd1d9d168fe195b4bac4eacf392347fc0 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Thu, 23 Apr 2020 15:39:26 +0100
Subject: [PATCH 01/46] Identify lineages from reference database

---
 PopPUNK/__main__.py           |  37 +++++-
 PopPUNK/lineage_clustering.py | 240 ++++++++++++++++++++++++++++++++++
 2 files changed, 276 insertions(+), 1 deletion(-)
 create mode 100644 PopPUNK/lineage_clustering.py
diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index c9895f17..3bbcdfc9 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -17,6 +17,8 @@
 
 from .sketchlib import no_sketchlib, checkSketchlibLibrary
 
+from .lineage_clustering import cluster_into_lineages
+
 from .network import fetchNetwork
 from .network import constructNetwork
 from .network import extractReferences
@@ -87,6 +89,10 @@ def get_options():
             help='Create model at this core distance threshold',
             default=None,
             type=float)
+    mode.add_argument('--lineage-clustering',
+            help='Identify lineages within a strain',
+            default=False,
+            action='store_true')
     mode.add_argument('--use-model',
             help='Apply a fitted model to a reference database to restore database files',
             default=False,
@@ -167,6 +173,10 @@ def get_options():
     queryingGroup.add_argument('--accessory-only', help='Use an accessory-distance only model for assigning queries '
                                               '[default = False]', default=False, action='store_true')
 
+    # lineage clustering within strains
+    lineagesGroup = parser.add_argument_group('Lineage analysis options')
+    lineagesGroup.add_argument('--R',help='Maximum rank used in lineage clustering', type = int, default=1)
+
     # plot output
     faGroup = parser.add_argument_group('Further analysis options')
     faGroup.add_argument('--subset', help='File with list of sequences to include in visualisation (with --generate-viz only)', default=None)
@@ -276,7 +286,7 @@ def main():
             sys.exit(1)
         else:
             sys.stderr.write('\t sketchlib: ' + checkSketchlibLibrary() + ')\n')
-    
+
     #******************************#
     #*                            *#
     #* Create database            *#
@@ -501,6 +511,31 @@ def main():
 
         nx.write_gpickle(genomeNetwork, args.output + "/" + os.path.basename(args.output) + '_graph.gpickle')
 
+    #******************************#
+    #*                            *#
+    #* within-strain analysis     *#
+    #*                            *#
+    #******************************#
+    sys.stderr.write("Checking options here\n\n")
+    if args.lineage_clustering:
+        sys.stderr.write("Mode: Identifying lineages within a clade\n\n")
+        
+        distances = None
+        ref_db = None
+        if args.distances is not None and args.ref_db is not None:
+            distances = args.distances
+            ref_db = args.ref_db
+        else:
+            sys.stderr.write("Need to provide an input set of distances with --distances "
+                             "and reference database directory with --ref-db\n\n")
+            sys.exit(1)
+
+        # load distance matrix
+        refList, queryList, self, distMat = readPickle(distances)
+        
+        # run lineage clustering
+        cluster_into_lineages(distMat, args.R, args.output, rlist = refList)
+
     #*******************************#
     #*                             *#
     #* query assignment (function  *#
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
new file mode 100644
index 00000000..493e6fbe
--- /dev/null
+++ b/PopPUNK/lineage_clustering.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+# vim: set fileencoding=<utf-8> :
+# Copyright 2018-2020 John Lees and Nick Croucher
+
+# universal
+import os
+import sys
+# additional
+import numpy as np
+from scipy.stats import rankdata
+from collections import defaultdict
+
+# import poppunk package
+from .utils import iterDistRows
+
+def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, R, lineage_index, seed_isolate):
+    """ Identifies isolates corresponding to a particular
+    lineage given a cluster seed.
+    
+    Args:
+        ranked_information
+        nearest_neighbours (dict)
+            
+        R (int)
+            Maximum rank of neighbours used for clustering.
+    
+    Returns:
+        ranked_information
+    
+    """
+    # first make all R neighbours of the seed part of the lineage if unclustered
+    for seed_neighbour in neighbours[seed_isolate]:
+        if lineage_clustering[seed_neighbour] > lineage_index:
+            lineage_clustering[seed_neighbour] = lineage_index
+    # iterate through ranks; for each isolate, test if neighbour belongs to this cluster
+    # overwrite higher cluster values - default value is higer than number of isolates
+    # when querying, allows smaller clusters to be merged into more basal ones as connections
+    # made
+    for rank in lineage_clustering_information.keys():
+        # iterate through isolates of same rank
+        for isolate in lineage_clustering_information[rank]:
+            # test if clustered/belonging to a more peripheral cluster
+#            print("Rank: " + str(rank) + " isolate: " + isolate)
+            if lineage_clustering[isolate] > lineage_index:
+                # get clusters of nearest neighbours
+                isolate_neighbour_clusters = [lineage_clustering[isolate_neighbour] for isolate_neighbour in neighbours[isolate].keys()]
+#                print('Neighbours: ' + str(neighbours[isolate].keys()) + '\nNeighbour clusters: ' + str(isolate_neighbour_clusters))
+                # if a nearest neighbour is in this cluster
+                if lineage_index in isolate_neighbour_clusters:
+                    # add isolate to lineage
+                    lineage_clustering[isolate] = lineage_index
+                    # add neighbours of same or lower rank to lineage if unclustered
+                    for isolate_neighbour in neighbours[isolate].keys():
+                        if lineage_clustering[isolate_neighbour] > lineage_index:
+                            for neighbour_rank in lineage_clustering_information.keys():
+                                if neighbour_rank <= rank:
+                                    if isolate_neighbour in lineage_clustering_information[neighbour_rank]:
+                                        lineage_clustering[isolate_neighbour] = lineage_index
+                                else:
+                                    break
+    return lineage_clustering
+
+def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_value):
+    """ Identifies the isolate used to initiate a cluster
+    """
+    # extract all pairwise distances between isolates that are not yet clustered
+    clustered_isolates = frozenset([isolate for isolate in lineage_clustering.keys() if lineage_clustering[isolate] != null_cluster_value])
+    unclustered_pair_indices = [n for n,pair in enumerate(row_labels) if not set(pair).issubset(clustered_isolates)]
+    unclustered_distances = distances[unclustered_pair_indices]
+#    print("Unclustered distances are: " + str(unclustered_distances))
+    # start at zero as the minimum if present in the unclustered set
+    minimum_unclustered_distance = 0
+    # otherwise go through the bother of picking a minimum
+    if minimum_unclustered_distance not in unclustered_distances:
+        if len(unclustered_distances) == 1:
+            minimum_unclustered_distance = unclustered_distances[0]
+        elif len(unclustered_distances) > 1:
+            minimum_unclustered_distance = np.amin(unclustered_distances)
+        else:
+            sys.stderr.write('Cannot find any more isolates not assigned to lineages; unclustered pair indices are ' + str(unclustered_pair_indices))
+            exit(1)
+    # identify which entries in the full distance set have this value
+    # unfortunately these distances are not necessarily neighbours
+    distance_indices = np.where(distances == minimum_unclustered_distance)[0].tolist()
+    # identify an unclustered isolate from the pair that have this value
+    closest_unclustered_pair_index =                                           int(set(unclustered_pair_indices).intersection(set(distance_indices)).pop())
+    new_seed_isolate = row_labels[closest_unclustered_pair_index][0] if lineage_clustering[row_labels[closest_unclustered_pair_index][0]] == null_cluster_value else row_labels[closest_unclustered_pair_index][1]
+    return new_seed_isolate
+    
+
+def get_lineage_clustering_information(seed_isolate, row_labels, distances):
+    """ Generates the ranked distances needed for cluster
+    definition.
+    """
+    # data structure
+    lineage_info = defaultdict(list)
+    # get subset of relevant distances
+    comparisons_involving_seed = [n for n,(pair) in enumerate(row_labels) if seed_isolate in pair]
+    distances_to_seed = distances[comparisons_involving_seed]
+    # get ranks of data
+    distance_ranks = rankdata(distances_to_seed)
+    # get partners of seed isolate in each comparison
+    pairs_involving_seed = [row_labels[x] for x in comparisons_involving_seed]
+    seed_partners = [r if q == seed_isolate else q for (r,q) in pairs_involving_seed]
+    # create a dict of lists of isolates for a given rank
+    # enables later easy iterating through ranked distances
+    for rank in np.unique(distance_ranks):
+        lineage_info[rank] = [seed_partners[n] for n,r in enumerate(distance_ranks) if r == rank]
+    return lineage_info
+
+def generate_nearest_neighbours(X, row_labels, isolate_list, R):
+    """ Identifies the nearest neighbours from the core
+    genome distances.
+    
+    Args
+        X (np.array)
+            n x 2 array of core and accessory distances for n samples.
+        row_labels (list of tuples)
+            List of pairs of isolates
+        isolate_list (list)
+            List of isolates
+        R (int)
+            Maximum rank of neighbours used for clustering.
+    
+    Returns:
+        nn
+            Dict of dict of neighbours per isolate, and pairwise distances
+    """
+    # data structures
+    nn = {}
+    # iterate through isolates
+    for isolate in isolate_list:
+        # initiate dict
+        nn[isolate] = {}
+        # get indices of rows involving the considered isolate
+        indices = [n for n, (r, q) in enumerate(row_labels) if r == isolate or q == isolate]
+        # get the corresponding distances
+        #ref_distances = X[np.array(indices):1]
+        ref_distances = np.take(X, np.array(indices))
+        # get unique distances and pick the R smallest values
+        unique_isolate_distances = np.unique(ref_distances)
+        R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)]
+        # get threshold distance for neighbour definition - np.maximum fails for R=1
+        threshold_distance = R_lowest_distances[0]
+        if R > 1:
+            threshold_distance = np.maximum(R_lowest_distances)
+        # identify all distances below this threshold and store relative to index
+        for i,d in enumerate(ref_distances):
+            if d <= threshold_distance:
+                # get index of row in full list
+                index = indices[i]
+                # get name of other isolate in pair
+                name = row_labels[index][1] if row_labels[index][0] == isolate else row_labels[index][0]
+                nn[isolate][name] = ref_distances[i]
+    # return completed dictionary
+    return nn
+
+
+def update_nearest_neighbours():
+    """ Updates the information on nearest neighbours, given
+    a new set of ref-query and query-query distances.
+    """
+    return 0
+
+
+def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_model = None, use_accessory = False):
+    """ Clusters isolates into lineages based on their
+    relative distances.
+    
+    Args
+        X (np.array)
+            n x 2 array of core and accessory distances for n samples.
+            This should not be subsampled.
+        R (int)
+            Integer specifying the maximum rank of neighbour used
+            for clustering.
+        rlist (list)
+            List of reference sequences.
+        qlist (list)
+            List of query sequences.
+        existing_model (?)
+            Existing lineage clustering model.
+    Returns:
+        ret_vec
+            An n-vector with the most likely cluster memberships
+    """
+    # process data lineage identification
+    sorted_distances = np.empty(shape = X.shape[0])
+    row_labels = []
+    isolate_list = []
+    if qlist is None:
+        isolate_list = rlist
+        distance_index = 1 if use_accessory else 0
+        distances = X[:,distance_index]
+        row_labels = list(iter(iterDistRows(rlist, rlist, self=True)))
+    else:
+        sys.stderr.write("Adding to lineage not yet implemented\n")
+    # identify nearest neighbours
+    neighbours = {}
+    neighbours = generate_nearest_neighbours(distances,
+                                            row_labels,
+                                            rlist,
+                                            R)
+    # run clustering
+    null_cluster_value = len(isolate_list) + 1
+    lineage_clustering = {i:null_cluster_value for i in isolate_list}
+    lineage_index = 1
+    lineage_seed = {}
+    while null_cluster_value in lineage_clustering.values():
+        # get seed isolate based on minimum pairwise distances
+        seed_isolate = get_seed_isolate(lineage_clustering,
+                                        row_labels,
+                                        distances,
+                                        null_cluster_value)
+        # record status of seed isolate
+        lineage_seed[lineage_index] = seed_isolate
+        lineage_clustering[seed_isolate] = lineage_index
+        # get information for lineage clustering
+        lineage_clustering_information = get_lineage_clustering_information(seed_isolate,
+                                                                            row_labels,
+                                                                            distances)
+        # cluster the lineages
+        lineage_clustering = run_lineage_clustering(lineage_clustering,
+                                                    lineage_clustering_information,
+                                                    neighbours,
+                                                    R,
+                                                    lineage_index,
+                                                    seed_isolate)
+        # increment index for next lineage
+        lineage_index = lineage_index + 1
+    
+    # print output
+    lineage_output_name = output + "/" + output + "_lineage_clusters.csv"
+    with open(lineage_output_name, 'w') as lFile:
+        print('Id,Lineage__autocolor', file = lFile)
+        for isolate in lineage_clustering.keys():
+            print(isolate + ',' + str(lineage_clustering[isolate]), file = lFile)
+    
+    return 0
+

From 82ed97393e7f8f064828f7929f2365f758804f2c Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 24 Apr 2020 09:28:43 +0100
Subject: [PATCH 02/46] Allow addition to lineage clustering

---
 PopPUNK/__main__.py           |  17 ++-
 PopPUNK/lineage_clustering.py | 233 +++++++++++++++++++++++++---------
 2 files changed, 183 insertions(+), 67 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 3bbcdfc9..cf90b5f4 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -175,7 +175,9 @@ def get_options():
 
     # lineage clustering within strains
     lineagesGroup = parser.add_argument_group('Lineage analysis options')
-    lineagesGroup.add_argument('--R',help='Maximum rank used in lineage clustering', type = int, default=1)
+    lineagesGroup.add_argument('--R',help='Maximum rank used in lineage clustering [default = 1]', type = int, default = 1)
+    lineagesGroup.add_argument('--use-accessory',help='Use accessory distances for lineage definitions [default = use core distances]', action = 'store_true', default = False)
+    lineagesGroup.add_argument('--existing-scheme',help='Name of pickle file storing existing lineage definitions', default = None)
 
     # plot output
     faGroup = parser.add_argument_group('Further analysis options')
@@ -521,20 +523,21 @@ def main():
         sys.stderr.write("Mode: Identifying lineages within a clade\n\n")
         
         distances = None
-        ref_db = None
-        if args.distances is not None and args.ref_db is not None:
+        if args.distances is not None:
             distances = args.distances
-            ref_db = args.ref_db
         else:
-            sys.stderr.write("Need to provide an input set of distances with --distances "
-                             "and reference database directory with --ref-db\n\n")
+            sys.stderr.write("Need to provide an input set of distances with --distances\n\n")
             sys.exit(1)
 
         # load distance matrix
         refList, queryList, self, distMat = readPickle(distances)
+        print("Queries are: " + str(queryList))
         
         # run lineage clustering
-        cluster_into_lineages(distMat, args.R, args.output, rlist = refList)
+        if self:
+            cluster_into_lineages(distMat, args.R, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme)
+        else:
+            cluster_into_lineages(distMat, args.R, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory,  existing_scheme = args.existing_scheme)
 
     #*******************************#
     #*                             *#
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 493e6fbe..5be9bc14 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -9,11 +9,12 @@
 import numpy as np
 from scipy.stats import rankdata
 from collections import defaultdict
+import pickle
 
 # import poppunk package
 from .utils import iterDistRows
 
-def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, R, lineage_index, seed_isolate):
+def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, R, lineage_index, seed_isolate, previous_lineage_clustering, null_cluster_value):
     """ Identifies isolates corresponding to a particular
     lineage given a cluster seed.
     
@@ -58,34 +59,56 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
                                         lineage_clustering[isolate_neighbour] = lineage_index
                                 else:
                                     break
+            # if this represents a change from the previous iteration of lineage definitions
+            # then the change needs to propagate through higher-ranked members
+            if previous_lineage_clustering[isolate] == lineage_index and lineage_clustering[isolate] != lineage_index:
+                for higher_rank in lineage_clustering_information.keys():
+                    if higher_rank > rank:
+                        for higher_ranked_isolate in lineage_clustering_information[rank]:
+                            if lineage_clustering[isolate] == lineage_index:
+                                lineage_clustering[isolate] = null_cluster_value
+                
+                        
     return lineage_clustering
 
-def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_value):
+def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_value, lineage_index, lineage_seed):
     """ Identifies the isolate used to initiate a cluster
     """
-    # extract all pairwise distances between isolates that are not yet clustered
-    clustered_isolates = frozenset([isolate for isolate in lineage_clustering.keys() if lineage_clustering[isolate] != null_cluster_value])
-    unclustered_pair_indices = [n for n,pair in enumerate(row_labels) if not set(pair).issubset(clustered_isolates)]
-    unclustered_distances = distances[unclustered_pair_indices]
-#    print("Unclustered distances are: " + str(unclustered_distances))
-    # start at zero as the minimum if present in the unclustered set
-    minimum_unclustered_distance = 0
-    # otherwise go through the bother of picking a minimum
-    if minimum_unclustered_distance not in unclustered_distances:
-        if len(unclustered_distances) == 1:
-            minimum_unclustered_distance = unclustered_distances[0]
-        elif len(unclustered_distances) > 1:
-            minimum_unclustered_distance = np.amin(unclustered_distances)
-        else:
-            sys.stderr.write('Cannot find any more isolates not assigned to lineages; unclustered pair indices are ' + str(unclustered_pair_indices))
-            exit(1)
-    # identify which entries in the full distance set have this value
-    # unfortunately these distances are not necessarily neighbours
-    distance_indices = np.where(distances == minimum_unclustered_distance)[0].tolist()
-    # identify an unclustered isolate from the pair that have this value
-    closest_unclustered_pair_index =                                           int(set(unclustered_pair_indices).intersection(set(distance_indices)).pop())
-    new_seed_isolate = row_labels[closest_unclustered_pair_index][0] if lineage_clustering[row_labels[closest_unclustered_pair_index][0]] == null_cluster_value else row_labels[closest_unclustered_pair_index][1]
-    return new_seed_isolate
+    # variable to return
+    seed_isolate = None
+    # first test if there is an existing seed
+    if lineage_index in lineage_seed.keys():
+        original_seed_isolate = lineage_seed[lineage_index]
+        # if seed now belongs to a different lineage, then lineage has been overwritten
+        # seed may be unassigned if neighbours have changed - but this does not overwrite the
+        # lineage in the same way
+        if lineage_clustering[original_seed_isolate] == null_cluster_value or lineage_clustering[original_seed_isolate] == lineage_index:
+            seed_isolate = original_seed_isolate
+    # else identify a new seed from the closest pair
+    else:
+        # extract all pairwise distances between isolates that are not yet clustered
+        clustered_isolates = frozenset([isolate for isolate in lineage_clustering.keys() if lineage_clustering[isolate] != null_cluster_value])
+        unclustered_pair_indices = [n for n,pair in enumerate(row_labels) if not set(pair).issubset(clustered_isolates)]
+        unclustered_distances = distances[unclustered_pair_indices]
+    #    print("Unclustered distances are: " + str(unclustered_distances))
+        # start at zero as the minimum if present in the unclustered set
+        minimum_unclustered_distance = 0
+        # otherwise go through the bother of picking a minimum
+        if minimum_unclustered_distance not in unclustered_distances:
+            if len(unclustered_distances) == 1:
+                minimum_unclustered_distance = unclustered_distances[0]
+            elif len(unclustered_distances) > 1:
+                minimum_unclustered_distance = np.amin(unclustered_distances)
+            else:
+                sys.stderr.write('Cannot find any more isolates not assigned to lineages; unclustered pair indices are ' + str(unclustered_pair_indices))
+                exit(1)
+        # identify which entries in the full distance set have this value
+        # unfortunately these distances are not necessarily neighbours
+        distance_indices = np.where(distances == minimum_unclustered_distance)[0].tolist()
+        # identify an unclustered isolate from the pair that have this value
+        closest_unclustered_pair_index =                                           int(set(unclustered_pair_indices).intersection(set(distance_indices)).pop())
+        seed_isolate = row_labels[closest_unclustered_pair_index][0] if lineage_clustering[row_labels[closest_unclustered_pair_index][0]] == null_cluster_value else row_labels[closest_unclustered_pair_index][1]
+    return seed_isolate
     
 
 def get_lineage_clustering_information(seed_isolate, row_labels, distances):
@@ -135,8 +158,8 @@ def generate_nearest_neighbours(X, row_labels, isolate_list, R):
         # get indices of rows involving the considered isolate
         indices = [n for n, (r, q) in enumerate(row_labels) if r == isolate or q == isolate]
         # get the corresponding distances
-        #ref_distances = X[np.array(indices):1]
-        ref_distances = np.take(X, np.array(indices))
+        #ref_distances = np.take(X, np.array(indices, dtype = 'int64'))
+        ref_distances = np.take(X, indices)
         # get unique distances and pick the R smallest values
         unique_isolate_distances = np.unique(ref_distances)
         R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)]
@@ -156,18 +179,70 @@ def generate_nearest_neighbours(X, row_labels, isolate_list, R):
     return nn
 
 
-def update_nearest_neighbours():
+def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clustering, null_cluster_value):
     """ Updates the information on nearest neighbours, given
     a new set of ref-query and query-query distances.
+    --- really need isolate list?
     """
-    return 0
+    # iterate through isolates and test whether any comparisons with
+    # newly-added queries replace or supplement existing neighbours
+    
+    # pre-process to extract ref-query distances first
+    query_match_indices = [n for n, (r, q) in enumerate(row_labels) if q in qlist or r in qlist]
+    query_row_names = [row_labels[i] for i in query_match_indices]
+    query_distances = np.take(distances, query_match_indices)
+    # iterate through references to update existing entries
+    for isolate in nn.keys():
+        # get max distance to existing neighbours
+        max_distance_to_ref = max(nn[isolate].values())
+        # get min distance to queries
+        ref_query_match_indices = [n for n, (r, q) in enumerate(query_row_names) if r == isolate or q == isolate]
+        ref_query_distances = np.take(query_distances, ref_query_match_indices)
+        min_distance_to_query = np.amin(ref_query_distances)
+        # if closer queries, find distances and replace
+        if min_distance_to_query < max_distance_to_ref:
+            # unset clustering of isolate and neighbours
+            lineage_clustering[isolate] = null_cluster_value
+            for neighbour in nn[isolate].keys():
+                lineage_clustering[neighbour] = null_cluster_value
+            # update neighbour information
+            nn_old = nn.pop(isolate)
+            nn[isolate] = {}
+            # old neighbours from existing dict
+            old_distances = list(set(nn_old.values()))
+            # new neighbours from new distances in X
+            new_distances = np.unique(ref_query_distances)
+            # combine old and new distances
+            combined_ref_distances = (nn_old.values(),ref_query_distances.tolist())
+            # take top ranked distances from both
+            ranked_unique_combined_ref_distances = sorted(set(combined_ref_distances))[:R]
+            for distance in ranked_unique_combined_ref_distances:
+                # add old neighbours back in
+                if distance in old_distances:
+                    for neighbour,d in nn_old.items():
+                        if d == distance:
+                            nn[isolate][neighbour] = distance
+                # add new neighbours
+                if distance in new_distances:
+                    for n,d in enumerate(ref_query_distances):
+                        if d == distance:
+                            row_label_index = ref_query_match_indices[n]
+                            neighbour = row_label_index[row_label_index][0] if row_label_index[row_label_index][1] == isolate else row_label_index[row_label_index][1]
+                            nn[isolate][neighbour] = d
+#            print('Changed sequences for isolate ' + isolate + '\nOld distances: ' + str(old_nn) + '\nNew distances: ' + str(nn[isolate]))
+    # get nn for query sequences
+    query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R)
+    # merge dicts
+    for query in query_nn.keys():
+        nn[query] = query_nn[query]
+    return nn
 
 
-def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_model = None, use_accessory = False):
+def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_scheme = None, use_accessory = False):
     """ Clusters isolates into lineages based on their
     relative distances.
     
-    Args
+    Args:
         X (np.array)
             n x 2 array of core and accessory distances for n samples.
             This should not be subsampled.
@@ -178,57 +253,95 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_mod
             List of reference sequences.
         qlist (list)
             List of query sequences.
-        existing_model (?)
+        existing_scheme (?)
             Existing lineage clustering model.
+            
     Returns:
         ret_vec
             An n-vector with the most likely cluster memberships
     """
-    # process data lineage identification
-    sorted_distances = np.empty(shape = X.shape[0])
+    # process distance matrix
+    distance_index = 1 if use_accessory else 0
+    distances = X[:,distance_index]
+    
+    # determine whether ref-ref or ref-query analysis
     row_labels = []
     isolate_list = []
     if qlist is None:
+        sys.stderr.write("Identifying lineages using reference sequences\n")
         isolate_list = rlist
-        distance_index = 1 if use_accessory else 0
-        distances = X[:,distance_index]
-        row_labels = list(iter(iterDistRows(rlist, rlist, self=True)))
+        row_labels = list(iter(iterDistRows(rlist, rlist, self = True)))
     else:
-        sys.stderr.write("Adding to lineage not yet implemented\n")
-    # identify nearest neighbours
+        sys.stderr.write("Identifying lineages from using reference and query sequences\n")
+        row_labels = list(iter(iterDistRows(rlist, qlist, self = False)))
+        isolate_list = rlist + qlist
+    
+    # determine whether novel analysis or modifying existing analysis
     neighbours = {}
-    neighbours = generate_nearest_neighbours(distances,
-                                            row_labels,
-                                            rlist,
-                                            R)
-    # run clustering
+    lineage_seed = {}
     null_cluster_value = len(isolate_list) + 1
     lineage_clustering = {i:null_cluster_value for i in isolate_list}
+    previous_lineage_clustering = lineage_clustering
+    
+    if existing_scheme is None:
+        neighbours = generate_nearest_neighbours(distances,
+                                                row_labels,
+                                                isolate_list,
+                                                R)
+    else:
+        with open(existing_scheme, 'rb') as pickle_file:
+            lineage_clustering, lineage_seed, neighbours = pickle.load(pickle_file)
+        previous_lineage_clustering = lineage_clustering # use for detecting changes
+        neighbours = update_nearest_neighbours(distances,
+                                                row_labels,
+                                                R,
+                                                qlist,
+                                                neighbours,
+                                                lineage_clustering,
+                                                null_cluster_value)
+
+    # run clustering
     lineage_index = 1
-    lineage_seed = {}
     while null_cluster_value in lineage_clustering.values():
+        
         # get seed isolate based on minimum pairwise distances
-        seed_isolate = get_seed_isolate(lineage_clustering,
+        seed_isolate = get_seed_isolate(lineage_clustering,     # need to add in lineage seed hash
                                         row_labels,
                                         distances,
-                                        null_cluster_value)
-        # record status of seed isolate
+                                        null_cluster_value,
+                                        lineage_index,
+                                        lineage_seed)
         lineage_seed[lineage_index] = seed_isolate
-        lineage_clustering[seed_isolate] = lineage_index
-        # get information for lineage clustering
-        lineage_clustering_information = get_lineage_clustering_information(seed_isolate,
-                                                                            row_labels,
-                                                                            distances)
-        # cluster the lineages
-        lineage_clustering = run_lineage_clustering(lineage_clustering,
-                                                    lineage_clustering_information,
-                                                    neighbours,
-                                                    R,
-                                                    lineage_index,
-                                                    seed_isolate)
+        
+        # seed isolate is None if a previously-existing cluster has been overwritten
+        # in which case pass over the lineage to keep nomenclature consistent
+        if seed_isolate is not None:
+        
+            # record status of seed isolate
+            lineage_clustering[seed_isolate] = lineage_index
+            
+            # get information for lineage clustering
+            lineage_clustering_information = get_lineage_clustering_information(seed_isolate,
+                                                                                row_labels,
+                                                                                distances)
+                                                                                
+            # cluster the lineages
+            lineage_clustering = run_lineage_clustering(lineage_clustering,
+                                                        lineage_clustering_information,
+                                                        neighbours,
+                                                        R,
+                                                        lineage_index,
+                                                        seed_isolate,
+                                                        previous_lineage_clustering,
+                                                        null_cluster_value)
+                                                        
         # increment index for next lineage
         lineage_index = lineage_index + 1
     
+    # store output
+    with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file:
+        pickle.dump([lineage_clustering, lineage_seed, neighbours], pickle_file)
+    
     # print output
     lineage_output_name = output + "/" + output + "_lineage_clusters.csv"
     with open(lineage_output_name, 'w') as lFile:

From bbf3538c169550047b6514f22f4aa3c4fe958cf5 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 24 Apr 2020 10:16:03 +0100
Subject: [PATCH 03/46] Fix integration of query isolates

---
 PopPUNK/__main__.py           |  1 -
 PopPUNK/lineage_clustering.py | 18 +++++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index cf90b5f4..058b3c9c 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -531,7 +531,6 @@ def main():
 
         # load distance matrix
         refList, queryList, self, distMat = readPickle(distances)
-        print("Queries are: " + str(queryList))
         
         # run lineage clustering
         if self:
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 5be9bc14..72554e52 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -61,12 +61,13 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
                                     break
             # if this represents a change from the previous iteration of lineage definitions
             # then the change needs to propagate through higher-ranked members
-            if previous_lineage_clustering[isolate] == lineage_index and lineage_clustering[isolate] != lineage_index:
-                for higher_rank in lineage_clustering_information.keys():
-                    if higher_rank > rank:
-                        for higher_ranked_isolate in lineage_clustering_information[rank]:
-                            if lineage_clustering[isolate] == lineage_index:
-                                lineage_clustering[isolate] = null_cluster_value
+            if isolate in previous_lineage_clustering:
+                if previous_lineage_clustering[isolate] == lineage_index and lineage_clustering[isolate] != lineage_index:
+                    for higher_rank in lineage_clustering_information.keys():
+                        if higher_rank > rank:
+                            for higher_ranked_isolate in lineage_clustering_information[rank]:
+                                if lineage_clustering[isolate] == lineage_index:
+                                    lineage_clustering[isolate] = null_cluster_value
                 
                         
     return lineage_clustering
@@ -110,7 +111,6 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val
         seed_isolate = row_labels[closest_unclustered_pair_index][0] if lineage_clustering[row_labels[closest_unclustered_pair_index][0]] == null_cluster_value else row_labels[closest_unclustered_pair_index][1]
     return seed_isolate
     
-
 def get_lineage_clustering_information(seed_isolate, row_labels, distances):
     """ Generates the ranked distances needed for cluster
     definition.
@@ -235,6 +235,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
     # merge dicts
     for query in query_nn.keys():
         nn[query] = query_nn[query]
+    # return updated dict
     return nn
 
 
@@ -292,6 +293,9 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch
         with open(existing_scheme, 'rb') as pickle_file:
             lineage_clustering, lineage_seed, neighbours = pickle.load(pickle_file)
         previous_lineage_clustering = lineage_clustering # use for detecting changes
+        # add new queries to lineage clustering
+        for q in qlist:
+            lineage_clustering[q] = null_cluster_value
         neighbours = update_nearest_neighbours(distances,
                                                 row_labels,
                                                 R,

From 95f9c84a2f61c605294195a63a7e3e544d69fe0b Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 24 Apr 2020 10:19:13 +0100
Subject: [PATCH 04/46] Add query status to output CSV

---
 PopPUNK/lineage_clustering.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 72554e52..e195565f 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -349,9 +349,16 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch
     # print output
     lineage_output_name = output + "/" + output + "_lineage_clusters.csv"
     with open(lineage_output_name, 'w') as lFile:
-        print('Id,Lineage__autocolor', file = lFile)
-        for isolate in lineage_clustering.keys():
-            print(isolate + ',' + str(lineage_clustering[isolate]), file = lFile)
-    
+        if qlist is None:
+            print('Id,Lineage__autocolor', file = lFile)
+            for isolate in lineage_clustering.keys():
+                print(isolate + ',' + str(lineage_clustering[isolate]), file = lFile)
+        else:
+            print('Id,Lineage__autocolor,QueryStatus', file = lFile)
+            for isolate in rlist:
+                print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + 'Reference', file = lFile)
+            for isolate in qlist:
+                print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + 'Query', file = lFile)
+
     return 0
 

From af909d2eb55c31a3f1d54ff2433b0e409c3d5ab8 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 24 Apr 2020 10:45:07 +0100
Subject: [PATCH 05/46] Documenting functions and updating lineage assignment

---
 PopPUNK/lineage_clustering.py | 133 ++++++++++++++++++++++++++--------
 1 file changed, 101 insertions(+), 32 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index e195565f..f4f7c805 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -19,14 +19,27 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
     lineage given a cluster seed.
     
     Args:
-        ranked_information
-        nearest_neighbours (dict)
-            
+        lineage_clustering (dict)
+           Clustering of existing dataset.
+        lineage_clustering_information (dict)
+            Dict listing isolates by ranked distance from seed.
+        neighbours (nested dict)
+           Pre-calculated neighbour relationships.
         R (int)
-            Maximum rank of neighbours used for clustering.
-    
+           Maximum rank of neighbours used for clustering.
+        lineage_index (int)
+           Label of current lineage.
+        seed_isolate (str)
+           Isolate to used to initiate next lineage.
+        previous_lineage_clustering (dict)
+            Clustering of existing dataset in previous iteration.
+        null_cluster_value (int)
+            Null cluster value used for unsetting lineage assignments
+            where this may change due to altered neighbour relationships.
+        
     Returns:
-        ranked_information
+        lineage_clustering (dict)
+            Assignment of isolates to lineages.
     
     """
     # first make all R neighbours of the seed part of the lineage if unclustered
@@ -41,11 +54,9 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
         # iterate through isolates of same rank
         for isolate in lineage_clustering_information[rank]:
             # test if clustered/belonging to a more peripheral cluster
-#            print("Rank: " + str(rank) + " isolate: " + isolate)
             if lineage_clustering[isolate] > lineage_index:
                 # get clusters of nearest neighbours
                 isolate_neighbour_clusters = [lineage_clustering[isolate_neighbour] for isolate_neighbour in neighbours[isolate].keys()]
-#                print('Neighbours: ' + str(neighbours[isolate].keys()) + '\nNeighbour clusters: ' + str(isolate_neighbour_clusters))
                 # if a nearest neighbour is in this cluster
                 if lineage_index in isolate_neighbour_clusters:
                     # add isolate to lineage
@@ -73,7 +84,27 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
     return lineage_clustering
 
 def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_value, lineage_index, lineage_seed):
-    """ Identifies the isolate used to initiate a cluster
+    """ Identifies the isolate used to initiate a cluster.
+    
+    Args:
+        lineage_clustering (dict)
+            Clustering of existing dataset.
+        row_labels (list of tuples)
+            Pairs of isolates labelling each distance.
+        distances (numpy array)
+            Pairwise distances used for defining relationships.
+        null_cluster_value (int)
+           Null cluster value used for unsetting lineage assignments
+           where this may change due to altered neighbour relationships.
+        lineage_index (int)
+            Label of current lineage.
+        lineage_seed (dict)
+            Dict of seeds used to initiate pre-existing lineage definitions.
+    
+    Returns:
+        seed_isolate (str)
+            Isolate to used to initiate next lineage.
+    
     """
     # variable to return
     seed_isolate = None
@@ -91,7 +122,6 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val
         clustered_isolates = frozenset([isolate for isolate in lineage_clustering.keys() if lineage_clustering[isolate] != null_cluster_value])
         unclustered_pair_indices = [n for n,pair in enumerate(row_labels) if not set(pair).issubset(clustered_isolates)]
         unclustered_distances = distances[unclustered_pair_indices]
-    #    print("Unclustered distances are: " + str(unclustered_distances))
         # start at zero as the minimum if present in the unclustered set
         minimum_unclustered_distance = 0
         # otherwise go through the bother of picking a minimum
@@ -114,6 +144,19 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val
 def get_lineage_clustering_information(seed_isolate, row_labels, distances):
     """ Generates the ranked distances needed for cluster
     definition.
+    
+    Args:
+        seed_isolate (str)
+            Isolate used to initiate lineage.
+        row_labels (list of tuples)
+            Pairs of isolates labelling each distance.
+        distances (numpy array)
+            Pairwise distances used for defining relationships.
+    
+    Returns:
+        lineage_info (dict)
+            Dict listing isolates by ranked distance from seed.
+    
     """
     # data structure
     lineage_info = defaultdict(list)
@@ -131,17 +174,17 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances):
         lineage_info[rank] = [seed_partners[n] for n,r in enumerate(distance_ranks) if r == rank]
     return lineage_info
 
-def generate_nearest_neighbours(X, row_labels, isolate_list, R):
+def generate_nearest_neighbours(distances, row_labels, isolate_list, R):
     """ Identifies the nearest neighbours from the core
     genome distances.
     
-    Args
-        X (np.array)
-            n x 2 array of core and accessory distances for n samples.
+    Args:
+        distances (numpy array)
+            Pairwise distances used for defining relationships.
         row_labels (list of tuples)
-            List of pairs of isolates
+            Pairs of isolates labelling each distance.
         isolate_list (list)
-            List of isolates
+            List of isolates for lineage assignment.
         R (int)
             Maximum rank of neighbours used for clustering.
     
@@ -159,7 +202,7 @@ def generate_nearest_neighbours(X, row_labels, isolate_list, R):
         indices = [n for n, (r, q) in enumerate(row_labels) if r == isolate or q == isolate]
         # get the corresponding distances
         #ref_distances = np.take(X, np.array(indices, dtype = 'int64'))
-        ref_distances = np.take(X, indices)
+        ref_distances = np.take(distances, indices)
         # get unique distances and pick the R smallest values
         unique_isolate_distances = np.unique(ref_distances)
         R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)]
@@ -182,7 +225,30 @@ def generate_nearest_neighbours(X, row_labels, isolate_list, R):
 def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clustering, null_cluster_value):
     """ Updates the information on nearest neighbours, given
     a new set of ref-query and query-query distances.
-    --- really need isolate list?
+    
+    Args:
+       distances (numpy array)
+        Distances to be used for defining lineages.
+       row_labels (list of tuples)
+        Pairs of isolates labelling each distance.
+       R (int)
+        Maximum rank of distance used to define nearest neighbours.
+       qlist (list)
+        List of queries to be added to existing dataset.
+       nn (nested dict)
+        Pre-calculated neighbour relationships.
+       lineage_clustering (dict)
+        Clustering of existing dataset.
+       null_cluster_value (int)
+        Null cluster value used for unsetting lineage assignments
+        where this may change due to altered neighbour relationships.
+    
+    Returns:
+        nn (nested dict)
+            Updated neighbour relationships.
+        lineage_clustering (dict)
+            Updated assignment of isolates to lineage.
+    
     """
     # iterate through isolates and test whether any comparisons with
     # newly-added queries replace or supplement existing neighbours
@@ -229,14 +295,13 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
                             row_label_index = ref_query_match_indices[n]
                             neighbour = row_label_index[row_label_index][0] if row_label_index[row_label_index][1] == isolate else row_label_index[row_label_index][1]
                             nn[isolate][neighbour] = d
-#            print('Changed sequences for isolate ' + isolate + '\nOld distances: ' + str(old_nn) + '\nNew distances: ' + str(nn[isolate]))
     # get nn for query sequences
     query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R)
     # merge dicts
     for query in query_nn.keys():
         nn[query] = query_nn[query]
     # return updated dict
-    return nn
+    return nn, lineage_clustering
 
 
 def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_scheme = None, use_accessory = False):
@@ -249,17 +314,21 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch
             This should not be subsampled.
         R (int)
             Integer specifying the maximum rank of neighbour used
-            for clustering.
+            for clustering. Should be changed to int list for hierarchical
+            clustering.
         rlist (list)
             List of reference sequences.
         qlist (list)
             List of query sequences.
-        existing_scheme (?)
-            Existing lineage clustering model.
+        existing_scheme (str)
+            Path to pickle file containing lineage scheme to which isolates
+            should be added.
+        use_accessory (bool)
+            Option to use accessory distances rather than core distances.
             
     Returns:
-        ret_vec
-            An n-vector with the most likely cluster memberships
+        completion_status (int)
+            0 if successful
     """
     # process distance matrix
     distance_index = 1 if use_accessory else 0
@@ -296,13 +365,13 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch
         # add new queries to lineage clustering
         for q in qlist:
             lineage_clustering[q] = null_cluster_value
-        neighbours = update_nearest_neighbours(distances,
-                                                row_labels,
-                                                R,
-                                                qlist,
-                                                neighbours,
-                                                lineage_clustering,
-                                                null_cluster_value)
+        neighbours, lineage_clustering = update_nearest_neighbours(distances,
+                                                                    row_labels,
+                                                                    R,
+                                                                    qlist,
+                                                                    neighbours,
+                                                                    lineage_clustering,
+                                                                    null_cluster_value)
 
     # run clustering
     lineage_index = 1

From 58eb9bbe063da6c1cc1ff0a25d14bbb08e9980da Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 24 Apr 2020 11:25:25 +0100
Subject: [PATCH 06/46] Reconfigure visualisation section to allow for
 external/lineage visualisation

---
 PopPUNK/__main__.py | 78 +++++++++++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 31 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 058b3c9c..9724f58c 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -107,6 +107,8 @@ def get_options():
     iGroup.add_argument('--distances', help='Prefix of input pickle of pre-calculated distances')
     iGroup.add_argument('--external-clustering', help='File with cluster definitions or other labels '
                                                       'generated with any other method.', default=None)
+    iGroup.add_argument('--viz-lineages', help='CSV with lineage definitions to use for visualisation'
+                                                    'rather than strain definitions.', default=None)
 
     # output options
     oGroup = parser.add_argument_group('Output options')
@@ -566,6 +568,8 @@ def main():
             sys.exit(1)
 
         if args.distances is not None and args.ref_db is not None:
+            
+            # Initial processing
             # Load original distances
             with open(args.distances + ".pkl", 'rb') as pickle_file:
                 rlist, qlist, self = pickle.load(pickle_file)
@@ -579,28 +583,8 @@ def main():
                 except OSError:
                     sys.stderr.write("Cannot create output directory\n")
                     sys.exit(1)
-
-            # identify existing analysis files
-            model_prefix = args.ref_db
-            if args.model_dir is not None:
-                model_prefix = args.model_dir
-            model = loadClusterFit(model_prefix + "/" + os.path.basename(model_prefix) + '_fit.pkl',
-                               model_prefix + "/" + os.path.basename(model_prefix) + '_fit.npz')
-
-            # Set directories of previous fit
-            if args.previous_clustering is not None:
-                prev_clustering = args.previous_clustering
-            else:
-                prev_clustering = os.path.dirname(args.distances + ".pkl")
-
-            # Read in network and cluster assignment
-            genomeNetwork, cluster_file = fetchNetwork(prev_clustering, model, rlist, args.core_only, args.accessory_only)
-
-            # Use external clustering if specified
-            if args.external_clustering:
-                cluster_file = args.external_clustering
-            isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)}
-
+            
+            # Define set/subset to be visualised
             # extract subset of distances if requested
             viz_subset = rlist
             if args.subset is not None:
@@ -611,8 +595,9 @@ def main():
 
             # Use the same code as no full_db in assign_query to take a subset
             dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
-            nodes_to_remove = set(genomeNetwork.nodes).difference(viz_subset)
-            postpruning_combined_seq, newDistMat = prune_distance_matrix(rlist, nodes_to_remove,
+            #nodes_to_remove = set(genomeNetwork.nodes).difference(viz_subset)
+            isolates_to_remove = set(combined_seq).difference(viz_subset)
+            postpruning_combined_seq, newDistMat = prune_distance_matrix(rlist, isolates_to_remove,
                                                                       complete_distMat, dists_out)
 
             rlist = viz_subset
@@ -624,12 +609,40 @@ def main():
             except:
                 sys.stderr.write("Isolates in subset not found in existing database\n")
             assert postpruning_combined_seq == viz_subset
+            
+            
+            
+            # Either use strain definitions, lineage assignments or external clustering
+            isolateClustering = {}
+            # Use external clustering if specified
+            if args.external_clustering:
+                cluster_file = args.external_clustering
+                isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)}
+            elif args.viz_lineages:
+                cluster_file = args.viz_lineages
+                isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)}
+            else:
+                # identify existing analysis files
+                model_prefix = args.ref_db
+                if args.model_dir is not None:
+                    model_prefix = args.model_dir
+                model = loadClusterFit(model_prefix + "/" + os.path.basename(model_prefix) + '_fit.pkl',
+                                   model_prefix + "/" + os.path.basename(model_prefix) + '_fit.npz')
+                
+                # Set directories of previous fit
+                if args.previous_clustering is not None:
+                    prev_clustering = args.previous_clustering
+                else:
+                    prev_clustering = os.path.dirname(args.distances + ".pkl")
 
-            # prune the network and dictionary of assignments
-            genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(viz_subset))
-            for clustering_type in isolateClustering:
-                isolateClustering[clustering_type] = {viz_key: isolateClustering[clustering_type][viz_key]
-                    for viz_key in viz_subset}
+                # Read in network and cluster assignment
+                genomeNetwork, cluster_file = fetchNetwork(prev_clustering, model, rlist, args.core_only, args.accessory_only)
+
+                # prune the network and dictionary of assignments
+                genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(viz_subset))
+                for clustering_type in isolateClustering:
+                    isolateClustering[clustering_type] = {viz_key: isolateClustering[clustering_type][viz_key]
+                        for viz_key in viz_subset}
 
             # generate selected visualisations
             if args.microreact:
@@ -645,8 +658,11 @@ def main():
                 outputsForGrapetree(viz_subset, core_distMat, isolateClustering, args.output, args.info_csv, args.rapidnj,
                                     overwrite = args.overwrite, microreact = args.microreact)
             if args.cytoscape:
-                sys.stderr.write("Writing cytoscape output\n")
-                outputsForCytoscape(genomeNetwork, isolateClustering, args.output, args.info_csv)
+                if args.viz_lineages or args.external_clustering:
+                    sys.stderr.write("Can only generate a network output for fitted models\n")
+                else:
+                    sys.stderr.write("Writing cytoscape output\n")
+                    outputsForCytoscape(genomeNetwork, isolateClustering, args.output, args.info_csv)
 
         else:
             # Cannot read input files

From fd490bb87c4f95eb766da1dc81d5ee552f0ca7a6 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 24 Apr 2020 20:37:18 +0100
Subject: [PATCH 07/46] Fix parsing errors with queries

---
 PopPUNK/__main__.py           | 140 +++++++++++++++++++++-------------
 PopPUNK/lineage_clustering.py | 115 ++++++++++++++++++++++------
 2 files changed, 180 insertions(+), 75 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 9724f58c..266ba26e 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -18,6 +18,7 @@
 from .sketchlib import no_sketchlib, checkSketchlibLibrary
 
 from .lineage_clustering import cluster_into_lineages
+from .lineage_clustering import calculateQueryDistances
 
 from .network import fetchNetwork
 from .network import constructNetwork
@@ -174,12 +175,15 @@ def get_options():
                                               '[default = False]', default=False, action='store_true')
     queryingGroup.add_argument('--accessory-only', help='Use an accessory-distance only model for assigning queries '
                                               '[default = False]', default=False, action='store_true')
+    queryingGroup.add_argument('--assign-lineage', help='Assign to lineages rather than strains - can specify '
+                                              'lineages other than those in the reference database using "--existing-scheme"'
+                                              '[default = False]', default=False, action='store_true')
 
     # lineage clustering within strains
     lineagesGroup = parser.add_argument_group('Lineage analysis options')
     lineagesGroup.add_argument('--R',help='Maximum rank used in lineage clustering [default = 1]', type = int, default = 1)
     lineagesGroup.add_argument('--use-accessory',help='Use accessory distances for lineage definitions [default = use core distances]', action = 'store_true', default = False)
-    lineagesGroup.add_argument('--existing-scheme',help='Name of pickle file storing existing lineage definitions', default = None)
+    lineagesGroup.add_argument('--existing-scheme',help='Name of pickle file storing existing lineage definitions', type = str, default = None)
 
     # plot output
     faGroup = parser.add_argument_group('Further analysis options')
@@ -536,9 +540,9 @@ def main():
         
         # run lineage clustering
         if self:
-            cluster_into_lineages(distMat, args.R, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme)
+            isolateClustering = cluster_into_lineages(distMat, args.R, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme)
         else:
-            cluster_into_lineages(distMat, args.R, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory,  existing_scheme = args.existing_scheme)
+            isolateClustering = cluster_into_lineages(distMat, args.R, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory,  existing_scheme = args.existing_scheme)
 
     #*******************************#
     #*                             *#
@@ -552,7 +556,7 @@ def main():
                      args.threads, args.use_mash, args.mash, args.overwrite, args.plot_fit, args.no_stream,
                      args.max_a_dist, args.model_dir, args.previous_clustering, args.external_clustering,
                      args.core_only, args.accessory_only, args.phandango, args.grapetree, args.info_csv,
-                     args.rapidnj, args.perplexity)
+                     args.rapidnj, args.perplexity, args.assign_lineage, args.existing_scheme, args.R, args.use_accessory)
 
     #******************************#
     #*                            *#
@@ -597,7 +601,10 @@ def main():
             dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
             #nodes_to_remove = set(genomeNetwork.nodes).difference(viz_subset)
             isolates_to_remove = set(combined_seq).difference(viz_subset)
-            postpruning_combined_seq, newDistMat = prune_distance_matrix(rlist, isolates_to_remove,
+            postpruning_combined_seq = sorted(viz_subset)
+            newDistMat = complete_distMat
+            if len(isolates_to_remove) > 0:
+                postpruning_combined_seq, newDistMat = prune_distance_matrix(rlist, isolates_to_remove,
                                                                       complete_distMat, dists_out)
 
             rlist = viz_subset
@@ -610,8 +617,6 @@ def main():
                 sys.stderr.write("Isolates in subset not found in existing database\n")
             assert postpruning_combined_seq == viz_subset
             
-            
-            
             # Either use strain definitions, lineage assignments or external clustering
             isolateClustering = {}
             # Use external clustering if specified
@@ -681,7 +686,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
                  kmers, sketch_sizes, ignore_length, estimated_length, threads, use_mash, mash, overwrite,
                  plot_fit, no_stream, max_a_dist, model_dir, previous_clustering,
                  external_clustering, core_only, accessory_only, phandango, grapetree,
-                 info_csv, rapidnj, perplexity):
+                 info_csv, rapidnj, perplexity, assign_lineage, existing_scheme, R, use_accessory):
     """Code for assign query mode. Written as a separate function so it can be called
     by pathogen.watch API
     """
@@ -740,58 +745,80 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
                                                     threads = threads)
         qcPass = qcDistMat(distMat, refList, queryList, max_a_dist)
 
-        # Assign these distances as within or between
-        model_prefix = ref_db
-        if model_dir is not None:
-            model_prefix = model_dir
-        model = loadClusterFit(model_prefix + "/" + os.path.basename(model_prefix) + '_fit.pkl',
-                                model_prefix + "/" + os.path.basename(model_prefix) + '_fit.npz')
-        queryAssignments = model.assign(distMat)
-
-        # set model prefix
-        model_prefix = ref_db
-        if model_dir is not None:
-            model_prefix = model_dir
-
-        # Set directories of previous fit
-        if previous_clustering is not None:
-            prev_clustering = previous_clustering
-        else:
-            prev_clustering = model_prefix
-
-        # Load the network based on supplied options
-        genomeNetwork, old_cluster_file = fetchNetwork(prev_clustering, model, refList,
-                                                        core_only, accessory_only)
-
-        # Assign clustering by adding to network
-        ordered_queryList, query_distMat = addQueryToNetwork(dbFuncs, refList, q_files,
-                genomeNetwork, kmers, estimated_length, queryAssignments, model, output, update_db,
-                use_mash, threads)
-
-        # if running simple query
-        print_full_clustering = False
-        if update_db:
-            print_full_clustering = True
-        isolateClustering = {'combined': printClusters(genomeNetwork, output + "/" + os.path.basename(output),
-                                                        old_cluster_file, external_clustering, print_full_clustering)}
-
-        # update_db like no full_db
-        if update_db:
+        # Calculate query-query distances
+###################################################################################################################
+        ordered_queryList = []
+        
+        # Assign to strains or lineages, as requested
+        if assign_lineage:
+
+            # Assign lineages by calculating query-query information
+            ordered_queryList, query_distMat = calculateQueryDistances(dbFuncs, refList, q_files,
+                    kmers, estimated_length, output, use_mash, threads)
+#
+#            # ASSIGN LINEAGE
+#            expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl'
+#            if existing_scheme:
+#                expected_lineage_name = existing_scheme
+#            combined_distances = np.concatenate((distMat,query_distMat),axis = 0)
+#            isolateClustering = cluster_into_lineages(combined_distances, R, output, rlist = refList, qlist = ordered_queryList, use_accessory = use_accessory,  existing_scheme = expected_lineage_name)
+###################################################################################################################
+        if not assign_lineage:
+            # Assign these distances as within or between strain
+            model_prefix = ref_db
+            if model_dir is not None:
+                model_prefix = model_dir
+            model = loadClusterFit(model_prefix + "/" + os.path.basename(model_prefix) + '_fit.pkl',
+                                    model_prefix + "/" + os.path.basename(model_prefix) + '_fit.npz')
+            queryAssignments = model.assign(distMat)
+
+            # set model prefix
+            model_prefix = ref_db
+            if model_dir is not None:
+                model_prefix = model_dir
+
+            # Set directories of previous fit
+            if previous_clustering is not None:
+                prev_clustering = previous_clustering
+            else:
+                prev_clustering = model_prefix
+
+            # Load the network based on supplied options
+            genomeNetwork, old_cluster_file = fetchNetwork(prev_clustering, model, refList,
+                                                            core_only, accessory_only)
+
+            # Assign clustering by adding to network
+            ordered_queryList, query_distMat = addQueryToNetwork(dbFuncs, refList, q_files,
+                    genomeNetwork, kmers, estimated_length, queryAssignments, model, output, update_db,
+                    use_mash, threads)
+
+            # if running simple query
+            print_full_clustering = False
+            if update_db:
+                print_full_clustering = True
+            isolateClustering = {'combined': printClusters(genomeNetwork, output + "/" + os.path.basename(output),
+                                                            old_cluster_file, external_clustering, print_full_clustering)}
+
+        # Update DB as requested
+        if update_db or assign_lineage:
+        
+            # Check new sequences pass QC before adding them
             if not qcPass:
                 sys.stderr.write("Queries contained outlier distances, not updating database\n")
             else:
                 sys.stderr.write("Updating reference database to " + output + "\n")
 
             # Update the network + ref list
-            if full_db is False:
+            # only update network if assigning to strains
+            if full_db is False or assign_lineage is False:
                 mashOrder = refList + ordered_queryList
                 newRepresentativesNames, newRepresentativesFile = extractReferences(genomeNetwork, mashOrder, output, refList)
                 genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(newRepresentativesNames))
                 newQueries = [x for x in ordered_queryList if x in frozenset(newRepresentativesNames)] # intersection that maintains order
+                nx.write_gpickle(genomeNetwork, output + "/" + os.path.basename(output) + '_graph.gpickle')
             else:
                 newQueries = ordered_queryList
-            nx.write_gpickle(genomeNetwork, output + "/" + os.path.basename(output) + '_graph.gpickle')
-
+            
             # Update the sketch database
             if newQueries != queryList and use_mash:
                 tmpRefFile = writeTmpFile(newQueries)
@@ -800,7 +827,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
                 os.remove(tmpRefFile)
             # With mash, this is the reduced DB constructed,
             # with sketchlib, all sketches
-            joinDBs(ref_db, output, output) 
+            joinDBs(ref_db, output, output)
 
             # Update distance matrices with all calculated distances
             if distances == None:
@@ -812,9 +839,15 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
                                                                 ordered_queryList, distMat, query_distMat)
             complete_distMat = translate_distMat(combined_seq, core_distMat, acc_distMat)
 
+            if assign_lineage:
+                expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl'
+                if existing_scheme is not None:
+                    expected_lineage_name = existing_scheme
+                isolateClustering = cluster_into_lineages(complete_distMat, R, output, refList, ordered_queryList, expected_lineage_name,  use_accessory)
+
             # Prune distances to references only, if not full db
             dists_out = output + "/" + os.path.basename(output) + ".dists"
-            if full_db is False:
+            if full_db is False and assign_lineage is False:
                 # could also have newRepresentativesNames in this diff (should be the same) - but want
                 # to ensure consistency with the network in case of bad input/bugs
                 nodes_to_remove = set(combined_seq).difference(genomeNetwork.nodes)
@@ -845,8 +878,11 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
             outputsForGrapetree(combined_seq, core_distMat, isolateClustering, output, info_csv, rapidnj,
                                 queryList = ordered_queryList, overwrite = overwrite, microreact = microreact)
         if cytoscape:
-            sys.stderr.write("Writing cytoscape output\n")
-            outputsForCytoscape(genomeNetwork, isolateClustering, output, info_csv, ordered_queryList)
+            if assign_lineage:
+                sys.stderr.write("Cannot generate a cytoscape network from a lineage assignment")
+            else:
+                sys.stderr.write("Writing cytoscape output\n")
+                outputsForCytoscape(genomeNetwork, isolateClustering, output, info_csv, ordered_queryList)
 
     else:
         sys.stderr.write("Need to provide both a reference database with --ref-db and "
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index f4f7c805..face1946 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -13,6 +13,7 @@
 
 # import poppunk package
 from .utils import iterDistRows
+from .utils import readRfile
 
 def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, R, lineage_index, seed_isolate, previous_lineage_clustering, null_cluster_value):
     """ Identifies isolates corresponding to a particular
@@ -253,12 +254,16 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
     # iterate through isolates and test whether any comparisons with
     # newly-added queries replace or supplement existing neighbours
     
+    # data structures for altered entries
+    nn_new = {}
     # pre-process to extract ref-query distances first
     query_match_indices = [n for n, (r, q) in enumerate(row_labels) if q in qlist or r in qlist]
     query_row_names = [row_labels[i] for i in query_match_indices]
     query_distances = np.take(distances, query_match_indices)
     # iterate through references to update existing entries
-    for isolate in nn.keys():
+    existing_isolates = nn.keys()
+    
+    for isolate in existing_isolates:
         # get max distance to existing neighbours
         max_distance_to_ref = max(nn[isolate].values())
         # get min distance to queries
@@ -272,39 +277,41 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
             for neighbour in nn[isolate].keys():
                 lineage_clustering[neighbour] = null_cluster_value
             # update neighbour information
-            nn_old = nn.pop(isolate)
-            nn[isolate] = {}
+            nn_old = nn[isolate]
+            nn_new[isolate] = {}
             # old neighbours from existing dict
-            old_distances = list(set(nn_old.values()))
+            old_distances = list(set(nn[isolate].values()))
             # new neighbours from new distances in X
             new_distances = np.unique(ref_query_distances)
             # combine old and new distances
-            combined_ref_distances = (nn_old.values(),ref_query_distances.tolist())
+            combined_ref_distances = list(nn[isolate].values()) + ref_query_distances.tolist()
             # take top ranked distances from both
             ranked_unique_combined_ref_distances = sorted(set(combined_ref_distances))[:R]
             for distance in ranked_unique_combined_ref_distances:
                 # add old neighbours back in
                 if distance in old_distances:
-                    for neighbour,d in nn_old.items():
+                    for neighbour,d in nn[isolate].items():
                         if d == distance:
-                            nn[isolate][neighbour] = distance
+                            nn_new[isolate][neighbour] = distance
                 # add new neighbours
                 if distance in new_distances:
                     for n,d in enumerate(ref_query_distances):
                         if d == distance:
                             row_label_index = ref_query_match_indices[n]
-                            neighbour = row_label_index[row_label_index][0] if row_label_index[row_label_index][1] == isolate else row_label_index[row_label_index][1]
-                            nn[isolate][neighbour] = d
+                            neighbour = row_labels[row_label_index][0] if row_labels[row_label_index][1] == isolate else row_labels[row_label_index][1]
+                            nn_new[isolate][neighbour] = d
     # get nn for query sequences
     query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R)
     # merge dicts
     for query in query_nn.keys():
         nn[query] = query_nn[query]
+    for updated in nn_new.keys():
+        nn[updated] = nn_new[updated]
     # return updated dict
     return nn, lineage_clustering
 
 
-def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_scheme = None, use_accessory = False):
+def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False):
     """ Clusters isolates into lineages based on their
     relative distances.
     
@@ -335,16 +342,16 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch
     distances = X[:,distance_index]
     
     # determine whether ref-ref or ref-query analysis
-    row_labels = []
-    isolate_list = []
-    if qlist is None:
-        sys.stderr.write("Identifying lineages using reference sequences\n")
-        isolate_list = rlist
-        row_labels = list(iter(iterDistRows(rlist, rlist, self = True)))
-    else:
-        sys.stderr.write("Identifying lineages from using reference and query sequences\n")
-        row_labels = list(iter(iterDistRows(rlist, qlist, self = False)))
-        isolate_list = rlist + qlist
+    isolate_list = rlist + qlist
+    row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True)))
+#    if qlist is None:
+#        sys.stderr.write("Identifying lineages using reference sequences\n")
+#        isolate_list = rlist
+#        row_labels = list(iter(iterDistRows(rlist, rlist, self = True)))
+#    else:
+#        sys.stderr.write("Identifying lineages from using reference and query sequences\n")
+#        row_labels = list(iter(iterDistRows(rlist, qlist, self = False)))
+#        isolate_list = rlist + qlist
     
     # determine whether novel analysis or modifying existing analysis
     neighbours = {}
@@ -360,7 +367,8 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch
                                                 R)
     else:
         with open(existing_scheme, 'rb') as pickle_file:
-            lineage_clustering, lineage_seed, neighbours = pickle.load(pickle_file)
+            sys.stderr.write('Loading from stored file ' + str(existing_scheme) + '\n')
+            lineage_clustering, lineage_seed, neighbours, R = pickle.load(pickle_file)
         previous_lineage_clustering = lineage_clustering # use for detecting changes
         # add new queries to lineage clustering
         for q in qlist:
@@ -413,7 +421,7 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch
     
     # store output
     with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file:
-        pickle.dump([lineage_clustering, lineage_seed, neighbours], pickle_file)
+        pickle.dump([lineage_clustering, lineage_seed, neighbours, R], pickle_file)
     
     # print output
     lineage_output_name = output + "/" + output + "_lineage_clusters.csv"
@@ -429,5 +437,66 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch
             for isolate in qlist:
                 print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + 'Query', file = lFile)
 
-    return 0
+    return lineage_clustering
+
+def calculateQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length,
+                    queryDB, use_mash = False, threads = 1):
+    """Finds edges between queries and items in the reference database,
+    and modifies the network to include them.
+
+    Args:
+        dbFuncs (list)
+            List of backend functions from :func:`~PopPUNK.utils.setupDBFuncs`
+        rlist (list)
+            List of reference names
+        qfile (str)
+            File containing queries
+        kmers (list)
+            List of k-mer sizes
+        estimated_length (int)
+            Estimated length of genome, if not calculated from data
+        queryDB (str)
+            Query database location
+        use_mash (bool)
+            Use the mash backend
+        threads (int)
+            Number of threads to use if new db created
+            (default = 1)
+    Returns:
+        qlist1 (list)
+            Ordered list of queries
+        distMat (numpy.array)
+            Query-query distances
+    """
+    
+    constructDatabase = dbFuncs['constructDatabase']
+    queryDatabase = dbFuncs['queryDatabase']
+    readDBParams = dbFuncs['readDBParams']
+
+    # These are returned
+    qlist1 = None
+    distMat = None
+
+    # Set up query names
+    qList, qSeqs = readRfile(qfile, oneSeq = use_mash)
+    queryFiles = dict(zip(qList, qSeqs))
+    if use_mash == True:
+        rNames = None
+        qNames = qSeqs
+    else:
+        rNames = qList
+        qNames = rNames
+
+    # Calculate all query-query distances too, if updating database
+    qlist1, qlist2, distMat = queryDatabase(rNames = rNames,
+                                            qNames = qNames,
+                                            dbPrefix = queryDB,
+                                            queryPrefix = queryDB,
+                                            klist = kmers,
+                                            self = True,
+                                            number_plot_fits = 0,
+                                            threads=threads)
+
+    return qlist1, distMat
+
 

From 95ec785104385258eea1055073e5fb7a2d1ad977 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 24 Apr 2020 21:15:18 +0100
Subject: [PATCH 08/46] Fix visualisation from query output

---
 PopPUNK/__main__.py           |  2 +-
 PopPUNK/lineage_clustering.py | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 266ba26e..5956fdd3 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -843,7 +843,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
                 expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl'
                 if existing_scheme is not None:
                     expected_lineage_name = existing_scheme
-                isolateClustering = cluster_into_lineages(complete_distMat, R, output, refList, ordered_queryList, expected_lineage_name,  use_accessory)
+                isolateClustering = {'combined': cluster_into_lineages(complete_distMat, R, output, combined_seq, ordered_queryList, expected_lineage_name,  use_accessory)}
 
             # Prune distances to references only, if not full db
             dists_out = output + "/" + os.path.basename(output) + ".dists"
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index face1946..87e8f954 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -326,7 +326,8 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e
         rlist (list)
             List of reference sequences.
         qlist (list)
-            List of query sequences.
+            List of query sequences being added to an existing clustering.
+            Should be included within rlist.
         existing_scheme (str)
             Path to pickle file containing lineage scheme to which isolates
             should be added.
@@ -342,7 +343,7 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e
     distances = X[:,distance_index]
     
     # determine whether ref-ref or ref-query analysis
-    isolate_list = rlist + qlist
+    isolate_list = rlist
     row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True)))
 #    if qlist is None:
 #        sys.stderr.write("Identifying lineages using reference sequences\n")
@@ -367,7 +368,6 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e
                                                 R)
     else:
         with open(existing_scheme, 'rb') as pickle_file:
-            sys.stderr.write('Loading from stored file ' + str(existing_scheme) + '\n')
             lineage_clustering, lineage_seed, neighbours, R = pickle.load(pickle_file)
         previous_lineage_clustering = lineage_clustering # use for detecting changes
         # add new queries to lineage clustering
@@ -433,9 +433,10 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e
         else:
             print('Id,Lineage__autocolor,QueryStatus', file = lFile)
             for isolate in rlist:
-                print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + 'Reference', file = lFile)
-            for isolate in qlist:
-                print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + 'Query', file = lFile)
+                status = 'Reference'
+                if isolate in qlist:
+                    status = 'Query'
+                print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + status, file = lFile)
 
     return lineage_clustering
 

From 2ea6ccd07ea92c829442d2d7e322120544925456 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Fri, 24 Apr 2020 21:57:43 +0100
Subject: [PATCH 09/46] Fix generate-viz mode

---
 PopPUNK/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 5956fdd3..ca46d34b 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -601,7 +601,7 @@ def main():
             dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"
             #nodes_to_remove = set(genomeNetwork.nodes).difference(viz_subset)
             isolates_to_remove = set(combined_seq).difference(viz_subset)
-            postpruning_combined_seq = sorted(viz_subset)
+            postpruning_combined_seq = viz_subset
             newDistMat = complete_distMat
             if len(isolates_to_remove) > 0:
                 postpruning_combined_seq, newDistMat = prune_distance_matrix(rlist, isolates_to_remove,

From 2d179f4969ae00784c85af366cf0ba2b752c7fd0 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 26 Apr 2020 19:25:02 +0100
Subject: [PATCH 10/46] Reconfigure to allow multiprocessing

---
 PopPUNK/lineage_clustering.py | 141 +++++++++++++++++++++++-----------
 1 file changed, 98 insertions(+), 43 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 87e8f954..7ae337c4 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -335,9 +335,16 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e
             Option to use accessory distances rather than core distances.
             
     Returns:
-        completion_status (int)
-            0 if successful
+        lineage_clustering (dict)
+            Assignment of each isolate to a cluster.
+        lineage_seed (dict)
+            Seed isolate used to initiate each cluster.
+        neighbours (nested dict)
+            Neighbour relationships between isolates for R.
+        R (int)
+            R used to generate neigbours and clustering.
     """
+    
     # process distance matrix
     distance_index = 1 if use_accessory else 0
     distances = X[:,distance_index]
@@ -345,46 +352,112 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e
     # determine whether ref-ref or ref-query analysis
     isolate_list = rlist
     row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True)))
-#    if qlist is None:
-#        sys.stderr.write("Identifying lineages using reference sequences\n")
-#        isolate_list = rlist
-#        row_labels = list(iter(iterDistRows(rlist, rlist, self = True)))
-#    else:
-#        sys.stderr.write("Identifying lineages from using reference and query sequences\n")
-#        row_labels = list(iter(iterDistRows(rlist, qlist, self = False)))
-#        isolate_list = rlist + qlist
     
     # determine whether novel analysis or modifying existing analysis
+    use_existing = False
     neighbours = {}
     lineage_seed = {}
     null_cluster_value = len(isolate_list) + 1
     lineage_clustering = {i:null_cluster_value for i in isolate_list}
     previous_lineage_clustering = lineage_clustering
     
-    if existing_scheme is None:
-        neighbours = generate_nearest_neighbours(distances,
-                                                row_labels,
-                                                isolate_list,
-                                                R)
-    else:
+    if existing_scheme is not None:
         with open(existing_scheme, 'rb') as pickle_file:
             lineage_clustering, lineage_seed, neighbours, R = pickle.load(pickle_file)
         previous_lineage_clustering = lineage_clustering # use for detecting changes
+        use_existing = True
         # add new queries to lineage clustering
         for q in qlist:
             lineage_clustering[q] = null_cluster_value
-        neighbours, lineage_clustering = update_nearest_neighbours(distances,
-                                                                    row_labels,
-                                                                    R,
-                                                                    qlist,
-                                                                    neighbours,
-                                                                    lineage_clustering,
-                                                                    null_cluster_value)
+    
+    # run clustering for an individual R
+    lineage_clustering, lineage_seed, neighbours, R = run_clustering_for_R(R,
+                                                                        distances = distances,
+                                                                        neighbours = neighbours,
+                                                                        lineage_clustering = lineage_clustering,
+                                                                        lineage_seed = lineage_seed,
+                                                                        null_cluster_value = null_cluster_value,
+                                                                        qlist = qlist,
+                                                                        use_existing = use_existing)
+    
+    # store output
+    with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file:
+        pickle.dump([lineage_clustering, lineage_seed, neighbours, R], pickle_file)
+    
+    # print output
+    lineage_output_name = output + "/" + output + "_lineage_clusters.csv"
+    with open(lineage_output_name, 'w') as lFile:
+        if qlist is None:
+            print('Id,Lineage__autocolor', file = lFile)
+            for isolate in lineage_clustering.keys():
+                print(isolate + ',' + str(lineage_clustering[isolate]), file = lFile)
+        else:
+            print('Id,Lineage__autocolor,QueryStatus', file = lFile)
+            for isolate in rlist:
+                status = 'Reference'
+                if isolate in qlist:
+                    status = 'Query'
+                print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + status, file = lFile)
+
+    return lineage_clustering, lineage_seed, neighbours, R
 
+def run_clustering_for_R(R, distances = None, neighbours = None, lineage_clustering = None,
+                            lineage_seed = None, null_cluster_value = None, qlist = None, use_existing = False):
+    """ Clusters isolates into lineages based on their
+    relative distances using a single R to enable
+    parallelisation.
+    
+    Args:
+        R (int)
+            Integer specifying the maximum rank of neighbour used
+            for clustering. Should be changed to int list for hierarchical
+            clustering.
+        distances (numpy array)
+            Sorted distances used for analysis.
+        neighbours (nested dict)
+            Neighbour relationships between isolates for R.
+        lineage_clustering (dict)
+            Assignment of each isolate to a cluster.
+        lineage_seed (dict)
+            Seed isolate used to initiate each cluster.
+        null_cluster_value (int)
+            Used to denote as-yet-unclustered isolates.
+        qlist (list)
+            List of query sequences being added to an existing clustering.
+            Should be included within rlist.
+        use_existing (bool)
+            Whether to extend a previously generated analysis or not.
+            
+    Returns:
+        lineage_clustering (dict)
+            Assignment of each isolate to a cluster.
+        lineage_seed (dict)
+            Seed isolate used to initiate each cluster.
+        neighbours (nested dict)
+            Neighbour relationships between isolates for R.
+        R (int)
+            R used to generate neigbours and clustering.
+    """
+    
+    # identify neighbours
+    if existing_scheme:
+        neighbours, lineage_clustering = update_nearest_neighbours(distances,
+                                                                row_labels,
+                                                                R,
+                                                                qlist,
+                                                                neighbours,
+                                                                lineage_clustering,
+                                                                null_cluster_value)
+    else:
+        neighbours = generate_nearest_neighbours(distances,
+                                                row_labels,
+                                                isolate_list,
+                                                R)
+        
     # run clustering
     lineage_index = 1
     while null_cluster_value in lineage_clustering.values():
-        
+    
         # get seed isolate based on minimum pairwise distances
         seed_isolate = get_seed_isolate(lineage_clustering,     # need to add in lineage seed hash
                                         row_labels,
@@ -419,25 +492,7 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e
         # increment index for next lineage
         lineage_index = lineage_index + 1
     
-    # store output
-    with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file:
-        pickle.dump([lineage_clustering, lineage_seed, neighbours, R], pickle_file)
-    
-    # print output
-    lineage_output_name = output + "/" + output + "_lineage_clusters.csv"
-    with open(lineage_output_name, 'w') as lFile:
-        if qlist is None:
-            print('Id,Lineage__autocolor', file = lFile)
-            for isolate in lineage_clustering.keys():
-                print(isolate + ',' + str(lineage_clustering[isolate]), file = lFile)
-        else:
-            print('Id,Lineage__autocolor,QueryStatus', file = lFile)
-            for isolate in rlist:
-                status = 'Reference'
-                if isolate in qlist:
-                    status = 'Query'
-                print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + status, file = lFile)
-
+    # return clustering
     return lineage_clustering
 
 def calculateQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length,

From 13006ec8ab0be68d0ca117d4361dedc3701633ce Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 26 Apr 2020 19:57:30 +0100
Subject: [PATCH 11/46] Enable multiple ranks to be used

---
 PopPUNK/__main__.py           | 24 ++++------
 PopPUNK/lineage_clustering.py | 87 +++++++++++++++++++----------------
 2 files changed, 58 insertions(+), 53 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index ca46d34b..1200ef10 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -527,7 +527,11 @@ def main():
     sys.stderr.write("Checking options here\n\n")
     if args.lineage_clustering:
         sys.stderr.write("Mode: Identifying lineages within a clade\n\n")
-        
+
+        # process rankings to use
+        rank_list = sorted(args.R.split(','))
+
+        # load distances
         distances = None
         if args.distances is not None:
             distances = args.distances
@@ -535,14 +539,13 @@ def main():
             sys.stderr.write("Need to provide an input set of distances with --distances\n\n")
             sys.exit(1)
 
-        # load distance matrix
         refList, queryList, self, distMat = readPickle(distances)
         
         # run lineage clustering
         if self:
-            isolateClustering = cluster_into_lineages(distMat, args.R, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme)
+            isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme)
         else:
-            isolateClustering = cluster_into_lineages(distMat, args.R, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory,  existing_scheme = args.existing_scheme)
+            isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory,  existing_scheme = args.existing_scheme)
 
     #*******************************#
     #*                             *#
@@ -746,7 +749,6 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
         qcPass = qcDistMat(distMat, refList, queryList, max_a_dist)
 
         # Calculate query-query distances
-###################################################################################################################
         ordered_queryList = []
         
         # Assign to strains or lineages, as requested
@@ -755,14 +757,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
             # Assign lineages by calculating query-query information
             ordered_queryList, query_distMat = calculateQueryDistances(dbFuncs, refList, q_files,
                     kmers, estimated_length, output, use_mash, threads)
-#
-#            # ASSIGN LINEAGE
-#            expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl'
-#            if existing_scheme:
-#                expected_lineage_name = existing_scheme
-#            combined_distances = np.concatenate((distMat,query_distMat),axis = 0)
-#            isolateClustering = cluster_into_lineages(combined_distances, R, output, rlist = refList, qlist = ordered_queryList, use_accessory = use_accessory,  existing_scheme = expected_lineage_name)
-###################################################################################################################
+
         if not assign_lineage:
             # Assign these distances as within or between strain
             model_prefix = ref_db
@@ -840,10 +835,11 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
             complete_distMat = translate_distMat(combined_seq, core_distMat, acc_distMat)
 
             if assign_lineage:
+                rank_list = sorted(R.split(','))
                 expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl'
                 if existing_scheme is not None:
                     expected_lineage_name = existing_scheme
-                isolateClustering = {'combined': cluster_into_lineages(complete_distMat, R, output, combined_seq, ordered_queryList, expected_lineage_name,  use_accessory)}
+                isolateClustering = {'combined': cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name,  use_accessory)}
 
             # Prune distances to references only, if not full db
             dists_out = output + "/" + os.path.basename(output) + ".dists"
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 7ae337c4..f41dcf42 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -10,6 +10,7 @@
 from scipy.stats import rankdata
 from collections import defaultdict
 import pickle
+from multiprocessing import Pool, Lock
 
 # import poppunk package
 from .utils import iterDistRows
@@ -311,7 +312,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
     return nn, lineage_clustering
 
 
-def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False):
+def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False):
     """ Clusters isolates into lineages based on their
     relative distances.
     
@@ -319,10 +320,9 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e
         X (np.array)
             n x 2 array of core and accessory distances for n samples.
             This should not be subsampled.
-        R (int)
-            Integer specifying the maximum rank of neighbour used
-            for clustering. Should be changed to int list for hierarchical
-            clustering.
+        rank_list (list of int)
+            Integers specifying the maximum rank of neighbours used
+            for clustering.
         rlist (list)
             List of reference sequences.
         qlist (list)
@@ -335,17 +335,14 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e
             Option to use accessory distances rather than core distances.
             
     Returns:
-        lineage_clustering (dict)
-            Assignment of each isolate to a cluster.
-        lineage_seed (dict)
-            Seed isolate used to initiate each cluster.
-        neighbours (nested dict)
-            Neighbour relationships between isolates for R.
-        R (int)
-            R used to generate neigbours and clustering.
+        combined (dict)
+            Assignment of each isolate to clusters by all ranks used.
     """
     
     # process distance matrix
+    # - this should be sorted (PyTorch allows this to be done on GPUs)
+    # - then the functions can be reimplemented to run faster on a
+    #   sorted list
     distance_index = 1 if use_accessory else 0
     distances = X[:,distance_index]
     
@@ -357,49 +354,63 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e
     use_existing = False
     neighbours = {}
     lineage_seed = {}
+    
     null_cluster_value = len(isolate_list) + 1
-    lineage_clustering = {i:null_cluster_value for i in isolate_list}
-    previous_lineage_clustering = lineage_clustering
+    for R in rank_list:
+        lineage_clustering[R] = {i:null_cluster_value for i in isolate_list}
+        previous_lineage_clustering[R] = lineage_clustering[R]
+        lineage_seed[R] = {}
     
     if existing_scheme is not None:
         with open(existing_scheme, 'rb') as pickle_file:
             lineage_clustering, lineage_seed, neighbours, R = pickle.load(pickle_file)
-        previous_lineage_clustering = lineage_clustering # use for detecting changes
+        previous_lineage_clustering[R] = lineage_clustering[R] # use for detecting changes
         use_existing = True
         # add new queries to lineage clustering
         for q in qlist:
             lineage_clustering[q] = null_cluster_value
     
     # run clustering for an individual R
-    lineage_clustering, lineage_seed, neighbours, R = run_clustering_for_R(R,
-                                                                        distances = distances,
-                                                                        neighbours = neighbours,
-                                                                        lineage_clustering = lineage_clustering,
-                                                                        lineage_seed = lineage_seed,
-                                                                        null_cluster_value = null_cluster_value,
-                                                                        qlist = qlist,
-                                                                        use_existing = use_existing)
+    # - this can be parallelised across R using multiprocessing.Pool
+    for R in rank_list:
+        lineage_clustering[R], lineage_seed[R], neighbours[R] = run_clustering_for_R(R,
+                                                                                    distances = distances,
+                                                                                    neighbours = neighbours[R],
+                                                                                    lineage_clustering = lineage_clustering[R],
+                                                                                    lineage_seed = lineage_seed[R],
+                                                                                    null_cluster_value = null_cluster_value,
+                                                                                    qlist = qlist,
+                                                                                    use_existing = use_existing)
     
     # store output
+    # - this needs to be per R
     with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file:
-        pickle.dump([lineage_clustering, lineage_seed, neighbours, R], pickle_file)
+        pickle.dump([lineage_clustering, lineage_seed, neighbours, rank_list], pickle_file)
     
     # print output
+    combined = {}
     lineage_output_name = output + "/" + output + "_lineage_clusters.csv"
     with open(lineage_output_name, 'w') as lFile:
-        if qlist is None:
-            print('Id,Lineage__autocolor', file = lFile)
-            for isolate in lineage_clustering.keys():
-                print(isolate + ',' + str(lineage_clustering[isolate]), file = lFile)
-        else:
-            print('Id,Lineage__autocolor,QueryStatus', file = lFile)
-            for isolate in rlist:
-                status = 'Reference'
+        print('Id', file = lFile)
+        for R in rank_list:
+            print(',Lineage_R' + str(R) + '__autocolor', file = lFile)
+        if qlist is not None:
+            print(',Status', file = lFile)
+        for isolate in lineage_clustering[R].keys():
+            print(isolate, file = lFile)
+            for R in rank_list:
+                 print(',' + str(lineage_clustering[R][isolate]), file = lFile)
+                 if len(combined[isolate]) == 0:
+                    combined[isolate] = lineage_clustering[R][isolate]
+                 else:
+                    combined[isolate] = combined[isolate] + '-' + lineage_clustering[R][isolate]
+            if qlist is not None:
                 if isolate in qlist:
-                    status = 'Query'
-                print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + status, file = lFile)
+                    print(',Added',file = lFile)
+                else:
+                    print(',Existing',file = lFile)
 
-    return lineage_clustering, lineage_seed, neighbours, R
+    return combined[R]
 
 def run_clustering_for_R(R, distances = None, neighbours = None, lineage_clustering = None,
                             lineage_seed = None, null_cluster_value = None, qlist = None, use_existing = False):
@@ -435,8 +446,6 @@ def run_clustering_for_R(R, distances = None, neighbours = None, lineage_cluster
             Seed isolate used to initiate each cluster.
         neighbours (nested dict)
             Neighbour relationships between isolates for R.
-        R (int)
-            R used to generate neigbours and clustering.
     """
     
     # identify neighbours
@@ -493,7 +502,7 @@ def run_clustering_for_R(R, distances = None, neighbours = None, lineage_cluster
         lineage_index = lineage_index + 1
     
     # return clustering
-    return lineage_clustering
+    return lineage_clustering, lineage_seed, neighbours
 
 def calculateQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length,
                     queryDB, use_mash = False, threads = 1):

From dce1ab5737392f089744d9c1f9254f2f0093a62a Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 27 Apr 2020 08:19:19 +0100
Subject: [PATCH 12/46] Improve printing from lineage querying

---
 PopPUNK/__main__.py           |  6 ++--
 PopPUNK/lineage_clustering.py | 55 +++++++++++++++++++++++------------
 2 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 1200ef10..520ddec6 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -181,7 +181,7 @@ def get_options():
 
     # lineage clustering within strains
     lineagesGroup = parser.add_argument_group('Lineage analysis options')
-    lineagesGroup.add_argument('--R',help='Maximum rank used in lineage clustering [default = 1]', type = int, default = 1)
+    lineagesGroup.add_argument('--R',help='Comma separated list of ranks used in lineage clustering [default = 1]', type = str, default = "1")
     lineagesGroup.add_argument('--use-accessory',help='Use accessory distances for lineage definitions [default = use core distances]', action = 'store_true', default = False)
     lineagesGroup.add_argument('--existing-scheme',help='Name of pickle file storing existing lineage definitions', type = str, default = None)
 
@@ -529,7 +529,7 @@ def main():
         sys.stderr.write("Mode: Identifying lineages within a clade\n\n")
 
         # process rankings to use
-        rank_list = sorted(args.R.split(','))
+        rank_list = sorted([int(x) for x in args.R.split(',')])
 
         # load distances
         distances = None
@@ -835,7 +835,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
             complete_distMat = translate_distMat(combined_seq, core_distMat, acc_distMat)
 
             if assign_lineage:
-                rank_list = sorted(R.split(','))
+                rank_list = sorted([int(x) for x in R.split(',')])
                 expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl'
                 if existing_scheme is not None:
                     expected_lineage_name = existing_scheme
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index f41dcf42..051573e2 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -207,11 +207,14 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R):
         ref_distances = np.take(distances, indices)
         # get unique distances and pick the R smallest values
         unique_isolate_distances = np.unique(ref_distances)
-        R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)]
+        R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)][:R]
         # get threshold distance for neighbour definition - np.maximum fails for R=1
         threshold_distance = R_lowest_distances[0]
         if R > 1:
-            threshold_distance = np.maximum(R_lowest_distances)
+            try:
+                threshold_distance = np.amax(R_lowest_distances)
+            except:
+                sys.stderr.write('Problem with: ' + str(R_lowest_distances) + '\n')
         # identify all distances below this threshold and store relative to index
         for i,d in enumerate(ref_distances):
             if d <= threshold_distance:
@@ -354,31 +357,38 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
     use_existing = False
     neighbours = {}
     lineage_seed = {}
+    lineage_clustering = {}
+    previous_lineage_clustering = {}
     
     null_cluster_value = len(isolate_list) + 1
     for R in rank_list:
         lineage_clustering[R] = {i:null_cluster_value for i in isolate_list}
         previous_lineage_clustering[R] = lineage_clustering[R]
         lineage_seed[R] = {}
+        neighbours[R] = {}
     
     if existing_scheme is not None:
         with open(existing_scheme, 'rb') as pickle_file:
-            lineage_clustering, lineage_seed, neighbours, R = pickle.load(pickle_file)
+            lineage_clustering, lineage_seed, neighbours, rank_list = pickle.load(pickle_file)
         previous_lineage_clustering[R] = lineage_clustering[R] # use for detecting changes
         use_existing = True
         # add new queries to lineage clustering
         for q in qlist:
-            lineage_clustering[q] = null_cluster_value
+            for R in rank_list:
+                lineage_clustering[R][q] = null_cluster_value
     
     # run clustering for an individual R
     # - this can be parallelised across R using multiprocessing.Pool
     for R in rank_list:
         lineage_clustering[R], lineage_seed[R], neighbours[R] = run_clustering_for_R(R,
                                                                                     distances = distances,
+                                                                                    row_labels = row_labels,
                                                                                     neighbours = neighbours[R],
                                                                                     lineage_clustering = lineage_clustering[R],
+                                                                                    previous_lineage_clustering = previous_lineage_clustering[R],
                                                                                     lineage_seed = lineage_seed[R],
                                                                                     null_cluster_value = null_cluster_value,
+                                                                                    isolate_list = isolate_list,
                                                                                     qlist = qlist,
                                                                                     use_existing = use_existing)
     
@@ -391,29 +401,30 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
     combined = {}
     lineage_output_name = output + "/" + output + "_lineage_clusters.csv"
     with open(lineage_output_name, 'w') as lFile:
-        print('Id', file = lFile)
+        lFile.write('Id')
         for R in rank_list:
-            print(',Lineage_R' + str(R) + '__autocolor', file = lFile)
+            lFile.write(',Lineage_R' + str(R) + '__autocolor')
         if qlist is not None:
-            print(',Status', file = lFile)
+            lFile.write(',Status\n')
         for isolate in lineage_clustering[R].keys():
-            print(isolate, file = lFile)
+            lFile.write(isolate)
             for R in rank_list:
-                 print(',' + str(lineage_clustering[R][isolate]), file = lFile)
-                 if len(combined[isolate]) == 0:
-                    combined[isolate] = lineage_clustering[R][isolate]
-                 else:
-                    combined[isolate] = combined[isolate] + '-' + lineage_clustering[R][isolate]
+                lFile.write(',' + str(lineage_clustering[R][isolate]))
+                if isolate in combined.keys():
+                    combined[isolate] = combined[isolate] + '-' + str(lineage_clustering[R][isolate])
+                else:
+                    combined[isolate] = str(lineage_clustering[R][isolate])
             if qlist is not None:
                 if isolate in qlist:
-                    print(',Added',file = lFile)
+                    lFile.write(',Added\n')
                 else:
-                    print(',Existing',file = lFile)
+                    lFile.write(',Existing\n')
 
-    return combined[R]
+    return combined
 
-def run_clustering_for_R(R, distances = None, neighbours = None, lineage_clustering = None,
-                            lineage_seed = None, null_cluster_value = None, qlist = None, use_existing = False):
+def run_clustering_for_R(R, distances = None, row_labels = None, neighbours = None, lineage_clustering = None,
+                            previous_lineage_clustering = None, lineage_seed = None, null_cluster_value = None,
+                            isolate_list = None, qlist = None, use_existing = False):
     """ Clusters isolates into lineages based on their
     relative distances using a single R to enable
     parallelisation.
@@ -425,14 +436,20 @@ def run_clustering_for_R(R, distances = None, neighbours = None, lineage_cluster
             clustering.
         distances (numpy array)
             Sorted distances used for analysis.
+        row_labels (list of tuples)
+            Pairs of isolates labelling each distance.
         neighbours (nested dict)
             Neighbour relationships between isolates for R.
         lineage_clustering (dict)
             Assignment of each isolate to a cluster.
+        previous_lineage_clustering (dict)
+            Assignment of each isolate to a cluster before updating.
         lineage_seed (dict)
             Seed isolate used to initiate each cluster.
         null_cluster_value (int)
             Used to denote as-yet-unclustered isolates.
+        isolate_list (list)
+            List of isolates for lineage assignment.
         qlist (list)
             List of query sequences being added to an existing clustering.
             Should be included within rlist.
@@ -449,7 +466,7 @@ def run_clustering_for_R(R, distances = None, neighbours = None, lineage_cluster
     """
     
     # identify neighbours
-    if existing_scheme:
+    if use_existing:
         neighbours, lineage_clustering = update_nearest_neighbours(distances,
                                                                 row_labels,
                                                                 R,

From 035177284dec046f57c898ae808c1344d1cb8428 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 27 Apr 2020 08:53:27 +0100
Subject: [PATCH 13/46] More detailed printing of clustering information

---
 PopPUNK/lineage_clustering.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 051573e2..54630829 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -363,20 +363,22 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
     null_cluster_value = len(isolate_list) + 1
     for R in rank_list:
         lineage_clustering[R] = {i:null_cluster_value for i in isolate_list}
-        previous_lineage_clustering[R] = lineage_clustering[R]
         lineage_seed[R] = {}
         neighbours[R] = {}
     
     if existing_scheme is not None:
         with open(existing_scheme, 'rb') as pickle_file:
             lineage_clustering, lineage_seed, neighbours, rank_list = pickle.load(pickle_file)
-        previous_lineage_clustering[R] = lineage_clustering[R] # use for detecting changes
         use_existing = True
         # add new queries to lineage clustering
         for q in qlist:
             for R in rank_list:
                 lineage_clustering[R][q] = null_cluster_value
     
+    # use for detecting changes in lineage assignation
+    for R in rank_list:
+        previous_lineage_clustering[R] = lineage_clustering[R]
+    
     # run clustering for an individual R
     # - this can be parallelised across R using multiprocessing.Pool
     for R in rank_list:
@@ -405,15 +407,20 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
         for R in rank_list:
             lFile.write(',Lineage_R' + str(R) + '__autocolor')
         if qlist is not None:
-            lFile.write(',Status\n')
+            lFile.write(',Overall_lineage,Status\n')
         for isolate in lineage_clustering[R].keys():
             lFile.write(isolate)
             for R in rank_list:
                 lFile.write(',' + str(lineage_clustering[R][isolate]))
+                lineage_string = str(lineage_clustering[R][isolate])
+                # include information on lineage clustering
+                if lineage_clustering[R][isolate] != previous_lineage_clustering[R][isolate]:
+                    lineage_string = str(previous_lineage_clustering[R][isolate]) + ':' + lineage_string
                 if isolate in combined.keys():
-                    combined[isolate] = combined[isolate] + '-' + str(lineage_clustering[R][isolate])
+                    combined[isolate] = combined[isolate] + '-' + lineage_string
                 else:
-                    combined[isolate] = str(lineage_clustering[R][isolate])
+                    combined[isolate] = lineage_string
+            lFile.write(',' + combined[isolate])
             if qlist is not None:
                 if isolate in qlist:
                     lFile.write(',Added\n')

From 8cdad429e240c0a28013d2f886ba49190f7cd1dc Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 27 Apr 2020 12:47:16 +0100
Subject: [PATCH 14/46] Improve visualisation of clustering

---
 PopPUNK/__main__.py           | 12 +++++---
 PopPUNK/lineage_clustering.py | 54 ++++++++++++++++++++++++++++++-----
 PopPUNK/plot.py               |  5 +++-
 3 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 520ddec6..cee0216a 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -19,6 +19,7 @@
 
 from .lineage_clustering import cluster_into_lineages
 from .lineage_clustering import calculateQueryDistances
+from .lineage_clustering import readLineages
 
 from .network import fetchNetwork
 from .network import constructNetwork
@@ -628,7 +629,8 @@ def main():
                 isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)}
             elif args.viz_lineages:
                 cluster_file = args.viz_lineages
-                isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)}
+                #isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)}
+                isolateClustering = readLineages(cluster_file)
             else:
                 # identify existing analysis files
                 model_prefix = args.ref_db
@@ -648,9 +650,11 @@ def main():
 
                 # prune the network and dictionary of assignments
                 genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(viz_subset))
-                for clustering_type in isolateClustering:
-                    isolateClustering[clustering_type] = {viz_key: isolateClustering[clustering_type][viz_key]
-                        for viz_key in viz_subset}
+                
+            # process the returned clustering
+#            for clustering_type in isolateClustering:
+#                isolateClustering[clustering_type] = {viz_key: isolateClustering[clustering_type][viz_key]
+#                    for viz_key in viz_subset}
 
             # generate selected visualisations
             if args.microreact:
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 54630829..c493aea1 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -5,6 +5,7 @@
 # universal
 import os
 import sys
+import re
 # additional
 import numpy as np
 from scipy.stats import rankdata
@@ -401,26 +402,34 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
     
     # print output
     combined = {}
+    titles_list = ['Lineage_R' + str(R)  for R in rank_list]
     lineage_output_name = output + "/" + output + "_lineage_clusters.csv"
     with open(lineage_output_name, 'w') as lFile:
+        # print header
         lFile.write('Id')
-        for R in rank_list:
-            lFile.write(',Lineage_R' + str(R) + '__autocolor')
+        for t in titles_list:
+            lFile.write(',' + t + '__autocolor')
+            combined[t] = {}
+        lFile.write(',Overall_lineage')
+        combined['Overall_lineage'] = {}
         if qlist is not None:
-            lFile.write(',Overall_lineage,Status\n')
+            lFile.write(',Status')
+        lFile.write('\n')
+        # print lines for each isolate
         for isolate in lineage_clustering[R].keys():
             lFile.write(isolate)
-            for R in rank_list:
+            for n,R in enumerate(rank_list):
                 lFile.write(',' + str(lineage_clustering[R][isolate]))
                 lineage_string = str(lineage_clustering[R][isolate])
                 # include information on lineage clustering
+                combined[titles_list[n]][isolate] = lineage_string
                 if lineage_clustering[R][isolate] != previous_lineage_clustering[R][isolate]:
                     lineage_string = str(previous_lineage_clustering[R][isolate]) + ':' + lineage_string
                 if isolate in combined.keys():
-                    combined[isolate] = combined[isolate] + '-' + lineage_string
+                    combined['Overall_lineage'][isolate] = combined[isolate] + '-' + lineage_string
                 else:
-                    combined[isolate] = lineage_string
-            lFile.write(',' + combined[isolate])
+                    combined['Overall_lineage'][isolate] = lineage_string
+            lFile.write(',' + combined['Overall_lineage'][isolate])
             if qlist is not None:
                 if isolate in qlist:
                     lFile.write(',Added\n')
@@ -589,3 +598,34 @@ def calculateQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length,
     return qlist1, distMat
 
 
+def readLineages(clustCSV):
+    """Read a previous reference clustering from CSV
+
+    Args:
+        clustCSV (str)
+            File name of CSV with previous cluster assignments
+
+    Returns:
+        clusters (dict)
+            Or if return_dict is set keys are sample names,
+            values are cluster assignments.
+    """
+    clusters = {}
+    relevant_headers = []
+    header_elements = []
+
+    with open(clustCSV, 'r') as csv_file:
+        header = csv_file.readline()
+        # identify columns to include
+        header_elements = header.rstrip().split(",")
+        relevant_headers.append(header_elements.index('Overall_lineage'))
+        relevant_headers.extend([n for n,i in enumerate(header_elements) if re.search('Lineage_R',i)])
+        for h in relevant_headers:
+            clusters[header_elements[h]] = {}
+        for line in csv_file:
+            elements = line.rstrip().split(",")
+            if elements[0] != header_elements[0]:
+                for h in relevant_headers:
+                    clusters[header_elements[h]][elements[0]] = elements[h]
+
+    return clusters
diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
index cf2df0e2..3b30eae6 100644
--- a/PopPUNK/plot.py
+++ b/PopPUNK/plot.py
@@ -477,8 +477,11 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format =
     # process clustering data
     nodeLabels = [r.split('/')[-1].split('.')[0] for r in nodeNames]
 
+    # get example clustering name for validation
+    example_cluster_title = list(clustering.keys())[0]
+
     for name, label in zip(nodeNames, nodeLabels):
-        if name in clustering['combined']:
+        if name in clustering[example_cluster_title]:
             if output_format == 'microreact':
                 d['id'].append(label)
                 for cluster_type in clustering:

From 055d7c825daa35359ea88cafb44056f88651488f Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 28 Apr 2020 10:03:49 +0100
Subject: [PATCH 15/46] Parallelise lineage clustering

---
 PopPUNK/__main__.py           |  11 +--
 PopPUNK/lineage_clustering.py | 144 ++++++++++++++++++++++------------
 2 files changed, 102 insertions(+), 53 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index cee0216a..709415d9 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -544,9 +544,9 @@ def main():
         
         # run lineage clustering
         if self:
-            isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme)
+            isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme, num_processes = args.threads)
         else:
-            isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory,  existing_scheme = args.existing_scheme)
+            isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory,  existing_scheme = args.existing_scheme, num_processes = args.threads)
 
     #*******************************#
     #*                             *#
@@ -817,7 +817,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
                 nx.write_gpickle(genomeNetwork, output + "/" + os.path.basename(output) + '_graph.gpickle')
             else:
                 newQueries = ordered_queryList
-            
+
             # Update the sketch database
             if newQueries != queryList and use_mash:
                 tmpRefFile = writeTmpFile(newQueries)
@@ -837,13 +837,14 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
             combined_seq, core_distMat, acc_distMat = update_distance_matrices(refList, ref_distMat,
                                                                 ordered_queryList, distMat, query_distMat)
             complete_distMat = translate_distMat(combined_seq, core_distMat, acc_distMat)
-
+            
             if assign_lineage:
                 rank_list = sorted([int(x) for x in R.split(',')])
                 expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl'
                 if existing_scheme is not None:
                     expected_lineage_name = existing_scheme
-                isolateClustering = {'combined': cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name,  use_accessory)}
+                #isolateClustering = {'combined': cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name,  use_accessory, threads)}
+                isolateClustering = cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name,  use_accessory, threads)
 
             # Prune distances to references only, if not full db
             dists_out = output + "/" + os.path.basename(output) + ".dists"
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index c493aea1..8a93f264 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -11,7 +11,8 @@
 from scipy.stats import rankdata
 from collections import defaultdict
 import pickle
-from multiprocessing import Pool, Lock
+from multiprocessing import Pool, Lock, Manager, RawArray, shared_memory, managers
+from functools import partial
 
 # import poppunk package
 from .utils import iterDistRows
@@ -208,7 +209,13 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R):
         ref_distances = np.take(distances, indices)
         # get unique distances and pick the R smallest values
         unique_isolate_distances = np.unique(ref_distances)
-        R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)][:R]
+#        R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)[:R]]
+        lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)]
+        try:
+            R_lowest_distances = lowest_distances[:R]
+        except:
+            print('Cannot extract lowest from ' + str(lowest_distances))
+            exit(1)
         # get threshold distance for neighbour definition - np.maximum fails for R=1
         threshold_distance = R_lowest_distances[0]
         if R > 1:
@@ -316,7 +323,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
     return nn, lineage_clustering
 
 
-def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False):
+def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1):
     """ Clusters isolates into lineages based on their
     relative distances.
     
@@ -337,6 +344,8 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
             should be added.
         use_accessory (bool)
             Option to use accessory distances rather than core distances.
+        num_processes (int)
+            Number of CPUs to use for calculations.
             
     Returns:
         combined (dict)
@@ -352,7 +361,6 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
     
     # determine whether ref-ref or ref-query analysis
     isolate_list = rlist
-    row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True)))
     
     # determine whether novel analysis or modifying existing analysis
     use_existing = False
@@ -367,36 +375,44 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
         lineage_seed[R] = {}
         neighbours[R] = {}
     
-    if existing_scheme is not None:
-        with open(existing_scheme, 'rb') as pickle_file:
-            lineage_clustering, lineage_seed, neighbours, rank_list = pickle.load(pickle_file)
-        use_existing = True
-        # add new queries to lineage clustering
-        for q in qlist:
-            for R in rank_list:
-                lineage_clustering[R][q] = null_cluster_value
-    
-    # use for detecting changes in lineage assignation
-    for R in rank_list:
-        previous_lineage_clustering[R] = lineage_clustering[R]
-    
     # run clustering for an individual R
-    # - this can be parallelised across R using multiprocessing.Pool
-    for R in rank_list:
-        lineage_clustering[R], lineage_seed[R], neighbours[R] = run_clustering_for_R(R,
-                                                                                    distances = distances,
-                                                                                    row_labels = row_labels,
-                                                                                    neighbours = neighbours[R],
-                                                                                    lineage_clustering = lineage_clustering[R],
-                                                                                    previous_lineage_clustering = previous_lineage_clustering[R],
-                                                                                    lineage_seed = lineage_seed[R],
-                                                                                    null_cluster_value = null_cluster_value,
-                                                                                    isolate_list = isolate_list,
-                                                                                    qlist = qlist,
-                                                                                    use_existing = use_existing)
-    
+    if num_processes == 1:
+        for R in rank_list:
+            lineage_clustering[R], lineage_seed[R], neighbours[R] = run_clustering_for_R(R,
+                                                                                        null_cluster_value = null_cluster_value,
+                                                                                        qlist = qlist,
+                                                                                        existing_scheme = existing_scheme,
+                                                                                        distances_length = distances.size)
+
+
+    else:
+        # use multiprocessing
+        shm_distances = shared_memory.SharedMemory(create = True, size = distances.nbytes, name = 'shm_distances')
+        distances_shared = np.ndarray(distances.shape, dtype = distances.dtype, buffer = shm_distances.buf)
+        distances_shared[:] = distances[:]
+        isolate_list_shared = shared_memory.ShareableList(isolate_list, name = 'shm_isolate_list')
+
+        with Pool(processes = num_processes) as pool:
+            results = pool.map(partial(run_clustering_for_R,
+                                null_cluster_value = null_cluster_value,
+                                qlist = qlist,
+                                existing_scheme = existing_scheme,
+                                distances_length = distances.size),
+                                rank_list)
+        
+        for n,result in enumerate(results):
+            R = rank_list[n]
+            lineage_clustering[R], lineage_seed[R], neighbours[R], previous_lineage_clustering[R] = result
+        
+        # manage memory
+        shm_distances.close()
+        shm_distances.unlink()
+        del shm_distances
+        isolate_list_shared.shm.close()
+        isolate_list_shared.shm.unlink()
+        del isolate_list_shared
+
     # store output
-    # - this needs to be per R
     with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file:
         pickle.dump([lineage_clustering, lineage_seed, neighbours, rank_list], pickle_file)
     
@@ -423,28 +439,35 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
                 lineage_string = str(lineage_clustering[R][isolate])
                 # include information on lineage clustering
                 combined[titles_list[n]][isolate] = lineage_string
-                if lineage_clustering[R][isolate] != previous_lineage_clustering[R][isolate]:
+                if lineage_clustering[R][isolate] != previous_lineage_clustering[R][isolate] and previous_lineage_clustering[R][isolate] != null_cluster_value:
                     lineage_string = str(previous_lineage_clustering[R][isolate]) + ':' + lineage_string
-                if isolate in combined.keys():
-                    combined['Overall_lineage'][isolate] = combined[isolate] + '-' + lineage_string
+                if isolate in combined['Overall_lineage'].keys():
+                    combined['Overall_lineage'][isolate] = combined['Overall_lineage'][isolate] + '-' + lineage_string
                 else:
                     combined['Overall_lineage'][isolate] = lineage_string
             lFile.write(',' + combined['Overall_lineage'][isolate])
             if qlist is not None:
                 if isolate in qlist:
-                    lFile.write(',Added\n')
+                    lFile.write(',Added')
                 else:
-                    lFile.write(',Existing\n')
+                    lFile.write(',Existing')
+            lFile.write('\n')
 
     return combined
 
-def run_clustering_for_R(R, distances = None, row_labels = None, neighbours = None, lineage_clustering = None,
-                            previous_lineage_clustering = None, lineage_seed = None, null_cluster_value = None,
-                            isolate_list = None, qlist = None, use_existing = False):
+def make_vars_global(distances = None, row_labels = None, isolate_list = None):
+    global global_distances
+    global_distances = distances
+    global global_row_labels
+    global_row_labels = row_labels
+    global global_isolate_list
+    global_isolate_list = isolate_list
+
+def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_scheme = False, distances_length = None):
     """ Clusters isolates into lineages based on their
     relative distances using a single R to enable
     parallelisation.
-    
+
     Args:
         R (int)
             Integer specifying the maximum rank of neighbour used
@@ -481,8 +504,33 @@ def run_clustering_for_R(R, distances = None, row_labels = None, neighbours = No
             Neighbour relationships between isolates for R.
     """
     
-    # identify neighbours
-    if use_existing:
+    # load shared memory objects
+    existing_shm = shared_memory.SharedMemory(name = 'shm_distances')
+    distances = np.ndarray(distances_length, dtype = np.float32, buffer = existing_shm.buf)
+    isolate_list = shared_memory.ShareableList(name = 'shm_isolate_list')
+    
+    # calculate row labels
+    # this is inefficient but there appears to be no way of sharing
+    # strings between processes efficiently
+    row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True)))
+    
+    neighbours = {}
+    lineage_seed = {}
+    lineage_clustering = {}
+    previous_lineage_clustering = {}
+    
+    if existing_scheme is not None:
+        with open(existing_scheme, 'rb') as pickle_file:
+            lineage_clustering_overall, lineage_seed_overall, neighbours_overall, rank_list = pickle.load(pickle_file)
+        # focus on relevant data
+        lineage_clustering = lineage_clustering_overall[R]
+        lineage_seed = lineage_seed_overall[R]
+        neighbours = neighbours_overall[R]
+        # add new queries to lineage clustering
+        for q in qlist:
+            lineage_clustering[q] = null_cluster_value
+        previous_lineage_clustering = lineage_clustering
+        
         neighbours, lineage_clustering = update_nearest_neighbours(distances,
                                                                 row_labels,
                                                                 R,
@@ -495,13 +543,13 @@ def run_clustering_for_R(R, distances = None, row_labels = None, neighbours = No
                                                 row_labels,
                                                 isolate_list,
                                                 R)
-        
+
     # run clustering
     lineage_index = 1
     while null_cluster_value in lineage_clustering.values():
-    
+
         # get seed isolate based on minimum pairwise distances
-        seed_isolate = get_seed_isolate(lineage_clustering,     # need to add in lineage seed hash
+        seed_isolate = get_seed_isolate(lineage_clustering,
                                         row_labels,
                                         distances,
                                         null_cluster_value,
@@ -533,9 +581,9 @@ def run_clustering_for_R(R, distances = None, row_labels = None, neighbours = No
                                                         
         # increment index for next lineage
         lineage_index = lineage_index + 1
-    
+
     # return clustering
-    return lineage_clustering, lineage_seed, neighbours
+    return lineage_clustering, lineage_seed, neighbours, previous_lineage_clustering
 
 def calculateQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length,
                     queryDB, use_mash = False, threads = 1):

From af2089ee06ac7b6592910ddfaa5414426ae20be1 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 28 Apr 2020 12:56:28 +0100
Subject: [PATCH 16/46] Specify float or double in shared memory

---
 PopPUNK/__main__.py           |  5 ---
 PopPUNK/lineage_clustering.py | 70 ++++++++++++++---------------------
 2 files changed, 28 insertions(+), 47 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 709415d9..0befe65e 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -651,11 +651,6 @@ def main():
                 # prune the network and dictionary of assignments
                 genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(viz_subset))
                 
-            # process the returned clustering
-#            for clustering_type in isolateClustering:
-#                isolateClustering[clustering_type] = {viz_key: isolateClustering[clustering_type][viz_key]
-#                    for viz_key in viz_subset}
-
             # generate selected visualisations
             if args.microreact:
                 sys.stderr.write("Writing microreact output\n")
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 8a93f264..71fa3851 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -272,6 +272,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
     query_match_indices = [n for n, (r, q) in enumerate(row_labels) if q in qlist or r in qlist]
     query_row_names = [row_labels[i] for i in query_match_indices]
     query_distances = np.take(distances, query_match_indices)
+    print('Query distances: ' + str(query_distances))
     # iterate through references to update existing entries
     existing_isolates = nn.keys()
     
@@ -284,6 +285,8 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
         min_distance_to_query = np.amin(ref_query_distances)
         # if closer queries, find distances and replace
         if min_distance_to_query < max_distance_to_ref:
+            if R == 1:
+                print('Found new partner for ' + isolate + ' as min query is ' + str(min_distance_to_query) + ' and max ref is ' + str(max_distance_to_ref))
             # unset clustering of isolate and neighbours
             lineage_clustering[isolate] = null_cluster_value
             for neighbour in nn[isolate].keys():
@@ -374,43 +377,47 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
         lineage_clustering[R] = {i:null_cluster_value for i in isolate_list}
         lineage_seed[R] = {}
         neighbours[R] = {}
+        previous_lineage_clustering[R] = {}
+    
+    # shared memory data structures
+    shm_distances = shared_memory.SharedMemory(create = True, size = distances.nbytes, name = 'shm_distances')
+    distances_shared = np.ndarray(distances.shape, dtype = distances.dtype, buffer = shm_distances.buf)
+    distances_shared[:] = distances[:]
+    isolate_list_shared = shared_memory.ShareableList(isolate_list, name = 'shm_isolate_list')
     
     # run clustering for an individual R
     if num_processes == 1:
         for R in rank_list:
-            lineage_clustering[R], lineage_seed[R], neighbours[R] = run_clustering_for_R(R,
+            lineage_clustering[R], lineage_seed[R], neighbours[R], previous_lineage_clustering[R] = run_clustering_for_R(R,
                                                                                         null_cluster_value = null_cluster_value,
                                                                                         qlist = qlist,
                                                                                         existing_scheme = existing_scheme,
-                                                                                        distances_length = distances.size)
+                                                                                        distances_length = distances.shape,
+                                                                                        distances_type = distances.dtype)
 
 
     else:
-        # use multiprocessing
-        shm_distances = shared_memory.SharedMemory(create = True, size = distances.nbytes, name = 'shm_distances')
-        distances_shared = np.ndarray(distances.shape, dtype = distances.dtype, buffer = shm_distances.buf)
-        distances_shared[:] = distances[:]
-        isolate_list_shared = shared_memory.ShareableList(isolate_list, name = 'shm_isolate_list')
 
         with Pool(processes = num_processes) as pool:
             results = pool.map(partial(run_clustering_for_R,
                                 null_cluster_value = null_cluster_value,
                                 qlist = qlist,
                                 existing_scheme = existing_scheme,
-                                distances_length = distances.size),
+                                distances_length = distances.shape,
+                                distances_type = distances.dtype),
                                 rank_list)
         
         for n,result in enumerate(results):
             R = rank_list[n]
             lineage_clustering[R], lineage_seed[R], neighbours[R], previous_lineage_clustering[R] = result
         
-        # manage memory
-        shm_distances.close()
-        shm_distances.unlink()
-        del shm_distances
-        isolate_list_shared.shm.close()
-        isolate_list_shared.shm.unlink()
-        del isolate_list_shared
+    # manage memory
+    shm_distances.close()
+    shm_distances.unlink()
+    del shm_distances
+    isolate_list_shared.shm.close()
+    isolate_list_shared.shm.unlink()
+    del isolate_list_shared
 
     # store output
     with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file:
@@ -431,6 +438,7 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
         if qlist is not None:
             lFile.write(',Status')
         lFile.write('\n')
+
         # print lines for each isolate
         for isolate in lineage_clustering[R].keys():
             lFile.write(isolate)
@@ -455,15 +463,7 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
 
     return combined
 
-def make_vars_global(distances = None, row_labels = None, isolate_list = None):
-    global global_distances
-    global_distances = distances
-    global global_row_labels
-    global_row_labels = row_labels
-    global global_isolate_list
-    global_isolate_list = isolate_list
-
-def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_scheme = False, distances_length = None):
+def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_scheme = False, distances_length = None, distances_type = None):
     """ Clusters isolates into lineages based on their
     relative distances using a single R to enable
     parallelisation.
@@ -473,22 +473,8 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc
             Integer specifying the maximum rank of neighbour used
             for clustering. Should be changed to int list for hierarchical
             clustering.
-        distances (numpy array)
-            Sorted distances used for analysis.
-        row_labels (list of tuples)
-            Pairs of isolates labelling each distance.
-        neighbours (nested dict)
-            Neighbour relationships between isolates for R.
-        lineage_clustering (dict)
-            Assignment of each isolate to a cluster.
-        previous_lineage_clustering (dict)
-            Assignment of each isolate to a cluster before updating.
-        lineage_seed (dict)
-            Seed isolate used to initiate each cluster.
         null_cluster_value (int)
             Used to denote as-yet-unclustered isolates.
-        isolate_list (list)
-            List of isolates for lineage assignment.
         qlist (list)
             List of query sequences being added to an existing clustering.
             Should be included within rlist.
@@ -506,7 +492,7 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc
     
     # load shared memory objects
     existing_shm = shared_memory.SharedMemory(name = 'shm_distances')
-    distances = np.ndarray(distances_length, dtype = np.float32, buffer = existing_shm.buf)
+    distances = np.ndarray(distances_length, dtype = distances_type, buffer = existing_shm.buf)
     isolate_list = shared_memory.ShareableList(name = 'shm_isolate_list')
     
     # calculate row labels
@@ -514,10 +500,10 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc
     # strings between processes efficiently
     row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True)))
     
-    neighbours = {}
+    lineage_clustering = {i:null_cluster_value for i in isolate_list}
+    previous_lineage_clustering = lineage_clustering
     lineage_seed = {}
-    lineage_clustering = {}
-    previous_lineage_clustering = {}
+    neighbours = {}
     
     if existing_scheme is not None:
         with open(existing_scheme, 'rb') as pickle_file:

From ab2ee00e9a18d379f5ccc8289adf6b7caf392b9a Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 28 Apr 2020 13:47:45 +0100
Subject: [PATCH 17/46] Sort distances and names in lineage clustering

---
 PopPUNK/lineage_clustering.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 71fa3851..ca13b3fd 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -272,7 +272,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
     query_match_indices = [n for n, (r, q) in enumerate(row_labels) if q in qlist or r in qlist]
     query_row_names = [row_labels[i] for i in query_match_indices]
     query_distances = np.take(distances, query_match_indices)
-    print('Query distances: ' + str(query_distances))
+
     # iterate through references to update existing entries
     existing_isolates = nn.keys()
     
@@ -285,8 +285,6 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
         min_distance_to_query = np.amin(ref_query_distances)
         # if closer queries, find distances and replace
         if min_distance_to_query < max_distance_to_ref:
-            if R == 1:
-                print('Found new partner for ' + isolate + ' as min query is ' + str(min_distance_to_query) + ' and max ref is ' + str(max_distance_to_ref))
             # unset clustering of isolate and neighbours
             lineage_clustering[isolate] = null_cluster_value
             for neighbour in nn[isolate].keys():
@@ -362,6 +360,10 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
     distance_index = 1 if use_accessory else 0
     distances = X[:,distance_index]
     
+    # sort distances
+    distance_ranks = rankdata(distances, method = 'ordinal')
+    distances = distances[distance_ranks-1]
+    
     # determine whether ref-ref or ref-query analysis
     isolate_list = rlist
     
@@ -383,6 +385,11 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
     shm_distances = shared_memory.SharedMemory(create = True, size = distances.nbytes, name = 'shm_distances')
     distances_shared = np.ndarray(distances.shape, dtype = distances.dtype, buffer = shm_distances.buf)
     distances_shared[:] = distances[:]
+
+    shm_distance_ranks = shared_memory.SharedMemory(create = True, size = distance_ranks.nbytes, name = 'shm_distance_ranks')
+    distance_ranks_shared = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = shm_distance_ranks.buf)
+    distance_ranks_shared[:] = distance_ranks[:]
+
     isolate_list_shared = shared_memory.ShareableList(isolate_list, name = 'shm_isolate_list')
     
     # run clustering for an individual R
@@ -415,6 +422,9 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
     shm_distances.close()
     shm_distances.unlink()
     del shm_distances
+    shm_distance_ranks.close()
+    shm_distance_ranks.unlink()
+    del shm_distance_ranks
     isolate_list_shared.shm.close()
     isolate_list_shared.shm.unlink()
     del isolate_list_shared
@@ -491,14 +501,18 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc
     """
     
     # load shared memory objects
-    existing_shm = shared_memory.SharedMemory(name = 'shm_distances')
-    distances = np.ndarray(distances_length, dtype = distances_type, buffer = existing_shm.buf)
+    existing_distance_shm = shared_memory.SharedMemory(name = 'shm_distances')
+    distances = np.ndarray(distances_length, dtype = distances_type, buffer = existing_distance_shm.buf)
+    existing_rank_shm = shared_memory.SharedMemory(name = 'shm_distance_ranks')
+    distance_ranks = np.ndarray(distances_length, dtype = 'int64', buffer = existing_rank_shm.buf)
     isolate_list = shared_memory.ShareableList(name = 'shm_isolate_list')
     
     # calculate row labels
     # this is inefficient but there appears to be no way of sharing
     # strings between processes efficiently
     row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True)))
+    # reorder by sorted distances
+    row_labels = [row_labels[i-1] for i in distance_ranks]
     
     lineage_clustering = {i:null_cluster_value for i in isolate_list}
     previous_lineage_clustering = lineage_clustering

From 3e59ecb0ff18732390db0c45e586b1d6d3d074cc Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 28 Apr 2020 21:43:59 +0100
Subject: [PATCH 18/46] Improve neighbour assignment

---
 PopPUNK/lineage_clustering.py | 39 ++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index ca13b3fd..636ea28c 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -179,6 +179,39 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances):
     return lineage_info
 
 def generate_nearest_neighbours(distances, row_labels, isolate_list, R):
+    # data structures
+    nn = {}
+    last_dist = {}
+    num_ranks = {}
+    for i in isolate_list:
+        nn[i] = {}
+        num_ranks[i] = 0
+    total_isolates = len(isolate_list)
+    completed_isolates = 0
+    index = 0
+    # iterate through distances until all nearest neighbours identified
+    while completed_isolates < total_isolates:
+        try:
+            distance = distances[index]
+        except:
+            sys.stderr.write('Not enough distances! Try reducing R\n')
+            exit(1)
+        for isolate in row_labels[index]:
+            if isolate in num_ranks.keys() and num_ranks[isolate] < R:
+                if isolate in last_dist.keys() and last_dist[isolate] != distance:
+                    num_ranks[isolate] = num_ranks[isolate] + 1
+                if num_ranks[isolate] >= R:
+                    completed_isolates = completed_isolates + 1
+                else:
+                    pair = row_labels[index][0] if row_labels[index][1] == isolate else row_labels[index][1]
+                    nn[isolate][pair] = distance
+                last_dist[isolate] = distance
+        index = index + 1
+    # return completed dict
+    return nn
+    
+
+def old_generate_nearest_neighbours(distances, row_labels, isolate_list, R):
     """ Identifies the nearest neighbours from the core
     genome distances.
     
@@ -361,8 +394,8 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
     distances = X[:,distance_index]
     
     # sort distances
-    distance_ranks = rankdata(distances, method = 'ordinal')
-    distances = distances[distance_ranks-1]
+    distance_ranks = np.argsort(distances)
+    distances = distances[distance_ranks]
     
     # determine whether ref-ref or ref-query analysis
     isolate_list = rlist
@@ -512,7 +545,7 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc
     # strings between processes efficiently
     row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True)))
     # reorder by sorted distances
-    row_labels = [row_labels[i-1] for i in distance_ranks]
+    row_labels = [row_labels[i] for i in distance_ranks]
     
     lineage_clustering = {i:null_cluster_value for i in isolate_list}
     previous_lineage_clustering = lineage_clustering

From 486544ba5e21b93a3d41f9d1e4ab3a2503072ad4 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 29 Apr 2020 09:54:08 +0100
Subject: [PATCH 19/46] Further optimisation - includes some debugging

---
 PopPUNK/lineage_clustering.py | 242 ++++++++++++++--------------------
 1 file changed, 100 insertions(+), 142 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 636ea28c..def7befa 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -89,7 +89,7 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
 
 def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_value, lineage_index, lineage_seed):
     """ Identifies the isolate used to initiate a cluster.
-    
+
     Args:
         lineage_clustering (dict)
             Clustering of existing dataset.
@@ -104,11 +104,11 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val
             Label of current lineage.
         lineage_seed (dict)
             Dict of seeds used to initiate pre-existing lineage definitions.
-    
+
     Returns:
         seed_isolate (str)
             Isolate to used to initiate next lineage.
-    
+
     """
     # variable to return
     seed_isolate = None
@@ -122,29 +122,14 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val
             seed_isolate = original_seed_isolate
     # else identify a new seed from the closest pair
     else:
-        # extract all pairwise distances between isolates that are not yet clustered
-        clustered_isolates = frozenset([isolate for isolate in lineage_clustering.keys() if lineage_clustering[isolate] != null_cluster_value])
-        unclustered_pair_indices = [n for n,pair in enumerate(row_labels) if not set(pair).issubset(clustered_isolates)]
-        unclustered_distances = distances[unclustered_pair_indices]
-        # start at zero as the minimum if present in the unclustered set
-        minimum_unclustered_distance = 0
-        # otherwise go through the bother of picking a minimum
-        if minimum_unclustered_distance not in unclustered_distances:
-            if len(unclustered_distances) == 1:
-                minimum_unclustered_distance = unclustered_distances[0]
-            elif len(unclustered_distances) > 1:
-                minimum_unclustered_distance = np.amin(unclustered_distances)
-            else:
-                sys.stderr.write('Cannot find any more isolates not assigned to lineages; unclustered pair indices are ' + str(unclustered_pair_indices))
-                exit(1)
-        # identify which entries in the full distance set have this value
-        # unfortunately these distances are not necessarily neighbours
-        distance_indices = np.where(distances == minimum_unclustered_distance)[0].tolist()
-        # identify an unclustered isolate from the pair that have this value
-        closest_unclustered_pair_index =                                           int(set(unclustered_pair_indices).intersection(set(distance_indices)).pop())
-        seed_isolate = row_labels[closest_unclustered_pair_index][0] if lineage_clustering[row_labels[closest_unclustered_pair_index][0]] == null_cluster_value else row_labels[closest_unclustered_pair_index][1]
+        for index,distance in enumerate(distances):
+            for isolate in row_labels[index]:
+                if lineage_clustering[isolate] == null_cluster_value:
+                    seed_isolate = isolate
+                break
+    # return identified seed isolate
     return seed_isolate
-    
+
 def get_lineage_clustering_information(seed_isolate, row_labels, distances):
     """ Generates the ranked distances needed for cluster
     definition.
@@ -164,21 +149,58 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances):
     """
     # data structure
     lineage_info = defaultdict(list)
-    # get subset of relevant distances
-    comparisons_involving_seed = [n for n,(pair) in enumerate(row_labels) if seed_isolate in pair]
-    distances_to_seed = distances[comparisons_involving_seed]
-    # get ranks of data
-    distance_ranks = rankdata(distances_to_seed)
-    # get partners of seed isolate in each comparison
-    pairs_involving_seed = [row_labels[x] for x in comparisons_involving_seed]
-    seed_partners = [r if q == seed_isolate else q for (r,q) in pairs_involving_seed]
-    # create a dict of lists of isolates for a given rank
-    # enables later easy iterating through ranked distances
-    for rank in np.unique(distance_ranks):
-        lineage_info[rank] = [seed_partners[n] for n,r in enumerate(distance_ranks) if r == rank]
+    rank = 0
+    last_dist = -1
+    # iterate through distances recording rank
+    for index,distance in enumerate(distances):
+        if seed_isolate in row_labels[index]:
+            if distance > last_dist:
+                rank = rank + 1
+                last_dist = distance
+            pair = row_labels[index][0] if row_labels[index][1] == seed_isolate else row_labels[index][1]
+            lineage_info[rank].append(pair)
+    # return information
     return lineage_info
 
+#def old_get_lineage_clustering_information(seed_isolate, row_labels, distances):
+#    """ Generates the ranked distances needed for cluster
+#    definition.
+#
+#    Args:
+#        seed_isolate (str)
+#            Isolate used to initiate lineage.
+#        row_labels (list of tuples)
+#            Pairs of isolates labelling each distance.
+#        distances (numpy array)
+#            Pairwise distances used for defining relationships.
+#
+#    Returns:
+#        lineage_info (dict)
+#            Dict listing isolates by ranked distance from seed.
+#
+#    """
+#    # data structure
+#    lineage_info = defaultdict(list)
+#    # get subset of relevant distances
+#    comparisons_involving_seed = [n for n,(pair) in enumerate(row_labels) if seed_isolate in pair]
+#    distances_to_seed = distances[comparisons_involving_seed]
+#    # get ranks of data
+#    distance_ranks = rankdata(distances_to_seed)
+#    # get partners of seed isolate in each comparison
+#    pairs_involving_seed = [row_labels[x] for x in comparisons_involving_seed]
+#    seed_partners = [r if q == seed_isolate else q for (r,q) in pairs_involving_seed]
+#    # create a dict of lists of isolates for a given rank
+#    # enables later easy iterating through ranked distances
+#    for rank in np.unique(distance_ranks):
+#        lineage_info[rank] = [seed_partners[n] for n,r in enumerate(distance_ranks) if r == rank]
+#        # debug
+#        if seed_isolate in {'EPI_ISL_416602','EPI_ISL_416650','EPI_ISL_417950','EPI_ISL_417931','EPI_ISL_416649','EPI_ISL_417975','EPI_ISL_413593','EPI_ISL_416633','EPI_ISL_416643','EPI_ISL_413592','EPI_ISL_416653'} and rank < 10:
+#            print('Rank: ' + str(rank) + '\tpair: ' + str(lineage_info[rank]) + '\tseed: ' + seed_isolate)
+#    return lineage_info
+
 def generate_nearest_neighbours(distances, row_labels, isolate_list, R):
+    # clade
+    in_lineage = {'EPI_ISL_419768','EPI_ISL_420716','EPI_ISL_417108','EPI_ISL_418430','EPI_ISL_417105','EPI_ISL_417102','EPI_ISL_419747','EPI_ISL_419749','EPI_ISL_419762','EPI_ISL_418406','EPI_ISL_421207','EPI_ISL_419733','EPI_ISL_417100','EPI_ISL_419744','EPI_ISL_418408','EPI_ISL_418416','EPI_ISL_417104','EPI_ISL_420721','EPI_ISL_418418','EPI_ISL_420717','EPI_ISL_419739','EPI_ISL_414045','EPI_ISL_419731','EPI_ISL_419729','EPI_ISL_419741','EPI_ISL_417107','EPI_ISL_419748','EPI_ISL_420727','EPI_ISL_420750','EPI_ISL_418405','EPI_ISL_418412','EPI_ISL_419765','EPI_ISL_418413','EPI_ISL_420723','EPI_ISL_420742','EPI_ISL_419725','EPI_ISL_419763','EPI_ISL_418432','EPI_ISL_418431','EPI_ISL_420745','EPI_ISL_418403','EPI_ISL_420751','EPI_ISL_419760','EPI_ISL_418409','EPI_ISL_418419','EPI_ISL_418414','EPI_ISL_419743','EPI_ISL_418400','EPI_ISL_419745','EPI_ISL_420722','EPI_ISL_418402','EPI_ISL_418417','EPI_ISL_419746','EPI_ISL_420724','EPI_ISL_420746','EPI_ISL_419761','EPI_ISL_419764','EPI_ISL_417101','EPI_ISL_420714','EPI_ISL_420719','EPI_ISL_419740','EPI_ISL_417148','EPI_ISL_420748','EPI_ISL_421212','EPI_ISL_419767','EPI_ISL_420747','EPI_ISL_419783','EPI_ISL_419726','EPI_ISL_419720','EPI_ISL_416691','EPI_ISL_419713','EPI_ISL_416693','EPI_ISL_416698','EPI_ISL_417197','EPI_ISL_421213','EPI_ISL_418404','EPI_ISL_419730','EPI_ISL_417122','EPI_ISL_420720','EPI_ISL_420785','EPI_ISL_421194','EPI_ISL_418426','EPI_ISL_417988','EPI_ISL_416656','EPI_ISL_416636','EPI_ISL_413574','EPI_ISL_421191','EPI_ISL_416631','EPI_ISL_416603','EPI_ISL_419766','EPI_ISL_420729','EPI_ISL_417103','EPI_ISL_420718','EPI_ISL_419728','EPI_ISL_417192','EPI_ISL_416685','EPI_ISL_419719','EPI_ISL_416658','EPI_ISL_420749','EPI_ISL_421210','EPI_ISL_419717','EPI_ISL_416686','EPI_ISL_419723','EPI_ISL_413572','EPI_ISL_414027','EPI_ISL_414023','EPI_ISL_419727','EPI_ISL_414020','EPI_ISL_416680','EPI_ISL_417196','EPI_ISL_421203','EPI_ISL_419708','EPI_ISL_419712','EPI_ISL_419710','EPI_ISL_419716','EPI_ISL_416626','EPI_ISL_416625','EPI_ISL_417121','EPI_ISL_416688','EPI_ISL_420715','EPI_ISL_419736','EPI_ISL_418401','EPI_ISL_417987','EPI_ISL_419737','EPI_ISL_420786','EPI_ISL_417109','EPI_ISL_419718','EPI_ISL_416682','EPI_ISL_417194','EPI_ISL_419769','EPI_ISL_417106','EPI_ISL_416690','EPI_ISL_421204','EPI_ISL_419711','EPI_ISL_419714','EPI_ISL_420728'}
     # data structures
     nn = {}
     last_dist = {}
@@ -205,73 +227,20 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R):
                 else:
                     pair = row_labels[index][0] if row_labels[index][1] == isolate else row_labels[index][1]
                     nn[isolate][pair] = distance
+                    # debug
+                    if R == 5 and (isolate in in_lineage and pair not in in_lineage):
+                        print('In: ' + isolate + '\tOut: ' + pair)
+                    elif R == 5 and (pair in in_lineage and isolate not in in_lineage):
+                        print('In: ' + pair + '\tOut: ' + isolate)
                 last_dist[isolate] = distance
         index = index + 1
     # return completed dict
     return nn
-    
-
-def old_generate_nearest_neighbours(distances, row_labels, isolate_list, R):
-    """ Identifies the nearest neighbours from the core
-    genome distances.
-    
-    Args:
-        distances (numpy array)
-            Pairwise distances used for defining relationships.
-        row_labels (list of tuples)
-            Pairs of isolates labelling each distance.
-        isolate_list (list)
-            List of isolates for lineage assignment.
-        R (int)
-            Maximum rank of neighbours used for clustering.
-    
-    Returns:
-        nn
-            Dict of dict of neighbours per isolate, and pairwise distances
-    """
-    # data structures
-    nn = {}
-    # iterate through isolates
-    for isolate in isolate_list:
-        # initiate dict
-        nn[isolate] = {}
-        # get indices of rows involving the considered isolate
-        indices = [n for n, (r, q) in enumerate(row_labels) if r == isolate or q == isolate]
-        # get the corresponding distances
-        #ref_distances = np.take(X, np.array(indices, dtype = 'int64'))
-        ref_distances = np.take(distances, indices)
-        # get unique distances and pick the R smallest values
-        unique_isolate_distances = np.unique(ref_distances)
-#        R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)[:R]]
-        lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)]
-        try:
-            R_lowest_distances = lowest_distances[:R]
-        except:
-            print('Cannot extract lowest from ' + str(lowest_distances))
-            exit(1)
-        # get threshold distance for neighbour definition - np.maximum fails for R=1
-        threshold_distance = R_lowest_distances[0]
-        if R > 1:
-            try:
-                threshold_distance = np.amax(R_lowest_distances)
-            except:
-                sys.stderr.write('Problem with: ' + str(R_lowest_distances) + '\n')
-        # identify all distances below this threshold and store relative to index
-        for i,d in enumerate(ref_distances):
-            if d <= threshold_distance:
-                # get index of row in full list
-                index = indices[i]
-                # get name of other isolate in pair
-                name = row_labels[index][1] if row_labels[index][0] == isolate else row_labels[index][0]
-                nn[isolate][name] = ref_distances[i]
-    # return completed dictionary
-    return nn
-
 
 def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clustering, null_cluster_value):
     """ Updates the information on nearest neighbours, given
     a new set of ref-query and query-query distances.
-    
+
     Args:
        distances (numpy array)
         Distances to be used for defining lineages.
@@ -288,17 +257,17 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
        null_cluster_value (int)
         Null cluster value used for unsetting lineage assignments
         where this may change due to altered neighbour relationships.
-    
+
     Returns:
         nn (nested dict)
             Updated neighbour relationships.
         lineage_clustering (dict)
             Updated assignment of isolates to lineage.
-    
+
     """
     # iterate through isolates and test whether any comparisons with
     # newly-added queries replace or supplement existing neighbours
-    
+
     # data structures for altered entries
     nn_new = {}
     # pre-process to extract ref-query distances first
@@ -306,57 +275,46 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
     query_row_names = [row_labels[i] for i in query_match_indices]
     query_distances = np.take(distances, query_match_indices)
 
-    # iterate through references to update existing entries
-    existing_isolates = nn.keys()
-    
-    for isolate in existing_isolates:
-        # get max distance to existing neighbours
-        max_distance_to_ref = max(nn[isolate].values())
-        # get min distance to queries
-        ref_query_match_indices = [n for n, (r, q) in enumerate(query_row_names) if r == isolate or q == isolate]
-        ref_query_distances = np.take(query_distances, ref_query_match_indices)
-        min_distance_to_query = np.amin(ref_query_distances)
-        # if closer queries, find distances and replace
-        if min_distance_to_query < max_distance_to_ref:
-            # unset clustering of isolate and neighbours
-            lineage_clustering[isolate] = null_cluster_value
-            for neighbour in nn[isolate].keys():
-                lineage_clustering[neighbour] = null_cluster_value
-            # update neighbour information
-            nn_old = nn[isolate]
-            nn_new[isolate] = {}
-            # old neighbours from existing dict
-            old_distances = list(set(nn[isolate].values()))
-            # new neighbours from new distances in X
-            new_distances = np.unique(ref_query_distances)
-            # combine old and new distances
-            combined_ref_distances = list(nn[isolate].values()) + ref_query_distances.tolist()
-            # take top ranked distances from both
-            ranked_unique_combined_ref_distances = sorted(set(combined_ref_distances))[:R]
-            for distance in ranked_unique_combined_ref_distances:
-                # add old neighbours back in
-                if distance in old_distances:
-                    for neighbour,d in nn[isolate].items():
-                        if d == distance:
-                            nn_new[isolate][neighbour] = distance
-                # add new neighbours
-                if distance in new_distances:
-                    for n,d in enumerate(ref_query_distances):
-                        if d == distance:
-                            row_label_index = ref_query_match_indices[n]
-                            neighbour = row_labels[row_label_index][0] if row_labels[row_label_index][1] == isolate else row_labels[row_label_index][1]
-                            nn_new[isolate][neighbour] = d
+    # calculate max distances for each isolate
+    max_distance = {}
+    for existing_isolate in nn.keys():
+        max_distance[existing_isolate] = max(nn[existing_isolate].values())
+    # iterate through the ref-query distances
+    for index,(distance,(isolate1,isolate2)) in enumerate(zip(query_distances,query_row_names)):
+        # identify ref-query matches
+        ref = None
+        query = None
+        if isolate1 in max_distance.keys() and isolate2 not in max_distance.keys():
+            ref = isolate1
+            query = isolate2
+        elif isolate2 in max_distance.keys() and isolate1 not in max_distance.keys():
+            ref = isolate2
+            query = isolate1
+        if ref is not None:
+            if distance <= max_distance[ref]:
+                # unset isolate and references
+                lineage_clustering[ref] = null_cluster_value
+                for neighbour in nn[ref]:
+                    lineage_clustering[neighbour] = null_cluster_value
+                # add neighbours
+                nn[ref][query] = distance
+                # delete links if no longer high ranked match
+                if distance < max_distance[ref]:
+                    to_delete = []
+                    for other in nn[ref].keys():
+                        if nn[ref][other] == max_distance[ref]:
+                            to_delete.append(other)
+                    for other in to_delete:
+                        del nn[ref][other]
+                    max_distance[ref] = max(nn[ref].values())
     # get nn for query sequences
     query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R)
     # merge dicts
     for query in query_nn.keys():
         nn[query] = query_nn[query]
-    for updated in nn_new.keys():
-        nn[updated] = nn_new[updated]
     # return updated dict
     return nn, lineage_clustering
 
-
 def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1):
     """ Clusters isolates into lineages based on their
     relative distances.

From 6a195cf8c88e3c85148ea08a20b7ff910080b411 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 29 Apr 2020 13:39:00 +0100
Subject: [PATCH 20/46] Substantial performance improvement, some error fixes

---
 PopPUNK/lineage_clustering.py | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index def7befa..af7fdfa1 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -67,13 +67,12 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
                     lineage_clustering[isolate] = lineage_index
                     # add neighbours of same or lower rank to lineage if unclustered
                     for isolate_neighbour in neighbours[isolate].keys():
+                        # if lineage of neighbour is higher, or it is null value (highest)
                         if lineage_clustering[isolate_neighbour] > lineage_index:
-                            for neighbour_rank in lineage_clustering_information.keys():
-                                if neighbour_rank <= rank:
-                                    if isolate_neighbour in lineage_clustering_information[neighbour_rank]:
-                                        lineage_clustering[isolate_neighbour] = lineage_index
-                                else:
-                                    break
+                            # check if rank of neighbour is lower than that of the current subject isolate
+                            for neighbour_rank in range(1,rank):
+                                if isolate_neighbour in lineage_clustering_information[neighbour_rank]:
+                                    lineage_clustering[isolate_neighbour] = lineage_index
             # if this represents a change from the previous iteration of lineage definitions
             # then the change needs to propagate through higher-ranked members
             if isolate in previous_lineage_clustering:
@@ -122,12 +121,17 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val
             seed_isolate = original_seed_isolate
     # else identify a new seed from the closest pair
     else:
-        for index,distance in enumerate(distances):
-            for isolate in row_labels[index]:
-                if lineage_clustering[isolate] == null_cluster_value:
-                    seed_isolate = isolate
+        for index,(distance,(isolate1,isolate2)) in enumerate(zip(distances,row_labels)):
+            if lineage_clustering[isolate1] == null_cluster_value:
+#                print('Found one!: ' + isolate1 + ' at distance ' + str(distance))
+                seed_isolate = isolate1
+                break
+            elif lineage_clustering[isolate2] == null_cluster_value:
+                seed_isolate = isolate2
+#                print('Found one!: ' + isolate2 + ' at distance ' + str(distance))
                 break
     # return identified seed isolate
+#    print('Seed: ' + seed_isolate)
     return seed_isolate
 
 def get_lineage_clustering_information(seed_isolate, row_labels, distances):
@@ -199,8 +203,6 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances):
 #    return lineage_info
 
 def generate_nearest_neighbours(distances, row_labels, isolate_list, R):
-    # clade
-    in_lineage = {'EPI_ISL_419768','EPI_ISL_420716','EPI_ISL_417108','EPI_ISL_418430','EPI_ISL_417105','EPI_ISL_417102','EPI_ISL_419747','EPI_ISL_419749','EPI_ISL_419762','EPI_ISL_418406','EPI_ISL_421207','EPI_ISL_419733','EPI_ISL_417100','EPI_ISL_419744','EPI_ISL_418408','EPI_ISL_418416','EPI_ISL_417104','EPI_ISL_420721','EPI_ISL_418418','EPI_ISL_420717','EPI_ISL_419739','EPI_ISL_414045','EPI_ISL_419731','EPI_ISL_419729','EPI_ISL_419741','EPI_ISL_417107','EPI_ISL_419748','EPI_ISL_420727','EPI_ISL_420750','EPI_ISL_418405','EPI_ISL_418412','EPI_ISL_419765','EPI_ISL_418413','EPI_ISL_420723','EPI_ISL_420742','EPI_ISL_419725','EPI_ISL_419763','EPI_ISL_418432','EPI_ISL_418431','EPI_ISL_420745','EPI_ISL_418403','EPI_ISL_420751','EPI_ISL_419760','EPI_ISL_418409','EPI_ISL_418419','EPI_ISL_418414','EPI_ISL_419743','EPI_ISL_418400','EPI_ISL_419745','EPI_ISL_420722','EPI_ISL_418402','EPI_ISL_418417','EPI_ISL_419746','EPI_ISL_420724','EPI_ISL_420746','EPI_ISL_419761','EPI_ISL_419764','EPI_ISL_417101','EPI_ISL_420714','EPI_ISL_420719','EPI_ISL_419740','EPI_ISL_417148','EPI_ISL_420748','EPI_ISL_421212','EPI_ISL_419767','EPI_ISL_420747','EPI_ISL_419783','EPI_ISL_419726','EPI_ISL_419720','EPI_ISL_416691','EPI_ISL_419713','EPI_ISL_416693','EPI_ISL_416698','EPI_ISL_417197','EPI_ISL_421213','EPI_ISL_418404','EPI_ISL_419730','EPI_ISL_417122','EPI_ISL_420720','EPI_ISL_420785','EPI_ISL_421194','EPI_ISL_418426','EPI_ISL_417988','EPI_ISL_416656','EPI_ISL_416636','EPI_ISL_413574','EPI_ISL_421191','EPI_ISL_416631','EPI_ISL_416603','EPI_ISL_419766','EPI_ISL_420729','EPI_ISL_417103','EPI_ISL_420718','EPI_ISL_419728','EPI_ISL_417192','EPI_ISL_416685','EPI_ISL_419719','EPI_ISL_416658','EPI_ISL_420749','EPI_ISL_421210','EPI_ISL_419717','EPI_ISL_416686','EPI_ISL_419723','EPI_ISL_413572','EPI_ISL_414027','EPI_ISL_414023','EPI_ISL_419727','EPI_ISL_414020','EPI_ISL_416680','EPI_ISL_417196','EPI_ISL_421203','EPI_ISL_419708','EPI_ISL_419712','EPI_ISL_419710','EPI_ISL_419716','EPI_ISL_416626','EPI_ISL_416625','EPI_ISL_417121','EPI_ISL_416688','EPI_ISL_420715','EPI_ISL_419736','EPI_ISL_418401','EPI_ISL_417987','EPI_ISL_419737','EPI_ISL_420786','EPI_ISL_417109','EPI_ISL_419718','EPI_ISL_416682','EPI_ISL_417194','EPI_ISL_419769','EPI_ISL_417106','EPI_ISL_416690','EPI_ISL_421204','EPI_ISL_419711','EPI_ISL_419714','EPI_ISL_420728'}
     # data structures
     nn = {}
     last_dist = {}
@@ -228,10 +230,10 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R):
                     pair = row_labels[index][0] if row_labels[index][1] == isolate else row_labels[index][1]
                     nn[isolate][pair] = distance
                     # debug
-                    if R == 5 and (isolate in in_lineage and pair not in in_lineage):
-                        print('In: ' + isolate + '\tOut: ' + pair)
-                    elif R == 5 and (pair in in_lineage and isolate not in in_lineage):
-                        print('In: ' + pair + '\tOut: ' + isolate)
+#                    if R == 5 and isolate == 'EPI_ISL_419767':
+#                        print('In: ' + isolate + '\tOut: ' + pair + '\tDistance: ' + str(distance))
+#                    elif R == 5 and (pair in in_lineage and isolate not in in_lineage):
+#                        print('In: ' + pair + '\tOut: ' + isolate)
                 last_dist[isolate] = distance
         index = index + 1
     # return completed dict

From f9645039f303100a3987fa845ec47f2e605914fb Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Wed, 29 Apr 2020 15:14:07 +0100
Subject: [PATCH 21/46] Fixed recursive addition of isolates

---
 PopPUNK/lineage_clustering.py | 152 ++++++++++++++++++----------------
 1 file changed, 80 insertions(+), 72 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index af7fdfa1..ac6bbf95 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -18,6 +18,48 @@
 from .utils import iterDistRows
 from .utils import readRfile
 
+def cluster_neighbours_of_equal_or_lower_rank(isolate, rank, lineage_index, lineage_clustering, lineage_clustering_information, nn):
+    """ Iteratively adds neighbours of isolates of lower or equal rank
+    to a lineage if an isolate of a higher rank is added.
+    
+    Args:
+    isolate (string)
+        Isolate of higher rank added to lineage.
+    rank (int)
+        Rank of isolate added to lineage.
+    lineage_index (int)
+       Label of current lineage.
+    lineage_clustering (dict)
+       Clustering of existing dataset.
+    lineage_clustering_information (dict)
+        Dict listing isolates by ranked distance from seed.
+    nn (nested dict)
+       Pre-calculated neighbour relationships.
+        
+    Returns:
+        lineage_clustering (dict)
+            Assignment of isolates to lineages.
+    """
+    isolates_to_check = [isolate]
+    isolate_ranks = {}
+    isolate_ranks[isolate] = rank
+    
+    while len(isolates_to_check) > 0:
+        for isolate in isolates_to_check:
+            rank = isolate_ranks[isolate]
+            for isolate_neighbour in nn[isolate].keys():
+                # if lineage of neighbour is higher, or it is null value (highest)
+                if lineage_clustering[isolate_neighbour] > lineage_index:
+                    # check if rank of neighbour is lower than that of the current subject isolate
+                    for neighbour_rank in range(1,rank):
+                        if isolate_neighbour in lineage_clustering_information[neighbour_rank]:
+                            lineage_clustering[isolate_neighbour] = lineage_index
+                            isolates_to_check.append(isolate_neighbour)
+                            isolate_ranks[isolate_neighbour] = neighbour_rank
+            isolates_to_check.remove(isolate)
+
+    return lineage_clustering
+
 def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, R, lineage_index, seed_isolate, previous_lineage_clustering, null_cluster_value):
     """ Identifies isolates corresponding to a particular
     lineage given a cluster seed.
@@ -66,13 +108,12 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
                     # add isolate to lineage
                     lineage_clustering[isolate] = lineage_index
                     # add neighbours of same or lower rank to lineage if unclustered
-                    for isolate_neighbour in neighbours[isolate].keys():
-                        # if lineage of neighbour is higher, or it is null value (highest)
-                        if lineage_clustering[isolate_neighbour] > lineage_index:
-                            # check if rank of neighbour is lower than that of the current subject isolate
-                            for neighbour_rank in range(1,rank):
-                                if isolate_neighbour in lineage_clustering_information[neighbour_rank]:
-                                    lineage_clustering[isolate_neighbour] = lineage_index
+                    lineage_clustering = cluster_neighbours_of_equal_or_lower_rank(isolate,
+                                                                                    rank,
+                                                                                    lineage_index,
+                                                                                    lineage_clustering,
+                                                                                    lineage_clustering_information,
+                                                                                    neighbours)
             # if this represents a change from the previous iteration of lineage definitions
             # then the change needs to propagate through higher-ranked members
             if isolate in previous_lineage_clustering:
@@ -123,15 +164,12 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val
     else:
         for index,(distance,(isolate1,isolate2)) in enumerate(zip(distances,row_labels)):
             if lineage_clustering[isolate1] == null_cluster_value:
-#                print('Found one!: ' + isolate1 + ' at distance ' + str(distance))
                 seed_isolate = isolate1
                 break
             elif lineage_clustering[isolate2] == null_cluster_value:
                 seed_isolate = isolate2
-#                print('Found one!: ' + isolate2 + ' at distance ' + str(distance))
                 break
     # return identified seed isolate
-#    print('Seed: ' + seed_isolate)
     return seed_isolate
 
 def get_lineage_clustering_information(seed_isolate, row_labels, distances):
@@ -166,42 +204,6 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances):
     # return information
     return lineage_info
 
-#def old_get_lineage_clustering_information(seed_isolate, row_labels, distances):
-#    """ Generates the ranked distances needed for cluster
-#    definition.
-#
-#    Args:
-#        seed_isolate (str)
-#            Isolate used to initiate lineage.
-#        row_labels (list of tuples)
-#            Pairs of isolates labelling each distance.
-#        distances (numpy array)
-#            Pairwise distances used for defining relationships.
-#
-#    Returns:
-#        lineage_info (dict)
-#            Dict listing isolates by ranked distance from seed.
-#
-#    """
-#    # data structure
-#    lineage_info = defaultdict(list)
-#    # get subset of relevant distances
-#    comparisons_involving_seed = [n for n,(pair) in enumerate(row_labels) if seed_isolate in pair]
-#    distances_to_seed = distances[comparisons_involving_seed]
-#    # get ranks of data
-#    distance_ranks = rankdata(distances_to_seed)
-#    # get partners of seed isolate in each comparison
-#    pairs_involving_seed = [row_labels[x] for x in comparisons_involving_seed]
-#    seed_partners = [r if q == seed_isolate else q for (r,q) in pairs_involving_seed]
-#    # create a dict of lists of isolates for a given rank
-#    # enables later easy iterating through ranked distances
-#    for rank in np.unique(distance_ranks):
-#        lineage_info[rank] = [seed_partners[n] for n,r in enumerate(distance_ranks) if r == rank]
-#        # debug
-#        if seed_isolate in {'EPI_ISL_416602','EPI_ISL_416650','EPI_ISL_417950','EPI_ISL_417931','EPI_ISL_416649','EPI_ISL_417975','EPI_ISL_413593','EPI_ISL_416633','EPI_ISL_416643','EPI_ISL_413592','EPI_ISL_416653'} and rank < 10:
-#            print('Rank: ' + str(rank) + '\tpair: ' + str(lineage_info[rank]) + '\tseed: ' + seed_isolate)
-#    return lineage_info
-
 def generate_nearest_neighbours(distances, row_labels, isolate_list, R):
     # data structures
     nn = {}
@@ -211,29 +213,26 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R):
         nn[i] = {}
         num_ranks[i] = 0
     total_isolates = len(isolate_list)
+    num_distances = len(distances)
     completed_isolates = 0
     index = 0
     # iterate through distances until all nearest neighbours identified
-    while completed_isolates < total_isolates:
-        try:
-            distance = distances[index]
-        except:
-            sys.stderr.write('Not enough distances! Try reducing R\n')
-            exit(1)
+    while completed_isolates < total_isolates and index < num_distances:
+        distance = distances[index]
+        # iterate through all listed isolates
         for isolate in row_labels[index]:
             if isolate in num_ranks.keys() and num_ranks[isolate] < R:
+                # R is the number of unique distances so only increment if
+                # different from the last distance
                 if isolate in last_dist.keys() and last_dist[isolate] != distance:
                     num_ranks[isolate] = num_ranks[isolate] + 1
-                if num_ranks[isolate] >= R:
+                # if maximum number of ranks reached, mark as complete
+                if num_ranks[isolate] == R: # stop at R as counting from 0
                     completed_isolates = completed_isolates + 1
+                # if not, add as a nearest neighbour
                 else:
                     pair = row_labels[index][0] if row_labels[index][1] == isolate else row_labels[index][1]
                     nn[isolate][pair] = distance
-                    # debug
-#                    if R == 5 and isolate == 'EPI_ISL_419767':
-#                        print('In: ' + isolate + '\tOut: ' + pair + '\tDistance: ' + str(distance))
-#                    elif R == 5 and (pair in in_lineage and isolate not in in_lineage):
-#                        print('In: ' + pair + '\tOut: ' + isolate)
                 last_dist[isolate] = distance
         index = index + 1
     # return completed dict
@@ -276,11 +275,21 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
     query_match_indices = [n for n, (r, q) in enumerate(row_labels) if q in qlist or r in qlist]
     query_row_names = [row_labels[i] for i in query_match_indices]
     query_distances = np.take(distances, query_match_indices)
-
+    
+    # get nn for query sequences
+    query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R)
+    # add query-query comparisons
+    for query in query_nn.keys():
+        nn[query] = query_nn[query]
+    
     # calculate max distances for each isolate
     max_distance = {}
-    for existing_isolate in nn.keys():
-        max_distance[existing_isolate] = max(nn[existing_isolate].values())
+    num_distances = {}
+    for isolate in nn.keys():
+        neighbour_distances = set(nn[isolate].values())
+        max_distance[isolate] = max(neighbour_distances)
+        num_distances[isolate] = len(neighbour_distances) # query-query comparisons may be < R
+        
     # iterate through the ref-query distances
     for index,(distance,(isolate1,isolate2)) in enumerate(zip(query_distances,query_row_names)):
         # identify ref-query matches
@@ -302,18 +311,17 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
                 nn[ref][query] = distance
                 # delete links if no longer high ranked match
                 if distance < max_distance[ref]:
-                    to_delete = []
-                    for other in nn[ref].keys():
-                        if nn[ref][other] == max_distance[ref]:
-                            to_delete.append(other)
-                    for other in to_delete:
-                        del nn[ref][other]
+                    if num_distances[ref] == R:
+                        to_delete = []
+                        for other in nn[ref].keys():
+                            if nn[ref][other] == max_distance[ref]:
+                                to_delete.append(other)
+                        for other in to_delete:
+                            del nn[ref][other]
+                    else:
+                        # if set from query-query distances < R
+                        num_distances[ref] = num_distances[ref] + 1
                     max_distance[ref] = max(nn[ref].values())
-    # get nn for query sequences
-    query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R)
-    # merge dicts
-    for query in query_nn.keys():
-        nn[query] = query_nn[query]
     # return updated dict
     return nn, lineage_clustering
 

From da5b6df826fe1cafe622094f82cb8d3677354661 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 2 May 2020 11:21:43 +0100
Subject: [PATCH 22/46] Update sharing of data structures between parallelised
 processes

---
 PopPUNK/lineage_clustering.py | 75 ++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 41 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index ac6bbf95..76ecd165 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -11,7 +11,15 @@
 from scipy.stats import rankdata
 from collections import defaultdict
 import pickle
+import collections
 from multiprocessing import Pool, Lock, Manager, RawArray, shared_memory, managers
+try:
+    from multiprocessing import Pool, shared_memory
+    from multiprocessing.managers import SharedMemoryManager
+    NumpyShared = collections.namedtuple('NumpyShared', ('name', 'shape', 'dtype'))
+except ImportError as e:
+    sys.stderr.write("This version of PopPUNK requires python v3.8 or higher\n")
+    sys.exit(0)
 from functools import partial
 
 # import poppunk package
@@ -383,52 +391,37 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
         previous_lineage_clustering[R] = {}
     
     # shared memory data structures
-    shm_distances = shared_memory.SharedMemory(create = True, size = distances.nbytes, name = 'shm_distances')
-    distances_shared = np.ndarray(distances.shape, dtype = distances.dtype, buffer = shm_distances.buf)
-    distances_shared[:] = distances[:]
-
-    shm_distance_ranks = shared_memory.SharedMemory(create = True, size = distance_ranks.nbytes, name = 'shm_distance_ranks')
-    distance_ranks_shared = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = shm_distance_ranks.buf)
-    distance_ranks_shared[:] = distance_ranks[:]
-
-    isolate_list_shared = shared_memory.ShareableList(isolate_list, name = 'shm_isolate_list')
+    with SharedMemoryManager() as smm:
+        # share sorted distances
+        distances_raw = smm.SharedMemory(size = distances.nbytes)
+        distances_shared_array = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_raw.buf)
+        distances_shared_array[:] = distances[:]
+        distances_shared_array = NumpyShared(name = distances_raw.name, shape = distances.shape, dtype = distances.dtype)
+
+        # share distance ranks
+        distance_ranks_raw = smm.SharedMemory(size = distance_ranks.nbytes)
+        distance_ranks_shared_array = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = distance_ranks_raw.buf)
+        distance_ranks_shared_array[:] = distance_ranks[:]
+        distance_ranks_shared_array = NumpyShared(name = distance_ranks_raw.name, shape = distance_ranks.shape, dtype = distance_ranks.dtype)
+
+        # share isolate list
+        isolate_list_shared = smm.ShareableList(isolate_list)
     
-    # run clustering for an individual R
-    if num_processes == 1:
-        for R in rank_list:
-            lineage_clustering[R], lineage_seed[R], neighbours[R], previous_lineage_clustering[R] = run_clustering_for_R(R,
-                                                                                        null_cluster_value = null_cluster_value,
-                                                                                        qlist = qlist,
-                                                                                        existing_scheme = existing_scheme,
-                                                                                        distances_length = distances.shape,
-                                                                                        distances_type = distances.dtype)
-
-
-    else:
-
+        # run clustering for an individual R
         with Pool(processes = num_processes) as pool:
             results = pool.map(partial(run_clustering_for_R,
                                 null_cluster_value = null_cluster_value,
                                 qlist = qlist,
                                 existing_scheme = existing_scheme,
-                                distances_length = distances.shape,
-                                distances_type = distances.dtype),
+                                distances = distances_shared_array,
+                                distance_ranks = distance_ranks_shared_array,
+                                isolates = isolate_list_shared),
                                 rank_list)
         
+        # extract results from multiprocessing pool
         for n,result in enumerate(results):
             R = rank_list[n]
             lineage_clustering[R], lineage_seed[R], neighbours[R], previous_lineage_clustering[R] = result
-        
-    # manage memory
-    shm_distances.close()
-    shm_distances.unlink()
-    del shm_distances
-    shm_distance_ranks.close()
-    shm_distance_ranks.unlink()
-    del shm_distance_ranks
-    isolate_list_shared.shm.close()
-    isolate_list_shared.shm.unlink()
-    del isolate_list_shared
 
     # store output
     with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file:
@@ -474,7 +467,7 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
 
     return combined
 
-def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_scheme = False, distances_length = None, distances_type = None):
+def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None):
     """ Clusters isolates into lineages based on their
     relative distances using a single R to enable
     parallelisation.
@@ -502,11 +495,11 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc
     """
     
     # load shared memory objects
-    existing_distance_shm = shared_memory.SharedMemory(name = 'shm_distances')
-    distances = np.ndarray(distances_length, dtype = distances_type, buffer = existing_distance_shm.buf)
-    existing_rank_shm = shared_memory.SharedMemory(name = 'shm_distance_ranks')
-    distance_ranks = np.ndarray(distances_length, dtype = 'int64', buffer = existing_rank_shm.buf)
-    isolate_list = shared_memory.ShareableList(name = 'shm_isolate_list')
+    distances_shm = shared_memory.SharedMemory(name = distances.name)
+    distances = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_shm.buf)
+    distance_ranks_shm = shared_memory.SharedMemory(name = distance_ranks.name)
+    distance_ranks = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = distance_ranks_shm.buf)
+    isolate_list = isolates
     
     # calculate row labels
     # this is inefficient but there appears to be no way of sharing

From effb8909e329c8ac57e1bc9aea09be1088f7021d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 2 May 2020 13:25:21 +0100
Subject: [PATCH 23/46] Alterations to command line options

---
 PopPUNK/__main__.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 0befe65e..66e4f0f2 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -95,6 +95,10 @@ def get_options():
             help='Identify lineages within a strain',
             default=False,
             action='store_true')
+    mode.add_argument('--assign-lineages',
+            help='Assign isolates to an existing lineages scheme',
+            default=False,
+            action='store_true')
     mode.add_argument('--use-model',
             help='Apply a fitted model to a reference database to restore database files',
             default=False,
@@ -176,15 +180,13 @@ def get_options():
                                               '[default = False]', default=False, action='store_true')
     queryingGroup.add_argument('--accessory-only', help='Use an accessory-distance only model for assigning queries '
                                               '[default = False]', default=False, action='store_true')
-    queryingGroup.add_argument('--assign-lineage', help='Assign to lineages rather than strains - can specify '
-                                              'lineages other than those in the reference database using "--existing-scheme"'
-                                              '[default = False]', default=False, action='store_true')
 
     # lineage clustering within strains
     lineagesGroup = parser.add_argument_group('Lineage analysis options')
-    lineagesGroup.add_argument('--R',help='Comma separated list of ranks used in lineage clustering [default = 1]', type = str, default = "1")
+    lineagesGroup.add_argument('--ranks',help='Comma separated list of ranks used in lineage clustering [default = 1,2,3]', type = str, default = "1,2,3")
     lineagesGroup.add_argument('--use-accessory',help='Use accessory distances for lineage definitions [default = use core distances]', action = 'store_true', default = False)
-    lineagesGroup.add_argument('--existing-scheme',help='Name of pickle file storing existing lineage definitions', type = str, default = None)
+    lineagesGroup.add_argument('--existing-scheme',help='Name of pickle file storing existing lineage definitions '
+                                                        ', required with "--assign-lineages"', type = str, default = None)
 
     # plot output
     faGroup = parser.add_argument_group('Further analysis options')
@@ -276,6 +278,10 @@ def main():
     if not args.use_mash:
         sketch_sizes = int(round(max(sketch_sizes.values())/64))
 
+    # check if working with lineages
+    if args.lineage_clustering or args.assign_lineages:
+        rank_list = sorted([int(x) for x in args.ranks.split(',')])
+
     # check on file paths and whether files will be overwritten
     # confusing to overwrite command line parameter
     #if not args.full_db and not (args.create_db or args.easy_run or args.assign_query):
@@ -525,15 +531,10 @@ def main():
     #* within-strain analysis     *#
     #*                            *#
     #******************************#
-    sys.stderr.write("Checking options here\n\n")
     if args.lineage_clustering:
         sys.stderr.write("Mode: Identifying lineages within a clade\n\n")
 
-        # process rankings to use
-        rank_list = sorted([int(x) for x in args.R.split(',')])
-
         # load distances
-        distances = None
         if args.distances is not None:
             distances = args.distances
         else:
@@ -554,13 +555,13 @@ def main():
     #* below)                      *#
     #*                             *#
     #*******************************# 
-    elif args.assign_query:
+    elif args.assign_query or args.assign_lineages:
         assign_query(dbFuncs, args.ref_db, args.q_files, args.output, args.update_db, args.full_db, args.distances,
                      args.microreact, args.cytoscape, kmers, sketch_sizes, args.ignore_length, args.estimated_length,
                      args.threads, args.use_mash, args.mash, args.overwrite, args.plot_fit, args.no_stream,
                      args.max_a_dist, args.model_dir, args.previous_clustering, args.external_clustering,
                      args.core_only, args.accessory_only, args.phandango, args.grapetree, args.info_csv,
-                     args.rapidnj, args.perplexity, args.assign_lineage, args.existing_scheme, args.R, args.use_accessory)
+                     args.rapidnj, args.perplexity, args.assign_lineages, args.existing_scheme, rank_list, args.use_accessory)
 
     #******************************#
     #*                            *#
@@ -688,7 +689,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
                  kmers, sketch_sizes, ignore_length, estimated_length, threads, use_mash, mash, overwrite,
                  plot_fit, no_stream, max_a_dist, model_dir, previous_clustering,
                  external_clustering, core_only, accessory_only, phandango, grapetree,
-                 info_csv, rapidnj, perplexity, assign_lineage, existing_scheme, R, use_accessory):
+                 info_csv, rapidnj, perplexity, assign_lineage, existing_scheme, rank_list, use_accessory):
     """Code for assign query mode. Written as a separate function so it can be called
     by pathogen.watch API
     """
@@ -834,12 +835,10 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
             complete_distMat = translate_distMat(combined_seq, core_distMat, acc_distMat)
             
             if assign_lineage:
-                rank_list = sorted([int(x) for x in R.split(',')])
                 expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl'
                 if existing_scheme is not None:
                     expected_lineage_name = existing_scheme
-                #isolateClustering = {'combined': cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name,  use_accessory, threads)}
-                isolateClustering = cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name,  use_accessory, threads)
+                isolateClustering = cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name, use_accessory, threads)
 
             # Prune distances to references only, if not full db
             dists_out = output + "/" + os.path.basename(output) + ".dists"

From 01cd435a731ae8cbc268fe7eacc4ea0dc7f42eb9 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 2 May 2020 13:47:36 +0100
Subject: [PATCH 24/46] Renaming rank variables

---
 PopPUNK/lineage_clustering.py | 70 +++++++++++++++++------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 76ecd165..718021d3 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -68,7 +68,7 @@ def cluster_neighbours_of_equal_or_lower_rank(isolate, rank, lineage_index, line
 
     return lineage_clustering
 
-def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, R, lineage_index, seed_isolate, previous_lineage_clustering, null_cluster_value):
+def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, rank, lineage_index, seed_isolate, previous_lineage_clustering, null_cluster_value):
     """ Identifies isolates corresponding to a particular
     lineage given a cluster seed.
     
@@ -79,7 +79,7 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
             Dict listing isolates by ranked distance from seed.
         neighbours (nested dict)
            Pre-calculated neighbour relationships.
-        R (int)
+        rank (int)
            Maximum rank of neighbours used for clustering.
         lineage_index (int)
            Label of current lineage.
@@ -212,7 +212,7 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances):
     # return information
     return lineage_info
 
-def generate_nearest_neighbours(distances, row_labels, isolate_list, R):
+def generate_nearest_neighbours(distances, row_labels, isolate_list, rank):
     # data structures
     nn = {}
     last_dist = {}
@@ -229,13 +229,13 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R):
         distance = distances[index]
         # iterate through all listed isolates
         for isolate in row_labels[index]:
-            if isolate in num_ranks.keys() and num_ranks[isolate] < R:
+            if isolate in num_ranks.keys() and num_ranks[isolate] < rank:
                 # R is the number of unique distances so only increment if
                 # different from the last distance
                 if isolate in last_dist.keys() and last_dist[isolate] != distance:
                     num_ranks[isolate] = num_ranks[isolate] + 1
                 # if maximum number of ranks reached, mark as complete
-                if num_ranks[isolate] == R: # stop at R as counting from 0
+                if num_ranks[isolate] == rank: # stop at R as counting from 0
                     completed_isolates = completed_isolates + 1
                 # if not, add as a nearest neighbour
                 else:
@@ -246,7 +246,7 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R):
     # return completed dict
     return nn
 
-def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clustering, null_cluster_value):
+def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_clustering, null_cluster_value):
     """ Updates the information on nearest neighbours, given
     a new set of ref-query and query-query distances.
 
@@ -255,7 +255,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
         Distances to be used for defining lineages.
        row_labels (list of tuples)
         Pairs of isolates labelling each distance.
-       R (int)
+       rank (int)
         Maximum rank of distance used to define nearest neighbours.
        qlist (list)
         List of queries to be added to existing dataset.
@@ -285,7 +285,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
     query_distances = np.take(distances, query_match_indices)
     
     # get nn for query sequences
-    query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R)
+    query_nn = generate_nearest_neighbours(distances, row_labels, qlist, rank)
     # add query-query comparisons
     for query in query_nn.keys():
         nn[query] = query_nn[query]
@@ -319,7 +319,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
                 nn[ref][query] = distance
                 # delete links if no longer high ranked match
                 if distance < max_distance[ref]:
-                    if num_distances[ref] == R:
+                    if num_distances[ref] == rank:
                         to_delete = []
                         for other in nn[ref].keys():
                             if nn[ref][other] == max_distance[ref]:
@@ -333,12 +333,12 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust
     # return updated dict
     return nn, lineage_clustering
 
-def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1):
+def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1):
     """ Clusters isolates into lineages based on their
     relative distances.
     
     Args:
-        X (np.array)
+        distMat (np.array)
             n x 2 array of core and accessory distances for n samples.
             This should not be subsampled.
         rank_list (list of int)
@@ -367,7 +367,7 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
     # - then the functions can be reimplemented to run faster on a
     #   sorted list
     distance_index = 1 if use_accessory else 0
-    distances = X[:,distance_index]
+    distances = distMat[:,distance_index]
     
     # sort distances
     distance_ranks = np.argsort(distances)
@@ -384,11 +384,11 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
     previous_lineage_clustering = {}
     
     null_cluster_value = len(isolate_list) + 1
-    for R in rank_list:
-        lineage_clustering[R] = {i:null_cluster_value for i in isolate_list}
-        lineage_seed[R] = {}
-        neighbours[R] = {}
-        previous_lineage_clustering[R] = {}
+    for rank in rank_list:
+        lineage_clustering[rank] = {i:null_cluster_value for i in isolate_list}
+        lineage_seed[rank] = {}
+        neighbours[rank] = {}
+        previous_lineage_clustering[rank] = {}
     
     # shared memory data structures
     with SharedMemoryManager() as smm:
@@ -409,7 +409,7 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
     
         # run clustering for an individual R
         with Pool(processes = num_processes) as pool:
-            results = pool.map(partial(run_clustering_for_R,
+            results = pool.map(partial(run_clustering_for_rank,
                                 null_cluster_value = null_cluster_value,
                                 qlist = qlist,
                                 existing_scheme = existing_scheme,
@@ -420,8 +420,8 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
         
         # extract results from multiprocessing pool
         for n,result in enumerate(results):
-            R = rank_list[n]
-            lineage_clustering[R], lineage_seed[R], neighbours[R], previous_lineage_clustering[R] = result
+            rank = rank_list[n]
+            lineage_clustering[rank], lineage_seed[rank], neighbours[rank], previous_lineage_clustering[rank] = result
 
     # store output
     with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file:
@@ -429,7 +429,7 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
     
     # print output
     combined = {}
-    titles_list = ['Lineage_R' + str(R)  for R in rank_list]
+    titles_list = ['Lineage_R' + str(rank) for rank in rank_list]
     lineage_output_name = output + "/" + output + "_lineage_clusters.csv"
     with open(lineage_output_name, 'w') as lFile:
         # print header
@@ -444,15 +444,15 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
         lFile.write('\n')
 
         # print lines for each isolate
-        for isolate in lineage_clustering[R].keys():
+        for isolate in lineage_clustering[rank].keys():
             lFile.write(isolate)
-            for n,R in enumerate(rank_list):
-                lFile.write(',' + str(lineage_clustering[R][isolate]))
-                lineage_string = str(lineage_clustering[R][isolate])
+            for n,rank in enumerate(rank_list):
+                lFile.write(',' + str(lineage_clustering[rank][isolate]))
+                lineage_string = str(lineage_clustering[rank][isolate])
                 # include information on lineage clustering
                 combined[titles_list[n]][isolate] = lineage_string
-                if lineage_clustering[R][isolate] != previous_lineage_clustering[R][isolate] and previous_lineage_clustering[R][isolate] != null_cluster_value:
-                    lineage_string = str(previous_lineage_clustering[R][isolate]) + ':' + lineage_string
+                if lineage_clustering[rank][isolate] != previous_lineage_clustering[rank][isolate] and previous_lineage_clustering[rank][isolate] != null_cluster_value:
+                    lineage_string = str(previous_lineage_clustering[rank][isolate]) + ':' + lineage_string
                 if isolate in combined['Overall_lineage'].keys():
                     combined['Overall_lineage'][isolate] = combined['Overall_lineage'][isolate] + '-' + lineage_string
                 else:
@@ -467,13 +467,13 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis
 
     return combined
 
-def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None):
+def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None):
     """ Clusters isolates into lineages based on their
     relative distances using a single R to enable
     parallelisation.
 
     Args:
-        R (int)
+        rank (int)
             Integer specifying the maximum rank of neighbour used
             for clustering. Should be changed to int list for hierarchical
             clustering.
@@ -517,9 +517,9 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc
         with open(existing_scheme, 'rb') as pickle_file:
             lineage_clustering_overall, lineage_seed_overall, neighbours_overall, rank_list = pickle.load(pickle_file)
         # focus on relevant data
-        lineage_clustering = lineage_clustering_overall[R]
-        lineage_seed = lineage_seed_overall[R]
-        neighbours = neighbours_overall[R]
+        lineage_clustering = lineage_clustering_overall[rank]
+        lineage_seed = lineage_seed_overall[rank]
+        neighbours = neighbours_overall[rank]
         # add new queries to lineage clustering
         for q in qlist:
             lineage_clustering[q] = null_cluster_value
@@ -527,7 +527,7 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc
         
         neighbours, lineage_clustering = update_nearest_neighbours(distances,
                                                                 row_labels,
-                                                                R,
+                                                                rank,
                                                                 qlist,
                                                                 neighbours,
                                                                 lineage_clustering,
@@ -536,7 +536,7 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc
         neighbours = generate_nearest_neighbours(distances,
                                                 row_labels,
                                                 isolate_list,
-                                                R)
+                                                rank)
 
     # run clustering
     lineage_index = 1
@@ -567,7 +567,7 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc
             lineage_clustering = run_lineage_clustering(lineage_clustering,
                                                         lineage_clustering_information,
                                                         neighbours,
-                                                        R,
+                                                        rank,
                                                         lineage_index,
                                                         seed_isolate,
                                                         previous_lineage_clustering,

From 8e0875ff87660230a3b41e195d93e3c2f49464a9 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 2 May 2020 14:08:07 +0100
Subject: [PATCH 25/46] Input argument validation

---
 PopPUNK/__main__.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 66e4f0f2..986c888c 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -281,6 +281,12 @@ def main():
     # check if working with lineages
     if args.lineage_clustering or args.assign_lineages:
         rank_list = sorted([int(x) for x in args.ranks.split(',')])
+        if min(rank_list) == 0 or max(rank_list) > 100:
+            sys.stderr.write('Ranks should be small non-zero integers for sensible results\n')
+            exit(1)
+        if args.assign_lineages and args.existing_scheme is None:
+            sys.stderr.write('Must provide an existing scheme (--existing-scheme) if assigning to lineages\n')
+            exit(1)
 
     # check on file paths and whether files will be overwritten
     # confusing to overwrite command line parameter
@@ -628,7 +634,7 @@ def main():
             if args.external_clustering:
                 cluster_file = args.external_clustering
                 isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)}
-            elif args.viz_lineages:
+            if args.viz_lineages:
                 cluster_file = args.viz_lineages
                 #isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)}
                 isolateClustering = readLineages(cluster_file)
@@ -758,7 +764,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
             ordered_queryList, query_distMat = calculateQueryDistances(dbFuncs, refList, q_files,
                     kmers, estimated_length, output, use_mash, threads)
 
-        if not assign_lineage:
+        else:
             # Assign these distances as within or between strain
             model_prefix = ref_db
             if model_dir is not None:

From 8cf51de4adb69e408835829f0652788e1ea208cd Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 2 May 2020 14:35:44 +0100
Subject: [PATCH 26/46] Move Q-Q distance calculation into a reused function

---
 PopPUNK/__main__.py           |  5 +--
 PopPUNK/lineage_clustering.py | 61 -----------------------------------
 PopPUNK/network.py            | 36 +++++++--------------
 PopPUNK/sketchlib.py          | 56 ++++++++++++++++++++++++++++++++
 4 files changed, 70 insertions(+), 88 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 986c888c..caabfa46 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -18,7 +18,6 @@
 from .sketchlib import no_sketchlib, checkSketchlibLibrary
 
 from .lineage_clustering import cluster_into_lineages
-from .lineage_clustering import calculateQueryDistances
 from .lineage_clustering import readLineages
 
 from .network import fetchNetwork
@@ -35,6 +34,8 @@
 
 from .prune_db import prune_distance_matrix
 
+from .sketchlib import calculateQueryQueryDistances
+
 from .utils import setupDBFuncs
 from .utils import storePickle
 from .utils import readPickle
@@ -761,7 +762,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
         if assign_lineage:
 
             # Assign lineages by calculating query-query information
-            ordered_queryList, query_distMat = calculateQueryDistances(dbFuncs, refList, q_files,
+            ordered_queryList, query_distMat = calculateQueryQueryDistances(dbFuncs, refList, q_files,
                     kmers, estimated_length, output, use_mash, threads)
 
         else:
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 718021d3..0b8672f3 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -579,67 +579,6 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist
     # return clustering
     return lineage_clustering, lineage_seed, neighbours, previous_lineage_clustering
 
-def calculateQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length,
-                    queryDB, use_mash = False, threads = 1):
-    """Finds edges between queries and items in the reference database,
-    and modifies the network to include them.
-
-    Args:
-        dbFuncs (list)
-            List of backend functions from :func:`~PopPUNK.utils.setupDBFuncs`
-        rlist (list)
-            List of reference names
-        qfile (str)
-            File containing queries
-        kmers (list)
-            List of k-mer sizes
-        estimated_length (int)
-            Estimated length of genome, if not calculated from data
-        queryDB (str)
-            Query database location
-        use_mash (bool)
-            Use the mash backend
-        threads (int)
-            Number of threads to use if new db created
-            (default = 1)
-    Returns:
-        qlist1 (list)
-            Ordered list of queries
-        distMat (numpy.array)
-            Query-query distances
-    """
-    
-    constructDatabase = dbFuncs['constructDatabase']
-    queryDatabase = dbFuncs['queryDatabase']
-    readDBParams = dbFuncs['readDBParams']
-
-    # These are returned
-    qlist1 = None
-    distMat = None
-
-    # Set up query names
-    qList, qSeqs = readRfile(qfile, oneSeq = use_mash)
-    queryFiles = dict(zip(qList, qSeqs))
-    if use_mash == True:
-        rNames = None
-        qNames = qSeqs
-    else:
-        rNames = qList
-        qNames = rNames
-
-    # Calculate all query-query distances too, if updating database
-    qlist1, qlist2, distMat = queryDatabase(rNames = rNames,
-                                            qNames = qNames,
-                                            dbPrefix = queryDB,
-                                            queryPrefix = queryDB,
-                                            klist = kmers,
-                                            self = True,
-                                            number_plot_fits = 0,
-                                            threads=threads)
-
-    return qlist1, distMat
-
-
 def readLineages(clustCSV):
     """Read a previous reference clustering from CSV
 
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 40b7c494..6932bc38 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -18,6 +18,8 @@
 from tempfile import mkstemp, mkdtemp
 from collections import defaultdict, Counter
 
+from .sketchlib import calculateQueryQueryDistances
+
 from .utils import iterDistRows
 from .utils import readClusters
 from .utils import readExternalClusters
@@ -320,26 +322,10 @@ def addQueryToNetwork(dbFuncs, rlist, qfile, G, kmers, estimated_length,
         distMat (numpy.array)
             Query-query distances
     """
-    constructDatabase = dbFuncs['constructDatabase']
-    queryDatabase = dbFuncs['queryDatabase']
-    readDBParams = dbFuncs['readDBParams']
 
     # initialise links data structure
     new_edges = []
     assigned = set()
-    # These are returned
-    qlist1 = None
-    distMat = None
-
-    # Set up query names
-    qList, qSeqs = readRfile(qfile, oneSeq = use_mash)
-    queryFiles = dict(zip(qList, qSeqs))
-    if use_mash == True:
-        rNames = None
-        qNames = qSeqs
-    else:
-        rNames = qList 
-        qNames = rNames
 
     # store links for each query in a list of edge tuples
     for assignment, (ref, query) in zip(assignments, iterDistRows(rlist, qList, self=False)):
@@ -350,17 +336,17 @@ def addQueryToNetwork(dbFuncs, rlist, qfile, G, kmers, estimated_length,
     # Calculate all query-query distances too, if updating database
     if queryQuery:
         sys.stderr.write("Calculating all query-query distances\n")
+        qlist, distMat = calculateQueryQueryDistances(dbFuncs,
+                                                        refList,
+                                                        q_files,
+                                                        kmers,
+                                                        estimated_length,
+                                                        output,
+                                                        use_mash,
+                                                        threads)
 
-        qlist1, qlist2, distMat = queryDatabase(rNames = rNames,
-                                                qNames = qNames, 
-                                                dbPrefix = queryDB, 
-                                                queryPrefix = queryDB, 
-                                                klist = kmers,
-                                                self = True, 
-                                                number_plot_fits = 0, 
-                                                threads=threads)
         queryAssignation = model.assign(distMat)
-        for assignment, (ref, query) in zip(queryAssignation, iterDistRows(qlist1, qlist2, self=True)):
+        for assignment, (ref, query) in zip(queryAssignation, iterDistRows(qlist, qlist, self=True)):
             if assignment == model.within_label:
                 new_edges.append((ref, query))
 
diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py
index 27abe3a8..ee53dc4f 100644
--- a/PopPUNK/sketchlib.py
+++ b/PopPUNK/sketchlib.py
@@ -446,3 +446,59 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
                                              False, threads, use_gpu, deviceid)
 
     return(rNames, qNames, distMat)
+
+def calculateQueryQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length,
+                queryDB, use_mash = False, threads = 1):
+    """Calculates distances between queries.
+
+    Args:
+        dbFuncs (list)
+            List of backend functions from :func:`~PopPUNK.utils.setupDBFuncs`
+        rlist (list)
+            List of reference names
+        qfile (str)
+            File containing queries
+        kmers (list)
+            List of k-mer sizes
+        estimated_length (int)
+            Estimated length of genome, if not calculated from data
+        queryDB (str)
+            Query database location
+        use_mash (bool)
+            Use the mash backend
+        threads (int)
+            Number of threads to use if new db created
+            (default = 1)
+            
+    Returns:
+        qlist1 (list)
+            Ordered list of queries
+        distMat (numpy.array)
+            Query-query distances
+    """
+
+    constructDatabase = dbFuncs['constructDatabase']
+    queryDatabase = dbFuncs['queryDatabase']
+    readDBParams = dbFuncs['readDBParams']
+
+    # Set up query names
+    qList, qSeqs = readRfile(qfile, oneSeq = use_mash)
+    queryFiles = dict(zip(qList, qSeqs))
+    if use_mash == True:
+        rNames = None
+        qNames = qSeqs
+    else:
+        rNames = qList
+        qNames = rNames
+
+    # Calculate all query-query distances too, if updating database
+    qlist1, qlist2, distMat = queryDatabase(rNames = rNames,
+                                            qNames = qNames,
+                                            dbPrefix = queryDB,
+                                            queryPrefix = queryDB,
+                                            klist = kmers,
+                                            self = True,
+                                            number_plot_fits = 0,
+                                            threads=threads)
+
+    return qlist1, distMat

From a0c19512dd61cbe086808d8504880df7d0292a53 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sat, 2 May 2020 21:39:06 +0100
Subject: [PATCH 27/46] Get rid of null cluster value

---
 PopPUNK/lineage_clustering.py | 56 +++++++++++++----------------------
 1 file changed, 20 insertions(+), 36 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 0b8672f3..9aa156cd 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -57,7 +57,7 @@ def cluster_neighbours_of_equal_or_lower_rank(isolate, rank, lineage_index, line
             rank = isolate_ranks[isolate]
             for isolate_neighbour in nn[isolate].keys():
                 # if lineage of neighbour is higher, or it is null value (highest)
-                if lineage_clustering[isolate_neighbour] > lineage_index:
+                if lineage_clustering[isolate_neighbour] is None or lineage_clustering[isolate_neighbour] > lineage_index:
                     # check if rank of neighbour is lower than that of the current subject isolate
                     for neighbour_rank in range(1,rank):
                         if isolate_neighbour in lineage_clustering_information[neighbour_rank]:
@@ -68,7 +68,7 @@ def cluster_neighbours_of_equal_or_lower_rank(isolate, rank, lineage_index, line
 
     return lineage_clustering
 
-def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, rank, lineage_index, seed_isolate, previous_lineage_clustering, null_cluster_value):
+def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, rank, lineage_index, seed_isolate, previous_lineage_clustering):
     """ Identifies isolates corresponding to a particular
     lineage given a cluster seed.
     
@@ -87,9 +87,6 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
            Isolate to used to initiate next lineage.
         previous_lineage_clustering (dict)
             Clustering of existing dataset in previous iteration.
-        null_cluster_value (int)
-            Null cluster value used for unsetting lineage assignments
-            where this may change due to altered neighbour relationships.
         
     Returns:
         lineage_clustering (dict)
@@ -98,7 +95,7 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
     """
     # first make all R neighbours of the seed part of the lineage if unclustered
     for seed_neighbour in neighbours[seed_isolate]:
-        if lineage_clustering[seed_neighbour] > lineage_index:
+        if lineage_clustering[seed_neighbour] is None or lineage_clustering[seed_neighbour] > lineage_index:
             lineage_clustering[seed_neighbour] = lineage_index
     # iterate through ranks; for each isolate, test if neighbour belongs to this cluster
     # overwrite higher cluster values - default value is higer than number of isolates
@@ -108,7 +105,7 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
         # iterate through isolates of same rank
         for isolate in lineage_clustering_information[rank]:
             # test if clustered/belonging to a more peripheral cluster
-            if lineage_clustering[isolate] > lineage_index:
+            if lineage_clustering[isolate] is None or lineage_clustering[isolate] > lineage_index:
                 # get clusters of nearest neighbours
                 isolate_neighbour_clusters = [lineage_clustering[isolate_neighbour] for isolate_neighbour in neighbours[isolate].keys()]
                 # if a nearest neighbour is in this cluster
@@ -130,12 +127,12 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
                         if higher_rank > rank:
                             for higher_ranked_isolate in lineage_clustering_information[rank]:
                                 if lineage_clustering[isolate] == lineage_index:
-                                    lineage_clustering[isolate] = null_cluster_value
+                                    lineage_clustering[isolate] = None
                 
                         
     return lineage_clustering
 
-def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_value, lineage_index, lineage_seed):
+def get_seed_isolate(lineage_clustering, row_labels, distances, lineage_index, lineage_seed):
     """ Identifies the isolate used to initiate a cluster.
 
     Args:
@@ -145,9 +142,6 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val
             Pairs of isolates labelling each distance.
         distances (numpy array)
             Pairwise distances used for defining relationships.
-        null_cluster_value (int)
-           Null cluster value used for unsetting lineage assignments
-           where this may change due to altered neighbour relationships.
         lineage_index (int)
             Label of current lineage.
         lineage_seed (dict)
@@ -166,15 +160,15 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val
         # if seed now belongs to a different lineage, then lineage has been overwritten
         # seed may be unassigned if neighbours have changed - but this does not overwrite the
         # lineage in the same way
-        if lineage_clustering[original_seed_isolate] == null_cluster_value or lineage_clustering[original_seed_isolate] == lineage_index:
+        if lineage_clustering[original_seed_isolate] is None or lineage_clustering[original_seed_isolate] == lineage_index:
             seed_isolate = original_seed_isolate
     # else identify a new seed from the closest pair
     else:
         for index,(distance,(isolate1,isolate2)) in enumerate(zip(distances,row_labels)):
-            if lineage_clustering[isolate1] == null_cluster_value:
+            if lineage_clustering[isolate1] is None:
                 seed_isolate = isolate1
                 break
-            elif lineage_clustering[isolate2] == null_cluster_value:
+            elif lineage_clustering[isolate2] is None:
                 seed_isolate = isolate2
                 break
     # return identified seed isolate
@@ -246,7 +240,7 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, rank):
     # return completed dict
     return nn
 
-def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_clustering, null_cluster_value):
+def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_clustering):
     """ Updates the information on nearest neighbours, given
     a new set of ref-query and query-query distances.
 
@@ -263,9 +257,6 @@ def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_cl
         Pre-calculated neighbour relationships.
        lineage_clustering (dict)
         Clustering of existing dataset.
-       null_cluster_value (int)
-        Null cluster value used for unsetting lineage assignments
-        where this may change due to altered neighbour relationships.
 
     Returns:
         nn (nested dict)
@@ -312,9 +303,9 @@ def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_cl
         if ref is not None:
             if distance <= max_distance[ref]:
                 # unset isolate and references
-                lineage_clustering[ref] = null_cluster_value
+                lineage_clustering[ref] = None
                 for neighbour in nn[ref]:
-                    lineage_clustering[neighbour] = null_cluster_value
+                    lineage_clustering[neighbour] = None
                 # add neighbours
                 nn[ref][query] = distance
                 # delete links if no longer high ranked match
@@ -383,9 +374,8 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None
     lineage_clustering = {}
     previous_lineage_clustering = {}
     
-    null_cluster_value = len(isolate_list) + 1
     for rank in rank_list:
-        lineage_clustering[rank] = {i:null_cluster_value for i in isolate_list}
+        lineage_clustering[rank] = {i:None for i in isolate_list}
         lineage_seed[rank] = {}
         neighbours[rank] = {}
         previous_lineage_clustering[rank] = {}
@@ -410,7 +400,6 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None
         # run clustering for an individual R
         with Pool(processes = num_processes) as pool:
             results = pool.map(partial(run_clustering_for_rank,
-                                null_cluster_value = null_cluster_value,
                                 qlist = qlist,
                                 existing_scheme = existing_scheme,
                                 distances = distances_shared_array,
@@ -451,7 +440,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None
                 lineage_string = str(lineage_clustering[rank][isolate])
                 # include information on lineage clustering
                 combined[titles_list[n]][isolate] = lineage_string
-                if lineage_clustering[rank][isolate] != previous_lineage_clustering[rank][isolate] and previous_lineage_clustering[rank][isolate] != null_cluster_value:
+                if lineage_clustering[rank][isolate] != previous_lineage_clustering[rank][isolate] and previous_lineage_clustering[rank][isolate] is not None:
                     lineage_string = str(previous_lineage_clustering[rank][isolate]) + ':' + lineage_string
                 if isolate in combined['Overall_lineage'].keys():
                     combined['Overall_lineage'][isolate] = combined['Overall_lineage'][isolate] + '-' + lineage_string
@@ -467,7 +456,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None
 
     return combined
 
-def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None):
+def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None):
     """ Clusters isolates into lineages based on their
     relative distances using a single R to enable
     parallelisation.
@@ -477,8 +466,6 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist
             Integer specifying the maximum rank of neighbour used
             for clustering. Should be changed to int list for hierarchical
             clustering.
-        null_cluster_value (int)
-            Used to denote as-yet-unclustered isolates.
         qlist (list)
             List of query sequences being added to an existing clustering.
             Should be included within rlist.
@@ -508,7 +495,7 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist
     # reorder by sorted distances
     row_labels = [row_labels[i] for i in distance_ranks]
     
-    lineage_clustering = {i:null_cluster_value for i in isolate_list}
+    lineage_clustering = {i:None for i in isolate_list}
     previous_lineage_clustering = lineage_clustering
     lineage_seed = {}
     neighbours = {}
@@ -522,7 +509,7 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist
         neighbours = neighbours_overall[rank]
         # add new queries to lineage clustering
         for q in qlist:
-            lineage_clustering[q] = null_cluster_value
+            lineage_clustering[q] = None
         previous_lineage_clustering = lineage_clustering
         
         neighbours, lineage_clustering = update_nearest_neighbours(distances,
@@ -530,8 +517,7 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist
                                                                 rank,
                                                                 qlist,
                                                                 neighbours,
-                                                                lineage_clustering,
-                                                                null_cluster_value)
+                                                                lineage_clustering)
     else:
         neighbours = generate_nearest_neighbours(distances,
                                                 row_labels,
@@ -540,13 +526,12 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist
 
     # run clustering
     lineage_index = 1
-    while null_cluster_value in lineage_clustering.values():
+    while None in lineage_clustering.values():
 
         # get seed isolate based on minimum pairwise distances
         seed_isolate = get_seed_isolate(lineage_clustering,
                                         row_labels,
                                         distances,
-                                        null_cluster_value,
                                         lineage_index,
                                         lineage_seed)
         lineage_seed[lineage_index] = seed_isolate
@@ -570,8 +555,7 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist
                                                         rank,
                                                         lineage_index,
                                                         seed_isolate,
-                                                        previous_lineage_clustering,
-                                                        null_cluster_value)
+                                                        previous_lineage_clustering)
                                                         
         # increment index for next lineage
         lineage_index = lineage_index + 1

From 14ed2f4dde474fff9d45d92a7952e2a642ac1d87 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 3 May 2020 06:20:52 +0100
Subject: [PATCH 28/46] Change dict initialisation

---
 PopPUNK/lineage_clustering.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 9aa156cd..c8a6da01 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -208,12 +208,10 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances):
 
 def generate_nearest_neighbours(distances, row_labels, isolate_list, rank):
     # data structures
-    nn = {}
+    nn = defaultdict(dict)
     last_dist = {}
     num_ranks = {}
-    for i in isolate_list:
-        nn[i] = {}
-        num_ranks[i] = 0
+    num_ranks = {i:0  for i in isolate_list}
     total_isolates = len(isolate_list)
     num_distances = len(distances)
     completed_isolates = 0

From 2d712d28d7200918959ac857b310647eef4637ef Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 3 May 2020 13:54:56 +0100
Subject: [PATCH 29/46] Merge CSV reading functions

---
 PopPUNK/__main__.py           |  8 ++---
 PopPUNK/lineage_clustering.py | 36 ++-----------------
 PopPUNK/network.py            | 20 +++++------
 PopPUNK/utils.py              | 65 ++++++++++++++++-------------------
 4 files changed, 44 insertions(+), 85 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index caabfa46..f996a6a5 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -18,7 +18,6 @@
 from .sketchlib import no_sketchlib, checkSketchlibLibrary
 
 from .lineage_clustering import cluster_into_lineages
-from .lineage_clustering import readLineages
 
 from .network import fetchNetwork
 from .network import constructNetwork
@@ -41,10 +40,10 @@
 from .utils import readPickle
 from .utils import writeTmpFile
 from .utils import qcDistMat
-from .utils import readClusters
 from .utils import translate_distMat
 from .utils import update_distance_matrices
 from .utils import readRfile
+from .utils import readIsolateTypeFromCsv
 
 # Minimum sketchlib version
 SKETCHLIB_MAJOR = 1
@@ -634,11 +633,10 @@ def main():
             # Use external clustering if specified
             if args.external_clustering:
                 cluster_file = args.external_clustering
-                isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)}
+                isolateClustering = readIsolateTypeFromCsv(cluster_file, mode = 'external', return_dict = True)
             if args.viz_lineages:
                 cluster_file = args.viz_lineages
-                #isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)}
-                isolateClustering = readLineages(cluster_file)
+                isolateClustering = readIsolateTypeFromCsv(cluster_file, mode = 'lineages', return_dict = True)
             else:
                 # identify existing analysis files
                 model_prefix = args.ref_db
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index c8a6da01..25c886fb 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -211,7 +211,7 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, rank):
     nn = defaultdict(dict)
     last_dist = {}
     num_ranks = {}
-    num_ranks = {i:0  for i in isolate_list}
+    num_ranks = {i:0 for i in isolate_list}
     total_isolates = len(isolate_list)
     num_distances = len(distances)
     completed_isolates = 0
@@ -416,7 +416,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None
     
     # print output
     combined = {}
-    titles_list = ['Lineage_R' + str(rank) for rank in rank_list]
+    titles_list = ['Lineage_Rank_' + str(rank) for rank in rank_list]
     lineage_output_name = output + "/" + output + "_lineage_clusters.csv"
     with open(lineage_output_name, 'w') as lFile:
         # print header
@@ -560,35 +560,3 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc
 
     # return clustering
     return lineage_clustering, lineage_seed, neighbours, previous_lineage_clustering
-
-def readLineages(clustCSV):
-    """Read a previous reference clustering from CSV
-
-    Args:
-        clustCSV (str)
-            File name of CSV with previous cluster assignments
-
-    Returns:
-        clusters (dict)
-            Or if return_dict is set keys are sample names,
-            values are cluster assignments.
-    """
-    clusters = {}
-    relevant_headers = []
-    header_elements = []
-
-    with open(clustCSV, 'r') as csv_file:
-        header = csv_file.readline()
-        # identify columns to include
-        header_elements = header.rstrip().split(",")
-        relevant_headers.append(header_elements.index('Overall_lineage'))
-        relevant_headers.extend([n for n,i in enumerate(header_elements) if re.search('Lineage_R',i)])
-        for h in relevant_headers:
-            clusters[header_elements[h]] = {}
-        for line in csv_file:
-            elements = line.rstrip().split(",")
-            if elements[0] != header_elements[0]:
-                for h in relevant_headers:
-                    clusters[header_elements[h]][elements[0]] = elements[h]
-
-    return clusters
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 6932bc38..9ad9bf73 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -21,8 +21,7 @@
 from .sketchlib import calculateQueryQueryDistances
 
 from .utils import iterDistRows
-from .utils import readClusters
-from .utils import readExternalClusters
+from .utils import readIsolateTypeFromCsv
 from .utils import readRfile
 
 def fetchNetwork(network_dir, model, refList,
@@ -402,7 +401,7 @@ def addQueryToNetwork(dbFuncs, rlist, qfile, G, kmers, estimated_length,
     return qlist1, distMat
 
 def printClusters(G, outPrefix = "_clusters.csv", oldClusterFile = None,
-                  externalClusterCSV = None, printRef = True, printCSV = True):
+                  externalClusterCSV = None, printRef = True, printCSV = True, clustering_type = 'combined'):
     """Get cluster assignments
 
     Also writes assignments to a CSV file
@@ -413,26 +412,24 @@ def printClusters(G, outPrefix = "_clusters.csv", oldClusterFile = None,
             :func:`~addQueryToNetwork`)
         outPrefix (str)
             Prefix for output CSV
-
             Default = "_clusters.csv"
         oldClusterFile (str)
             CSV with previous cluster assignments.
             Pass to ensure consistency in cluster assignment name.
-
             Default = None
         externalClusterCSV (str)
             CSV with cluster assignments from any source. Will print a file
             relating these to new cluster assignments
-
             Default = None
         printRef (bool)
             If false, print only query sequences in the output
-
             Default = True
         printCSV (bool)
             Print results to file
-
             Default = True
+        clustering_type (str)
+            Type of clustering network, used for comparison with old clusters
+            Default = 'combined'
 
     Returns:
         clustering (dict)
@@ -446,7 +443,9 @@ def printClusters(G, outPrefix = "_clusters.csv", oldClusterFile = None,
     oldNames = set()
 
     if oldClusterFile != None:
-        oldClusters = readClusters(oldClusterFile)
+#        oldClusters = readClusters(oldClusterFile)
+        oldAllClusters = readIsolateTypeFromCsv(oldClusterFile, mode = 'external', return_dict = False)
+        oldCluster = oldAllClusters[clustering_type]
         new_id = len(oldClusters) + 1 # 1-indexed
         while new_id in oldClusters:
             new_id += 1 # in case clusters have been merged
@@ -557,7 +556,8 @@ def printExternalClusters(newClusters, extClusterFile, outPrefix,
     d = defaultdict(list)
 
     # Read in external clusters
-    extClusters = readExternalClusters(extClusterFile)
+#    extClusters = readExternalClusters(extClusterFile)
+    readIsolateTypeFromCsv(clustCSV, mode = 'external', return_dict = False)
 
     # Go through each cluster (as defined by poppunk) and find the external
     # clusters that had previously been assigned to any sample in the cluster
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 695fa7fe..58d1f0fe 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -214,12 +214,12 @@ def qcDistMat(distMat, refList, queryList, a_max):
     return passed
 
 
-def readClusters(clustCSV, return_dict=False):
-    """Read a previous reference clustering from CSV
+def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False):
+    """Read isolate types from CSV file.
 
     Args:
         clustCSV (str)
-            File name of CSV with previous cluster assignments
+            File name of CSV with isolate assignments
         return_type (str)
             If True, return a dict with sample->cluster instead
             of sets
@@ -230,46 +230,39 @@ def readClusters(clustCSV, return_dict=False):
             sets containing samples in the cluster). Or if return_dict is set keys
             are sample names, values are cluster assignments.
     """
+    # data structures
     if return_dict:
-        clusters = {}
+        clusters = defaultdict(lambda: defaultdict(str))
     else:
-        clusters = defaultdict(set)
-
-    with open(clustCSV, 'r') as csv_file:
-        header = csv_file.readline()
-        for line in csv_file:
-            (sample, clust_id) = line.rstrip().split(",")[:2]
+        clusters = defaultdict(lambda: defaultdict(set))
+    
+    # read CSV
+    clustersCsv = pd.read_csv(clustCSV, index_col = 0, quotechar='"')
+    
+    # select relevant columns according to mode
+    if mode == 'clusters':
+        type_columns = clustersCsv.columns[clustersCsv.columns.str.contains('_Cluster')]
+    elif mode == 'lineages':
+        type_columns = clustersCsv.columns[clustersCsv.columns.str.contains('Overall_lineage') | clustersCsv.columns.str.contains('Lineage_Rank_')]
+    elif mode == 'external':
+        type_columns = range(1,len(clustersCsv.columns))
+    else:
+        sys.stderr.write('Unknown CSV reading mode: ' + mode + '\n')
+        exit(1)
+        
+    # read file
+    for row in clustersCsv.itertuples():
+        for cls_idx in type_columns:
+            cluster_name = str(clustersCsv.columns[type_columns[cls_idx]])
+            cluster_name = cluster_name.replace('__autocolour','')
             if return_dict:
-                clusters[sample] = clust_id
+                clusters[cluster_name][row.Index] = str(row[cls_idx])
             else:
-                clusters[clust_id].add(sample)
+                clusters[cluster_name][str(row[cls_idx])].append([row.Index])
 
+    # return data structure
     return clusters
 
-
-def readExternalClusters(clustCSV):
-    """Read a cluster definition from CSV (does not have to be PopPUNK
-    generated clusters). Rows samples, columns clusters.
-
-    Args:
-        clustCSV (str)
-            File name of CSV with previous cluster assingments
-
-    Returns:
-        extClusters (dict)
-            Dictionary of dictionaries of cluster assignments
-            (first key cluster assignment name, second key sample, value cluster assignment)
-    """
-    extClusters = defaultdict(lambda: defaultdict(str))
-
-    extClustersFile = pd.read_csv(clustCSV, index_col = 0, quotechar='"')
-    for row in extClustersFile.itertuples():
-        for cls_idx, cluster in enumerate(extClustersFile.columns):
-            extClusters[str(cluster)][row.Index] = str(row[cls_idx + 1])
-
-    return(extClusters)
-
-
 def translate_distMat(combined_list, core_distMat, acc_distMat):
     """Convert distances from a square form (2 NxN matrices) to a long form
     (1 matrix with n_comparisons rows and 2 columns).

From ed6e7296d0206fbd82057e14a73ec82f1e5c9d53 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 3 May 2020 15:51:28 +0100
Subject: [PATCH 30/46] Print lineages CSV using writing function in plot.py

---
 PopPUNK/__main__.py           |  2 +-
 PopPUNK/lineage_clustering.py | 67 +++++++++++++++--------------------
 PopPUNK/plot.py               | 23 ++++++------
 3 files changed, 41 insertions(+), 51 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index f996a6a5..fb3daf4f 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -840,7 +840,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
             complete_distMat = translate_distMat(combined_seq, core_distMat, acc_distMat)
             
             if assign_lineage:
-                expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl'
+                expected_lineage_name = ref_db + '/' + ref_db + '_lineages.pkl'
                 if existing_scheme is not None:
                     expected_lineage_name = existing_scheme
                 isolateClustering = cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name, use_accessory, threads)
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 25c886fb..7b30dc5a 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -23,6 +23,8 @@
 from functools import partial
 
 # import poppunk package
+from .plot import writeClusterCsv
+
 from .utils import iterDistRows
 from .utils import readRfile
 
@@ -411,48 +413,35 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None
             lineage_clustering[rank], lineage_seed[rank], neighbours[rank], previous_lineage_clustering[rank] = result
 
     # store output
-    with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file:
+    with open(output + "/" + output + '_lineages.pkl', 'wb') as pickle_file:
         pickle.dump([lineage_clustering, lineage_seed, neighbours, rank_list], pickle_file)
     
-    # print output
-    combined = {}
-    titles_list = ['Lineage_Rank_' + str(rank) for rank in rank_list]
-    lineage_output_name = output + "/" + output + "_lineage_clusters.csv"
-    with open(lineage_output_name, 'w') as lFile:
-        # print header
-        lFile.write('Id')
-        for t in titles_list:
-            lFile.write(',' + t + '__autocolor')
-            combined[t] = {}
-        lFile.write(',Overall_lineage')
-        combined['Overall_lineage'] = {}
-        if qlist is not None:
-            lFile.write(',Status')
-        lFile.write('\n')
-
-        # print lines for each isolate
-        for isolate in lineage_clustering[rank].keys():
-            lFile.write(isolate)
-            for n,rank in enumerate(rank_list):
-                lFile.write(',' + str(lineage_clustering[rank][isolate]))
-                lineage_string = str(lineage_clustering[rank][isolate])
-                # include information on lineage clustering
-                combined[titles_list[n]][isolate] = lineage_string
-                if lineage_clustering[rank][isolate] != previous_lineage_clustering[rank][isolate] and previous_lineage_clustering[rank][isolate] is not None:
-                    lineage_string = str(previous_lineage_clustering[rank][isolate]) + ':' + lineage_string
-                if isolate in combined['Overall_lineage'].keys():
-                    combined['Overall_lineage'][isolate] = combined['Overall_lineage'][isolate] + '-' + lineage_string
-                else:
-                    combined['Overall_lineage'][isolate] = lineage_string
-            lFile.write(',' + combined['Overall_lineage'][isolate])
-            if qlist is not None:
-                if isolate in qlist:
-                    lFile.write(',Added')
-                else:
-                    lFile.write(',Existing')
-            lFile.write('\n')
+    # process multirank lineages
+    overall_lineages = {}
+    overall_lineages = {'Rank_' + str(rank):{} for rank in rank_list}
+    overall_lineages['overall'] = {}
+    for isolate in isolate_list:
+        overall_lineage = None
+        for rank in rank_list:
+            overall_lineages['Rank_' + str(rank)][isolate] = lineage_clustering[rank][isolate]
+            if overall_lineage is None:
+                overall_lineage = str(lineage_clustering[rank][isolate])
+            else:
+                overall_lineage = overall_lineage + '-' + str(lineage_clustering[rank][isolate])
+        overall_lineages['overall'][isolate] = overall_lineage
+    
+    # print output as CSV
+    writeClusterCsv(output + "/" + output + '_lineages.csv',
+                    isolate_list,
+                    isolate_list,
+                    overall_lineages,
+                    output_format = 'phandango',
+                    epiCsv = None,
+                    queryNames = qlist,
+                    suffix = '_Lineage')
 
-    return combined
+    # return lineages
+    return overall_lineages
 
 def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None):
     """ Clusters isolates into lineages based on their
diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
index 2c8b0b4e..13441f0c 100644
--- a/PopPUNK/plot.py
+++ b/PopPUNK/plot.py
@@ -399,7 +399,7 @@ def outputsForCytoscape(G, clustering, outPrefix, epiCsv, queryList = None, suff
                         epiCsv,
                         queryList)
 
-def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = 'microreact', epiCsv = None, queryNames = None):
+def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = 'microreact', epiCsv = None, queryNames = None, suffix = '_Cluster'):
     """Print CSV file of clustering and optionally epi data
 
     Writes CSV output of clusters which can be used as input to microreact and cytoscape.
@@ -433,7 +433,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format =
     if output_format == 'microreact':
         colnames = ['id']
         for cluster_type in clustering:
-            col_name = cluster_type + '_Cluster__autocolour'
+            col_name = cluster_type + suffix + '__autocolour'
             colnames.append(col_name)
         if queryNames is not None:
             colnames.append('Status')
@@ -441,7 +441,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format =
     elif output_format == 'phandango':
         colnames = ['id']
         for cluster_type in clustering:
-            col_name = cluster_type + '_Cluster'
+            col_name = cluster_type + suffix
             colnames.append(col_name)
         if queryNames is not None:
             colnames.append('Status')
@@ -449,14 +449,14 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format =
     elif output_format == 'grapetree':
         colnames = ['ID']
         for cluster_type in clustering:
-            col_name = cluster_type + '_Cluster'
+            col_name = cluster_type + suffix
             colnames.append(col_name)
         if queryNames is not None:
             colnames.append('Status')
     elif output_format == 'cytoscape':
         colnames = ['id']
         for cluster_type in clustering:
-            col_name = cluster_type + '_Cluster'
+            col_name = cluster_type + suffix
             colnames.append(col_name)
         if queryNames is not None:
             colnames.append('Status')
@@ -485,7 +485,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format =
             if output_format == 'microreact':
                 d['id'].append(label)
                 for cluster_type in clustering:
-                    col_name = cluster_type + "_Cluster__autocolour"
+                    col_name = cluster_type + suffix + "__autocolour"
                     d[col_name].append(clustering[cluster_type][name])
                 if queryNames is not None:
                     if name in queryNames:
@@ -497,7 +497,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format =
             elif output_format == 'phandango':
                 d['id'].append(label)
                 for cluster_type in clustering:
-                    col_name = cluster_type + "_Cluster"
+                    col_name = cluster_type + suffix
                     d[col_name].append(clustering[cluster_type][name])
                 if queryNames is not None:
                     if name in queryNames:
@@ -509,7 +509,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format =
             elif output_format == 'grapetree':
                 d['ID'].append(label)
                 for cluster_type in clustering:
-                    col_name = cluster_type + "_Cluster"
+                    col_name = cluster_type + suffix
                     d[col_name].append(clustering[cluster_type][name])
                 if queryNames is not None:
                     if name in queryNames:
@@ -519,7 +519,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format =
             elif output_format == 'cytoscape':
                 d['id'].append(name)
                 for cluster_type in clustering:
-                    col_name = cluster_type + "_Cluster"
+                    col_name = cluster_type + suffix
                     d[col_name].append(clustering[cluster_type][name])
                 if queryNames is not None:
                     if name in queryNames:
@@ -530,7 +530,8 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format =
                 # avoid adding
                 if len(columns_to_be_omitted) == 0:
                     columns_to_be_omitted = ['id', 'Id', 'ID', 'combined_Cluster__autocolour',
-                                             'core_Cluster__autocolour', 'accessory_Cluster__autocolour']
+                                             'core_Cluster__autocolour', 'accessory_Cluster__autocolour',
+                                             'overall_Lineage']
                     for c in d:
                         if c not in columns_to_be_omitted:
                             columns_to_be_omitted.append(c)
@@ -546,7 +547,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format =
         else:
             sys.stderr.write("Cannot find " + name + " in clustering\n")
             sys.exit(1)
-                
+
     # print CSV
     sys.stderr.write("Parsed data, now writing to CSV\n")
     try:

From 464e1458b20e281de6ea73edfa7b6263b50eb408 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Sun, 3 May 2020 22:07:22 +0100
Subject: [PATCH 31/46] Use indices rather than names for saving memory

---
 PopPUNK/lineage_clustering.py | 30 +++++++++++++++------------
 PopPUNK/utils.py              | 39 +++++++++++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 7b30dc5a..e5a73a18 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -26,6 +26,7 @@
 from .plot import writeClusterCsv
 
 from .utils import iterDistRows
+from .utils import listDistInts
 from .utils import readRfile
 
 def cluster_neighbours_of_equal_or_lower_rank(isolate, rank, lineage_index, lineage_clustering, lineage_clustering_information, nn):
@@ -208,13 +209,13 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances):
     # return information
     return lineage_info
 
-def generate_nearest_neighbours(distances, row_labels, isolate_list, rank):
+def generate_nearest_neighbours(distances, row_labels, isolate_indices, rank):
     # data structures
     nn = defaultdict(dict)
     last_dist = {}
     num_ranks = {}
-    num_ranks = {i:0 for i in isolate_list}
-    total_isolates = len(isolate_list)
+    num_ranks = {i:0 for i in isolate_indices}
+    total_isolates = len(isolate_indices)
     num_distances = len(distances)
     completed_isolates = 0
     index = 0
@@ -280,6 +281,7 @@ def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_cl
     # add query-query comparisons
     for query in query_nn.keys():
         nn[query] = query_nn[query]
+        print('Query: ' + str(query) + ' nn ' + str(query_nn[query]))
     
     # calculate max distances for each isolate
     max_distance = {}
@@ -375,7 +377,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None
     previous_lineage_clustering = {}
     
     for rank in rank_list:
-        lineage_clustering[rank] = {i:None for i in isolate_list}
+        lineage_clustering[rank] = {i:None for i in range(0,len(isolate_list))}
         lineage_seed[rank] = {}
         neighbours[rank] = {}
         previous_lineage_clustering[rank] = {}
@@ -420,14 +422,14 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None
     overall_lineages = {}
     overall_lineages = {'Rank_' + str(rank):{} for rank in rank_list}
     overall_lineages['overall'] = {}
-    for isolate in isolate_list:
+    for index,isolate in enumerate(isolate_list):
         overall_lineage = None
         for rank in rank_list:
-            overall_lineages['Rank_' + str(rank)][isolate] = lineage_clustering[rank][isolate]
+            overall_lineages['Rank_' + str(rank)][isolate] = lineage_clustering[rank][index]
             if overall_lineage is None:
-                overall_lineage = str(lineage_clustering[rank][isolate])
+                overall_lineage = str(lineage_clustering[rank][index])
             else:
-                overall_lineage = overall_lineage + '-' + str(lineage_clustering[rank][isolate])
+                overall_lineage = overall_lineage + '-' + str(lineage_clustering[rank][index])
         overall_lineages['overall'][isolate] = overall_lineage
     
     # print output as CSV
@@ -474,15 +476,16 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc
     distance_ranks_shm = shared_memory.SharedMemory(name = distance_ranks.name)
     distance_ranks = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = distance_ranks_shm.buf)
     isolate_list = isolates
+    isolate_indices = range(0,len(isolate_list))
     
     # calculate row labels
     # this is inefficient but there appears to be no way of sharing
     # strings between processes efficiently
-    row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True)))
+    row_labels = listDistInts(isolate_list, isolate_list, self = True)
     # reorder by sorted distances
     row_labels = [row_labels[i] for i in distance_ranks]
     
-    lineage_clustering = {i:None for i in isolate_list}
+    lineage_clustering = {i:None for i in range(0,len(isolate_list))}
     previous_lineage_clustering = lineage_clustering
     lineage_seed = {}
     neighbours = {}
@@ -495,20 +498,21 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc
         lineage_seed = lineage_seed_overall[rank]
         neighbours = neighbours_overall[rank]
         # add new queries to lineage clustering
-        for q in qlist:
+        q_indices = [isolate_list.index(q) for q in qlist]
+        for q in q_indices:
             lineage_clustering[q] = None
         previous_lineage_clustering = lineage_clustering
         
         neighbours, lineage_clustering = update_nearest_neighbours(distances,
                                                                 row_labels,
                                                                 rank,
-                                                                qlist,
+                                                                q_indices,
                                                                 neighbours,
                                                                 lineage_clustering)
     else:
         neighbours = generate_nearest_neighbours(distances,
                                                 row_labels,
-                                                isolate_list,
+                                                isolate_indices,
                                                 rank)
 
     # run clustering
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 58d1f0fe..ac7353ad 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -143,9 +143,7 @@ def iterDistRows(refSeqs, querySeqs, self=True):
             List of query sequence names.
         self (bool)
             Whether a self-comparison, used when constructing a database.
-
             Requires refSeqs == querySeqs
-
             Default is True
     Returns:
         ref, query (str, str)
@@ -162,6 +160,43 @@ def iterDistRows(refSeqs, querySeqs, self=True):
             for ref in refSeqs:
                 yield(ref, query)
 
+def listDistInts(refSeqs, querySeqs, self=True):
+    """Gets the ref and query ID for each row of the distance matrix
+
+    Returns an iterable with ref and query ID pairs by row.
+
+    Args:
+        refSeqs (list)
+            List of reference sequence names.
+        querySeqs (list)
+            List of query sequence names.
+        self (bool)
+            Whether a self-comparison, used when constructing a database.
+            Requires refSeqs == querySeqs
+            Default is True
+    Returns:
+        ref, query (str, str)
+            Iterable of tuples with ref and query names for each distMat row.
+    """
+    dist_orders = []
+    n = 0
+    if self:
+        dist_orders = [(0,0)] * int(0.5 * len(refSeqs) * (len(refSeqs) - 1))
+        if refSeqs != querySeqs:
+            raise RuntimeError('refSeqs must equal querySeqs for db building (self = true)')
+        for i in range(0, len(refSeqs)):
+            for j in range(i + 1, len(refSeqs)):
+                dist_orders[n] = (j, i)
+                n = n + 1
+    else:
+        dist_orders = [(0,0)] * int(len(refSeqs) * (len(querySeqs)))
+        for i,query in enumerate(querySeqs):
+            for j,ref in enumerate(refSeqs):
+                dist_orders[n] = (j, i)
+                n = n + 1
+    
+    # return final list of tuples
+    return dist_orders
 
 def writeTmpFile(fileList):
     """Writes a list to a temporary file. Used for turning variable into mash

From 09172f569a2b7cbe060e2d1de3c72e66a4f688a1 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 4 May 2020 10:21:46 +0100
Subject: [PATCH 32/46] Fix pandas syntax

---
 PopPUNK/plot.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
index 13441f0c..eb2ab87f 100644
--- a/PopPUNK/plot.py
+++ b/PopPUNK/plot.py
@@ -478,6 +478,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format =
     nodeLabels = [r.split('/')[-1].split('.')[0] for r in nodeNames]
 
     # get example clustering name for validation
+    print('Clustering is ' + str(clustering))
     example_cluster_title = list(clustering.keys())[0]
 
     for name, label in zip(nodeNames, nodeLabels):

From ce3a7e97a170862a966d1fb2a2e6eb16c1a74db2 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 4 May 2020 13:19:05 +0100
Subject: [PATCH 33/46] Correct CSV reading/writing

---
 PopPUNK/__main__.py           |  1 +
 PopPUNK/lineage_clustering.py |  3 ++-
 PopPUNK/plot.py               |  1 -
 PopPUNK/utils.py              | 12 ++++++------
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index fb3daf4f..6ca5be8b 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -637,6 +637,7 @@ def main():
             if args.viz_lineages:
                 cluster_file = args.viz_lineages
                 isolateClustering = readIsolateTypeFromCsv(cluster_file, mode = 'lineages', return_dict = True)
+                print('CLUSTERS: ' + str(isolateClustering))
             else:
                 # identify existing analysis files
                 model_prefix = args.ref_db
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index e5a73a18..da900a33 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -281,7 +281,6 @@ def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_cl
     # add query-query comparisons
     for query in query_nn.keys():
         nn[query] = query_nn[query]
-        print('Query: ' + str(query) + ' nn ' + str(query_nn[query]))
     
     # calculate max distances for each isolate
     max_distance = {}
@@ -413,6 +412,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None
         for n,result in enumerate(results):
             rank = rank_list[n]
             lineage_clustering[rank], lineage_seed[rank], neighbours[rank], previous_lineage_clustering[rank] = result
+            print('Rank: ' + str(rank) + ' seeds: ' + str(lineage_seed[rank]))
 
     # store output
     with open(output + "/" + output + '_lineages.pkl', 'wb') as pickle_file:
@@ -496,6 +496,7 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc
         # focus on relevant data
         lineage_clustering = lineage_clustering_overall[rank]
         lineage_seed = lineage_seed_overall[rank]
+        print('Rank: ' + str(rank) + '\nSEEDS!: ' + str(lineage_seed))
         neighbours = neighbours_overall[rank]
         # add new queries to lineage clustering
         q_indices = [isolate_list.index(q) for q in qlist]
diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
index eb2ab87f..13441f0c 100644
--- a/PopPUNK/plot.py
+++ b/PopPUNK/plot.py
@@ -478,7 +478,6 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format =
     nodeLabels = [r.split('/')[-1].split('.')[0] for r in nodeNames]
 
     # get example clustering name for validation
-    print('Clustering is ' + str(clustering))
     example_cluster_title = list(clustering.keys())[0]
 
     for name, label in zip(nodeNames, nodeLabels):
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index ac7353ad..80a0b62a 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -267,7 +267,7 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False):
     """
     # data structures
     if return_dict:
-        clusters = defaultdict(lambda: defaultdict(str))
+        clusters = defaultdict(dict)
     else:
         clusters = defaultdict(lambda: defaultdict(set))
     
@@ -276,9 +276,9 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False):
     
     # select relevant columns according to mode
     if mode == 'clusters':
-        type_columns = clustersCsv.columns[clustersCsv.columns.str.contains('_Cluster')]
+        type_columns = [n for n,col in enumerate(clustersCsv.columns) if ('_Cluster' in col)]
     elif mode == 'lineages':
-        type_columns = clustersCsv.columns[clustersCsv.columns.str.contains('Overall_lineage') | clustersCsv.columns.str.contains('Lineage_Rank_')]
+        type_columns = [n for n,col in enumerate(clustersCsv.columns) if ('Rank_' in col or 'overall' in col)]
     elif mode == 'external':
         type_columns = range(1,len(clustersCsv.columns))
     else:
@@ -288,12 +288,12 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False):
     # read file
     for row in clustersCsv.itertuples():
         for cls_idx in type_columns:
-            cluster_name = str(clustersCsv.columns[type_columns[cls_idx]])
+            cluster_name = clustersCsv.columns[cls_idx]
             cluster_name = cluster_name.replace('__autocolour','')
             if return_dict:
-                clusters[cluster_name][row.Index] = str(row[cls_idx])
+                clusters[cluster_name][row.Index] = str(row[cls_idx + 1])
             else:
-                clusters[cluster_name][str(row[cls_idx])].append([row.Index])
+                clusters[cluster_name][str(row[cls_idx + 1])].append([row.Index])
 
     # return data structure
     return clusters

From c82e24f4edc858de85550852ce38de91f945b6e1 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 4 May 2020 15:55:20 +0100
Subject: [PATCH 34/46] Simplify lineage clustering

---
 PopPUNK/__main__.py           |   1 -
 PopPUNK/lineage_clustering.py | 143 ++++++----------------------------
 2 files changed, 22 insertions(+), 122 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 6ca5be8b..fb3daf4f 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -637,7 +637,6 @@ def main():
             if args.viz_lineages:
                 cluster_file = args.viz_lineages
                 isolateClustering = readIsolateTypeFromCsv(cluster_file, mode = 'lineages', return_dict = True)
-                print('CLUSTERS: ' + str(isolateClustering))
             else:
                 # identify existing analysis files
                 model_prefix = args.ref_db
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index da900a33..fb400ddf 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -29,57 +29,13 @@
 from .utils import listDistInts
 from .utils import readRfile
 
-def cluster_neighbours_of_equal_or_lower_rank(isolate, rank, lineage_index, lineage_clustering, lineage_clustering_information, nn):
-    """ Iteratively adds neighbours of isolates of lower or equal rank
-    to a lineage if an isolate of a higher rank is added.
-    
-    Args:
-    isolate (string)
-        Isolate of higher rank added to lineage.
-    rank (int)
-        Rank of isolate added to lineage.
-    lineage_index (int)
-       Label of current lineage.
-    lineage_clustering (dict)
-       Clustering of existing dataset.
-    lineage_clustering_information (dict)
-        Dict listing isolates by ranked distance from seed.
-    nn (nested dict)
-       Pre-calculated neighbour relationships.
-        
-    Returns:
-        lineage_clustering (dict)
-            Assignment of isolates to lineages.
-    """
-    isolates_to_check = [isolate]
-    isolate_ranks = {}
-    isolate_ranks[isolate] = rank
-    
-    while len(isolates_to_check) > 0:
-        for isolate in isolates_to_check:
-            rank = isolate_ranks[isolate]
-            for isolate_neighbour in nn[isolate].keys():
-                # if lineage of neighbour is higher, or it is null value (highest)
-                if lineage_clustering[isolate_neighbour] is None or lineage_clustering[isolate_neighbour] > lineage_index:
-                    # check if rank of neighbour is lower than that of the current subject isolate
-                    for neighbour_rank in range(1,rank):
-                        if isolate_neighbour in lineage_clustering_information[neighbour_rank]:
-                            lineage_clustering[isolate_neighbour] = lineage_index
-                            isolates_to_check.append(isolate_neighbour)
-                            isolate_ranks[isolate_neighbour] = neighbour_rank
-            isolates_to_check.remove(isolate)
-
-    return lineage_clustering
-
-def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, rank, lineage_index, seed_isolate, previous_lineage_clustering):
+def run_lineage_clustering(lineage_clustering, neighbours, rank, lineage_index, seed_isolate, previous_lineage_clustering):
     """ Identifies isolates corresponding to a particular
     lineage given a cluster seed.
     
     Args:
         lineage_clustering (dict)
            Clustering of existing dataset.
-        lineage_clustering_information (dict)
-            Dict listing isolates by ranked distance from seed.
         neighbours (nested dict)
            Pre-calculated neighbour relationships.
         rank (int)
@@ -96,43 +52,28 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n
             Assignment of isolates to lineages.
     
     """
-    # first make all R neighbours of the seed part of the lineage if unclustered
-    for seed_neighbour in neighbours[seed_isolate]:
-        if lineage_clustering[seed_neighbour] is None or lineage_clustering[seed_neighbour] > lineage_index:
-            lineage_clustering[seed_neighbour] = lineage_index
-    # iterate through ranks; for each isolate, test if neighbour belongs to this cluster
-    # overwrite higher cluster values - default value is higer than number of isolates
-    # when querying, allows smaller clusters to be merged into more basal ones as connections
-    # made
-    for rank in lineage_clustering_information.keys():
-        # iterate through isolates of same rank
-        for isolate in lineage_clustering_information[rank]:
-            # test if clustered/belonging to a more peripheral cluster
+    # generate sets of neighbours based on rank
+    neighbour_set = defaultdict(frozenset)
+    for isolate in neighbours.keys():
+        neighbour_set[isolate] = frozenset(neighbours[isolate].keys())
+    # initiate lineage as the seed isolate and immediate unclustered neighbours
+    in_lineage = {seed_isolate}
+    for seed_neighbours in neighbour_set[seed_isolate]:
+        if lineage_clustering[seed_neighbours] is None or lineage_clustering[seed_neighbours] > lineage_index:
+            in_lineage.add(seed_neighbours)
+            lineage_clustering[seed_neighbours] = lineage_index
+    # iterate through other isolates until converged on a stable clustering
+    alterations = len(neighbours.keys())
+    while alterations > 0:
+        alterations = 0
+        for isolate in neighbour_set.keys():
             if lineage_clustering[isolate] is None or lineage_clustering[isolate] > lineage_index:
-                # get clusters of nearest neighbours
-                isolate_neighbour_clusters = [lineage_clustering[isolate_neighbour] for isolate_neighbour in neighbours[isolate].keys()]
-                # if a nearest neighbour is in this cluster
-                if lineage_index in isolate_neighbour_clusters:
-                    # add isolate to lineage
+                intersection_size = in_lineage.intersection(neighbour_set[isolate])
+                if intersection_size is not None and len(intersection_size) > 0:
+                    in_lineage.add(isolate)
                     lineage_clustering[isolate] = lineage_index
-                    # add neighbours of same or lower rank to lineage if unclustered
-                    lineage_clustering = cluster_neighbours_of_equal_or_lower_rank(isolate,
-                                                                                    rank,
-                                                                                    lineage_index,
-                                                                                    lineage_clustering,
-                                                                                    lineage_clustering_information,
-                                                                                    neighbours)
-            # if this represents a change from the previous iteration of lineage definitions
-            # then the change needs to propagate through higher-ranked members
-            if isolate in previous_lineage_clustering:
-                if previous_lineage_clustering[isolate] == lineage_index and lineage_clustering[isolate] != lineage_index:
-                    for higher_rank in lineage_clustering_information.keys():
-                        if higher_rank > rank:
-                            for higher_ranked_isolate in lineage_clustering_information[rank]:
-                                if lineage_clustering[isolate] == lineage_index:
-                                    lineage_clustering[isolate] = None
-                
-                        
+                    alterations = alterations + 1
+    # return final clustering
     return lineage_clustering
 
 def get_seed_isolate(lineage_clustering, row_labels, distances, lineage_index, lineage_seed):
@@ -177,38 +118,6 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, lineage_index, l
     # return identified seed isolate
     return seed_isolate
 
-def get_lineage_clustering_information(seed_isolate, row_labels, distances):
-    """ Generates the ranked distances needed for cluster
-    definition.
-    
-    Args:
-        seed_isolate (str)
-            Isolate used to initiate lineage.
-        row_labels (list of tuples)
-            Pairs of isolates labelling each distance.
-        distances (numpy array)
-            Pairwise distances used for defining relationships.
-    
-    Returns:
-        lineage_info (dict)
-            Dict listing isolates by ranked distance from seed.
-    
-    """
-    # data structure
-    lineage_info = defaultdict(list)
-    rank = 0
-    last_dist = -1
-    # iterate through distances recording rank
-    for index,distance in enumerate(distances):
-        if seed_isolate in row_labels[index]:
-            if distance > last_dist:
-                rank = rank + 1
-                last_dist = distance
-            pair = row_labels[index][0] if row_labels[index][1] == seed_isolate else row_labels[index][1]
-            lineage_info[rank].append(pair)
-    # return information
-    return lineage_info
-
 def generate_nearest_neighbours(distances, row_labels, isolate_indices, rank):
     # data structures
     nn = defaultdict(dict)
@@ -412,7 +321,6 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None
         for n,result in enumerate(results):
             rank = rank_list[n]
             lineage_clustering[rank], lineage_seed[rank], neighbours[rank], previous_lineage_clustering[rank] = result
-            print('Rank: ' + str(rank) + ' seeds: ' + str(lineage_seed[rank]))
 
     # store output
     with open(output + "/" + output + '_lineages.pkl', 'wb') as pickle_file:
@@ -496,7 +404,6 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc
         # focus on relevant data
         lineage_clustering = lineage_clustering_overall[rank]
         lineage_seed = lineage_seed_overall[rank]
-        print('Rank: ' + str(rank) + '\nSEEDS!: ' + str(lineage_seed))
         neighbours = neighbours_overall[rank]
         # add new queries to lineage clustering
         q_indices = [isolate_list.index(q) for q in qlist]
@@ -534,15 +441,9 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc
         
             # record status of seed isolate
             lineage_clustering[seed_isolate] = lineage_index
-            
-            # get information for lineage clustering
-            lineage_clustering_information = get_lineage_clustering_information(seed_isolate,
-                                                                                row_labels,
-                                                                                distances)
-                                                                                
+                                                                                            
             # cluster the lineages
             lineage_clustering = run_lineage_clustering(lineage_clustering,
-                                                        lineage_clustering_information,
                                                         neighbours,
                                                         rank,
                                                         lineage_index,

From f75160a72eb49fdf10ef3c5387b9e5568284fae1 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 4 May 2020 21:33:13 +0100
Subject: [PATCH 35/46] Edit to pass tests

---
 PopPUNK/__main__.py |  2 ++
 PopPUNK/network.py  | 32 +++++++++++++++++++++++---------
 PopPUNK/utils.py    | 15 ++++++++++-----
 3 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index fb3daf4f..159c8364 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -279,6 +279,7 @@ def main():
         sketch_sizes = int(round(max(sketch_sizes.values())/64))
 
     # check if working with lineages
+    rank_list = []
     if args.lineage_clustering or args.assign_lineages:
         rank_list = sorted([int(x) for x in args.ranks.split(',')])
         if min(rank_list) == 0 or max(rank_list) > 100:
@@ -653,6 +654,7 @@ def main():
 
                 # Read in network and cluster assignment
                 genomeNetwork, cluster_file = fetchNetwork(prev_clustering, model, rlist, args.core_only, args.accessory_only)
+                isolateClustering = readIsolateTypeFromCsv(cluster_file, mode = 'clusters', return_dict = True)
 
                 # prune the network and dictionary of assignments
                 genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(viz_subset))
diff --git a/PopPUNK/network.py b/PopPUNK/network.py
index 9ad9bf73..4930bef4 100644
--- a/PopPUNK/network.py
+++ b/PopPUNK/network.py
@@ -326,6 +326,20 @@ def addQueryToNetwork(dbFuncs, rlist, qfile, G, kmers, estimated_length,
     new_edges = []
     assigned = set()
 
+    # These are returned
+    qlist1 = None
+    distMat = None
+
+    # Set up query names
+    qList, qSeqs = readRfile(qfile, oneSeq = use_mash)
+    queryFiles = dict(zip(qList, qSeqs))
+    if use_mash == True:
+        rNames = None
+        qNames = qSeqs
+    else:
+        rNames = qList
+        qNames = rNames
+
     # store links for each query in a list of edge tuples
     for assignment, (ref, query) in zip(assignments, iterDistRows(rlist, qList, self=False)):
         if assignment == model.within_label:
@@ -335,17 +349,17 @@ def addQueryToNetwork(dbFuncs, rlist, qfile, G, kmers, estimated_length,
     # Calculate all query-query distances too, if updating database
     if queryQuery:
         sys.stderr.write("Calculating all query-query distances\n")
-        qlist, distMat = calculateQueryQueryDistances(dbFuncs,
-                                                        refList,
-                                                        q_files,
+        qlist1, distMat = calculateQueryQueryDistances(dbFuncs,
+                                                        rNames,
+                                                        qfile,
                                                         kmers,
                                                         estimated_length,
-                                                        output,
+                                                        queryDB,
                                                         use_mash,
                                                         threads)
 
         queryAssignation = model.assign(distMat)
-        for assignment, (ref, query) in zip(queryAssignation, iterDistRows(qlist, qlist, self=True)):
+        for assignment, (ref, query) in zip(queryAssignation, iterDistRows(qlist1, qlist1, self=True)):
             if assignment == model.within_label:
                 new_edges.append((ref, query))
 
@@ -384,7 +398,7 @@ def addQueryToNetwork(dbFuncs, rlist, qfile, G, kmers, estimated_length,
                                                     number_plot_fits = 0,
                                                     threads = threads)
             queryAssignation = model.assign(distMat)
-
+            
             # identify any links between queries and store in the same links dict
             # links dict now contains lists of links both to original database and new queries
             for assignment, (query1, query2) in zip(queryAssignation, iterDistRows(qlist1, qlist2, self=True)):
@@ -443,10 +457,10 @@ def printClusters(G, outPrefix = "_clusters.csv", oldClusterFile = None,
     oldNames = set()
 
     if oldClusterFile != None:
-#        oldClusters = readClusters(oldClusterFile)
         oldAllClusters = readIsolateTypeFromCsv(oldClusterFile, mode = 'external', return_dict = False)
-        oldCluster = oldAllClusters[clustering_type]
-        new_id = len(oldClusters) + 1 # 1-indexed
+        oldClusters = oldAllClusters[list(oldAllClusters.keys())[0]]
+        print('oldCluster is ' + str(oldClusters))
+        new_id = len(oldClusters.keys()) + 1 # 1-indexed
         while new_id in oldClusters:
             new_id += 1 # in case clusters have been merged
 
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 0e9c2efb..0a8f0133 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -268,22 +268,25 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False):
     if return_dict:
         clusters = defaultdict(dict)
     else:
-        clusters = defaultdict(lambda: defaultdict(set))
+        clusters = {}
     
     # read CSV
     clustersCsv = pd.read_csv(clustCSV, index_col = 0, quotechar='"')
     
     # select relevant columns according to mode
     if mode == 'clusters':
-        type_columns = [n for n,col in enumerate(clustersCsv.columns) if ('_Cluster' in col)]
+        type_columns = [n for n,col in enumerate(clustersCsv.columns) if ('Cluster' in col)]
     elif mode == 'lineages':
         type_columns = [n for n,col in enumerate(clustersCsv.columns) if ('Rank_' in col or 'overall' in col)]
     elif mode == 'external':
-        type_columns = range(1,len(clustersCsv.columns))
+        if len(clustersCsv.columns) == 1:
+            type_columns = [0]
+        elif len(clustersCsv.columns) > 1:
+            type_columns = range((len(clustersCsv.columns)-1))
     else:
         sys.stderr.write('Unknown CSV reading mode: ' + mode + '\n')
         exit(1)
-        
+    
     # read file
     for row in clustersCsv.itertuples():
         for cls_idx in type_columns:
@@ -292,7 +295,9 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False):
             if return_dict:
                 clusters[cluster_name][row.Index] = str(row[cls_idx + 1])
             else:
-                clusters[cluster_name][str(row[cls_idx + 1])].append([row.Index])
+                if cluster_name not in clusters.keys():
+                    clusters[cluster_name] = defaultdict(set)
+                clusters[cluster_name][str(row[cls_idx + 1])].add(row.Index)
 
     # return data structure
     return clusters

From 46b1e1adab7379903cdb92c71a093005ffa91ace Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 4 May 2020 21:59:45 +0100
Subject: [PATCH 36/46] Output directory generation for lineage clustering

---
 PopPUNK/__main__.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index 159c8364..a4fad21d 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -550,6 +550,14 @@ def main():
 
         refList, queryList, self, distMat = readPickle(distances)
         
+        # make directory for new output files
+        if not os.path.isdir(args.output):
+            try:
+                os.makedirs(args.output)
+            except OSError:
+                sys.stderr.write("Cannot create output directory\n")
+                sys.exit(1)
+        
         # run lineage clustering
         if self:
             isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme, num_processes = args.threads)
@@ -812,7 +820,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances
 
             # Update the network + ref list
             # only update network if assigning to strains
-            if full_db is False or assign_lineage is False:
+            if full_db is False and assign_lineage is False:
                 mashOrder = refList + ordered_queryList
                 newRepresentativesNames, newRepresentativesFile = extractReferences(genomeNetwork, mashOrder, output, refList)
                 genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(newRepresentativesNames))

From e72dcbc22bf3a840ce86af6f2b74be6f8b33a605 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 4 May 2020 22:00:34 +0100
Subject: [PATCH 37/46] Add lineages clustering tests to run_test.py

---
 test/run_test.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/run_test.py b/test/run_test.py
index fd2bed8b..0dc981ab 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -72,6 +72,13 @@
 sys.stderr.write("Running microreact visualisations (--generate-viz)\n")
 subprocess.run("python ../poppunk-runner.py --generate-viz --distances example_db_mash/example_db_mash.dists --ref-db example_db_mash --output example_viz --microreact --subset subset.txt", shell=True, check=True)
 
+# lineage clustering
+sys.stderr.write("Running lineage clustering test (--lineage-clustering)\n")
+subprocess.run("python ../poppunk-runner.py --lineage-clustering --distances example_db/example_db.dists --output example_lineages --ranks 1,2,3,5", shell=True, check=True)
+
+# assign query to lineages
+sys.stderr.write("Running query assignment (--assign-lineages)\n")
+subprocess.run("python ../poppunk-runner.py --assign-lineages --q-files queries.txt --distances example_db/example_db.dists --ref-db example_db --existing-scheme example_lineages/example_lineages_lineages.pkl --output example_lineage_query --update-db", shell=True, check=True)
 
 # tests of other command line programs (TODO)
 

From abf8d6ea9949ab94cc36e180cbeb111e03cf18f3 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Mon, 4 May 2020 22:07:49 +0100
Subject: [PATCH 38/46] Add mash tests for lineage clustering

---
 test/run_test.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/run_test.py b/test/run_test.py
index 0dc981ab..8f736720 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -80,6 +80,15 @@
 sys.stderr.write("Running query assignment (--assign-lineages)\n")
 subprocess.run("python ../poppunk-runner.py --assign-lineages --q-files queries.txt --distances example_db/example_db.dists --ref-db example_db --existing-scheme example_lineages/example_lineages_lineages.pkl --output example_lineage_query --update-db", shell=True, check=True)
 
+# lineage clustering with mash
+sys.stderr.write("Running lineage clustering test (--lineage-clustering)\n")
+subprocess.run("python ../poppunk-runner.py --lineage-clustering --distances example_db_mash/example_db_mash.dists --output example_lineages_mash --ranks 1,2,3,5 --use-mash", shell=True, check=True)
+
+# assign query to lineages with mash
+sys.stderr.write("Running query assignment (--assign-lineages)\n")
+subprocess.run("python ../poppunk-runner.py --assign-lineages --q-files queries.txt --distances example_db_mash/example_db_mash.dists --ref-db example_db_mash --existing-scheme example_lineages_mash/example_lineages_mash_lineages.pkl --output example_lineage_mash_query --update-db --use-mash", shell=True, check=True)
+
+
 # tests of other command line programs (TODO)
 
 sys.stderr.write("Tests completed\n")

From 4717b0f2e698a4526950bbd3af8e8be340a4606c Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 5 May 2020 05:33:38 +0100
Subject: [PATCH 39/46] Simplify variables

---
 PopPUNK/__main__.py           | 4 ++--
 PopPUNK/lineage_clustering.py | 8 +++-----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
index a4fad21d..40909770 100644
--- a/PopPUNK/__main__.py
+++ b/PopPUNK/__main__.py
@@ -560,9 +560,9 @@ def main():
         
         # run lineage clustering
         if self:
-            isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme, num_processes = args.threads)
+            isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, isolate_list = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme, num_processes = args.threads)
         else:
-            isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory,  existing_scheme = args.existing_scheme, num_processes = args.threads)
+            isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, isolate_list = refList, qlist = queryList, use_accessory = args.use_accessory,  existing_scheme = args.existing_scheme, num_processes = args.threads)
 
     #*******************************#
     #*                             *#
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index fb400ddf..4c353b98 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -234,7 +234,7 @@ def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_cl
     # return updated dict
     return nn, lineage_clustering
 
-def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1):
+def cluster_into_lineages(distMat, rank_list = None, output = None, isolate_list = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1):
     """ Clusters isolates into lineages based on their
     relative distances.
     
@@ -245,7 +245,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None
         rank_list (list of int)
             Integers specifying the maximum rank of neighbours used
             for clustering.
-        rlist (list)
+        isolate_list (list)
             List of reference sequences.
         qlist (list)
             List of query sequences being added to an existing clustering.
@@ -274,9 +274,6 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None
     distance_ranks = np.argsort(distances)
     distances = distances[distance_ranks]
     
-    # determine whether ref-ref or ref-query analysis
-    isolate_list = rlist
-    
     # determine whether novel analysis or modifying existing analysis
     use_existing = False
     neighbours = {}
@@ -292,6 +289,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None
     
     # shared memory data structures
     with SharedMemoryManager() as smm:
+    
         # share sorted distances
         distances_raw = smm.SharedMemory(size = distances.nbytes)
         distances_shared_array = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_raw.buf)

From cb1f0d1336e3f38068f1ab6b87e55320f054541d Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 5 May 2020 17:03:47 +0100
Subject: [PATCH 40/46] Reimplement lineage clustering using networks

---
 PopPUNK/lineage_clustering.py | 427 ++++++++++++----------------------
 1 file changed, 145 insertions(+), 282 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 4c353b98..9ec63062 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -12,6 +12,7 @@
 from collections import defaultdict
 import pickle
 import collections
+import networkx as nx
 from multiprocessing import Pool, Lock, Manager, RawArray, shared_memory, managers
 try:
     from multiprocessing import Pool, shared_memory
@@ -28,11 +29,60 @@
 from .utils import iterDistRows
 from .utils import listDistInts
 from .utils import readRfile
+from .utils import update_distance_matrices
 
-def run_lineage_clustering(lineage_clustering, neighbours, rank, lineage_index, seed_isolate, previous_lineage_clustering):
+def get_chunk_ranges(N, nb):
+    step = N / nb
+    range_sizes = [(round(step*i), round(step*(i+1))) for i in range(nb)]
+    # extend to end of distMat
+    range_sizes[len(range_sizes) - 1] = (range_sizes[len(range_sizes) - 1][0],N)
+    # return ranges
+    return range_sizes
+
+def rank_distance_matrix(bounds, distances = None):
+    # load distance matrix from shared memory
+    distances_shm = shared_memory.SharedMemory(name = distances.name)
+    distances = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_shm.buf)
+    # rank relevant slide of distance matrix
+    ranks = np.apply_along_axis(rankdata, 1, distances[slice(*bounds),:], method = 'ordinal')
+    return ranks
+
+def get_nearest_neighbours(rank, isolates = None, ranks = None):
+    # data structure
+    nn = {}
+    # load shared ranks
+    ranks_shm = shared_memory.SharedMemory(name = ranks.name)
+    ranks = np.ndarray(ranks.shape, dtype = ranks.dtype, buffer = ranks_shm.buf)
+    # apply along axis
+    for i in isolates:
+        nn[i] = defaultdict(frozenset)
+        isolate_ranks = ranks[i,:]
+        closest_ranked = np.ravel(np.where(isolate_ranks <= rank))
+        neighbours = frozenset(closest_ranked.tolist())
+        nn[i] = neighbours
+    # return dict
+    return nn
+    
+
+def pick_seed_isolate(G, distances = None):
+    # identify unclustered isolates
+    unclustered_isolates = list(nx.isolates(G))
+    # select minimum distance between unclustered isolates
+    minimum_distance_between_unclustered_isolates = np.amin(distances[unclustered_isolates,unclustered_isolates],axis = 0)
+    # select occurrences of this distance
+    minimum_distance_coordinates = np.where(distances == minimum_distance_between_unclustered_isolates)
+    # identify case where both isolates are unclustered
+    for i in range(len(minimum_distance_coordinates[0])):
+        if minimum_distance_coordinates[0][i] in unclustered_isolates and minimum_distance_coordinates[1][i] in unclustered_isolates:
+            seed_isolate = minimum_distance_coordinates[0][i]
+            break
+    # return unclustered isolate with minimum distance to another isolate
+    return seed_isolate
+
+def get_lineage(G, neighbours, seed_isolate, lineage_index):
     """ Identifies isolates corresponding to a particular
     lineage given a cluster seed.
-    
+
     Args:
         lineage_clustering (dict)
            Clustering of existing dataset.
@@ -50,189 +100,31 @@ def run_lineage_clustering(lineage_clustering, neighbours, rank, lineage_index,
     Returns:
         lineage_clustering (dict)
             Assignment of isolates to lineages.
-    
+
     """
-    # generate sets of neighbours based on rank
-    neighbour_set = defaultdict(frozenset)
-    for isolate in neighbours.keys():
-        neighbour_set[isolate] = frozenset(neighbours[isolate].keys())
     # initiate lineage as the seed isolate and immediate unclustered neighbours
     in_lineage = {seed_isolate}
-    for seed_neighbours in neighbour_set[seed_isolate]:
-        if lineage_clustering[seed_neighbours] is None or lineage_clustering[seed_neighbours] > lineage_index:
-            in_lineage.add(seed_neighbours)
-            lineage_clustering[seed_neighbours] = lineage_index
+    G.nodes[seed_isolate]['lineage'] = lineage_index
+    for seed_neighbour in neighbours[seed_isolate]:
+        if nx.is_isolate(G, seed_neighbour):
+            G.add_edge(seed_isolate, seed_neighbour)
+            G.nodes[seed_neighbour]['lineage'] = lineage_index
+            in_lineage.add(seed_neighbour)
     # iterate through other isolates until converged on a stable clustering
     alterations = len(neighbours.keys())
     while alterations > 0:
         alterations = 0
-        for isolate in neighbour_set.keys():
-            if lineage_clustering[isolate] is None or lineage_clustering[isolate] > lineage_index:
-                intersection_size = in_lineage.intersection(neighbour_set[isolate])
+        for isolate in neighbours.keys():
+            if nx.is_isolate(G, isolate):
+                intersection_size = in_lineage.intersection(neighbours[isolate])
                 if intersection_size is not None and len(intersection_size) > 0:
+                    for i in intersection_size:
+                        G.add_edge(isolate, i)
+                        G.nodes[isolate]['lineage'] = lineage_index
                     in_lineage.add(isolate)
-                    lineage_clustering[isolate] = lineage_index
                     alterations = alterations + 1
     # return final clustering
-    return lineage_clustering
-
-def get_seed_isolate(lineage_clustering, row_labels, distances, lineage_index, lineage_seed):
-    """ Identifies the isolate used to initiate a cluster.
-
-    Args:
-        lineage_clustering (dict)
-            Clustering of existing dataset.
-        row_labels (list of tuples)
-            Pairs of isolates labelling each distance.
-        distances (numpy array)
-            Pairwise distances used for defining relationships.
-        lineage_index (int)
-            Label of current lineage.
-        lineage_seed (dict)
-            Dict of seeds used to initiate pre-existing lineage definitions.
-
-    Returns:
-        seed_isolate (str)
-            Isolate to used to initiate next lineage.
-
-    """
-    # variable to return
-    seed_isolate = None
-    # first test if there is an existing seed
-    if lineage_index in lineage_seed.keys():
-        original_seed_isolate = lineage_seed[lineage_index]
-        # if seed now belongs to a different lineage, then lineage has been overwritten
-        # seed may be unassigned if neighbours have changed - but this does not overwrite the
-        # lineage in the same way
-        if lineage_clustering[original_seed_isolate] is None or lineage_clustering[original_seed_isolate] == lineage_index:
-            seed_isolate = original_seed_isolate
-    # else identify a new seed from the closest pair
-    else:
-        for index,(distance,(isolate1,isolate2)) in enumerate(zip(distances,row_labels)):
-            if lineage_clustering[isolate1] is None:
-                seed_isolate = isolate1
-                break
-            elif lineage_clustering[isolate2] is None:
-                seed_isolate = isolate2
-                break
-    # return identified seed isolate
-    return seed_isolate
-
-def generate_nearest_neighbours(distances, row_labels, isolate_indices, rank):
-    # data structures
-    nn = defaultdict(dict)
-    last_dist = {}
-    num_ranks = {}
-    num_ranks = {i:0 for i in isolate_indices}
-    total_isolates = len(isolate_indices)
-    num_distances = len(distances)
-    completed_isolates = 0
-    index = 0
-    # iterate through distances until all nearest neighbours identified
-    while completed_isolates < total_isolates and index < num_distances:
-        distance = distances[index]
-        # iterate through all listed isolates
-        for isolate in row_labels[index]:
-            if isolate in num_ranks.keys() and num_ranks[isolate] < rank:
-                # R is the number of unique distances so only increment if
-                # different from the last distance
-                if isolate in last_dist.keys() and last_dist[isolate] != distance:
-                    num_ranks[isolate] = num_ranks[isolate] + 1
-                # if maximum number of ranks reached, mark as complete
-                if num_ranks[isolate] == rank: # stop at R as counting from 0
-                    completed_isolates = completed_isolates + 1
-                # if not, add as a nearest neighbour
-                else:
-                    pair = row_labels[index][0] if row_labels[index][1] == isolate else row_labels[index][1]
-                    nn[isolate][pair] = distance
-                last_dist[isolate] = distance
-        index = index + 1
-    # return completed dict
-    return nn
-
-def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_clustering):
-    """ Updates the information on nearest neighbours, given
-    a new set of ref-query and query-query distances.
-
-    Args:
-       distances (numpy array)
-        Distances to be used for defining lineages.
-       row_labels (list of tuples)
-        Pairs of isolates labelling each distance.
-       rank (int)
-        Maximum rank of distance used to define nearest neighbours.
-       qlist (list)
-        List of queries to be added to existing dataset.
-       nn (nested dict)
-        Pre-calculated neighbour relationships.
-       lineage_clustering (dict)
-        Clustering of existing dataset.
-
-    Returns:
-        nn (nested dict)
-            Updated neighbour relationships.
-        lineage_clustering (dict)
-            Updated assignment of isolates to lineage.
-
-    """
-    # iterate through isolates and test whether any comparisons with
-    # newly-added queries replace or supplement existing neighbours
-
-    # data structures for altered entries
-    nn_new = {}
-    # pre-process to extract ref-query distances first
-    query_match_indices = [n for n, (r, q) in enumerate(row_labels) if q in qlist or r in qlist]
-    query_row_names = [row_labels[i] for i in query_match_indices]
-    query_distances = np.take(distances, query_match_indices)
-    
-    # get nn for query sequences
-    query_nn = generate_nearest_neighbours(distances, row_labels, qlist, rank)
-    # add query-query comparisons
-    for query in query_nn.keys():
-        nn[query] = query_nn[query]
-    
-    # calculate max distances for each isolate
-    max_distance = {}
-    num_distances = {}
-    for isolate in nn.keys():
-        neighbour_distances = set(nn[isolate].values())
-        max_distance[isolate] = max(neighbour_distances)
-        num_distances[isolate] = len(neighbour_distances) # query-query comparisons may be < R
-        
-    # iterate through the ref-query distances
-    for index,(distance,(isolate1,isolate2)) in enumerate(zip(query_distances,query_row_names)):
-        # identify ref-query matches
-        ref = None
-        query = None
-        if isolate1 in max_distance.keys() and isolate2 not in max_distance.keys():
-            ref = isolate1
-            query = isolate2
-        elif isolate2 in max_distance.keys() and isolate1 not in max_distance.keys():
-            ref = isolate2
-            query = isolate1
-        if ref is not None:
-            if distance <= max_distance[ref]:
-                # unset isolate and references
-                lineage_clustering[ref] = None
-                for neighbour in nn[ref]:
-                    lineage_clustering[neighbour] = None
-                # add neighbours
-                nn[ref][query] = distance
-                # delete links if no longer high ranked match
-                if distance < max_distance[ref]:
-                    if num_distances[ref] == rank:
-                        to_delete = []
-                        for other in nn[ref].keys():
-                            if nn[ref][other] == max_distance[ref]:
-                                to_delete.append(other)
-                        for other in to_delete:
-                            del nn[ref][other]
-                    else:
-                        # if set from query-query distances < R
-                        num_distances[ref] = num_distances[ref] + 1
-                    max_distance[ref] = max(nn[ref].values())
-    # return updated dict
-    return nn, lineage_clustering
+    return G
 
 def cluster_into_lineages(distMat, rank_list = None, output = None, isolate_list = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1):
     """ Clusters isolates into lineages based on their
@@ -262,67 +154,74 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, isolate_list
         combined (dict)
             Assignment of each isolate to clusters by all ranks used.
     """
+    # data structures
+    lineage_clustering = defaultdict(dict)
+    overall_lineage_seeds = defaultdict(dict)
+    overall_lineages = defaultdict(dict)
+    if existing_scheme is not None:
+        with open(existing_scheme, 'rb') as pickle_file:
+            lineage_clustering, overall_lineage_seeds, rank_list = pickle.load(pickle_file)
+
+    # generate square distance matrix
+    seqLabels, coreMat, accMat = update_distance_matrices(isolate_list, distMat)
+    if use_accessory:
+        distances = accMat
+    else:
+        distances = coreMat
+    try:
+        assert seqLabels == isolate_list
+    except:
+        sys.stderr.write('Isolates in wrong order?')
+        exit(1)
+        
+    # list indices and set self-self to Inf
+    isolate_indices = [n for n,i in enumerate(isolate_list)]
+    for i in isolate_indices:
+        distances[i,i] = np.Inf
     
-    # process distance matrix
-    # - this should be sorted (PyTorch allows this to be done on GPUs)
-    # - then the functions can be reimplemented to run faster on a
-    #   sorted list
-    distance_index = 1 if use_accessory else 0
-    distances = distMat[:,distance_index]
-    
-    # sort distances
-    distance_ranks = np.argsort(distances)
-    distances = distances[distance_ranks]
-    
-    # determine whether novel analysis or modifying existing analysis
-    use_existing = False
-    neighbours = {}
-    lineage_seed = {}
-    lineage_clustering = {}
-    previous_lineage_clustering = {}
-    
-    for rank in rank_list:
-        lineage_clustering[rank] = {i:None for i in range(0,len(isolate_list))}
-        lineage_seed[rank] = {}
-        neighbours[rank] = {}
-        previous_lineage_clustering[rank] = {}
-    
-    # shared memory data structures
+    # get ranks of distances per row
+    chunk_boundaries = get_chunk_ranges(distances.shape[0], num_processes)
     with SharedMemoryManager() as smm:
     
-        # share sorted distances
+        # share isolate list
+        isolate_list_shared = smm.ShareableList(isolate_indices)
+    
+        # create shared memory object for distances
         distances_raw = smm.SharedMemory(size = distances.nbytes)
         distances_shared_array = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_raw.buf)
         distances_shared_array[:] = distances[:]
         distances_shared_array = NumpyShared(name = distances_raw.name, shape = distances.shape, dtype = distances.dtype)
 
-        # share distance ranks
+        # parallelise ranking of distances across CPUs
+        with Pool(processes = num_processes) as pool:
+            ranked_array = pool.map(partial(rank_distance_matrix,
+                                distances = distances_shared_array),
+                                chunk_boundaries)
+        
+        # concatenate ranks into shared memory
+        distance_ranks = np.concatenate(ranked_array)
         distance_ranks_raw = smm.SharedMemory(size = distance_ranks.nbytes)
         distance_ranks_shared_array = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = distance_ranks_raw.buf)
         distance_ranks_shared_array[:] = distance_ranks[:]
         distance_ranks_shared_array = NumpyShared(name = distance_ranks_raw.name, shape = distance_ranks.shape, dtype = distance_ranks.dtype)
-
-        # share isolate list
-        isolate_list_shared = smm.ShareableList(isolate_list)
-    
-        # run clustering for an individual R
+                
+        # parallelise neighbour identification for each rank
         with Pool(processes = num_processes) as pool:
             results = pool.map(partial(run_clustering_for_rank,
-                                qlist = qlist,
-                                existing_scheme = existing_scheme,
-                                distances = distances_shared_array,
-                                distance_ranks = distance_ranks_shared_array,
-                                isolates = isolate_list_shared),
+                                distances_input = distances_shared_array,
+                                distance_ranks_input = distance_ranks_shared_array,
+                                isolates = isolate_list_shared,
+                                previous_seeds = overall_lineage_seeds),
                                 rank_list)
         
         # extract results from multiprocessing pool
         for n,result in enumerate(results):
             rank = rank_list[n]
-            lineage_clustering[rank], lineage_seed[rank], neighbours[rank], previous_lineage_clustering[rank] = result
+            lineage_clustering[rank], overall_lineage_seeds[rank] = result
 
     # store output
     with open(output + "/" + output + '_lineages.pkl', 'wb') as pickle_file:
-        pickle.dump([lineage_clustering, lineage_seed, neighbours, rank_list], pickle_file)
+        pickle.dump([lineage_clustering, overall_lineage_seeds, rank_list], pickle_file)
     
     # process multirank lineages
     overall_lineages = {}
@@ -351,7 +250,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, isolate_list
     # return lineages
     return overall_lineages
 
-def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None):
+def run_clustering_for_rank(rank, distances_input = None, distance_ranks_input = None, isolates = None, previous_seeds = None):
     """ Clusters isolates into lineages based on their
     relative distances using a single R to enable
     parallelisation.
@@ -377,79 +276,43 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc
     """
     
     # load shared memory objects
-    distances_shm = shared_memory.SharedMemory(name = distances.name)
-    distances = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_shm.buf)
-    distance_ranks_shm = shared_memory.SharedMemory(name = distance_ranks.name)
-    distance_ranks = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = distance_ranks_shm.buf)
+    distances_shm = shared_memory.SharedMemory(name = distances_input.name)
+    distances = np.ndarray(distances_input.shape, dtype = distances_input.dtype, buffer = distances_shm.buf)
+    distance_ranks_shm = shared_memory.SharedMemory(name = distance_ranks_input.name)
+    distance_ranks = np.ndarray(distance_ranks_input.shape, dtype = distance_ranks_input.dtype, buffer = distance_ranks_shm.buf)
     isolate_list = isolates
     isolate_indices = range(0,len(isolate_list))
+
+    # load previous scheme
+    seeds = {}
+    if previous_seeds is not None:
+        seeds = previous_seeds[rank]
+
+    # create graph structure
+    G = nx.Graph()
+    G.add_nodes_from(isolate_indices)
+    G.nodes.data('lineage', default = 0)
     
-    # calculate row labels
-    # this is inefficient but there appears to be no way of sharing
-    # strings between processes efficiently
-    row_labels = listDistInts(isolate_list, isolate_list, self = True)
-    # reorder by sorted distances
-    row_labels = [row_labels[i] for i in distance_ranks]
-    
-    lineage_clustering = {i:None for i in range(0,len(isolate_list))}
-    previous_lineage_clustering = lineage_clustering
-    lineage_seed = {}
-    neighbours = {}
+    # identify nearest neighbours
+    nn = get_nearest_neighbours(rank,
+                            ranks = distance_ranks_input,
+                            isolates = isolate_list)
     
-    if existing_scheme is not None:
-        with open(existing_scheme, 'rb') as pickle_file:
-            lineage_clustering_overall, lineage_seed_overall, neighbours_overall, rank_list = pickle.load(pickle_file)
-        # focus on relevant data
-        lineage_clustering = lineage_clustering_overall[rank]
-        lineage_seed = lineage_seed_overall[rank]
-        neighbours = neighbours_overall[rank]
-        # add new queries to lineage clustering
-        q_indices = [isolate_list.index(q) for q in qlist]
-        for q in q_indices:
-            lineage_clustering[q] = None
-        previous_lineage_clustering = lineage_clustering
-        
-        neighbours, lineage_clustering = update_nearest_neighbours(distances,
-                                                                row_labels,
-                                                                rank,
-                                                                q_indices,
-                                                                neighbours,
-                                                                lineage_clustering)
-    else:
-        neighbours = generate_nearest_neighbours(distances,
-                                                row_labels,
-                                                isolate_indices,
-                                                rank)
-
-    # run clustering
+    # iteratively identify lineages
     lineage_index = 1
-    while None in lineage_clustering.values():
-
-        # get seed isolate based on minimum pairwise distances
-        seed_isolate = get_seed_isolate(lineage_clustering,
-                                        row_labels,
-                                        distances,
-                                        lineage_index,
-                                        lineage_seed)
-        lineage_seed[lineage_index] = seed_isolate
-        
-        # seed isolate is None if a previously-existing cluster has been overwritten
-        # in which case pass over the lineage to keep nomenclature consistent
-        if seed_isolate is not None:
-        
-            # record status of seed isolate
-            lineage_clustering[seed_isolate] = lineage_index
-                                                                                            
-            # cluster the lineages
-            lineage_clustering = run_lineage_clustering(lineage_clustering,
-                                                        neighbours,
-                                                        rank,
-                                                        lineage_index,
-                                                        seed_isolate,
-                                                        previous_lineage_clustering)
-                                                        
-        # increment index for next lineage
+    while nx.number_of_isolates(G) > 0:
+        if lineage_index in seeds.keys():
+            seed_isolate = seeds[lineage_index]
+        else:
+            seed_isolate = pick_seed_isolate(G, distances = distances)
+        # skip over previously-defined seeds if amalgamated into different lineage now
+        if nx.is_isolate(G, seed_isolate):
+            seeds[lineage_index] = seed_isolate
+            G = get_lineage(G, nn, seed_isolate, lineage_index)
         lineage_index = lineage_index + 1
 
+    # identify components and name lineages
+    lineage_clustering = {node:nodedata for (node, nodedata) in G.nodes(data='lineage')}
+
     # return clustering
-    return lineage_clustering, lineage_seed, neighbours, previous_lineage_clustering
+    return lineage_clustering, seeds

From 9b143e3b0327bfa0f286fc22eb34dc39ef54b976 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 5 May 2020 20:11:45 +0100
Subject: [PATCH 41/46] Tidy up redundant code

---
 PopPUNK/lineage_clustering.py |  2 --
 PopPUNK/utils.py              | 38 -----------------------------------
 2 files changed, 40 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 9ec63062..4c00a170 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -27,8 +27,6 @@
 from .plot import writeClusterCsv
 
 from .utils import iterDistRows
-from .utils import listDistInts
-from .utils import readRfile
 from .utils import update_distance_matrices
 
 def get_chunk_ranges(N, nb):
diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 0a8f0133..b019f922 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -159,44 +159,6 @@ def iterDistRows(refSeqs, querySeqs, self=True):
             for ref in refSeqs:
                 yield(ref, query)
 
-def listDistInts(refSeqs, querySeqs, self=True):
-    """Gets the ref and query ID for each row of the distance matrix
-
-    Returns an iterable with ref and query ID pairs by row.
-
-    Args:
-        refSeqs (list)
-            List of reference sequence names.
-        querySeqs (list)
-            List of query sequence names.
-        self (bool)
-            Whether a self-comparison, used when constructing a database.
-            Requires refSeqs == querySeqs
-            Default is True
-    Returns:
-        ref, query (str, str)
-            Iterable of tuples with ref and query names for each distMat row.
-    """
-    dist_orders = []
-    n = 0
-    if self:
-        dist_orders = [(0,0)] * int(0.5 * len(refSeqs) * (len(refSeqs) - 1))
-        if refSeqs != querySeqs:
-            raise RuntimeError('refSeqs must equal querySeqs for db building (self = true)')
-        for i in range(0, len(refSeqs)):
-            for j in range(i + 1, len(refSeqs)):
-                dist_orders[n] = (j, i)
-                n = n + 1
-    else:
-        dist_orders = [(0,0)] * int(len(refSeqs) * (len(querySeqs)))
-        for i,query in enumerate(querySeqs):
-            for j,ref in enumerate(refSeqs):
-                dist_orders[n] = (j, i)
-                n = n + 1
-    
-    # return final list of tuples
-    return dist_orders
-
 def writeTmpFile(fileList):
     """Writes a list to a temporary file. Used for turning variable into mash
     input.

From 905e25b2be2180a9f4835dbddf235b01660828f6 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 5 May 2020 20:43:24 +0100
Subject: [PATCH 42/46] Tidy up documentation

---
 PopPUNK/lineage_clustering.py | 85 ++++++++++++++++++++++++++++-------
 1 file changed, 69 insertions(+), 16 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index 4c00a170..a11f5fd7 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -13,7 +13,7 @@
 import pickle
 import collections
 import networkx as nx
-from multiprocessing import Pool, Lock, Manager, RawArray, shared_memory, managers
+from multiprocessing import Pool, RawArray, shared_memory, managers
 try:
     from multiprocessing import Pool, shared_memory
     from multiprocessing.managers import SharedMemoryManager
@@ -30,6 +30,19 @@
 from .utils import update_distance_matrices
 
 def get_chunk_ranges(N, nb):
+    """ Calculates boundaries for dividing distances array
+    into chunks for parallelisation.
+    
+    Args:
+        N (int)
+            Number of rows in array
+        nb (int)
+            Number of blocks into which to divide array.
+    
+    Returns:
+        range_sizes (list of tuples)
+            Limits of blocks for dividing array.
+    """
     step = N / nb
     range_sizes = [(round(step*i), round(step*(i+1))) for i in range(nb)]
     # extend to end of distMat
@@ -38,6 +51,19 @@ def get_chunk_ranges(N, nb):
     return range_sizes
 
 def rank_distance_matrix(bounds, distances = None):
+    """ Ranks distances between isolates for each index (row)
+    isolate.
+    
+    Args:
+        bounds (2-tuple)
+            Range of rows to process in this thread.
+        distances (int)
+            Shared memory object storing pairwise distances.
+    
+    Returns:
+        ranks (numpy ndarray)
+            Ranks of distances for each row.
+    """
     # load distance matrix from shared memory
     distances_shm = shared_memory.SharedMemory(name = distances.name)
     distances = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_shm.buf)
@@ -46,6 +72,22 @@ def rank_distance_matrix(bounds, distances = None):
     return ranks
 
 def get_nearest_neighbours(rank, isolates = None, ranks = None):
+    """ Identifies sets of nearest neighbours for each isolate.
+    
+    Args:
+        rank (int)
+            Rank used in analysis.
+        isolates (int list)
+            List of isolate indices.
+        ranks (ndarray)
+            Shared memory object pointing to ndarray of
+            ranked pairwise distances.
+            
+    Returns:
+        nn (default dict of frozensets)
+            Dict indexed by isolates, values are a
+            frozen set of nearest neighbours.
+    """
     # data structure
     nn = {}
     # load shared ranks
@@ -63,6 +105,19 @@ def get_nearest_neighbours(rank, isolates = None, ranks = None):
     
 
 def pick_seed_isolate(G, distances = None):
+    """ Identifies seed isolate from the closest pair of
+    unclustered isolates.
+    
+    Args:
+        G (network)
+            Network with one node per isolate.
+        distances (ndarray of pairwise distances)
+            Pairwise distances between isolates.
+            
+    Returns:
+        seed_isolate (int)
+            Index of isolate selected as seed.
+    """
     # identify unclustered isolates
     unclustered_isolates = list(nx.isolates(G))
     # select minimum distance between unclustered isolates
@@ -82,23 +137,18 @@ def get_lineage(G, neighbours, seed_isolate, lineage_index):
     lineage given a cluster seed.
 
     Args:
-        lineage_clustering (dict)
-           Clustering of existing dataset.
-        neighbours (nested dict)
+        G (network)
+            Network with one node per isolate.
+        neighbours (dict of frozen sets)
            Pre-calculated neighbour relationships.
-        rank (int)
-           Maximum rank of neighbours used for clustering.
+        seed_isolate (int)
+           Index of isolate selected as seed.
         lineage_index (int)
            Label of current lineage.
-        seed_isolate (str)
-           Isolate to used to initiate next lineage.
-        previous_lineage_clustering (dict)
-            Clustering of existing dataset in previous iteration.
         
     Returns:
-        lineage_clustering (dict)
-            Assignment of isolates to lineages.
-
+        G (network)
+            Network modified with new edges.
     """
     # initiate lineage as the seed isolate and immediate unclustered neighbours
     in_lineage = {seed_isolate}
@@ -135,11 +185,13 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, isolate_list
         rank_list (list of int)
             Integers specifying the maximum rank of neighbours used
             for clustering.
+        output (string)
+            Prefix used for printing output files.
         isolate_list (list)
             List of reference sequences.
         qlist (list)
             List of query sequences being added to an existing clustering.
-            Should be included within rlist.
+            Should be included within isolate_list.
         existing_scheme (str)
             Path to pickle file containing lineage scheme to which isolates
             should be added.
@@ -149,9 +201,10 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, isolate_list
             Number of CPUs to use for calculations.
             
     Returns:
-        combined (dict)
-            Assignment of each isolate to clusters by all ranks used.
+        overall_lineages (nested dict)
+            Dict for each rank listing the lineage of each isolate.
     """
+    
     # data structures
     lineage_clustering = defaultdict(dict)
     overall_lineage_seeds = defaultdict(dict)

From df81bed4b27d7c21bfc6c53ef6c599cbd96b2587 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 5 May 2020 20:50:23 +0100
Subject: [PATCH 43/46] Changing memory use in seed_isolate function

---
 PopPUNK/lineage_clustering.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
index a11f5fd7..d553801a 100644
--- a/PopPUNK/lineage_clustering.py
+++ b/PopPUNK/lineage_clustering.py
@@ -57,7 +57,7 @@ def rank_distance_matrix(bounds, distances = None):
     Args:
         bounds (2-tuple)
             Range of rows to process in this thread.
-        distances (int)
+        distances (ndarray in shared memory)
             Shared memory object storing pairwise distances.
     
     Returns:
@@ -79,7 +79,7 @@ def get_nearest_neighbours(rank, isolates = None, ranks = None):
             Rank used in analysis.
         isolates (int list)
             List of isolate indices.
-        ranks (ndarray)
+        ranks (ndarray in shared memory)
             Shared memory object pointing to ndarray of
             ranked pairwise distances.
             
@@ -111,13 +111,16 @@ def pick_seed_isolate(G, distances = None):
     Args:
         G (network)
             Network with one node per isolate.
-        distances (ndarray of pairwise distances)
+        distances (ndarray in shared memory)
             Pairwise distances between isolates.
             
     Returns:
         seed_isolate (int)
             Index of isolate selected as seed.
     """
+    # load distances from shared memory
+    distances_shm = shared_memory.SharedMemory(name = distances.name)
+    distances = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_shm.buf)
     # identify unclustered isolates
     unclustered_isolates = list(nx.isolates(G))
     # select minimum distance between unclustered isolates
@@ -355,7 +358,7 @@ def run_clustering_for_rank(rank, distances_input = None, distance_ranks_input =
         if lineage_index in seeds.keys():
             seed_isolate = seeds[lineage_index]
         else:
-            seed_isolate = pick_seed_isolate(G, distances = distances)
+            seed_isolate = pick_seed_isolate(G, distances = distances_input)
         # skip over previously-defined seeds if amalgamated into different lineage now
         if nx.is_isolate(G, seed_isolate):
             seeds[lineage_index] = seed_isolate

From 3ebd9bc54bf64f2bfc5496b832a259a5ef76e1ef Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 5 May 2020 21:37:22 +0100
Subject: [PATCH 44/46] Change to minimum k-mer warning message

---
 PopPUNK/utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index b019f922..80f0e586 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -463,9 +463,8 @@ def assembly_qc(assemblyList, klist, ignoreLengthOutliers, estimated_length):
         sys.stderr.write("WARNING: Average length over 10Mb - are these assemblies?\n")
 
     k_min = min(klist)
-    max_prob = 1/(pow(4, k_min)/float(genome_length) + 1)
-    if 1/(pow(4, k_min)/float(genome_length) + 1) > 0.05:
-        sys.stderr.write("Minimum k-mer length " + str(k_min) + " is too small for genome length " + str(genome_length) +"; please increase to avoid nonsense results\n")
+    if k_min < 6:
+        sys.stderr.write("Minimum k-mer length is 6 - any shorter is unlikely to generate sensible results.\n")
         exit(1)
 
     return (int(genome_length), max_prob)

From f93700c045e8dc65a2c1b34f4ccd190af3f0e1cf Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 5 May 2020 21:43:11 +0100
Subject: [PATCH 45/46] Change random match probability message

---
 PopPUNK/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 80f0e586..265ac7ea 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -463,8 +463,10 @@ def assembly_qc(assemblyList, klist, ignoreLengthOutliers, estimated_length):
         sys.stderr.write("WARNING: Average length over 10Mb - are these assemblies?\n")
 
     k_min = min(klist)
+    if 1/(pow(4, k_min)/float(genome_length) + 1) > 0.05:
+        sys.stderr.write("Minimum k-mer length " + str(k_min) + " is too small for genome length " + str(genome_length) +"; results will be adjusted for random match probabilities\n")
     if k_min < 6:
-        sys.stderr.write("Minimum k-mer length is 6 - any shorter is unlikely to generate sensible results.\n")
+        sys.stderr.write("Minimum k-mer length is too low; please increase to at least 6\n")
         exit(1)
 
     return (int(genome_length), max_prob)

From b801b26df4ffde5f281ea6f6e3662c2d4114f9e7 Mon Sep 17 00:00:00 2001
From: nickjcroucher <n.croucher@imperial.ac.uk>
Date: Tue, 5 May 2020 21:45:17 +0100
Subject: [PATCH 46/46] Define max_prob variable

---
 PopPUNK/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
index 265ac7ea..9618c945 100644
--- a/PopPUNK/utils.py
+++ b/PopPUNK/utils.py
@@ -463,7 +463,8 @@ def assembly_qc(assemblyList, klist, ignoreLengthOutliers, estimated_length):
         sys.stderr.write("WARNING: Average length over 10Mb - are these assemblies?\n")
 
     k_min = min(klist)
-    if 1/(pow(4, k_min)/float(genome_length) + 1) > 0.05:
+    max_prob = 1/(pow(4, k_min)/float(genome_length) + 1)
+    if max_prob > 0.05:
         sys.stderr.write("Minimum k-mer length " + str(k_min) + " is too small for genome length " + str(genome_length) +"; results will be adjusted for random match probabilities\n")
     if k_min < 6:
         sys.stderr.write("Minimum k-mer length is too low; please increase to at least 6\n")