bacpop · johnlees · Jul 16, 2020 · May 7, 2020 · May 7, 2020 · May 7, 2020
diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py
@@ -12,7 +12,7 @@
 from collections import defaultdict
 import pickle
 import collections
-import networkx as nx
+import graph_tool.all as gt
 from multiprocessing import Pool, RawArray, shared_memory, managers
 try:
     from multiprocessing import Pool, shared_memory
@@ -91,17 +91,16 @@ def get_nearest_neighbours(rank, isolates = None, ranks = None):
             frozen set of nearest neighbours.
     """
     # data structure
-    nn = {}
+    nn = set()
     # load shared ranks
     ranks_shm = shared_memory.SharedMemory(name = ranks.name)
     ranks = np.ndarray(ranks.shape, dtype = ranks.dtype, buffer = ranks_shm.buf)
     # apply along axis
     for i in isolates:
-        nn[i] = defaultdict(frozenset)
         isolate_ranks = ranks[i,:]
         closest_ranked = np.ravel(np.where(isolate_ranks <= rank))
-        neighbours = frozenset(closest_ranked.tolist())
-        nn[i] = neighbours
+        for j in closest_ranked.tolist():
+            nn.add((i,j))
     # return dict
     return nn
 
@@ -213,12 +212,17 @@ def cluster_into_lineages(distMat, rank_list = None, output = None,
     """
 
     # data structures
-    lineage_clustering = defaultdict(dict)
+    lineage_assignation = defaultdict(dict)
     overall_lineage_seeds = defaultdict(dict)
     overall_lineages = defaultdict(dict)
+    max_existing_cluster = {rank:1 for rank in rank_list}
+
+    # load existing scheme if supplied
     if existing_scheme is not None:
         with open(existing_scheme, 'rb') as pickle_file:
-            lineage_clustering, overall_lineage_seeds, rank_list = pickle.load(pickle_file)
+            lineage_assignation, overall_lineage_seeds, rank_list = pickle.load(pickle_file)
+        for rank in rank_list:
+            max_existing_cluster[rank] = max(lineage_assignation[rank].values()) + 1
 
     # generate square distance matrix
     seqLabels, coreMat, accMat = \
@@ -263,37 +267,87 @@ def cluster_into_lineages(distMat, rank_list = None, output = None,
         distance_ranks_shared_array = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = distance_ranks_raw.buf)
         distance_ranks_shared_array[:] = distance_ranks[:]
         distance_ranks_shared_array = NumpyShared(name = distance_ranks_raw.name, shape = distance_ranks.shape, dtype = distance_ranks.dtype)
-
+
+        # build a graph framework for network outputs
+        # create graph structure with internal vertex property map
+        # storing lineage assignation cannot load boost.python within spawned
+        # processes so have to run network analysis separately
+        G = gt.Graph(directed = False)
+        G.add_vertex(len(isolate_list))
+        # add sequence labels for visualisation
+        vid = G.new_vertex_property('string',
+                                    vals = isolate_list)
+        G.vp.id = vid
+
         # parallelise neighbour identification for each rank
         with Pool(processes = num_processes) as pool:
-            results = pool.map(partial(run_clustering_for_rank,
-                                distances_input = distances_shared_array,
-                                distance_ranks_input = distance_ranks_shared_array,
-                                isolates = isolate_list_shared,
-                                previous_seeds = overall_lineage_seeds),
+            results = pool.map(partial(get_nearest_neighbours,
+                                ranks = distance_ranks_shared_array,
+                                isolates = isolate_list_shared),
                                 rank_list)
 
-        # extract results from multiprocessing pool
+        # extract results from multiprocessing pool and save output network
+        nn = defaultdict(dict)
+
         for n,result in enumerate(results):
+            # get results per rank
             rank = rank_list[n]
-            lineage_clustering[rank], overall_lineage_seeds[rank] = result
+            # get neigbours
+            edges_to_add = result
+            # store results in network
+            G.add_edge_list(edges_to_add)
+            # calculate connectivity of each vertex
+            vertex_out_degrees = G.get_out_degrees(G.get_vertices())
+            # identify components and rank by frequency
+            components, component_frequencies = gt.label_components(G)
+            component_frequency_ranks = (len(component_frequencies) - rankdata(component_frequencies, method = 'ordinal').astype(int)).tolist()
+            # construct a name translation table
+            # begin with previously defined clusters
+            component_name = [None] * len(component_frequencies)
+            for seed in overall_lineage_seeds[rank]:
+                isolate_index = isolate_list.index(seed)
+                component_number = components[isolate_index]
+                if component_name[component_number] is None or component_name[component_number] > overall_lineage_seeds[rank][seed]:
+                    component_name[component_number] = overall_lineage_seeds[rank][seed]
+            # name remaining components in rank order
+            for component_rank in range(len(component_frequency_ranks)):
+#                
+                component_number = component_frequency_ranks.index(component_rank)
+                if component_name[component_number] is None:
+                    component_name[component_number] = max_existing_cluster[rank]
+                    # find seed isolate
+                    component_max_degree = np.amax(vertex_out_degrees[np.where(components.a == component_number)])
+                    seed_isolate_index = int(np.where((components.a == component_number) & (vertex_out_degrees == component_max_degree))[0][0])
+                    seed_isolate = isolate_list[seed_isolate_index]
+                    overall_lineage_seeds[rank][seed_isolate] = max_existing_cluster[rank]
+                    # increment
+                    max_existing_cluster[rank] = max_existing_cluster[rank] + 1
+            # store assignments
+            for isolate_index,isolate_name in enumerate(isolate_list):
+                original_component = components.a[isolate_index]
+                renamed_component = component_name[original_component]
+                lineage_assignation[rank][isolate_name] = renamed_component
+            # save network
+            G.save(file_name = output + "/" + os.path.basename(output) + '_rank_' + str(rank) + '_lineages.gt', fmt = 'gt')
+            # clear edges - nodes in graph can be reused but edges differ between ranks
+            G.clear_edges()
 
     # store output
     with open(output + "/" + output + '_lineages.pkl', 'wb') as pickle_file:
-        pickle.dump([lineage_clustering, overall_lineage_seeds, rank_list], pickle_file)
-
+        pickle.dump([lineage_assignation, overall_lineage_seeds, rank_list], pickle_file)
+    
     # process multirank lineages
     overall_lineages = {}
     overall_lineages = {'Rank_' + str(rank):{} for rank in rank_list}
     overall_lineages['overall'] = {}
     for index,isolate in enumerate(isolate_list):
         overall_lineage = None
         for rank in rank_list:
-            overall_lineages['Rank_' + str(rank)][isolate] = lineage_clustering[rank][index]
+            overall_lineages['Rank_' + str(rank)][isolate] = lineage_assignation[rank][isolate]
             if overall_lineage is None:
-                overall_lineage = str(lineage_clustering[rank][index])
+                overall_lineage = str(lineage_assignation[rank][isolate])
             else:
-                overall_lineage = overall_lineage + '-' + str(lineage_clustering[rank][index])
+                overall_lineage = overall_lineage + '-' + str(lineage_assignation[rank][isolate])
         overall_lineages['overall'][isolate] = overall_lineage
 
     # print output as CSV
@@ -326,13 +380,13 @@ def run_clustering_for_rank(rank, distances_input = None, distance_ranks_input =
             Whether to extend a previously generated analysis or not.
 
     Returns:
-        lineage_clustering (dict)
+        lineage_assignation (dict)
             Assignment of each isolate to a cluster.
         lineage_seed (dict)
             Seed isolate used to initiate each cluster.
-        neighbours (nested dict)
-            Neighbour relationships between isolates for R.
-    """
+        connections (set of tuples)
+            Edges to add to network describing lineages.
+    """    
 
     # load shared memory objects
     distances_shm = shared_memory.SharedMemory(name = distances_input.name)
@@ -347,31 +401,27 @@ def run_clustering_for_rank(rank, distances_input = None, distance_ranks_input =
     if previous_seeds is not None:
         seeds = previous_seeds[rank]
 
-    # create graph structure
-    G = nx.Graph()
-    G.add_nodes_from(isolate_indices)
-    G.nodes.data('lineage', default = 0)
-
     # identify nearest neighbours
     nn = get_nearest_neighbours(rank,
                             ranks = distance_ranks_input,
                             isolates = isolate_list)
 
     # iteratively identify lineages
     lineage_index = 1
-    while nx.number_of_isolates(G) > 0:
+    connections = set()
+    lineage_assignation = {isolate:None for isolate in isolate_list}
+
+    while None in lineage_assignation.values():
         if lineage_index in seeds.keys():
             seed_isolate = seeds[lineage_index]
         else:
-            seed_isolate = pick_seed_isolate(G, distances = distances_input)
+            seed_isolate = pick_seed_isolate(lineage_assignation, distances = distances_input)
         # skip over previously-defined seeds if amalgamated into different lineage now
-        if nx.is_isolate(G, seed_isolate):
+        if lineage_assignation[seed_isolate] is None:
             seeds[lineage_index] = seed_isolate
-            G = get_lineage(G, nn, seed_isolate, lineage_index)
+            lineage_assignation, added_connections = get_lineage(lineage_assignation, nn, seed_isolate, lineage_index)
+            connections.update(added_connections)
         lineage_index = lineage_index + 1
-
-    # identify components and name lineages
-    lineage_clustering = {node:nodedata for (node, nodedata) in G.nodes(data='lineage')}
-
+
     # return clustering
-    return lineage_clustering, seeds
+    return lineage_assignation, seeds, nn, connections
diff --git a/PopPUNK/mash.py b/PopPUNK/mash.py
@@ -18,7 +18,6 @@
 from glob import glob
 from random import sample
 import numpy as np
-import networkx as nx
 from scipy import optimize
 try:
     from multiprocessing import Pool, shared_memory
@@ -542,10 +541,10 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
             # Check mash output is consistent with expected order
             # This is ok in all tests, but best to check and exit in case something changes between mash versions
             expected_names = iterDistRows(refList, qNames, self)
-
             prev_ref = ""
             skip = 0
             skipped = 0
+
             for line in mashOut:
                 # Skip the first row with self and symmetric elements
                 if skipped < skip:
@@ -602,17 +601,20 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
 
     # run pairwise analyses across kmer lengths, mutating distMat
     # Create range of rows that each thread will work with
+    # if there is only one pair, apply_along_axis will not work
+    if threads > number_pairs:
+        threads = number_pairs
     rows_per_thread = int(number_pairs / threads)
     big_threads = number_pairs % threads
     start = 0
     mat_chunks = []
+
     for thread in range(threads):
         end = start + rows_per_thread
         if thread < big_threads:
             end += 1
         mat_chunks.append((start, end))
         start = end
-
     # create empty distMat that can be shared with multiple processes
     distMat = np.zeros((number_pairs, 2), dtype=raw.dtype)
     with SharedMemoryManager() as smm:
@@ -624,7 +626,6 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num
 
         shm_distMat = smm.SharedMemory(size = distMat.nbytes)
         distMat_shared = NumpyShared(name = shm_distMat.name, shape = (number_pairs, 2), dtype = raw.dtype)
-
         # Run regressions
         with Pool(processes = threads) as pool:
             pool.map(partial(fitKmerBlock,
@@ -668,7 +669,10 @@ def fitKmerBlock(idxRanges, distMat, raw, klist, jacobian):
 
     # analyse
     (start, end) = idxRanges
-    distMat[start:end, :] = np.apply_along_axis(fitKmerCurve, 1, raw[start:end, :], klist, jacobian)
+    if raw.shape[0] == 1:
+        distMat[start:end, :] = fitKmerCurve(raw[0,:], klist, jacobian)
+    else:
+        distMat[start:end, :] = np.apply_along_axis(fitKmerCurve, 1, raw[start:end, :], klist, jacobian)
 
 
 def fitKmerCurve(pairwise, klist, jacobian):
@@ -707,4 +711,3 @@ def fitKmerCurve(pairwise, klist, jacobian):
 
     # Return core, accessory
     return(np.flipud(transformed_params))
-