From 4b123bbbd1d9d168fe195b4bac4eacf392347fc0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Thu, 23 Apr 2020 15:39:26 +0100 Subject: [PATCH 01/46] Identify lineages from reference database --- PopPUNK/__main__.py | 37 +++++- PopPUNK/lineage_clustering.py | 240 ++++++++++++++++++++++++++++++++++ 2 files changed, 276 insertions(+), 1 deletion(-) create mode 100644 PopPUNK/lineage_clustering.py diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index c9895f17..3bbcdfc9 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -17,6 +17,8 @@ from .sketchlib import no_sketchlib, checkSketchlibLibrary +from .lineage_clustering import cluster_into_lineages + from .network import fetchNetwork from .network import constructNetwork from .network import extractReferences @@ -87,6 +89,10 @@ def get_options(): help='Create model at this core distance threshold', default=None, type=float) + mode.add_argument('--lineage-clustering', + help='Identify lineages within a strain', + default=False, + action='store_true') mode.add_argument('--use-model', help='Apply a fitted model to a reference database to restore database files', default=False, @@ -167,6 +173,10 @@ def get_options(): queryingGroup.add_argument('--accessory-only', help='Use an accessory-distance only model for assigning queries ' '[default = False]', default=False, action='store_true') + # lineage clustering within strains + lineagesGroup = parser.add_argument_group('Lineage analysis options') + lineagesGroup.add_argument('--R',help='Maximum rank used in lineage clustering', type = int, default=1) + # plot output faGroup = parser.add_argument_group('Further analysis options') faGroup.add_argument('--subset', help='File with list of sequences to include in visualisation (with --generate-viz only)', default=None) @@ -276,7 +286,7 @@ def main(): sys.exit(1) else: sys.stderr.write('\t sketchlib: ' + checkSketchlibLibrary() + ')\n') - + #******************************# #* *# #* Create database *# @@ -501,6 +511,31 @@ def main(): nx.write_gpickle(genomeNetwork, args.output + "/" + os.path.basename(args.output) + '_graph.gpickle') + #******************************# + #* *# + #* within-strain analysis *# + #* *# + #******************************# + sys.stderr.write("Checking options here\n\n") + if args.lineage_clustering: + sys.stderr.write("Mode: Identifying lineages within a clade\n\n") + + distances = None + ref_db = None + if args.distances is not None and args.ref_db is not None: + distances = args.distances + ref_db = args.ref_db + else: + sys.stderr.write("Need to provide an input set of distances with --distances " + "and reference database directory with --ref-db\n\n") + sys.exit(1) + + # load distance matrix + refList, queryList, self, distMat = readPickle(distances) + + # run lineage clustering + cluster_into_lineages(distMat, args.R, args.output, rlist = refList) + #*******************************# #* *# #* query assignment (function *# diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py new file mode 100644 index 00000000..493e6fbe --- /dev/null +++ b/PopPUNK/lineage_clustering.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python +# vim: set fileencoding= : +# Copyright 2018-2020 John Lees and Nick Croucher + +# universal +import os +import sys +# additional +import numpy as np +from scipy.stats import rankdata +from collections import defaultdict + +# import poppunk package +from .utils import iterDistRows + +def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, R, lineage_index, seed_isolate): + """ Identifies isolates corresponding to a particular + lineage given a cluster seed. + + Args: + ranked_information + nearest_neighbours (dict) + + R (int) + Maximum rank of neighbours used for clustering. + + Returns: + ranked_information + + """ + # first make all R neighbours of the seed part of the lineage if unclustered + for seed_neighbour in neighbours[seed_isolate]: + if lineage_clustering[seed_neighbour] > lineage_index: + lineage_clustering[seed_neighbour] = lineage_index + # iterate through ranks; for each isolate, test if neighbour belongs to this cluster + # overwrite higher cluster values - default value is higer than number of isolates + # when querying, allows smaller clusters to be merged into more basal ones as connections + # made + for rank in lineage_clustering_information.keys(): + # iterate through isolates of same rank + for isolate in lineage_clustering_information[rank]: + # test if clustered/belonging to a more peripheral cluster +# print("Rank: " + str(rank) + " isolate: " + isolate) + if lineage_clustering[isolate] > lineage_index: + # get clusters of nearest neighbours + isolate_neighbour_clusters = [lineage_clustering[isolate_neighbour] for isolate_neighbour in neighbours[isolate].keys()] +# print('Neighbours: ' + str(neighbours[isolate].keys()) + '\nNeighbour clusters: ' + str(isolate_neighbour_clusters)) + # if a nearest neighbour is in this cluster + if lineage_index in isolate_neighbour_clusters: + # add isolate to lineage + lineage_clustering[isolate] = lineage_index + # add neighbours of same or lower rank to lineage if unclustered + for isolate_neighbour in neighbours[isolate].keys(): + if lineage_clustering[isolate_neighbour] > lineage_index: + for neighbour_rank in lineage_clustering_information.keys(): + if neighbour_rank <= rank: + if isolate_neighbour in lineage_clustering_information[neighbour_rank]: + lineage_clustering[isolate_neighbour] = lineage_index + else: + break + return lineage_clustering + +def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_value): + """ Identifies the isolate used to initiate a cluster + """ + # extract all pairwise distances between isolates that are not yet clustered + clustered_isolates = frozenset([isolate for isolate in lineage_clustering.keys() if lineage_clustering[isolate] != null_cluster_value]) + unclustered_pair_indices = [n for n,pair in enumerate(row_labels) if not set(pair).issubset(clustered_isolates)] + unclustered_distances = distances[unclustered_pair_indices] +# print("Unclustered distances are: " + str(unclustered_distances)) + # start at zero as the minimum if present in the unclustered set + minimum_unclustered_distance = 0 + # otherwise go through the bother of picking a minimum + if minimum_unclustered_distance not in unclustered_distances: + if len(unclustered_distances) == 1: + minimum_unclustered_distance = unclustered_distances[0] + elif len(unclustered_distances) > 1: + minimum_unclustered_distance = np.amin(unclustered_distances) + else: + sys.stderr.write('Cannot find any more isolates not assigned to lineages; unclustered pair indices are ' + str(unclustered_pair_indices)) + exit(1) + # identify which entries in the full distance set have this value + # unfortunately these distances are not necessarily neighbours + distance_indices = np.where(distances == minimum_unclustered_distance)[0].tolist() + # identify an unclustered isolate from the pair that have this value + closest_unclustered_pair_index = int(set(unclustered_pair_indices).intersection(set(distance_indices)).pop()) + new_seed_isolate = row_labels[closest_unclustered_pair_index][0] if lineage_clustering[row_labels[closest_unclustered_pair_index][0]] == null_cluster_value else row_labels[closest_unclustered_pair_index][1] + return new_seed_isolate + + +def get_lineage_clustering_information(seed_isolate, row_labels, distances): + """ Generates the ranked distances needed for cluster + definition. + """ + # data structure + lineage_info = defaultdict(list) + # get subset of relevant distances + comparisons_involving_seed = [n for n,(pair) in enumerate(row_labels) if seed_isolate in pair] + distances_to_seed = distances[comparisons_involving_seed] + # get ranks of data + distance_ranks = rankdata(distances_to_seed) + # get partners of seed isolate in each comparison + pairs_involving_seed = [row_labels[x] for x in comparisons_involving_seed] + seed_partners = [r if q == seed_isolate else q for (r,q) in pairs_involving_seed] + # create a dict of lists of isolates for a given rank + # enables later easy iterating through ranked distances + for rank in np.unique(distance_ranks): + lineage_info[rank] = [seed_partners[n] for n,r in enumerate(distance_ranks) if r == rank] + return lineage_info + +def generate_nearest_neighbours(X, row_labels, isolate_list, R): + """ Identifies the nearest neighbours from the core + genome distances. + + Args + X (np.array) + n x 2 array of core and accessory distances for n samples. + row_labels (list of tuples) + List of pairs of isolates + isolate_list (list) + List of isolates + R (int) + Maximum rank of neighbours used for clustering. + + Returns: + nn + Dict of dict of neighbours per isolate, and pairwise distances + """ + # data structures + nn = {} + # iterate through isolates + for isolate in isolate_list: + # initiate dict + nn[isolate] = {} + # get indices of rows involving the considered isolate + indices = [n for n, (r, q) in enumerate(row_labels) if r == isolate or q == isolate] + # get the corresponding distances + #ref_distances = X[np.array(indices):1] + ref_distances = np.take(X, np.array(indices)) + # get unique distances and pick the R smallest values + unique_isolate_distances = np.unique(ref_distances) + R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)] + # get threshold distance for neighbour definition - np.maximum fails for R=1 + threshold_distance = R_lowest_distances[0] + if R > 1: + threshold_distance = np.maximum(R_lowest_distances) + # identify all distances below this threshold and store relative to index + for i,d in enumerate(ref_distances): + if d <= threshold_distance: + # get index of row in full list + index = indices[i] + # get name of other isolate in pair + name = row_labels[index][1] if row_labels[index][0] == isolate else row_labels[index][0] + nn[isolate][name] = ref_distances[i] + # return completed dictionary + return nn + + +def update_nearest_neighbours(): + """ Updates the information on nearest neighbours, given + a new set of ref-query and query-query distances. + """ + return 0 + + +def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_model = None, use_accessory = False): + """ Clusters isolates into lineages based on their + relative distances. + + Args + X (np.array) + n x 2 array of core and accessory distances for n samples. + This should not be subsampled. + R (int) + Integer specifying the maximum rank of neighbour used + for clustering. + rlist (list) + List of reference sequences. + qlist (list) + List of query sequences. + existing_model (?) + Existing lineage clustering model. + Returns: + ret_vec + An n-vector with the most likely cluster memberships + """ + # process data lineage identification + sorted_distances = np.empty(shape = X.shape[0]) + row_labels = [] + isolate_list = [] + if qlist is None: + isolate_list = rlist + distance_index = 1 if use_accessory else 0 + distances = X[:,distance_index] + row_labels = list(iter(iterDistRows(rlist, rlist, self=True))) + else: + sys.stderr.write("Adding to lineage not yet implemented\n") + # identify nearest neighbours + neighbours = {} + neighbours = generate_nearest_neighbours(distances, + row_labels, + rlist, + R) + # run clustering + null_cluster_value = len(isolate_list) + 1 + lineage_clustering = {i:null_cluster_value for i in isolate_list} + lineage_index = 1 + lineage_seed = {} + while null_cluster_value in lineage_clustering.values(): + # get seed isolate based on minimum pairwise distances + seed_isolate = get_seed_isolate(lineage_clustering, + row_labels, + distances, + null_cluster_value) + # record status of seed isolate + lineage_seed[lineage_index] = seed_isolate + lineage_clustering[seed_isolate] = lineage_index + # get information for lineage clustering + lineage_clustering_information = get_lineage_clustering_information(seed_isolate, + row_labels, + distances) + # cluster the lineages + lineage_clustering = run_lineage_clustering(lineage_clustering, + lineage_clustering_information, + neighbours, + R, + lineage_index, + seed_isolate) + # increment index for next lineage + lineage_index = lineage_index + 1 + + # print output + lineage_output_name = output + "/" + output + "_lineage_clusters.csv" + with open(lineage_output_name, 'w') as lFile: + print('Id,Lineage__autocolor', file = lFile) + for isolate in lineage_clustering.keys(): + print(isolate + ',' + str(lineage_clustering[isolate]), file = lFile) + + return 0 + From 82ed97393e7f8f064828f7929f2365f758804f2c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 24 Apr 2020 09:28:43 +0100 Subject: [PATCH 02/46] Allow addition to lineage clustering --- PopPUNK/__main__.py | 17 ++- PopPUNK/lineage_clustering.py | 233 +++++++++++++++++++++++++--------- 2 files changed, 183 insertions(+), 67 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 3bbcdfc9..cf90b5f4 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -175,7 +175,9 @@ def get_options(): # lineage clustering within strains lineagesGroup = parser.add_argument_group('Lineage analysis options') - lineagesGroup.add_argument('--R',help='Maximum rank used in lineage clustering', type = int, default=1) + lineagesGroup.add_argument('--R',help='Maximum rank used in lineage clustering [default = 1]', type = int, default = 1) + lineagesGroup.add_argument('--use-accessory',help='Use accessory distances for lineage definitions [default = use core distances]', action = 'store_true', default = False) + lineagesGroup.add_argument('--existing-scheme',help='Name of pickle file storing existing lineage definitions', default = None) # plot output faGroup = parser.add_argument_group('Further analysis options') @@ -521,20 +523,21 @@ def main(): sys.stderr.write("Mode: Identifying lineages within a clade\n\n") distances = None - ref_db = None - if args.distances is not None and args.ref_db is not None: + if args.distances is not None: distances = args.distances - ref_db = args.ref_db else: - sys.stderr.write("Need to provide an input set of distances with --distances " - "and reference database directory with --ref-db\n\n") + sys.stderr.write("Need to provide an input set of distances with --distances\n\n") sys.exit(1) # load distance matrix refList, queryList, self, distMat = readPickle(distances) + print("Queries are: " + str(queryList)) # run lineage clustering - cluster_into_lineages(distMat, args.R, args.output, rlist = refList) + if self: + cluster_into_lineages(distMat, args.R, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme) + else: + cluster_into_lineages(distMat, args.R, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme) #*******************************# #* *# diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 493e6fbe..5be9bc14 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -9,11 +9,12 @@ import numpy as np from scipy.stats import rankdata from collections import defaultdict +import pickle # import poppunk package from .utils import iterDistRows -def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, R, lineage_index, seed_isolate): +def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, R, lineage_index, seed_isolate, previous_lineage_clustering, null_cluster_value): """ Identifies isolates corresponding to a particular lineage given a cluster seed. @@ -58,34 +59,56 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n lineage_clustering[isolate_neighbour] = lineage_index else: break + # if this represents a change from the previous iteration of lineage definitions + # then the change needs to propagate through higher-ranked members + if previous_lineage_clustering[isolate] == lineage_index and lineage_clustering[isolate] != lineage_index: + for higher_rank in lineage_clustering_information.keys(): + if higher_rank > rank: + for higher_ranked_isolate in lineage_clustering_information[rank]: + if lineage_clustering[isolate] == lineage_index: + lineage_clustering[isolate] = null_cluster_value + + return lineage_clustering -def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_value): +def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_value, lineage_index, lineage_seed): """ Identifies the isolate used to initiate a cluster """ - # extract all pairwise distances between isolates that are not yet clustered - clustered_isolates = frozenset([isolate for isolate in lineage_clustering.keys() if lineage_clustering[isolate] != null_cluster_value]) - unclustered_pair_indices = [n for n,pair in enumerate(row_labels) if not set(pair).issubset(clustered_isolates)] - unclustered_distances = distances[unclustered_pair_indices] -# print("Unclustered distances are: " + str(unclustered_distances)) - # start at zero as the minimum if present in the unclustered set - minimum_unclustered_distance = 0 - # otherwise go through the bother of picking a minimum - if minimum_unclustered_distance not in unclustered_distances: - if len(unclustered_distances) == 1: - minimum_unclustered_distance = unclustered_distances[0] - elif len(unclustered_distances) > 1: - minimum_unclustered_distance = np.amin(unclustered_distances) - else: - sys.stderr.write('Cannot find any more isolates not assigned to lineages; unclustered pair indices are ' + str(unclustered_pair_indices)) - exit(1) - # identify which entries in the full distance set have this value - # unfortunately these distances are not necessarily neighbours - distance_indices = np.where(distances == minimum_unclustered_distance)[0].tolist() - # identify an unclustered isolate from the pair that have this value - closest_unclustered_pair_index = int(set(unclustered_pair_indices).intersection(set(distance_indices)).pop()) - new_seed_isolate = row_labels[closest_unclustered_pair_index][0] if lineage_clustering[row_labels[closest_unclustered_pair_index][0]] == null_cluster_value else row_labels[closest_unclustered_pair_index][1] - return new_seed_isolate + # variable to return + seed_isolate = None + # first test if there is an existing seed + if lineage_index in lineage_seed.keys(): + original_seed_isolate = lineage_seed[lineage_index] + # if seed now belongs to a different lineage, then lineage has been overwritten + # seed may be unassigned if neighbours have changed - but this does not overwrite the + # lineage in the same way + if lineage_clustering[original_seed_isolate] == null_cluster_value or lineage_clustering[original_seed_isolate] == lineage_index: + seed_isolate = original_seed_isolate + # else identify a new seed from the closest pair + else: + # extract all pairwise distances between isolates that are not yet clustered + clustered_isolates = frozenset([isolate for isolate in lineage_clustering.keys() if lineage_clustering[isolate] != null_cluster_value]) + unclustered_pair_indices = [n for n,pair in enumerate(row_labels) if not set(pair).issubset(clustered_isolates)] + unclustered_distances = distances[unclustered_pair_indices] + # print("Unclustered distances are: " + str(unclustered_distances)) + # start at zero as the minimum if present in the unclustered set + minimum_unclustered_distance = 0 + # otherwise go through the bother of picking a minimum + if minimum_unclustered_distance not in unclustered_distances: + if len(unclustered_distances) == 1: + minimum_unclustered_distance = unclustered_distances[0] + elif len(unclustered_distances) > 1: + minimum_unclustered_distance = np.amin(unclustered_distances) + else: + sys.stderr.write('Cannot find any more isolates not assigned to lineages; unclustered pair indices are ' + str(unclustered_pair_indices)) + exit(1) + # identify which entries in the full distance set have this value + # unfortunately these distances are not necessarily neighbours + distance_indices = np.where(distances == minimum_unclustered_distance)[0].tolist() + # identify an unclustered isolate from the pair that have this value + closest_unclustered_pair_index = int(set(unclustered_pair_indices).intersection(set(distance_indices)).pop()) + seed_isolate = row_labels[closest_unclustered_pair_index][0] if lineage_clustering[row_labels[closest_unclustered_pair_index][0]] == null_cluster_value else row_labels[closest_unclustered_pair_index][1] + return seed_isolate def get_lineage_clustering_information(seed_isolate, row_labels, distances): @@ -135,8 +158,8 @@ def generate_nearest_neighbours(X, row_labels, isolate_list, R): # get indices of rows involving the considered isolate indices = [n for n, (r, q) in enumerate(row_labels) if r == isolate or q == isolate] # get the corresponding distances - #ref_distances = X[np.array(indices):1] - ref_distances = np.take(X, np.array(indices)) + #ref_distances = np.take(X, np.array(indices, dtype = 'int64')) + ref_distances = np.take(X, indices) # get unique distances and pick the R smallest values unique_isolate_distances = np.unique(ref_distances) R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)] @@ -156,18 +179,70 @@ def generate_nearest_neighbours(X, row_labels, isolate_list, R): return nn -def update_nearest_neighbours(): +def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clustering, null_cluster_value): """ Updates the information on nearest neighbours, given a new set of ref-query and query-query distances. + --- really need isolate list? """ - return 0 + # iterate through isolates and test whether any comparisons with + # newly-added queries replace or supplement existing neighbours + + # pre-process to extract ref-query distances first + query_match_indices = [n for n, (r, q) in enumerate(row_labels) if q in qlist or r in qlist] + query_row_names = [row_labels[i] for i in query_match_indices] + query_distances = np.take(distances, query_match_indices) + # iterate through references to update existing entries + for isolate in nn.keys(): + # get max distance to existing neighbours + max_distance_to_ref = max(nn[isolate].values()) + # get min distance to queries + ref_query_match_indices = [n for n, (r, q) in enumerate(query_row_names) if r == isolate or q == isolate] + ref_query_distances = np.take(query_distances, ref_query_match_indices) + min_distance_to_query = np.amin(ref_query_distances) + # if closer queries, find distances and replace + if min_distance_to_query < max_distance_to_ref: + # unset clustering of isolate and neighbours + lineage_clustering[isolate] = null_cluster_value + for neighbour in nn[isolate].keys(): + lineage_clustering[neighbour] = null_cluster_value + # update neighbour information + nn_old = nn.pop(isolate) + nn[isolate] = {} + # old neighbours from existing dict + old_distances = list(set(nn_old.values())) + # new neighbours from new distances in X + new_distances = np.unique(ref_query_distances) + # combine old and new distances + combined_ref_distances = (nn_old.values(),ref_query_distances.tolist()) + # take top ranked distances from both + ranked_unique_combined_ref_distances = sorted(set(combined_ref_distances))[:R] + for distance in ranked_unique_combined_ref_distances: + # add old neighbours back in + if distance in old_distances: + for neighbour,d in nn_old.items(): + if d == distance: + nn[isolate][neighbour] = distance + # add new neighbours + if distance in new_distances: + for n,d in enumerate(ref_query_distances): + if d == distance: + row_label_index = ref_query_match_indices[n] + neighbour = row_label_index[row_label_index][0] if row_label_index[row_label_index][1] == isolate else row_label_index[row_label_index][1] + nn[isolate][neighbour] = d +# print('Changed sequences for isolate ' + isolate + '\nOld distances: ' + str(old_nn) + '\nNew distances: ' + str(nn[isolate])) + # get nn for query sequences + query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R) + # merge dicts + for query in query_nn.keys(): + nn[query] = query_nn[query] + return nn -def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_model = None, use_accessory = False): +def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_scheme = None, use_accessory = False): """ Clusters isolates into lineages based on their relative distances. - Args + Args: X (np.array) n x 2 array of core and accessory distances for n samples. This should not be subsampled. @@ -178,57 +253,95 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_mod List of reference sequences. qlist (list) List of query sequences. - existing_model (?) + existing_scheme (?) Existing lineage clustering model. + Returns: ret_vec An n-vector with the most likely cluster memberships """ - # process data lineage identification - sorted_distances = np.empty(shape = X.shape[0]) + # process distance matrix + distance_index = 1 if use_accessory else 0 + distances = X[:,distance_index] + + # determine whether ref-ref or ref-query analysis row_labels = [] isolate_list = [] if qlist is None: + sys.stderr.write("Identifying lineages using reference sequences\n") isolate_list = rlist - distance_index = 1 if use_accessory else 0 - distances = X[:,distance_index] - row_labels = list(iter(iterDistRows(rlist, rlist, self=True))) + row_labels = list(iter(iterDistRows(rlist, rlist, self = True))) else: - sys.stderr.write("Adding to lineage not yet implemented\n") - # identify nearest neighbours + sys.stderr.write("Identifying lineages from using reference and query sequences\n") + row_labels = list(iter(iterDistRows(rlist, qlist, self = False))) + isolate_list = rlist + qlist + + # determine whether novel analysis or modifying existing analysis neighbours = {} - neighbours = generate_nearest_neighbours(distances, - row_labels, - rlist, - R) - # run clustering + lineage_seed = {} null_cluster_value = len(isolate_list) + 1 lineage_clustering = {i:null_cluster_value for i in isolate_list} + previous_lineage_clustering = lineage_clustering + + if existing_scheme is None: + neighbours = generate_nearest_neighbours(distances, + row_labels, + isolate_list, + R) + else: + with open(existing_scheme, 'rb') as pickle_file: + lineage_clustering, lineage_seed, neighbours = pickle.load(pickle_file) + previous_lineage_clustering = lineage_clustering # use for detecting changes + neighbours = update_nearest_neighbours(distances, + row_labels, + R, + qlist, + neighbours, + lineage_clustering, + null_cluster_value) + + # run clustering lineage_index = 1 - lineage_seed = {} while null_cluster_value in lineage_clustering.values(): + # get seed isolate based on minimum pairwise distances - seed_isolate = get_seed_isolate(lineage_clustering, + seed_isolate = get_seed_isolate(lineage_clustering, # need to add in lineage seed hash row_labels, distances, - null_cluster_value) - # record status of seed isolate + null_cluster_value, + lineage_index, + lineage_seed) lineage_seed[lineage_index] = seed_isolate - lineage_clustering[seed_isolate] = lineage_index - # get information for lineage clustering - lineage_clustering_information = get_lineage_clustering_information(seed_isolate, - row_labels, - distances) - # cluster the lineages - lineage_clustering = run_lineage_clustering(lineage_clustering, - lineage_clustering_information, - neighbours, - R, - lineage_index, - seed_isolate) + + # seed isolate is None if a previously-existing cluster has been overwritten + # in which case pass over the lineage to keep nomenclature consistent + if seed_isolate is not None: + + # record status of seed isolate + lineage_clustering[seed_isolate] = lineage_index + + # get information for lineage clustering + lineage_clustering_information = get_lineage_clustering_information(seed_isolate, + row_labels, + distances) + + # cluster the lineages + lineage_clustering = run_lineage_clustering(lineage_clustering, + lineage_clustering_information, + neighbours, + R, + lineage_index, + seed_isolate, + previous_lineage_clustering, + null_cluster_value) + # increment index for next lineage lineage_index = lineage_index + 1 + # store output + with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file: + pickle.dump([lineage_clustering, lineage_seed, neighbours], pickle_file) + # print output lineage_output_name = output + "/" + output + "_lineage_clusters.csv" with open(lineage_output_name, 'w') as lFile: From bbf3538c169550047b6514f22f4aa3c4fe958cf5 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 24 Apr 2020 10:16:03 +0100 Subject: [PATCH 03/46] Fix integration of query isolates --- PopPUNK/__main__.py | 1 - PopPUNK/lineage_clustering.py | 18 +++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index cf90b5f4..058b3c9c 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -531,7 +531,6 @@ def main(): # load distance matrix refList, queryList, self, distMat = readPickle(distances) - print("Queries are: " + str(queryList)) # run lineage clustering if self: diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 5be9bc14..72554e52 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -61,12 +61,13 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n break # if this represents a change from the previous iteration of lineage definitions # then the change needs to propagate through higher-ranked members - if previous_lineage_clustering[isolate] == lineage_index and lineage_clustering[isolate] != lineage_index: - for higher_rank in lineage_clustering_information.keys(): - if higher_rank > rank: - for higher_ranked_isolate in lineage_clustering_information[rank]: - if lineage_clustering[isolate] == lineage_index: - lineage_clustering[isolate] = null_cluster_value + if isolate in previous_lineage_clustering: + if previous_lineage_clustering[isolate] == lineage_index and lineage_clustering[isolate] != lineage_index: + for higher_rank in lineage_clustering_information.keys(): + if higher_rank > rank: + for higher_ranked_isolate in lineage_clustering_information[rank]: + if lineage_clustering[isolate] == lineage_index: + lineage_clustering[isolate] = null_cluster_value return lineage_clustering @@ -110,7 +111,6 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val seed_isolate = row_labels[closest_unclustered_pair_index][0] if lineage_clustering[row_labels[closest_unclustered_pair_index][0]] == null_cluster_value else row_labels[closest_unclustered_pair_index][1] return seed_isolate - def get_lineage_clustering_information(seed_isolate, row_labels, distances): """ Generates the ranked distances needed for cluster definition. @@ -235,6 +235,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust # merge dicts for query in query_nn.keys(): nn[query] = query_nn[query] + # return updated dict return nn @@ -292,6 +293,9 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch with open(existing_scheme, 'rb') as pickle_file: lineage_clustering, lineage_seed, neighbours = pickle.load(pickle_file) previous_lineage_clustering = lineage_clustering # use for detecting changes + # add new queries to lineage clustering + for q in qlist: + lineage_clustering[q] = null_cluster_value neighbours = update_nearest_neighbours(distances, row_labels, R, From 95f9c84a2f61c605294195a63a7e3e544d69fe0b Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 24 Apr 2020 10:19:13 +0100 Subject: [PATCH 04/46] Add query status to output CSV --- PopPUNK/lineage_clustering.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 72554e52..e195565f 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -349,9 +349,16 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch # print output lineage_output_name = output + "/" + output + "_lineage_clusters.csv" with open(lineage_output_name, 'w') as lFile: - print('Id,Lineage__autocolor', file = lFile) - for isolate in lineage_clustering.keys(): - print(isolate + ',' + str(lineage_clustering[isolate]), file = lFile) - + if qlist is None: + print('Id,Lineage__autocolor', file = lFile) + for isolate in lineage_clustering.keys(): + print(isolate + ',' + str(lineage_clustering[isolate]), file = lFile) + else: + print('Id,Lineage__autocolor,QueryStatus', file = lFile) + for isolate in rlist: + print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + 'Reference', file = lFile) + for isolate in qlist: + print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + 'Query', file = lFile) + return 0 From af909d2eb55c31a3f1d54ff2433b0e409c3d5ab8 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 24 Apr 2020 10:45:07 +0100 Subject: [PATCH 05/46] Documenting functions and updating lineage assignment --- PopPUNK/lineage_clustering.py | 133 ++++++++++++++++++++++++++-------- 1 file changed, 101 insertions(+), 32 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index e195565f..f4f7c805 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -19,14 +19,27 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n lineage given a cluster seed. Args: - ranked_information - nearest_neighbours (dict) - + lineage_clustering (dict) + Clustering of existing dataset. + lineage_clustering_information (dict) + Dict listing isolates by ranked distance from seed. + neighbours (nested dict) + Pre-calculated neighbour relationships. R (int) - Maximum rank of neighbours used for clustering. - + Maximum rank of neighbours used for clustering. + lineage_index (int) + Label of current lineage. + seed_isolate (str) + Isolate to used to initiate next lineage. + previous_lineage_clustering (dict) + Clustering of existing dataset in previous iteration. + null_cluster_value (int) + Null cluster value used for unsetting lineage assignments + where this may change due to altered neighbour relationships. + Returns: - ranked_information + lineage_clustering (dict) + Assignment of isolates to lineages. """ # first make all R neighbours of the seed part of the lineage if unclustered @@ -41,11 +54,9 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n # iterate through isolates of same rank for isolate in lineage_clustering_information[rank]: # test if clustered/belonging to a more peripheral cluster -# print("Rank: " + str(rank) + " isolate: " + isolate) if lineage_clustering[isolate] > lineage_index: # get clusters of nearest neighbours isolate_neighbour_clusters = [lineage_clustering[isolate_neighbour] for isolate_neighbour in neighbours[isolate].keys()] -# print('Neighbours: ' + str(neighbours[isolate].keys()) + '\nNeighbour clusters: ' + str(isolate_neighbour_clusters)) # if a nearest neighbour is in this cluster if lineage_index in isolate_neighbour_clusters: # add isolate to lineage @@ -73,7 +84,27 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n return lineage_clustering def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_value, lineage_index, lineage_seed): - """ Identifies the isolate used to initiate a cluster + """ Identifies the isolate used to initiate a cluster. + + Args: + lineage_clustering (dict) + Clustering of existing dataset. + row_labels (list of tuples) + Pairs of isolates labelling each distance. + distances (numpy array) + Pairwise distances used for defining relationships. + null_cluster_value (int) + Null cluster value used for unsetting lineage assignments + where this may change due to altered neighbour relationships. + lineage_index (int) + Label of current lineage. + lineage_seed (dict) + Dict of seeds used to initiate pre-existing lineage definitions. + + Returns: + seed_isolate (str) + Isolate to used to initiate next lineage. + """ # variable to return seed_isolate = None @@ -91,7 +122,6 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val clustered_isolates = frozenset([isolate for isolate in lineage_clustering.keys() if lineage_clustering[isolate] != null_cluster_value]) unclustered_pair_indices = [n for n,pair in enumerate(row_labels) if not set(pair).issubset(clustered_isolates)] unclustered_distances = distances[unclustered_pair_indices] - # print("Unclustered distances are: " + str(unclustered_distances)) # start at zero as the minimum if present in the unclustered set minimum_unclustered_distance = 0 # otherwise go through the bother of picking a minimum @@ -114,6 +144,19 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val def get_lineage_clustering_information(seed_isolate, row_labels, distances): """ Generates the ranked distances needed for cluster definition. + + Args: + seed_isolate (str) + Isolate used to initiate lineage. + row_labels (list of tuples) + Pairs of isolates labelling each distance. + distances (numpy array) + Pairwise distances used for defining relationships. + + Returns: + lineage_info (dict) + Dict listing isolates by ranked distance from seed. + """ # data structure lineage_info = defaultdict(list) @@ -131,17 +174,17 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances): lineage_info[rank] = [seed_partners[n] for n,r in enumerate(distance_ranks) if r == rank] return lineage_info -def generate_nearest_neighbours(X, row_labels, isolate_list, R): +def generate_nearest_neighbours(distances, row_labels, isolate_list, R): """ Identifies the nearest neighbours from the core genome distances. - Args - X (np.array) - n x 2 array of core and accessory distances for n samples. + Args: + distances (numpy array) + Pairwise distances used for defining relationships. row_labels (list of tuples) - List of pairs of isolates + Pairs of isolates labelling each distance. isolate_list (list) - List of isolates + List of isolates for lineage assignment. R (int) Maximum rank of neighbours used for clustering. @@ -159,7 +202,7 @@ def generate_nearest_neighbours(X, row_labels, isolate_list, R): indices = [n for n, (r, q) in enumerate(row_labels) if r == isolate or q == isolate] # get the corresponding distances #ref_distances = np.take(X, np.array(indices, dtype = 'int64')) - ref_distances = np.take(X, indices) + ref_distances = np.take(distances, indices) # get unique distances and pick the R smallest values unique_isolate_distances = np.unique(ref_distances) R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)] @@ -182,7 +225,30 @@ def generate_nearest_neighbours(X, row_labels, isolate_list, R): def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clustering, null_cluster_value): """ Updates the information on nearest neighbours, given a new set of ref-query and query-query distances. - --- really need isolate list? + + Args: + distances (numpy array) + Distances to be used for defining lineages. + row_labels (list of tuples) + Pairs of isolates labelling each distance. + R (int) + Maximum rank of distance used to define nearest neighbours. + qlist (list) + List of queries to be added to existing dataset. + nn (nested dict) + Pre-calculated neighbour relationships. + lineage_clustering (dict) + Clustering of existing dataset. + null_cluster_value (int) + Null cluster value used for unsetting lineage assignments + where this may change due to altered neighbour relationships. + + Returns: + nn (nested dict) + Updated neighbour relationships. + lineage_clustering (dict) + Updated assignment of isolates to lineage. + """ # iterate through isolates and test whether any comparisons with # newly-added queries replace or supplement existing neighbours @@ -229,14 +295,13 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust row_label_index = ref_query_match_indices[n] neighbour = row_label_index[row_label_index][0] if row_label_index[row_label_index][1] == isolate else row_label_index[row_label_index][1] nn[isolate][neighbour] = d -# print('Changed sequences for isolate ' + isolate + '\nOld distances: ' + str(old_nn) + '\nNew distances: ' + str(nn[isolate])) # get nn for query sequences query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R) # merge dicts for query in query_nn.keys(): nn[query] = query_nn[query] # return updated dict - return nn + return nn, lineage_clustering def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_scheme = None, use_accessory = False): @@ -249,17 +314,21 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch This should not be subsampled. R (int) Integer specifying the maximum rank of neighbour used - for clustering. + for clustering. Should be changed to int list for hierarchical + clustering. rlist (list) List of reference sequences. qlist (list) List of query sequences. - existing_scheme (?) - Existing lineage clustering model. + existing_scheme (str) + Path to pickle file containing lineage scheme to which isolates + should be added. + use_accessory (bool) + Option to use accessory distances rather than core distances. Returns: - ret_vec - An n-vector with the most likely cluster memberships + completion_status (int) + 0 if successful """ # process distance matrix distance_index = 1 if use_accessory else 0 @@ -296,13 +365,13 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch # add new queries to lineage clustering for q in qlist: lineage_clustering[q] = null_cluster_value - neighbours = update_nearest_neighbours(distances, - row_labels, - R, - qlist, - neighbours, - lineage_clustering, - null_cluster_value) + neighbours, lineage_clustering = update_nearest_neighbours(distances, + row_labels, + R, + qlist, + neighbours, + lineage_clustering, + null_cluster_value) # run clustering lineage_index = 1 From 58eb9bbe063da6c1cc1ff0a25d14bbb08e9980da Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 24 Apr 2020 11:25:25 +0100 Subject: [PATCH 06/46] Reconfigure visualisation section to allow for external/lineage visualisation --- PopPUNK/__main__.py | 78 +++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 31 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 058b3c9c..9724f58c 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -107,6 +107,8 @@ def get_options(): iGroup.add_argument('--distances', help='Prefix of input pickle of pre-calculated distances') iGroup.add_argument('--external-clustering', help='File with cluster definitions or other labels ' 'generated with any other method.', default=None) + iGroup.add_argument('--viz-lineages', help='CSV with lineage definitions to use for visualisation' + 'rather than strain definitions.', default=None) # output options oGroup = parser.add_argument_group('Output options') @@ -566,6 +568,8 @@ def main(): sys.exit(1) if args.distances is not None and args.ref_db is not None: + + # Initial processing # Load original distances with open(args.distances + ".pkl", 'rb') as pickle_file: rlist, qlist, self = pickle.load(pickle_file) @@ -579,28 +583,8 @@ def main(): except OSError: sys.stderr.write("Cannot create output directory\n") sys.exit(1) - - # identify existing analysis files - model_prefix = args.ref_db - if args.model_dir is not None: - model_prefix = args.model_dir - model = loadClusterFit(model_prefix + "/" + os.path.basename(model_prefix) + '_fit.pkl', - model_prefix + "/" + os.path.basename(model_prefix) + '_fit.npz') - - # Set directories of previous fit - if args.previous_clustering is not None: - prev_clustering = args.previous_clustering - else: - prev_clustering = os.path.dirname(args.distances + ".pkl") - - # Read in network and cluster assignment - genomeNetwork, cluster_file = fetchNetwork(prev_clustering, model, rlist, args.core_only, args.accessory_only) - - # Use external clustering if specified - if args.external_clustering: - cluster_file = args.external_clustering - isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)} - + + # Define set/subset to be visualised # extract subset of distances if requested viz_subset = rlist if args.subset is not None: @@ -611,8 +595,9 @@ def main(): # Use the same code as no full_db in assign_query to take a subset dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" - nodes_to_remove = set(genomeNetwork.nodes).difference(viz_subset) - postpruning_combined_seq, newDistMat = prune_distance_matrix(rlist, nodes_to_remove, + #nodes_to_remove = set(genomeNetwork.nodes).difference(viz_subset) + isolates_to_remove = set(combined_seq).difference(viz_subset) + postpruning_combined_seq, newDistMat = prune_distance_matrix(rlist, isolates_to_remove, complete_distMat, dists_out) rlist = viz_subset @@ -624,12 +609,40 @@ def main(): except: sys.stderr.write("Isolates in subset not found in existing database\n") assert postpruning_combined_seq == viz_subset + + + + # Either use strain definitions, lineage assignments or external clustering + isolateClustering = {} + # Use external clustering if specified + if args.external_clustering: + cluster_file = args.external_clustering + isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)} + elif args.viz_lineages: + cluster_file = args.viz_lineages + isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)} + else: + # identify existing analysis files + model_prefix = args.ref_db + if args.model_dir is not None: + model_prefix = args.model_dir + model = loadClusterFit(model_prefix + "/" + os.path.basename(model_prefix) + '_fit.pkl', + model_prefix + "/" + os.path.basename(model_prefix) + '_fit.npz') + + # Set directories of previous fit + if args.previous_clustering is not None: + prev_clustering = args.previous_clustering + else: + prev_clustering = os.path.dirname(args.distances + ".pkl") - # prune the network and dictionary of assignments - genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(viz_subset)) - for clustering_type in isolateClustering: - isolateClustering[clustering_type] = {viz_key: isolateClustering[clustering_type][viz_key] - for viz_key in viz_subset} + # Read in network and cluster assignment + genomeNetwork, cluster_file = fetchNetwork(prev_clustering, model, rlist, args.core_only, args.accessory_only) + + # prune the network and dictionary of assignments + genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(viz_subset)) + for clustering_type in isolateClustering: + isolateClustering[clustering_type] = {viz_key: isolateClustering[clustering_type][viz_key] + for viz_key in viz_subset} # generate selected visualisations if args.microreact: @@ -645,8 +658,11 @@ def main(): outputsForGrapetree(viz_subset, core_distMat, isolateClustering, args.output, args.info_csv, args.rapidnj, overwrite = args.overwrite, microreact = args.microreact) if args.cytoscape: - sys.stderr.write("Writing cytoscape output\n") - outputsForCytoscape(genomeNetwork, isolateClustering, args.output, args.info_csv) + if args.viz_lineages or args.external_clustering: + sys.stderr.write("Can only generate a network output for fitted models\n") + else: + sys.stderr.write("Writing cytoscape output\n") + outputsForCytoscape(genomeNetwork, isolateClustering, args.output, args.info_csv) else: # Cannot read input files From fd490bb87c4f95eb766da1dc81d5ee552f0ca7a6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 24 Apr 2020 20:37:18 +0100 Subject: [PATCH 07/46] Fix parsing errors with queries --- PopPUNK/__main__.py | 140 +++++++++++++++++++++------------- PopPUNK/lineage_clustering.py | 115 ++++++++++++++++++++++------ 2 files changed, 180 insertions(+), 75 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 9724f58c..266ba26e 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -18,6 +18,7 @@ from .sketchlib import no_sketchlib, checkSketchlibLibrary from .lineage_clustering import cluster_into_lineages +from .lineage_clustering import calculateQueryDistances from .network import fetchNetwork from .network import constructNetwork @@ -174,12 +175,15 @@ def get_options(): '[default = False]', default=False, action='store_true') queryingGroup.add_argument('--accessory-only', help='Use an accessory-distance only model for assigning queries ' '[default = False]', default=False, action='store_true') + queryingGroup.add_argument('--assign-lineage', help='Assign to lineages rather than strains - can specify ' + 'lineages other than those in the reference database using "--existing-scheme"' + '[default = False]', default=False, action='store_true') # lineage clustering within strains lineagesGroup = parser.add_argument_group('Lineage analysis options') lineagesGroup.add_argument('--R',help='Maximum rank used in lineage clustering [default = 1]', type = int, default = 1) lineagesGroup.add_argument('--use-accessory',help='Use accessory distances for lineage definitions [default = use core distances]', action = 'store_true', default = False) - lineagesGroup.add_argument('--existing-scheme',help='Name of pickle file storing existing lineage definitions', default = None) + lineagesGroup.add_argument('--existing-scheme',help='Name of pickle file storing existing lineage definitions', type = str, default = None) # plot output faGroup = parser.add_argument_group('Further analysis options') @@ -536,9 +540,9 @@ def main(): # run lineage clustering if self: - cluster_into_lineages(distMat, args.R, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme) + isolateClustering = cluster_into_lineages(distMat, args.R, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme) else: - cluster_into_lineages(distMat, args.R, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme) + isolateClustering = cluster_into_lineages(distMat, args.R, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme) #*******************************# #* *# @@ -552,7 +556,7 @@ def main(): args.threads, args.use_mash, args.mash, args.overwrite, args.plot_fit, args.no_stream, args.max_a_dist, args.model_dir, args.previous_clustering, args.external_clustering, args.core_only, args.accessory_only, args.phandango, args.grapetree, args.info_csv, - args.rapidnj, args.perplexity) + args.rapidnj, args.perplexity, args.assign_lineage, args.existing_scheme, args.R, args.use_accessory) #******************************# #* *# @@ -597,7 +601,10 @@ def main(): dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" #nodes_to_remove = set(genomeNetwork.nodes).difference(viz_subset) isolates_to_remove = set(combined_seq).difference(viz_subset) - postpruning_combined_seq, newDistMat = prune_distance_matrix(rlist, isolates_to_remove, + postpruning_combined_seq = sorted(viz_subset) + newDistMat = complete_distMat + if len(isolates_to_remove) > 0: + postpruning_combined_seq, newDistMat = prune_distance_matrix(rlist, isolates_to_remove, complete_distMat, dists_out) rlist = viz_subset @@ -610,8 +617,6 @@ def main(): sys.stderr.write("Isolates in subset not found in existing database\n") assert postpruning_combined_seq == viz_subset - - # Either use strain definitions, lineage assignments or external clustering isolateClustering = {} # Use external clustering if specified @@ -681,7 +686,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances kmers, sketch_sizes, ignore_length, estimated_length, threads, use_mash, mash, overwrite, plot_fit, no_stream, max_a_dist, model_dir, previous_clustering, external_clustering, core_only, accessory_only, phandango, grapetree, - info_csv, rapidnj, perplexity): + info_csv, rapidnj, perplexity, assign_lineage, existing_scheme, R, use_accessory): """Code for assign query mode. Written as a separate function so it can be called by pathogen.watch API """ @@ -740,58 +745,80 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances threads = threads) qcPass = qcDistMat(distMat, refList, queryList, max_a_dist) - # Assign these distances as within or between - model_prefix = ref_db - if model_dir is not None: - model_prefix = model_dir - model = loadClusterFit(model_prefix + "/" + os.path.basename(model_prefix) + '_fit.pkl', - model_prefix + "/" + os.path.basename(model_prefix) + '_fit.npz') - queryAssignments = model.assign(distMat) - - # set model prefix - model_prefix = ref_db - if model_dir is not None: - model_prefix = model_dir - - # Set directories of previous fit - if previous_clustering is not None: - prev_clustering = previous_clustering - else: - prev_clustering = model_prefix - - # Load the network based on supplied options - genomeNetwork, old_cluster_file = fetchNetwork(prev_clustering, model, refList, - core_only, accessory_only) - - # Assign clustering by adding to network - ordered_queryList, query_distMat = addQueryToNetwork(dbFuncs, refList, q_files, - genomeNetwork, kmers, estimated_length, queryAssignments, model, output, update_db, - use_mash, threads) - - # if running simple query - print_full_clustering = False - if update_db: - print_full_clustering = True - isolateClustering = {'combined': printClusters(genomeNetwork, output + "/" + os.path.basename(output), - old_cluster_file, external_clustering, print_full_clustering)} - - # update_db like no full_db - if update_db: + # Calculate query-query distances +################################################################################################################### + ordered_queryList = [] + + # Assign to strains or lineages, as requested + if assign_lineage: + + # Assign lineages by calculating query-query information + ordered_queryList, query_distMat = calculateQueryDistances(dbFuncs, refList, q_files, + kmers, estimated_length, output, use_mash, threads) +# +# # ASSIGN LINEAGE +# expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl' +# if existing_scheme: +# expected_lineage_name = existing_scheme +# combined_distances = np.concatenate((distMat,query_distMat),axis = 0) +# isolateClustering = cluster_into_lineages(combined_distances, R, output, rlist = refList, qlist = ordered_queryList, use_accessory = use_accessory, existing_scheme = expected_lineage_name) +################################################################################################################### + if not assign_lineage: + # Assign these distances as within or between strain + model_prefix = ref_db + if model_dir is not None: + model_prefix = model_dir + model = loadClusterFit(model_prefix + "/" + os.path.basename(model_prefix) + '_fit.pkl', + model_prefix + "/" + os.path.basename(model_prefix) + '_fit.npz') + queryAssignments = model.assign(distMat) + + # set model prefix + model_prefix = ref_db + if model_dir is not None: + model_prefix = model_dir + + # Set directories of previous fit + if previous_clustering is not None: + prev_clustering = previous_clustering + else: + prev_clustering = model_prefix + + # Load the network based on supplied options + genomeNetwork, old_cluster_file = fetchNetwork(prev_clustering, model, refList, + core_only, accessory_only) + + # Assign clustering by adding to network + ordered_queryList, query_distMat = addQueryToNetwork(dbFuncs, refList, q_files, + genomeNetwork, kmers, estimated_length, queryAssignments, model, output, update_db, + use_mash, threads) + + # if running simple query + print_full_clustering = False + if update_db: + print_full_clustering = True + isolateClustering = {'combined': printClusters(genomeNetwork, output + "/" + os.path.basename(output), + old_cluster_file, external_clustering, print_full_clustering)} + + # Update DB as requested + if update_db or assign_lineage: + + # Check new sequences pass QC before adding them if not qcPass: sys.stderr.write("Queries contained outlier distances, not updating database\n") else: sys.stderr.write("Updating reference database to " + output + "\n") # Update the network + ref list - if full_db is False: + # only update network if assigning to strains + if full_db is False or assign_lineage is False: mashOrder = refList + ordered_queryList newRepresentativesNames, newRepresentativesFile = extractReferences(genomeNetwork, mashOrder, output, refList) genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(newRepresentativesNames)) newQueries = [x for x in ordered_queryList if x in frozenset(newRepresentativesNames)] # intersection that maintains order + nx.write_gpickle(genomeNetwork, output + "/" + os.path.basename(output) + '_graph.gpickle') else: newQueries = ordered_queryList - nx.write_gpickle(genomeNetwork, output + "/" + os.path.basename(output) + '_graph.gpickle') - + # Update the sketch database if newQueries != queryList and use_mash: tmpRefFile = writeTmpFile(newQueries) @@ -800,7 +827,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances os.remove(tmpRefFile) # With mash, this is the reduced DB constructed, # with sketchlib, all sketches - joinDBs(ref_db, output, output) + joinDBs(ref_db, output, output) # Update distance matrices with all calculated distances if distances == None: @@ -812,9 +839,15 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances ordered_queryList, distMat, query_distMat) complete_distMat = translate_distMat(combined_seq, core_distMat, acc_distMat) + if assign_lineage: + expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl' + if existing_scheme is not None: + expected_lineage_name = existing_scheme + isolateClustering = cluster_into_lineages(complete_distMat, R, output, refList, ordered_queryList, expected_lineage_name, use_accessory) + # Prune distances to references only, if not full db dists_out = output + "/" + os.path.basename(output) + ".dists" - if full_db is False: + if full_db is False and assign_lineage is False: # could also have newRepresentativesNames in this diff (should be the same) - but want # to ensure consistency with the network in case of bad input/bugs nodes_to_remove = set(combined_seq).difference(genomeNetwork.nodes) @@ -845,8 +878,11 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances outputsForGrapetree(combined_seq, core_distMat, isolateClustering, output, info_csv, rapidnj, queryList = ordered_queryList, overwrite = overwrite, microreact = microreact) if cytoscape: - sys.stderr.write("Writing cytoscape output\n") - outputsForCytoscape(genomeNetwork, isolateClustering, output, info_csv, ordered_queryList) + if assign_lineage: + sys.stderr.write("Cannot generate a cytoscape network from a lineage assignment") + else: + sys.stderr.write("Writing cytoscape output\n") + outputsForCytoscape(genomeNetwork, isolateClustering, output, info_csv, ordered_queryList) else: sys.stderr.write("Need to provide both a reference database with --ref-db and " diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index f4f7c805..face1946 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -13,6 +13,7 @@ # import poppunk package from .utils import iterDistRows +from .utils import readRfile def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, R, lineage_index, seed_isolate, previous_lineage_clustering, null_cluster_value): """ Identifies isolates corresponding to a particular @@ -253,12 +254,16 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust # iterate through isolates and test whether any comparisons with # newly-added queries replace or supplement existing neighbours + # data structures for altered entries + nn_new = {} # pre-process to extract ref-query distances first query_match_indices = [n for n, (r, q) in enumerate(row_labels) if q in qlist or r in qlist] query_row_names = [row_labels[i] for i in query_match_indices] query_distances = np.take(distances, query_match_indices) # iterate through references to update existing entries - for isolate in nn.keys(): + existing_isolates = nn.keys() + + for isolate in existing_isolates: # get max distance to existing neighbours max_distance_to_ref = max(nn[isolate].values()) # get min distance to queries @@ -272,39 +277,41 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust for neighbour in nn[isolate].keys(): lineage_clustering[neighbour] = null_cluster_value # update neighbour information - nn_old = nn.pop(isolate) - nn[isolate] = {} + nn_old = nn[isolate] + nn_new[isolate] = {} # old neighbours from existing dict - old_distances = list(set(nn_old.values())) + old_distances = list(set(nn[isolate].values())) # new neighbours from new distances in X new_distances = np.unique(ref_query_distances) # combine old and new distances - combined_ref_distances = (nn_old.values(),ref_query_distances.tolist()) + combined_ref_distances = list(nn[isolate].values()) + ref_query_distances.tolist() # take top ranked distances from both ranked_unique_combined_ref_distances = sorted(set(combined_ref_distances))[:R] for distance in ranked_unique_combined_ref_distances: # add old neighbours back in if distance in old_distances: - for neighbour,d in nn_old.items(): + for neighbour,d in nn[isolate].items(): if d == distance: - nn[isolate][neighbour] = distance + nn_new[isolate][neighbour] = distance # add new neighbours if distance in new_distances: for n,d in enumerate(ref_query_distances): if d == distance: row_label_index = ref_query_match_indices[n] - neighbour = row_label_index[row_label_index][0] if row_label_index[row_label_index][1] == isolate else row_label_index[row_label_index][1] - nn[isolate][neighbour] = d + neighbour = row_labels[row_label_index][0] if row_labels[row_label_index][1] == isolate else row_labels[row_label_index][1] + nn_new[isolate][neighbour] = d # get nn for query sequences query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R) # merge dicts for query in query_nn.keys(): nn[query] = query_nn[query] + for updated in nn_new.keys(): + nn[updated] = nn_new[updated] # return updated dict return nn, lineage_clustering -def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_scheme = None, use_accessory = False): +def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False): """ Clusters isolates into lineages based on their relative distances. @@ -335,16 +342,16 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch distances = X[:,distance_index] # determine whether ref-ref or ref-query analysis - row_labels = [] - isolate_list = [] - if qlist is None: - sys.stderr.write("Identifying lineages using reference sequences\n") - isolate_list = rlist - row_labels = list(iter(iterDistRows(rlist, rlist, self = True))) - else: - sys.stderr.write("Identifying lineages from using reference and query sequences\n") - row_labels = list(iter(iterDistRows(rlist, qlist, self = False))) - isolate_list = rlist + qlist + isolate_list = rlist + qlist + row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True))) +# if qlist is None: +# sys.stderr.write("Identifying lineages using reference sequences\n") +# isolate_list = rlist +# row_labels = list(iter(iterDistRows(rlist, rlist, self = True))) +# else: +# sys.stderr.write("Identifying lineages from using reference and query sequences\n") +# row_labels = list(iter(iterDistRows(rlist, qlist, self = False))) +# isolate_list = rlist + qlist # determine whether novel analysis or modifying existing analysis neighbours = {} @@ -360,7 +367,8 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch R) else: with open(existing_scheme, 'rb') as pickle_file: - lineage_clustering, lineage_seed, neighbours = pickle.load(pickle_file) + sys.stderr.write('Loading from stored file ' + str(existing_scheme) + '\n') + lineage_clustering, lineage_seed, neighbours, R = pickle.load(pickle_file) previous_lineage_clustering = lineage_clustering # use for detecting changes # add new queries to lineage clustering for q in qlist: @@ -413,7 +421,7 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch # store output with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file: - pickle.dump([lineage_clustering, lineage_seed, neighbours], pickle_file) + pickle.dump([lineage_clustering, lineage_seed, neighbours, R], pickle_file) # print output lineage_output_name = output + "/" + output + "_lineage_clusters.csv" @@ -429,5 +437,66 @@ def cluster_into_lineages(X, R, output, rlist = None, qlist = None, existing_sch for isolate in qlist: print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + 'Query', file = lFile) - return 0 + return lineage_clustering + +def calculateQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length, + queryDB, use_mash = False, threads = 1): + """Finds edges between queries and items in the reference database, + and modifies the network to include them. + + Args: + dbFuncs (list) + List of backend functions from :func:`~PopPUNK.utils.setupDBFuncs` + rlist (list) + List of reference names + qfile (str) + File containing queries + kmers (list) + List of k-mer sizes + estimated_length (int) + Estimated length of genome, if not calculated from data + queryDB (str) + Query database location + use_mash (bool) + Use the mash backend + threads (int) + Number of threads to use if new db created + (default = 1) + Returns: + qlist1 (list) + Ordered list of queries + distMat (numpy.array) + Query-query distances + """ + + constructDatabase = dbFuncs['constructDatabase'] + queryDatabase = dbFuncs['queryDatabase'] + readDBParams = dbFuncs['readDBParams'] + + # These are returned + qlist1 = None + distMat = None + + # Set up query names + qList, qSeqs = readRfile(qfile, oneSeq = use_mash) + queryFiles = dict(zip(qList, qSeqs)) + if use_mash == True: + rNames = None + qNames = qSeqs + else: + rNames = qList + qNames = rNames + + # Calculate all query-query distances too, if updating database + qlist1, qlist2, distMat = queryDatabase(rNames = rNames, + qNames = qNames, + dbPrefix = queryDB, + queryPrefix = queryDB, + klist = kmers, + self = True, + number_plot_fits = 0, + threads=threads) + + return qlist1, distMat + From 95ec785104385258eea1055073e5fb7a2d1ad977 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 24 Apr 2020 21:15:18 +0100 Subject: [PATCH 08/46] Fix visualisation from query output --- PopPUNK/__main__.py | 2 +- PopPUNK/lineage_clustering.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 266ba26e..5956fdd3 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -843,7 +843,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl' if existing_scheme is not None: expected_lineage_name = existing_scheme - isolateClustering = cluster_into_lineages(complete_distMat, R, output, refList, ordered_queryList, expected_lineage_name, use_accessory) + isolateClustering = {'combined': cluster_into_lineages(complete_distMat, R, output, combined_seq, ordered_queryList, expected_lineage_name, use_accessory)} # Prune distances to references only, if not full db dists_out = output + "/" + os.path.basename(output) + ".dists" diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index face1946..87e8f954 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -326,7 +326,8 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e rlist (list) List of reference sequences. qlist (list) - List of query sequences. + List of query sequences being added to an existing clustering. + Should be included within rlist. existing_scheme (str) Path to pickle file containing lineage scheme to which isolates should be added. @@ -342,7 +343,7 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e distances = X[:,distance_index] # determine whether ref-ref or ref-query analysis - isolate_list = rlist + qlist + isolate_list = rlist row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True))) # if qlist is None: # sys.stderr.write("Identifying lineages using reference sequences\n") @@ -367,7 +368,6 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e R) else: with open(existing_scheme, 'rb') as pickle_file: - sys.stderr.write('Loading from stored file ' + str(existing_scheme) + '\n') lineage_clustering, lineage_seed, neighbours, R = pickle.load(pickle_file) previous_lineage_clustering = lineage_clustering # use for detecting changes # add new queries to lineage clustering @@ -433,9 +433,10 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e else: print('Id,Lineage__autocolor,QueryStatus', file = lFile) for isolate in rlist: - print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + 'Reference', file = lFile) - for isolate in qlist: - print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + 'Query', file = lFile) + status = 'Reference' + if isolate in qlist: + status = 'Query' + print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + status, file = lFile) return lineage_clustering From 2ea6ccd07ea92c829442d2d7e322120544925456 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Fri, 24 Apr 2020 21:57:43 +0100 Subject: [PATCH 09/46] Fix generate-viz mode --- PopPUNK/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 5956fdd3..ca46d34b 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -601,7 +601,7 @@ def main(): dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" #nodes_to_remove = set(genomeNetwork.nodes).difference(viz_subset) isolates_to_remove = set(combined_seq).difference(viz_subset) - postpruning_combined_seq = sorted(viz_subset) + postpruning_combined_seq = viz_subset newDistMat = complete_distMat if len(isolates_to_remove) > 0: postpruning_combined_seq, newDistMat = prune_distance_matrix(rlist, isolates_to_remove, From 2d179f4969ae00784c85af366cf0ba2b752c7fd0 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 26 Apr 2020 19:25:02 +0100 Subject: [PATCH 10/46] Reconfigure to allow multiprocessing --- PopPUNK/lineage_clustering.py | 141 +++++++++++++++++++++++----------- 1 file changed, 98 insertions(+), 43 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 87e8f954..7ae337c4 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -335,9 +335,16 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e Option to use accessory distances rather than core distances. Returns: - completion_status (int) - 0 if successful + lineage_clustering (dict) + Assignment of each isolate to a cluster. + lineage_seed (dict) + Seed isolate used to initiate each cluster. + neighbours (nested dict) + Neighbour relationships between isolates for R. + R (int) + R used to generate neigbours and clustering. """ + # process distance matrix distance_index = 1 if use_accessory else 0 distances = X[:,distance_index] @@ -345,46 +352,112 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e # determine whether ref-ref or ref-query analysis isolate_list = rlist row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True))) -# if qlist is None: -# sys.stderr.write("Identifying lineages using reference sequences\n") -# isolate_list = rlist -# row_labels = list(iter(iterDistRows(rlist, rlist, self = True))) -# else: -# sys.stderr.write("Identifying lineages from using reference and query sequences\n") -# row_labels = list(iter(iterDistRows(rlist, qlist, self = False))) -# isolate_list = rlist + qlist # determine whether novel analysis or modifying existing analysis + use_existing = False neighbours = {} lineage_seed = {} null_cluster_value = len(isolate_list) + 1 lineage_clustering = {i:null_cluster_value for i in isolate_list} previous_lineage_clustering = lineage_clustering - if existing_scheme is None: - neighbours = generate_nearest_neighbours(distances, - row_labels, - isolate_list, - R) - else: + if existing_scheme is not None: with open(existing_scheme, 'rb') as pickle_file: lineage_clustering, lineage_seed, neighbours, R = pickle.load(pickle_file) previous_lineage_clustering = lineage_clustering # use for detecting changes + use_existing = True # add new queries to lineage clustering for q in qlist: lineage_clustering[q] = null_cluster_value - neighbours, lineage_clustering = update_nearest_neighbours(distances, - row_labels, - R, - qlist, - neighbours, - lineage_clustering, - null_cluster_value) + + # run clustering for an individual R + lineage_clustering, lineage_seed, neighbours, R = run_clustering_for_R(R, + distances = distances, + neighbours = neighbours, + lineage_clustering = lineage_clustering, + lineage_seed = lineage_seed, + null_cluster_value = null_cluster_value, + qlist = qlist, + use_existing = use_existing) + + # store output + with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file: + pickle.dump([lineage_clustering, lineage_seed, neighbours, R], pickle_file) + + # print output + lineage_output_name = output + "/" + output + "_lineage_clusters.csv" + with open(lineage_output_name, 'w') as lFile: + if qlist is None: + print('Id,Lineage__autocolor', file = lFile) + for isolate in lineage_clustering.keys(): + print(isolate + ',' + str(lineage_clustering[isolate]), file = lFile) + else: + print('Id,Lineage__autocolor,QueryStatus', file = lFile) + for isolate in rlist: + status = 'Reference' + if isolate in qlist: + status = 'Query' + print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + status, file = lFile) + + return lineage_clustering, lineage_seed, neighbours, R +def run_clustering_for_R(R, distances = None, neighbours = None, lineage_clustering = None, + lineage_seed = None, null_cluster_value = None, qlist = None, use_existing = False): + """ Clusters isolates into lineages based on their + relative distances using a single R to enable + parallelisation. + + Args: + R (int) + Integer specifying the maximum rank of neighbour used + for clustering. Should be changed to int list for hierarchical + clustering. + distances (numpy array) + Sorted distances used for analysis. + neighbours (nested dict) + Neighbour relationships between isolates for R. + lineage_clustering (dict) + Assignment of each isolate to a cluster. + lineage_seed (dict) + Seed isolate used to initiate each cluster. + null_cluster_value (int) + Used to denote as-yet-unclustered isolates. + qlist (list) + List of query sequences being added to an existing clustering. + Should be included within rlist. + use_existing (bool) + Whether to extend a previously generated analysis or not. + + Returns: + lineage_clustering (dict) + Assignment of each isolate to a cluster. + lineage_seed (dict) + Seed isolate used to initiate each cluster. + neighbours (nested dict) + Neighbour relationships between isolates for R. + R (int) + R used to generate neigbours and clustering. + """ + + # identify neighbours + if existing_scheme: + neighbours, lineage_clustering = update_nearest_neighbours(distances, + row_labels, + R, + qlist, + neighbours, + lineage_clustering, + null_cluster_value) + else: + neighbours = generate_nearest_neighbours(distances, + row_labels, + isolate_list, + R) + # run clustering lineage_index = 1 while null_cluster_value in lineage_clustering.values(): - + # get seed isolate based on minimum pairwise distances seed_isolate = get_seed_isolate(lineage_clustering, # need to add in lineage seed hash row_labels, @@ -419,25 +492,7 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e # increment index for next lineage lineage_index = lineage_index + 1 - # store output - with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file: - pickle.dump([lineage_clustering, lineage_seed, neighbours, R], pickle_file) - - # print output - lineage_output_name = output + "/" + output + "_lineage_clusters.csv" - with open(lineage_output_name, 'w') as lFile: - if qlist is None: - print('Id,Lineage__autocolor', file = lFile) - for isolate in lineage_clustering.keys(): - print(isolate + ',' + str(lineage_clustering[isolate]), file = lFile) - else: - print('Id,Lineage__autocolor,QueryStatus', file = lFile) - for isolate in rlist: - status = 'Reference' - if isolate in qlist: - status = 'Query' - print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + status, file = lFile) - + # return clustering return lineage_clustering def calculateQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length, From 13006ec8ab0be68d0ca117d4361dedc3701633ce Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 26 Apr 2020 19:57:30 +0100 Subject: [PATCH 11/46] Enable multiple ranks to be used --- PopPUNK/__main__.py | 24 ++++------ PopPUNK/lineage_clustering.py | 87 +++++++++++++++++++---------------- 2 files changed, 58 insertions(+), 53 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index ca46d34b..1200ef10 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -527,7 +527,11 @@ def main(): sys.stderr.write("Checking options here\n\n") if args.lineage_clustering: sys.stderr.write("Mode: Identifying lineages within a clade\n\n") - + + # process rankings to use + rank_list = sorted(args.R.split(',')) + + # load distances distances = None if args.distances is not None: distances = args.distances @@ -535,14 +539,13 @@ def main(): sys.stderr.write("Need to provide an input set of distances with --distances\n\n") sys.exit(1) - # load distance matrix refList, queryList, self, distMat = readPickle(distances) # run lineage clustering if self: - isolateClustering = cluster_into_lineages(distMat, args.R, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme) + isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme) else: - isolateClustering = cluster_into_lineages(distMat, args.R, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme) + isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme) #*******************************# #* *# @@ -746,7 +749,6 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances qcPass = qcDistMat(distMat, refList, queryList, max_a_dist) # Calculate query-query distances -################################################################################################################### ordered_queryList = [] # Assign to strains or lineages, as requested @@ -755,14 +757,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances # Assign lineages by calculating query-query information ordered_queryList, query_distMat = calculateQueryDistances(dbFuncs, refList, q_files, kmers, estimated_length, output, use_mash, threads) -# -# # ASSIGN LINEAGE -# expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl' -# if existing_scheme: -# expected_lineage_name = existing_scheme -# combined_distances = np.concatenate((distMat,query_distMat),axis = 0) -# isolateClustering = cluster_into_lineages(combined_distances, R, output, rlist = refList, qlist = ordered_queryList, use_accessory = use_accessory, existing_scheme = expected_lineage_name) -################################################################################################################### + if not assign_lineage: # Assign these distances as within or between strain model_prefix = ref_db @@ -840,10 +835,11 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances complete_distMat = translate_distMat(combined_seq, core_distMat, acc_distMat) if assign_lineage: + rank_list = sorted(R.split(',')) expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl' if existing_scheme is not None: expected_lineage_name = existing_scheme - isolateClustering = {'combined': cluster_into_lineages(complete_distMat, R, output, combined_seq, ordered_queryList, expected_lineage_name, use_accessory)} + isolateClustering = {'combined': cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name, use_accessory)} # Prune distances to references only, if not full db dists_out = output + "/" + os.path.basename(output) + ".dists" diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 7ae337c4..f41dcf42 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -10,6 +10,7 @@ from scipy.stats import rankdata from collections import defaultdict import pickle +from multiprocessing import Pool, Lock # import poppunk package from .utils import iterDistRows @@ -311,7 +312,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust return nn, lineage_clustering -def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False): +def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False): """ Clusters isolates into lineages based on their relative distances. @@ -319,10 +320,9 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e X (np.array) n x 2 array of core and accessory distances for n samples. This should not be subsampled. - R (int) - Integer specifying the maximum rank of neighbour used - for clustering. Should be changed to int list for hierarchical - clustering. + rank_list (list of int) + Integers specifying the maximum rank of neighbours used + for clustering. rlist (list) List of reference sequences. qlist (list) @@ -335,17 +335,14 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e Option to use accessory distances rather than core distances. Returns: - lineage_clustering (dict) - Assignment of each isolate to a cluster. - lineage_seed (dict) - Seed isolate used to initiate each cluster. - neighbours (nested dict) - Neighbour relationships between isolates for R. - R (int) - R used to generate neigbours and clustering. + combined (dict) + Assignment of each isolate to clusters by all ranks used. """ # process distance matrix + # - this should be sorted (PyTorch allows this to be done on GPUs) + # - then the functions can be reimplemented to run faster on a + # sorted list distance_index = 1 if use_accessory else 0 distances = X[:,distance_index] @@ -357,49 +354,63 @@ def cluster_into_lineages(X, R = 1, output = None, rlist = None, qlist = None, e use_existing = False neighbours = {} lineage_seed = {} + null_cluster_value = len(isolate_list) + 1 - lineage_clustering = {i:null_cluster_value for i in isolate_list} - previous_lineage_clustering = lineage_clustering + for R in rank_list: + lineage_clustering[R] = {i:null_cluster_value for i in isolate_list} + previous_lineage_clustering[R] = lineage_clustering[R] + lineage_seed[R] = {} if existing_scheme is not None: with open(existing_scheme, 'rb') as pickle_file: lineage_clustering, lineage_seed, neighbours, R = pickle.load(pickle_file) - previous_lineage_clustering = lineage_clustering # use for detecting changes + previous_lineage_clustering[R] = lineage_clustering[R] # use for detecting changes use_existing = True # add new queries to lineage clustering for q in qlist: lineage_clustering[q] = null_cluster_value # run clustering for an individual R - lineage_clustering, lineage_seed, neighbours, R = run_clustering_for_R(R, - distances = distances, - neighbours = neighbours, - lineage_clustering = lineage_clustering, - lineage_seed = lineage_seed, - null_cluster_value = null_cluster_value, - qlist = qlist, - use_existing = use_existing) + # - this can be parallelised across R using multiprocessing.Pool + for R in rank_list: + lineage_clustering[R], lineage_seed[R], neighbours[R] = run_clustering_for_R(R, + distances = distances, + neighbours = neighbours[R], + lineage_clustering = lineage_clustering[R], + lineage_seed = lineage_seed[R], + null_cluster_value = null_cluster_value, + qlist = qlist, + use_existing = use_existing) # store output + # - this needs to be per R with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file: - pickle.dump([lineage_clustering, lineage_seed, neighbours, R], pickle_file) + pickle.dump([lineage_clustering, lineage_seed, neighbours, rank_list], pickle_file) # print output + combined = {} lineage_output_name = output + "/" + output + "_lineage_clusters.csv" with open(lineage_output_name, 'w') as lFile: - if qlist is None: - print('Id,Lineage__autocolor', file = lFile) - for isolate in lineage_clustering.keys(): - print(isolate + ',' + str(lineage_clustering[isolate]), file = lFile) - else: - print('Id,Lineage__autocolor,QueryStatus', file = lFile) - for isolate in rlist: - status = 'Reference' + print('Id', file = lFile) + for R in rank_list: + print(',Lineage_R' + str(R) + '__autocolor', file = lFile) + if qlist is not None: + print(',Status', file = lFile) + for isolate in lineage_clustering[R].keys(): + print(isolate, file = lFile) + for R in rank_list: + print(',' + str(lineage_clustering[R][isolate]), file = lFile) + if len(combined[isolate]) == 0: + combined[isolate] = lineage_clustering[R][isolate] + else: + combined[isolate] = combined[isolate] + '-' + lineage_clustering[R][isolate] + if qlist is not None: if isolate in qlist: - status = 'Query' - print(isolate + ',' + str(lineage_clustering[isolate]) + ',' + status, file = lFile) + print(',Added',file = lFile) + else: + print(',Existing',file = lFile) - return lineage_clustering, lineage_seed, neighbours, R + return combined[R] def run_clustering_for_R(R, distances = None, neighbours = None, lineage_clustering = None, lineage_seed = None, null_cluster_value = None, qlist = None, use_existing = False): @@ -435,8 +446,6 @@ def run_clustering_for_R(R, distances = None, neighbours = None, lineage_cluster Seed isolate used to initiate each cluster. neighbours (nested dict) Neighbour relationships between isolates for R. - R (int) - R used to generate neigbours and clustering. """ # identify neighbours @@ -493,7 +502,7 @@ def run_clustering_for_R(R, distances = None, neighbours = None, lineage_cluster lineage_index = lineage_index + 1 # return clustering - return lineage_clustering + return lineage_clustering, lineage_seed, neighbours def calculateQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length, queryDB, use_mash = False, threads = 1): From dce1ab5737392f089744d9c1f9254f2f0093a62a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 27 Apr 2020 08:19:19 +0100 Subject: [PATCH 12/46] Improve printing from lineage querying --- PopPUNK/__main__.py | 6 ++-- PopPUNK/lineage_clustering.py | 55 +++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 22 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 1200ef10..520ddec6 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -181,7 +181,7 @@ def get_options(): # lineage clustering within strains lineagesGroup = parser.add_argument_group('Lineage analysis options') - lineagesGroup.add_argument('--R',help='Maximum rank used in lineage clustering [default = 1]', type = int, default = 1) + lineagesGroup.add_argument('--R',help='Comma separated list of ranks used in lineage clustering [default = 1]', type = str, default = "1") lineagesGroup.add_argument('--use-accessory',help='Use accessory distances for lineage definitions [default = use core distances]', action = 'store_true', default = False) lineagesGroup.add_argument('--existing-scheme',help='Name of pickle file storing existing lineage definitions', type = str, default = None) @@ -529,7 +529,7 @@ def main(): sys.stderr.write("Mode: Identifying lineages within a clade\n\n") # process rankings to use - rank_list = sorted(args.R.split(',')) + rank_list = sorted([int(x) for x in args.R.split(',')]) # load distances distances = None @@ -835,7 +835,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances complete_distMat = translate_distMat(combined_seq, core_distMat, acc_distMat) if assign_lineage: - rank_list = sorted(R.split(',')) + rank_list = sorted([int(x) for x in R.split(',')]) expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl' if existing_scheme is not None: expected_lineage_name = existing_scheme diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index f41dcf42..051573e2 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -207,11 +207,14 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R): ref_distances = np.take(distances, indices) # get unique distances and pick the R smallest values unique_isolate_distances = np.unique(ref_distances) - R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)] + R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)][:R] # get threshold distance for neighbour definition - np.maximum fails for R=1 threshold_distance = R_lowest_distances[0] if R > 1: - threshold_distance = np.maximum(R_lowest_distances) + try: + threshold_distance = np.amax(R_lowest_distances) + except: + sys.stderr.write('Problem with: ' + str(R_lowest_distances) + '\n') # identify all distances below this threshold and store relative to index for i,d in enumerate(ref_distances): if d <= threshold_distance: @@ -354,31 +357,38 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis use_existing = False neighbours = {} lineage_seed = {} + lineage_clustering = {} + previous_lineage_clustering = {} null_cluster_value = len(isolate_list) + 1 for R in rank_list: lineage_clustering[R] = {i:null_cluster_value for i in isolate_list} previous_lineage_clustering[R] = lineage_clustering[R] lineage_seed[R] = {} + neighbours[R] = {} if existing_scheme is not None: with open(existing_scheme, 'rb') as pickle_file: - lineage_clustering, lineage_seed, neighbours, R = pickle.load(pickle_file) + lineage_clustering, lineage_seed, neighbours, rank_list = pickle.load(pickle_file) previous_lineage_clustering[R] = lineage_clustering[R] # use for detecting changes use_existing = True # add new queries to lineage clustering for q in qlist: - lineage_clustering[q] = null_cluster_value + for R in rank_list: + lineage_clustering[R][q] = null_cluster_value # run clustering for an individual R # - this can be parallelised across R using multiprocessing.Pool for R in rank_list: lineage_clustering[R], lineage_seed[R], neighbours[R] = run_clustering_for_R(R, distances = distances, + row_labels = row_labels, neighbours = neighbours[R], lineage_clustering = lineage_clustering[R], + previous_lineage_clustering = previous_lineage_clustering[R], lineage_seed = lineage_seed[R], null_cluster_value = null_cluster_value, + isolate_list = isolate_list, qlist = qlist, use_existing = use_existing) @@ -391,29 +401,30 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis combined = {} lineage_output_name = output + "/" + output + "_lineage_clusters.csv" with open(lineage_output_name, 'w') as lFile: - print('Id', file = lFile) + lFile.write('Id') for R in rank_list: - print(',Lineage_R' + str(R) + '__autocolor', file = lFile) + lFile.write(',Lineage_R' + str(R) + '__autocolor') if qlist is not None: - print(',Status', file = lFile) + lFile.write(',Status\n') for isolate in lineage_clustering[R].keys(): - print(isolate, file = lFile) + lFile.write(isolate) for R in rank_list: - print(',' + str(lineage_clustering[R][isolate]), file = lFile) - if len(combined[isolate]) == 0: - combined[isolate] = lineage_clustering[R][isolate] - else: - combined[isolate] = combined[isolate] + '-' + lineage_clustering[R][isolate] + lFile.write(',' + str(lineage_clustering[R][isolate])) + if isolate in combined.keys(): + combined[isolate] = combined[isolate] + '-' + str(lineage_clustering[R][isolate]) + else: + combined[isolate] = str(lineage_clustering[R][isolate]) if qlist is not None: if isolate in qlist: - print(',Added',file = lFile) + lFile.write(',Added\n') else: - print(',Existing',file = lFile) + lFile.write(',Existing\n') - return combined[R] + return combined -def run_clustering_for_R(R, distances = None, neighbours = None, lineage_clustering = None, - lineage_seed = None, null_cluster_value = None, qlist = None, use_existing = False): +def run_clustering_for_R(R, distances = None, row_labels = None, neighbours = None, lineage_clustering = None, + previous_lineage_clustering = None, lineage_seed = None, null_cluster_value = None, + isolate_list = None, qlist = None, use_existing = False): """ Clusters isolates into lineages based on their relative distances using a single R to enable parallelisation. @@ -425,14 +436,20 @@ def run_clustering_for_R(R, distances = None, neighbours = None, lineage_cluster clustering. distances (numpy array) Sorted distances used for analysis. + row_labels (list of tuples) + Pairs of isolates labelling each distance. neighbours (nested dict) Neighbour relationships between isolates for R. lineage_clustering (dict) Assignment of each isolate to a cluster. + previous_lineage_clustering (dict) + Assignment of each isolate to a cluster before updating. lineage_seed (dict) Seed isolate used to initiate each cluster. null_cluster_value (int) Used to denote as-yet-unclustered isolates. + isolate_list (list) + List of isolates for lineage assignment. qlist (list) List of query sequences being added to an existing clustering. Should be included within rlist. @@ -449,7 +466,7 @@ def run_clustering_for_R(R, distances = None, neighbours = None, lineage_cluster """ # identify neighbours - if existing_scheme: + if use_existing: neighbours, lineage_clustering = update_nearest_neighbours(distances, row_labels, R, From 035177284dec046f57c898ae808c1344d1cb8428 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 27 Apr 2020 08:53:27 +0100 Subject: [PATCH 13/46] More detailed printing of clustering information --- PopPUNK/lineage_clustering.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 051573e2..54630829 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -363,20 +363,22 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis null_cluster_value = len(isolate_list) + 1 for R in rank_list: lineage_clustering[R] = {i:null_cluster_value for i in isolate_list} - previous_lineage_clustering[R] = lineage_clustering[R] lineage_seed[R] = {} neighbours[R] = {} if existing_scheme is not None: with open(existing_scheme, 'rb') as pickle_file: lineage_clustering, lineage_seed, neighbours, rank_list = pickle.load(pickle_file) - previous_lineage_clustering[R] = lineage_clustering[R] # use for detecting changes use_existing = True # add new queries to lineage clustering for q in qlist: for R in rank_list: lineage_clustering[R][q] = null_cluster_value + # use for detecting changes in lineage assignation + for R in rank_list: + previous_lineage_clustering[R] = lineage_clustering[R] + # run clustering for an individual R # - this can be parallelised across R using multiprocessing.Pool for R in rank_list: @@ -405,15 +407,20 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis for R in rank_list: lFile.write(',Lineage_R' + str(R) + '__autocolor') if qlist is not None: - lFile.write(',Status\n') + lFile.write(',Overall_lineage,Status\n') for isolate in lineage_clustering[R].keys(): lFile.write(isolate) for R in rank_list: lFile.write(',' + str(lineage_clustering[R][isolate])) + lineage_string = str(lineage_clustering[R][isolate]) + # include information on lineage clustering + if lineage_clustering[R][isolate] != previous_lineage_clustering[R][isolate]: + lineage_string = str(previous_lineage_clustering[R][isolate]) + ':' + lineage_string if isolate in combined.keys(): - combined[isolate] = combined[isolate] + '-' + str(lineage_clustering[R][isolate]) + combined[isolate] = combined[isolate] + '-' + lineage_string else: - combined[isolate] = str(lineage_clustering[R][isolate]) + combined[isolate] = lineage_string + lFile.write(',' + combined[isolate]) if qlist is not None: if isolate in qlist: lFile.write(',Added\n') From 8cdad429e240c0a28013d2f886ba49190f7cd1dc Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 27 Apr 2020 12:47:16 +0100 Subject: [PATCH 14/46] Improve visualisation of clustering --- PopPUNK/__main__.py | 12 +++++--- PopPUNK/lineage_clustering.py | 54 ++++++++++++++++++++++++++++++----- PopPUNK/plot.py | 5 +++- 3 files changed, 59 insertions(+), 12 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 520ddec6..cee0216a 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -19,6 +19,7 @@ from .lineage_clustering import cluster_into_lineages from .lineage_clustering import calculateQueryDistances +from .lineage_clustering import readLineages from .network import fetchNetwork from .network import constructNetwork @@ -628,7 +629,8 @@ def main(): isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)} elif args.viz_lineages: cluster_file = args.viz_lineages - isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)} + #isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)} + isolateClustering = readLineages(cluster_file) else: # identify existing analysis files model_prefix = args.ref_db @@ -648,9 +650,11 @@ def main(): # prune the network and dictionary of assignments genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(viz_subset)) - for clustering_type in isolateClustering: - isolateClustering[clustering_type] = {viz_key: isolateClustering[clustering_type][viz_key] - for viz_key in viz_subset} + + # process the returned clustering +# for clustering_type in isolateClustering: +# isolateClustering[clustering_type] = {viz_key: isolateClustering[clustering_type][viz_key] +# for viz_key in viz_subset} # generate selected visualisations if args.microreact: diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 54630829..c493aea1 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -5,6 +5,7 @@ # universal import os import sys +import re # additional import numpy as np from scipy.stats import rankdata @@ -401,26 +402,34 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis # print output combined = {} + titles_list = ['Lineage_R' + str(R) for R in rank_list] lineage_output_name = output + "/" + output + "_lineage_clusters.csv" with open(lineage_output_name, 'w') as lFile: + # print header lFile.write('Id') - for R in rank_list: - lFile.write(',Lineage_R' + str(R) + '__autocolor') + for t in titles_list: + lFile.write(',' + t + '__autocolor') + combined[t] = {} + lFile.write(',Overall_lineage') + combined['Overall_lineage'] = {} if qlist is not None: - lFile.write(',Overall_lineage,Status\n') + lFile.write(',Status') + lFile.write('\n') + # print lines for each isolate for isolate in lineage_clustering[R].keys(): lFile.write(isolate) - for R in rank_list: + for n,R in enumerate(rank_list): lFile.write(',' + str(lineage_clustering[R][isolate])) lineage_string = str(lineage_clustering[R][isolate]) # include information on lineage clustering + combined[titles_list[n]][isolate] = lineage_string if lineage_clustering[R][isolate] != previous_lineage_clustering[R][isolate]: lineage_string = str(previous_lineage_clustering[R][isolate]) + ':' + lineage_string if isolate in combined.keys(): - combined[isolate] = combined[isolate] + '-' + lineage_string + combined['Overall_lineage'][isolate] = combined[isolate] + '-' + lineage_string else: - combined[isolate] = lineage_string - lFile.write(',' + combined[isolate]) + combined['Overall_lineage'][isolate] = lineage_string + lFile.write(',' + combined['Overall_lineage'][isolate]) if qlist is not None: if isolate in qlist: lFile.write(',Added\n') @@ -589,3 +598,34 @@ def calculateQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length, return qlist1, distMat +def readLineages(clustCSV): + """Read a previous reference clustering from CSV + + Args: + clustCSV (str) + File name of CSV with previous cluster assignments + + Returns: + clusters (dict) + Or if return_dict is set keys are sample names, + values are cluster assignments. + """ + clusters = {} + relevant_headers = [] + header_elements = [] + + with open(clustCSV, 'r') as csv_file: + header = csv_file.readline() + # identify columns to include + header_elements = header.rstrip().split(",") + relevant_headers.append(header_elements.index('Overall_lineage')) + relevant_headers.extend([n for n,i in enumerate(header_elements) if re.search('Lineage_R',i)]) + for h in relevant_headers: + clusters[header_elements[h]] = {} + for line in csv_file: + elements = line.rstrip().split(",") + if elements[0] != header_elements[0]: + for h in relevant_headers: + clusters[header_elements[h]][elements[0]] = elements[h] + + return clusters diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index cf2df0e2..3b30eae6 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -477,8 +477,11 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = # process clustering data nodeLabels = [r.split('/')[-1].split('.')[0] for r in nodeNames] + # get example clustering name for validation + example_cluster_title = list(clustering.keys())[0] + for name, label in zip(nodeNames, nodeLabels): - if name in clustering['combined']: + if name in clustering[example_cluster_title]: if output_format == 'microreact': d['id'].append(label) for cluster_type in clustering: From 055d7c825daa35359ea88cafb44056f88651488f Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 28 Apr 2020 10:03:49 +0100 Subject: [PATCH 15/46] Parallelise lineage clustering --- PopPUNK/__main__.py | 11 +-- PopPUNK/lineage_clustering.py | 144 ++++++++++++++++++++++------------ 2 files changed, 102 insertions(+), 53 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index cee0216a..709415d9 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -544,9 +544,9 @@ def main(): # run lineage clustering if self: - isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme) + isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme, num_processes = args.threads) else: - isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme) + isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme, num_processes = args.threads) #*******************************# #* *# @@ -817,7 +817,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances nx.write_gpickle(genomeNetwork, output + "/" + os.path.basename(output) + '_graph.gpickle') else: newQueries = ordered_queryList - + # Update the sketch database if newQueries != queryList and use_mash: tmpRefFile = writeTmpFile(newQueries) @@ -837,13 +837,14 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances combined_seq, core_distMat, acc_distMat = update_distance_matrices(refList, ref_distMat, ordered_queryList, distMat, query_distMat) complete_distMat = translate_distMat(combined_seq, core_distMat, acc_distMat) - + if assign_lineage: rank_list = sorted([int(x) for x in R.split(',')]) expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl' if existing_scheme is not None: expected_lineage_name = existing_scheme - isolateClustering = {'combined': cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name, use_accessory)} + #isolateClustering = {'combined': cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name, use_accessory, threads)} + isolateClustering = cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name, use_accessory, threads) # Prune distances to references only, if not full db dists_out = output + "/" + os.path.basename(output) + ".dists" diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index c493aea1..8a93f264 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -11,7 +11,8 @@ from scipy.stats import rankdata from collections import defaultdict import pickle -from multiprocessing import Pool, Lock +from multiprocessing import Pool, Lock, Manager, RawArray, shared_memory, managers +from functools import partial # import poppunk package from .utils import iterDistRows @@ -208,7 +209,13 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R): ref_distances = np.take(distances, indices) # get unique distances and pick the R smallest values unique_isolate_distances = np.unique(ref_distances) - R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)][:R] +# R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)[:R]] + lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)] + try: + R_lowest_distances = lowest_distances[:R] + except: + print('Cannot extract lowest from ' + str(lowest_distances)) + exit(1) # get threshold distance for neighbour definition - np.maximum fails for R=1 threshold_distance = R_lowest_distances[0] if R > 1: @@ -316,7 +323,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust return nn, lineage_clustering -def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False): +def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1): """ Clusters isolates into lineages based on their relative distances. @@ -337,6 +344,8 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis should be added. use_accessory (bool) Option to use accessory distances rather than core distances. + num_processes (int) + Number of CPUs to use for calculations. Returns: combined (dict) @@ -352,7 +361,6 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis # determine whether ref-ref or ref-query analysis isolate_list = rlist - row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True))) # determine whether novel analysis or modifying existing analysis use_existing = False @@ -367,36 +375,44 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis lineage_seed[R] = {} neighbours[R] = {} - if existing_scheme is not None: - with open(existing_scheme, 'rb') as pickle_file: - lineage_clustering, lineage_seed, neighbours, rank_list = pickle.load(pickle_file) - use_existing = True - # add new queries to lineage clustering - for q in qlist: - for R in rank_list: - lineage_clustering[R][q] = null_cluster_value - - # use for detecting changes in lineage assignation - for R in rank_list: - previous_lineage_clustering[R] = lineage_clustering[R] - # run clustering for an individual R - # - this can be parallelised across R using multiprocessing.Pool - for R in rank_list: - lineage_clustering[R], lineage_seed[R], neighbours[R] = run_clustering_for_R(R, - distances = distances, - row_labels = row_labels, - neighbours = neighbours[R], - lineage_clustering = lineage_clustering[R], - previous_lineage_clustering = previous_lineage_clustering[R], - lineage_seed = lineage_seed[R], - null_cluster_value = null_cluster_value, - isolate_list = isolate_list, - qlist = qlist, - use_existing = use_existing) - + if num_processes == 1: + for R in rank_list: + lineage_clustering[R], lineage_seed[R], neighbours[R] = run_clustering_for_R(R, + null_cluster_value = null_cluster_value, + qlist = qlist, + existing_scheme = existing_scheme, + distances_length = distances.size) + + + else: + # use multiprocessing + shm_distances = shared_memory.SharedMemory(create = True, size = distances.nbytes, name = 'shm_distances') + distances_shared = np.ndarray(distances.shape, dtype = distances.dtype, buffer = shm_distances.buf) + distances_shared[:] = distances[:] + isolate_list_shared = shared_memory.ShareableList(isolate_list, name = 'shm_isolate_list') + + with Pool(processes = num_processes) as pool: + results = pool.map(partial(run_clustering_for_R, + null_cluster_value = null_cluster_value, + qlist = qlist, + existing_scheme = existing_scheme, + distances_length = distances.size), + rank_list) + + for n,result in enumerate(results): + R = rank_list[n] + lineage_clustering[R], lineage_seed[R], neighbours[R], previous_lineage_clustering[R] = result + + # manage memory + shm_distances.close() + shm_distances.unlink() + del shm_distances + isolate_list_shared.shm.close() + isolate_list_shared.shm.unlink() + del isolate_list_shared + # store output - # - this needs to be per R with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file: pickle.dump([lineage_clustering, lineage_seed, neighbours, rank_list], pickle_file) @@ -423,28 +439,35 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis lineage_string = str(lineage_clustering[R][isolate]) # include information on lineage clustering combined[titles_list[n]][isolate] = lineage_string - if lineage_clustering[R][isolate] != previous_lineage_clustering[R][isolate]: + if lineage_clustering[R][isolate] != previous_lineage_clustering[R][isolate] and previous_lineage_clustering[R][isolate] != null_cluster_value: lineage_string = str(previous_lineage_clustering[R][isolate]) + ':' + lineage_string - if isolate in combined.keys(): - combined['Overall_lineage'][isolate] = combined[isolate] + '-' + lineage_string + if isolate in combined['Overall_lineage'].keys(): + combined['Overall_lineage'][isolate] = combined['Overall_lineage'][isolate] + '-' + lineage_string else: combined['Overall_lineage'][isolate] = lineage_string lFile.write(',' + combined['Overall_lineage'][isolate]) if qlist is not None: if isolate in qlist: - lFile.write(',Added\n') + lFile.write(',Added') else: - lFile.write(',Existing\n') + lFile.write(',Existing') + lFile.write('\n') return combined -def run_clustering_for_R(R, distances = None, row_labels = None, neighbours = None, lineage_clustering = None, - previous_lineage_clustering = None, lineage_seed = None, null_cluster_value = None, - isolate_list = None, qlist = None, use_existing = False): +def make_vars_global(distances = None, row_labels = None, isolate_list = None): + global global_distances + global_distances = distances + global global_row_labels + global_row_labels = row_labels + global global_isolate_list + global_isolate_list = isolate_list + +def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_scheme = False, distances_length = None): """ Clusters isolates into lineages based on their relative distances using a single R to enable parallelisation. - + Args: R (int) Integer specifying the maximum rank of neighbour used @@ -481,8 +504,33 @@ def run_clustering_for_R(R, distances = None, row_labels = None, neighbours = No Neighbour relationships between isolates for R. """ - # identify neighbours - if use_existing: + # load shared memory objects + existing_shm = shared_memory.SharedMemory(name = 'shm_distances') + distances = np.ndarray(distances_length, dtype = np.float32, buffer = existing_shm.buf) + isolate_list = shared_memory.ShareableList(name = 'shm_isolate_list') + + # calculate row labels + # this is inefficient but there appears to be no way of sharing + # strings between processes efficiently + row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True))) + + neighbours = {} + lineage_seed = {} + lineage_clustering = {} + previous_lineage_clustering = {} + + if existing_scheme is not None: + with open(existing_scheme, 'rb') as pickle_file: + lineage_clustering_overall, lineage_seed_overall, neighbours_overall, rank_list = pickle.load(pickle_file) + # focus on relevant data + lineage_clustering = lineage_clustering_overall[R] + lineage_seed = lineage_seed_overall[R] + neighbours = neighbours_overall[R] + # add new queries to lineage clustering + for q in qlist: + lineage_clustering[q] = null_cluster_value + previous_lineage_clustering = lineage_clustering + neighbours, lineage_clustering = update_nearest_neighbours(distances, row_labels, R, @@ -495,13 +543,13 @@ def run_clustering_for_R(R, distances = None, row_labels = None, neighbours = No row_labels, isolate_list, R) - + # run clustering lineage_index = 1 while null_cluster_value in lineage_clustering.values(): - + # get seed isolate based on minimum pairwise distances - seed_isolate = get_seed_isolate(lineage_clustering, # need to add in lineage seed hash + seed_isolate = get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_value, @@ -533,9 +581,9 @@ def run_clustering_for_R(R, distances = None, row_labels = None, neighbours = No # increment index for next lineage lineage_index = lineage_index + 1 - + # return clustering - return lineage_clustering, lineage_seed, neighbours + return lineage_clustering, lineage_seed, neighbours, previous_lineage_clustering def calculateQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length, queryDB, use_mash = False, threads = 1): From af2089ee06ac7b6592910ddfaa5414426ae20be1 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 28 Apr 2020 12:56:28 +0100 Subject: [PATCH 16/46] Specify float or double in shared memory --- PopPUNK/__main__.py | 5 --- PopPUNK/lineage_clustering.py | 70 ++++++++++++++--------------------- 2 files changed, 28 insertions(+), 47 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 709415d9..0befe65e 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -651,11 +651,6 @@ def main(): # prune the network and dictionary of assignments genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(viz_subset)) - # process the returned clustering -# for clustering_type in isolateClustering: -# isolateClustering[clustering_type] = {viz_key: isolateClustering[clustering_type][viz_key] -# for viz_key in viz_subset} - # generate selected visualisations if args.microreact: sys.stderr.write("Writing microreact output\n") diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 8a93f264..71fa3851 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -272,6 +272,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust query_match_indices = [n for n, (r, q) in enumerate(row_labels) if q in qlist or r in qlist] query_row_names = [row_labels[i] for i in query_match_indices] query_distances = np.take(distances, query_match_indices) + print('Query distances: ' + str(query_distances)) # iterate through references to update existing entries existing_isolates = nn.keys() @@ -284,6 +285,8 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust min_distance_to_query = np.amin(ref_query_distances) # if closer queries, find distances and replace if min_distance_to_query < max_distance_to_ref: + if R == 1: + print('Found new partner for ' + isolate + ' as min query is ' + str(min_distance_to_query) + ' and max ref is ' + str(max_distance_to_ref)) # unset clustering of isolate and neighbours lineage_clustering[isolate] = null_cluster_value for neighbour in nn[isolate].keys(): @@ -374,43 +377,47 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis lineage_clustering[R] = {i:null_cluster_value for i in isolate_list} lineage_seed[R] = {} neighbours[R] = {} + previous_lineage_clustering[R] = {} + + # shared memory data structures + shm_distances = shared_memory.SharedMemory(create = True, size = distances.nbytes, name = 'shm_distances') + distances_shared = np.ndarray(distances.shape, dtype = distances.dtype, buffer = shm_distances.buf) + distances_shared[:] = distances[:] + isolate_list_shared = shared_memory.ShareableList(isolate_list, name = 'shm_isolate_list') # run clustering for an individual R if num_processes == 1: for R in rank_list: - lineage_clustering[R], lineage_seed[R], neighbours[R] = run_clustering_for_R(R, + lineage_clustering[R], lineage_seed[R], neighbours[R], previous_lineage_clustering[R] = run_clustering_for_R(R, null_cluster_value = null_cluster_value, qlist = qlist, existing_scheme = existing_scheme, - distances_length = distances.size) + distances_length = distances.shape, + distances_type = distances.dtype) else: - # use multiprocessing - shm_distances = shared_memory.SharedMemory(create = True, size = distances.nbytes, name = 'shm_distances') - distances_shared = np.ndarray(distances.shape, dtype = distances.dtype, buffer = shm_distances.buf) - distances_shared[:] = distances[:] - isolate_list_shared = shared_memory.ShareableList(isolate_list, name = 'shm_isolate_list') with Pool(processes = num_processes) as pool: results = pool.map(partial(run_clustering_for_R, null_cluster_value = null_cluster_value, qlist = qlist, existing_scheme = existing_scheme, - distances_length = distances.size), + distances_length = distances.shape, + distances_type = distances.dtype), rank_list) for n,result in enumerate(results): R = rank_list[n] lineage_clustering[R], lineage_seed[R], neighbours[R], previous_lineage_clustering[R] = result - # manage memory - shm_distances.close() - shm_distances.unlink() - del shm_distances - isolate_list_shared.shm.close() - isolate_list_shared.shm.unlink() - del isolate_list_shared + # manage memory + shm_distances.close() + shm_distances.unlink() + del shm_distances + isolate_list_shared.shm.close() + isolate_list_shared.shm.unlink() + del isolate_list_shared # store output with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file: @@ -431,6 +438,7 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis if qlist is not None: lFile.write(',Status') lFile.write('\n') + # print lines for each isolate for isolate in lineage_clustering[R].keys(): lFile.write(isolate) @@ -455,15 +463,7 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis return combined -def make_vars_global(distances = None, row_labels = None, isolate_list = None): - global global_distances - global_distances = distances - global global_row_labels - global_row_labels = row_labels - global global_isolate_list - global_isolate_list = isolate_list - -def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_scheme = False, distances_length = None): +def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_scheme = False, distances_length = None, distances_type = None): """ Clusters isolates into lineages based on their relative distances using a single R to enable parallelisation. @@ -473,22 +473,8 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc Integer specifying the maximum rank of neighbour used for clustering. Should be changed to int list for hierarchical clustering. - distances (numpy array) - Sorted distances used for analysis. - row_labels (list of tuples) - Pairs of isolates labelling each distance. - neighbours (nested dict) - Neighbour relationships between isolates for R. - lineage_clustering (dict) - Assignment of each isolate to a cluster. - previous_lineage_clustering (dict) - Assignment of each isolate to a cluster before updating. - lineage_seed (dict) - Seed isolate used to initiate each cluster. null_cluster_value (int) Used to denote as-yet-unclustered isolates. - isolate_list (list) - List of isolates for lineage assignment. qlist (list) List of query sequences being added to an existing clustering. Should be included within rlist. @@ -506,7 +492,7 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc # load shared memory objects existing_shm = shared_memory.SharedMemory(name = 'shm_distances') - distances = np.ndarray(distances_length, dtype = np.float32, buffer = existing_shm.buf) + distances = np.ndarray(distances_length, dtype = distances_type, buffer = existing_shm.buf) isolate_list = shared_memory.ShareableList(name = 'shm_isolate_list') # calculate row labels @@ -514,10 +500,10 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc # strings between processes efficiently row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True))) - neighbours = {} + lineage_clustering = {i:null_cluster_value for i in isolate_list} + previous_lineage_clustering = lineage_clustering lineage_seed = {} - lineage_clustering = {} - previous_lineage_clustering = {} + neighbours = {} if existing_scheme is not None: with open(existing_scheme, 'rb') as pickle_file: From ab2ee00e9a18d379f5ccc8289adf6b7caf392b9a Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 28 Apr 2020 13:47:45 +0100 Subject: [PATCH 17/46] Sort distances and names in lineage clustering --- PopPUNK/lineage_clustering.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 71fa3851..ca13b3fd 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -272,7 +272,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust query_match_indices = [n for n, (r, q) in enumerate(row_labels) if q in qlist or r in qlist] query_row_names = [row_labels[i] for i in query_match_indices] query_distances = np.take(distances, query_match_indices) - print('Query distances: ' + str(query_distances)) + # iterate through references to update existing entries existing_isolates = nn.keys() @@ -285,8 +285,6 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust min_distance_to_query = np.amin(ref_query_distances) # if closer queries, find distances and replace if min_distance_to_query < max_distance_to_ref: - if R == 1: - print('Found new partner for ' + isolate + ' as min query is ' + str(min_distance_to_query) + ' and max ref is ' + str(max_distance_to_ref)) # unset clustering of isolate and neighbours lineage_clustering[isolate] = null_cluster_value for neighbour in nn[isolate].keys(): @@ -362,6 +360,10 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis distance_index = 1 if use_accessory else 0 distances = X[:,distance_index] + # sort distances + distance_ranks = rankdata(distances, method = 'ordinal') + distances = distances[distance_ranks-1] + # determine whether ref-ref or ref-query analysis isolate_list = rlist @@ -383,6 +385,11 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis shm_distances = shared_memory.SharedMemory(create = True, size = distances.nbytes, name = 'shm_distances') distances_shared = np.ndarray(distances.shape, dtype = distances.dtype, buffer = shm_distances.buf) distances_shared[:] = distances[:] + + shm_distance_ranks = shared_memory.SharedMemory(create = True, size = distance_ranks.nbytes, name = 'shm_distance_ranks') + distance_ranks_shared = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = shm_distance_ranks.buf) + distance_ranks_shared[:] = distance_ranks[:] + isolate_list_shared = shared_memory.ShareableList(isolate_list, name = 'shm_isolate_list') # run clustering for an individual R @@ -415,6 +422,9 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis shm_distances.close() shm_distances.unlink() del shm_distances + shm_distance_ranks.close() + shm_distance_ranks.unlink() + del shm_distance_ranks isolate_list_shared.shm.close() isolate_list_shared.shm.unlink() del isolate_list_shared @@ -491,14 +501,18 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc """ # load shared memory objects - existing_shm = shared_memory.SharedMemory(name = 'shm_distances') - distances = np.ndarray(distances_length, dtype = distances_type, buffer = existing_shm.buf) + existing_distance_shm = shared_memory.SharedMemory(name = 'shm_distances') + distances = np.ndarray(distances_length, dtype = distances_type, buffer = existing_distance_shm.buf) + existing_rank_shm = shared_memory.SharedMemory(name = 'shm_distance_ranks') + distance_ranks = np.ndarray(distances_length, dtype = 'int64', buffer = existing_rank_shm.buf) isolate_list = shared_memory.ShareableList(name = 'shm_isolate_list') # calculate row labels # this is inefficient but there appears to be no way of sharing # strings between processes efficiently row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True))) + # reorder by sorted distances + row_labels = [row_labels[i-1] for i in distance_ranks] lineage_clustering = {i:null_cluster_value for i in isolate_list} previous_lineage_clustering = lineage_clustering From 3e59ecb0ff18732390db0c45e586b1d6d3d074cc Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 28 Apr 2020 21:43:59 +0100 Subject: [PATCH 18/46] Improve neighbour assignment --- PopPUNK/lineage_clustering.py | 39 ++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index ca13b3fd..636ea28c 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -179,6 +179,39 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances): return lineage_info def generate_nearest_neighbours(distances, row_labels, isolate_list, R): + # data structures + nn = {} + last_dist = {} + num_ranks = {} + for i in isolate_list: + nn[i] = {} + num_ranks[i] = 0 + total_isolates = len(isolate_list) + completed_isolates = 0 + index = 0 + # iterate through distances until all nearest neighbours identified + while completed_isolates < total_isolates: + try: + distance = distances[index] + except: + sys.stderr.write('Not enough distances! Try reducing R\n') + exit(1) + for isolate in row_labels[index]: + if isolate in num_ranks.keys() and num_ranks[isolate] < R: + if isolate in last_dist.keys() and last_dist[isolate] != distance: + num_ranks[isolate] = num_ranks[isolate] + 1 + if num_ranks[isolate] >= R: + completed_isolates = completed_isolates + 1 + else: + pair = row_labels[index][0] if row_labels[index][1] == isolate else row_labels[index][1] + nn[isolate][pair] = distance + last_dist[isolate] = distance + index = index + 1 + # return completed dict + return nn + + +def old_generate_nearest_neighbours(distances, row_labels, isolate_list, R): """ Identifies the nearest neighbours from the core genome distances. @@ -361,8 +394,8 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis distances = X[:,distance_index] # sort distances - distance_ranks = rankdata(distances, method = 'ordinal') - distances = distances[distance_ranks-1] + distance_ranks = np.argsort(distances) + distances = distances[distance_ranks] # determine whether ref-ref or ref-query analysis isolate_list = rlist @@ -512,7 +545,7 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc # strings between processes efficiently row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True))) # reorder by sorted distances - row_labels = [row_labels[i-1] for i in distance_ranks] + row_labels = [row_labels[i] for i in distance_ranks] lineage_clustering = {i:null_cluster_value for i in isolate_list} previous_lineage_clustering = lineage_clustering From 486544ba5e21b93a3d41f9d1e4ab3a2503072ad4 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 29 Apr 2020 09:54:08 +0100 Subject: [PATCH 19/46] Further optimisation - includes some debugging --- PopPUNK/lineage_clustering.py | 242 ++++++++++++++-------------------- 1 file changed, 100 insertions(+), 142 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 636ea28c..def7befa 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -89,7 +89,7 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_value, lineage_index, lineage_seed): """ Identifies the isolate used to initiate a cluster. - + Args: lineage_clustering (dict) Clustering of existing dataset. @@ -104,11 +104,11 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val Label of current lineage. lineage_seed (dict) Dict of seeds used to initiate pre-existing lineage definitions. - + Returns: seed_isolate (str) Isolate to used to initiate next lineage. - + """ # variable to return seed_isolate = None @@ -122,29 +122,14 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val seed_isolate = original_seed_isolate # else identify a new seed from the closest pair else: - # extract all pairwise distances between isolates that are not yet clustered - clustered_isolates = frozenset([isolate for isolate in lineage_clustering.keys() if lineage_clustering[isolate] != null_cluster_value]) - unclustered_pair_indices = [n for n,pair in enumerate(row_labels) if not set(pair).issubset(clustered_isolates)] - unclustered_distances = distances[unclustered_pair_indices] - # start at zero as the minimum if present in the unclustered set - minimum_unclustered_distance = 0 - # otherwise go through the bother of picking a minimum - if minimum_unclustered_distance not in unclustered_distances: - if len(unclustered_distances) == 1: - minimum_unclustered_distance = unclustered_distances[0] - elif len(unclustered_distances) > 1: - minimum_unclustered_distance = np.amin(unclustered_distances) - else: - sys.stderr.write('Cannot find any more isolates not assigned to lineages; unclustered pair indices are ' + str(unclustered_pair_indices)) - exit(1) - # identify which entries in the full distance set have this value - # unfortunately these distances are not necessarily neighbours - distance_indices = np.where(distances == minimum_unclustered_distance)[0].tolist() - # identify an unclustered isolate from the pair that have this value - closest_unclustered_pair_index = int(set(unclustered_pair_indices).intersection(set(distance_indices)).pop()) - seed_isolate = row_labels[closest_unclustered_pair_index][0] if lineage_clustering[row_labels[closest_unclustered_pair_index][0]] == null_cluster_value else row_labels[closest_unclustered_pair_index][1] + for index,distance in enumerate(distances): + for isolate in row_labels[index]: + if lineage_clustering[isolate] == null_cluster_value: + seed_isolate = isolate + break + # return identified seed isolate return seed_isolate - + def get_lineage_clustering_information(seed_isolate, row_labels, distances): """ Generates the ranked distances needed for cluster definition. @@ -164,21 +149,58 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances): """ # data structure lineage_info = defaultdict(list) - # get subset of relevant distances - comparisons_involving_seed = [n for n,(pair) in enumerate(row_labels) if seed_isolate in pair] - distances_to_seed = distances[comparisons_involving_seed] - # get ranks of data - distance_ranks = rankdata(distances_to_seed) - # get partners of seed isolate in each comparison - pairs_involving_seed = [row_labels[x] for x in comparisons_involving_seed] - seed_partners = [r if q == seed_isolate else q for (r,q) in pairs_involving_seed] - # create a dict of lists of isolates for a given rank - # enables later easy iterating through ranked distances - for rank in np.unique(distance_ranks): - lineage_info[rank] = [seed_partners[n] for n,r in enumerate(distance_ranks) if r == rank] + rank = 0 + last_dist = -1 + # iterate through distances recording rank + for index,distance in enumerate(distances): + if seed_isolate in row_labels[index]: + if distance > last_dist: + rank = rank + 1 + last_dist = distance + pair = row_labels[index][0] if row_labels[index][1] == seed_isolate else row_labels[index][1] + lineage_info[rank].append(pair) + # return information return lineage_info +#def old_get_lineage_clustering_information(seed_isolate, row_labels, distances): +# """ Generates the ranked distances needed for cluster +# definition. +# +# Args: +# seed_isolate (str) +# Isolate used to initiate lineage. +# row_labels (list of tuples) +# Pairs of isolates labelling each distance. +# distances (numpy array) +# Pairwise distances used for defining relationships. +# +# Returns: +# lineage_info (dict) +# Dict listing isolates by ranked distance from seed. +# +# """ +# # data structure +# lineage_info = defaultdict(list) +# # get subset of relevant distances +# comparisons_involving_seed = [n for n,(pair) in enumerate(row_labels) if seed_isolate in pair] +# distances_to_seed = distances[comparisons_involving_seed] +# # get ranks of data +# distance_ranks = rankdata(distances_to_seed) +# # get partners of seed isolate in each comparison +# pairs_involving_seed = [row_labels[x] for x in comparisons_involving_seed] +# seed_partners = [r if q == seed_isolate else q for (r,q) in pairs_involving_seed] +# # create a dict of lists of isolates for a given rank +# # enables later easy iterating through ranked distances +# for rank in np.unique(distance_ranks): +# lineage_info[rank] = [seed_partners[n] for n,r in enumerate(distance_ranks) if r == rank] +# # debug +# if seed_isolate in {'EPI_ISL_416602','EPI_ISL_416650','EPI_ISL_417950','EPI_ISL_417931','EPI_ISL_416649','EPI_ISL_417975','EPI_ISL_413593','EPI_ISL_416633','EPI_ISL_416643','EPI_ISL_413592','EPI_ISL_416653'} and rank < 10: +# print('Rank: ' + str(rank) + '\tpair: ' + str(lineage_info[rank]) + '\tseed: ' + seed_isolate) +# return lineage_info + def generate_nearest_neighbours(distances, row_labels, isolate_list, R): + # clade + in_lineage = {'EPI_ISL_419768','EPI_ISL_420716','EPI_ISL_417108','EPI_ISL_418430','EPI_ISL_417105','EPI_ISL_417102','EPI_ISL_419747','EPI_ISL_419749','EPI_ISL_419762','EPI_ISL_418406','EPI_ISL_421207','EPI_ISL_419733','EPI_ISL_417100','EPI_ISL_419744','EPI_ISL_418408','EPI_ISL_418416','EPI_ISL_417104','EPI_ISL_420721','EPI_ISL_418418','EPI_ISL_420717','EPI_ISL_419739','EPI_ISL_414045','EPI_ISL_419731','EPI_ISL_419729','EPI_ISL_419741','EPI_ISL_417107','EPI_ISL_419748','EPI_ISL_420727','EPI_ISL_420750','EPI_ISL_418405','EPI_ISL_418412','EPI_ISL_419765','EPI_ISL_418413','EPI_ISL_420723','EPI_ISL_420742','EPI_ISL_419725','EPI_ISL_419763','EPI_ISL_418432','EPI_ISL_418431','EPI_ISL_420745','EPI_ISL_418403','EPI_ISL_420751','EPI_ISL_419760','EPI_ISL_418409','EPI_ISL_418419','EPI_ISL_418414','EPI_ISL_419743','EPI_ISL_418400','EPI_ISL_419745','EPI_ISL_420722','EPI_ISL_418402','EPI_ISL_418417','EPI_ISL_419746','EPI_ISL_420724','EPI_ISL_420746','EPI_ISL_419761','EPI_ISL_419764','EPI_ISL_417101','EPI_ISL_420714','EPI_ISL_420719','EPI_ISL_419740','EPI_ISL_417148','EPI_ISL_420748','EPI_ISL_421212','EPI_ISL_419767','EPI_ISL_420747','EPI_ISL_419783','EPI_ISL_419726','EPI_ISL_419720','EPI_ISL_416691','EPI_ISL_419713','EPI_ISL_416693','EPI_ISL_416698','EPI_ISL_417197','EPI_ISL_421213','EPI_ISL_418404','EPI_ISL_419730','EPI_ISL_417122','EPI_ISL_420720','EPI_ISL_420785','EPI_ISL_421194','EPI_ISL_418426','EPI_ISL_417988','EPI_ISL_416656','EPI_ISL_416636','EPI_ISL_413574','EPI_ISL_421191','EPI_ISL_416631','EPI_ISL_416603','EPI_ISL_419766','EPI_ISL_420729','EPI_ISL_417103','EPI_ISL_420718','EPI_ISL_419728','EPI_ISL_417192','EPI_ISL_416685','EPI_ISL_419719','EPI_ISL_416658','EPI_ISL_420749','EPI_ISL_421210','EPI_ISL_419717','EPI_ISL_416686','EPI_ISL_419723','EPI_ISL_413572','EPI_ISL_414027','EPI_ISL_414023','EPI_ISL_419727','EPI_ISL_414020','EPI_ISL_416680','EPI_ISL_417196','EPI_ISL_421203','EPI_ISL_419708','EPI_ISL_419712','EPI_ISL_419710','EPI_ISL_419716','EPI_ISL_416626','EPI_ISL_416625','EPI_ISL_417121','EPI_ISL_416688','EPI_ISL_420715','EPI_ISL_419736','EPI_ISL_418401','EPI_ISL_417987','EPI_ISL_419737','EPI_ISL_420786','EPI_ISL_417109','EPI_ISL_419718','EPI_ISL_416682','EPI_ISL_417194','EPI_ISL_419769','EPI_ISL_417106','EPI_ISL_416690','EPI_ISL_421204','EPI_ISL_419711','EPI_ISL_419714','EPI_ISL_420728'} # data structures nn = {} last_dist = {} @@ -205,73 +227,20 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R): else: pair = row_labels[index][0] if row_labels[index][1] == isolate else row_labels[index][1] nn[isolate][pair] = distance + # debug + if R == 5 and (isolate in in_lineage and pair not in in_lineage): + print('In: ' + isolate + '\tOut: ' + pair) + elif R == 5 and (pair in in_lineage and isolate not in in_lineage): + print('In: ' + pair + '\tOut: ' + isolate) last_dist[isolate] = distance index = index + 1 # return completed dict return nn - - -def old_generate_nearest_neighbours(distances, row_labels, isolate_list, R): - """ Identifies the nearest neighbours from the core - genome distances. - - Args: - distances (numpy array) - Pairwise distances used for defining relationships. - row_labels (list of tuples) - Pairs of isolates labelling each distance. - isolate_list (list) - List of isolates for lineage assignment. - R (int) - Maximum rank of neighbours used for clustering. - - Returns: - nn - Dict of dict of neighbours per isolate, and pairwise distances - """ - # data structures - nn = {} - # iterate through isolates - for isolate in isolate_list: - # initiate dict - nn[isolate] = {} - # get indices of rows involving the considered isolate - indices = [n for n, (r, q) in enumerate(row_labels) if r == isolate or q == isolate] - # get the corresponding distances - #ref_distances = np.take(X, np.array(indices, dtype = 'int64')) - ref_distances = np.take(distances, indices) - # get unique distances and pick the R smallest values - unique_isolate_distances = np.unique(ref_distances) -# R_lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)[:R]] - lowest_distances = unique_isolate_distances[np.argpartition(unique_isolate_distances, R)] - try: - R_lowest_distances = lowest_distances[:R] - except: - print('Cannot extract lowest from ' + str(lowest_distances)) - exit(1) - # get threshold distance for neighbour definition - np.maximum fails for R=1 - threshold_distance = R_lowest_distances[0] - if R > 1: - try: - threshold_distance = np.amax(R_lowest_distances) - except: - sys.stderr.write('Problem with: ' + str(R_lowest_distances) + '\n') - # identify all distances below this threshold and store relative to index - for i,d in enumerate(ref_distances): - if d <= threshold_distance: - # get index of row in full list - index = indices[i] - # get name of other isolate in pair - name = row_labels[index][1] if row_labels[index][0] == isolate else row_labels[index][0] - nn[isolate][name] = ref_distances[i] - # return completed dictionary - return nn - def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clustering, null_cluster_value): """ Updates the information on nearest neighbours, given a new set of ref-query and query-query distances. - + Args: distances (numpy array) Distances to be used for defining lineages. @@ -288,17 +257,17 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust null_cluster_value (int) Null cluster value used for unsetting lineage assignments where this may change due to altered neighbour relationships. - + Returns: nn (nested dict) Updated neighbour relationships. lineage_clustering (dict) Updated assignment of isolates to lineage. - + """ # iterate through isolates and test whether any comparisons with # newly-added queries replace or supplement existing neighbours - + # data structures for altered entries nn_new = {} # pre-process to extract ref-query distances first @@ -306,57 +275,46 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust query_row_names = [row_labels[i] for i in query_match_indices] query_distances = np.take(distances, query_match_indices) - # iterate through references to update existing entries - existing_isolates = nn.keys() - - for isolate in existing_isolates: - # get max distance to existing neighbours - max_distance_to_ref = max(nn[isolate].values()) - # get min distance to queries - ref_query_match_indices = [n for n, (r, q) in enumerate(query_row_names) if r == isolate or q == isolate] - ref_query_distances = np.take(query_distances, ref_query_match_indices) - min_distance_to_query = np.amin(ref_query_distances) - # if closer queries, find distances and replace - if min_distance_to_query < max_distance_to_ref: - # unset clustering of isolate and neighbours - lineage_clustering[isolate] = null_cluster_value - for neighbour in nn[isolate].keys(): - lineage_clustering[neighbour] = null_cluster_value - # update neighbour information - nn_old = nn[isolate] - nn_new[isolate] = {} - # old neighbours from existing dict - old_distances = list(set(nn[isolate].values())) - # new neighbours from new distances in X - new_distances = np.unique(ref_query_distances) - # combine old and new distances - combined_ref_distances = list(nn[isolate].values()) + ref_query_distances.tolist() - # take top ranked distances from both - ranked_unique_combined_ref_distances = sorted(set(combined_ref_distances))[:R] - for distance in ranked_unique_combined_ref_distances: - # add old neighbours back in - if distance in old_distances: - for neighbour,d in nn[isolate].items(): - if d == distance: - nn_new[isolate][neighbour] = distance - # add new neighbours - if distance in new_distances: - for n,d in enumerate(ref_query_distances): - if d == distance: - row_label_index = ref_query_match_indices[n] - neighbour = row_labels[row_label_index][0] if row_labels[row_label_index][1] == isolate else row_labels[row_label_index][1] - nn_new[isolate][neighbour] = d + # calculate max distances for each isolate + max_distance = {} + for existing_isolate in nn.keys(): + max_distance[existing_isolate] = max(nn[existing_isolate].values()) + # iterate through the ref-query distances + for index,(distance,(isolate1,isolate2)) in enumerate(zip(query_distances,query_row_names)): + # identify ref-query matches + ref = None + query = None + if isolate1 in max_distance.keys() and isolate2 not in max_distance.keys(): + ref = isolate1 + query = isolate2 + elif isolate2 in max_distance.keys() and isolate1 not in max_distance.keys(): + ref = isolate2 + query = isolate1 + if ref is not None: + if distance <= max_distance[ref]: + # unset isolate and references + lineage_clustering[ref] = null_cluster_value + for neighbour in nn[ref]: + lineage_clustering[neighbour] = null_cluster_value + # add neighbours + nn[ref][query] = distance + # delete links if no longer high ranked match + if distance < max_distance[ref]: + to_delete = [] + for other in nn[ref].keys(): + if nn[ref][other] == max_distance[ref]: + to_delete.append(other) + for other in to_delete: + del nn[ref][other] + max_distance[ref] = max(nn[ref].values()) # get nn for query sequences query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R) # merge dicts for query in query_nn.keys(): nn[query] = query_nn[query] - for updated in nn_new.keys(): - nn[updated] = nn_new[updated] # return updated dict return nn, lineage_clustering - def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1): """ Clusters isolates into lineages based on their relative distances. From 6a195cf8c88e3c85148ea08a20b7ff910080b411 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 29 Apr 2020 13:39:00 +0100 Subject: [PATCH 20/46] Substantial performance improvement, some error fixes --- PopPUNK/lineage_clustering.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index def7befa..af7fdfa1 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -67,13 +67,12 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n lineage_clustering[isolate] = lineage_index # add neighbours of same or lower rank to lineage if unclustered for isolate_neighbour in neighbours[isolate].keys(): + # if lineage of neighbour is higher, or it is null value (highest) if lineage_clustering[isolate_neighbour] > lineage_index: - for neighbour_rank in lineage_clustering_information.keys(): - if neighbour_rank <= rank: - if isolate_neighbour in lineage_clustering_information[neighbour_rank]: - lineage_clustering[isolate_neighbour] = lineage_index - else: - break + # check if rank of neighbour is lower than that of the current subject isolate + for neighbour_rank in range(1,rank): + if isolate_neighbour in lineage_clustering_information[neighbour_rank]: + lineage_clustering[isolate_neighbour] = lineage_index # if this represents a change from the previous iteration of lineage definitions # then the change needs to propagate through higher-ranked members if isolate in previous_lineage_clustering: @@ -122,12 +121,17 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val seed_isolate = original_seed_isolate # else identify a new seed from the closest pair else: - for index,distance in enumerate(distances): - for isolate in row_labels[index]: - if lineage_clustering[isolate] == null_cluster_value: - seed_isolate = isolate + for index,(distance,(isolate1,isolate2)) in enumerate(zip(distances,row_labels)): + if lineage_clustering[isolate1] == null_cluster_value: +# print('Found one!: ' + isolate1 + ' at distance ' + str(distance)) + seed_isolate = isolate1 + break + elif lineage_clustering[isolate2] == null_cluster_value: + seed_isolate = isolate2 +# print('Found one!: ' + isolate2 + ' at distance ' + str(distance)) break # return identified seed isolate +# print('Seed: ' + seed_isolate) return seed_isolate def get_lineage_clustering_information(seed_isolate, row_labels, distances): @@ -199,8 +203,6 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances): # return lineage_info def generate_nearest_neighbours(distances, row_labels, isolate_list, R): - # clade - in_lineage = {'EPI_ISL_419768','EPI_ISL_420716','EPI_ISL_417108','EPI_ISL_418430','EPI_ISL_417105','EPI_ISL_417102','EPI_ISL_419747','EPI_ISL_419749','EPI_ISL_419762','EPI_ISL_418406','EPI_ISL_421207','EPI_ISL_419733','EPI_ISL_417100','EPI_ISL_419744','EPI_ISL_418408','EPI_ISL_418416','EPI_ISL_417104','EPI_ISL_420721','EPI_ISL_418418','EPI_ISL_420717','EPI_ISL_419739','EPI_ISL_414045','EPI_ISL_419731','EPI_ISL_419729','EPI_ISL_419741','EPI_ISL_417107','EPI_ISL_419748','EPI_ISL_420727','EPI_ISL_420750','EPI_ISL_418405','EPI_ISL_418412','EPI_ISL_419765','EPI_ISL_418413','EPI_ISL_420723','EPI_ISL_420742','EPI_ISL_419725','EPI_ISL_419763','EPI_ISL_418432','EPI_ISL_418431','EPI_ISL_420745','EPI_ISL_418403','EPI_ISL_420751','EPI_ISL_419760','EPI_ISL_418409','EPI_ISL_418419','EPI_ISL_418414','EPI_ISL_419743','EPI_ISL_418400','EPI_ISL_419745','EPI_ISL_420722','EPI_ISL_418402','EPI_ISL_418417','EPI_ISL_419746','EPI_ISL_420724','EPI_ISL_420746','EPI_ISL_419761','EPI_ISL_419764','EPI_ISL_417101','EPI_ISL_420714','EPI_ISL_420719','EPI_ISL_419740','EPI_ISL_417148','EPI_ISL_420748','EPI_ISL_421212','EPI_ISL_419767','EPI_ISL_420747','EPI_ISL_419783','EPI_ISL_419726','EPI_ISL_419720','EPI_ISL_416691','EPI_ISL_419713','EPI_ISL_416693','EPI_ISL_416698','EPI_ISL_417197','EPI_ISL_421213','EPI_ISL_418404','EPI_ISL_419730','EPI_ISL_417122','EPI_ISL_420720','EPI_ISL_420785','EPI_ISL_421194','EPI_ISL_418426','EPI_ISL_417988','EPI_ISL_416656','EPI_ISL_416636','EPI_ISL_413574','EPI_ISL_421191','EPI_ISL_416631','EPI_ISL_416603','EPI_ISL_419766','EPI_ISL_420729','EPI_ISL_417103','EPI_ISL_420718','EPI_ISL_419728','EPI_ISL_417192','EPI_ISL_416685','EPI_ISL_419719','EPI_ISL_416658','EPI_ISL_420749','EPI_ISL_421210','EPI_ISL_419717','EPI_ISL_416686','EPI_ISL_419723','EPI_ISL_413572','EPI_ISL_414027','EPI_ISL_414023','EPI_ISL_419727','EPI_ISL_414020','EPI_ISL_416680','EPI_ISL_417196','EPI_ISL_421203','EPI_ISL_419708','EPI_ISL_419712','EPI_ISL_419710','EPI_ISL_419716','EPI_ISL_416626','EPI_ISL_416625','EPI_ISL_417121','EPI_ISL_416688','EPI_ISL_420715','EPI_ISL_419736','EPI_ISL_418401','EPI_ISL_417987','EPI_ISL_419737','EPI_ISL_420786','EPI_ISL_417109','EPI_ISL_419718','EPI_ISL_416682','EPI_ISL_417194','EPI_ISL_419769','EPI_ISL_417106','EPI_ISL_416690','EPI_ISL_421204','EPI_ISL_419711','EPI_ISL_419714','EPI_ISL_420728'} # data structures nn = {} last_dist = {} @@ -228,10 +230,10 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R): pair = row_labels[index][0] if row_labels[index][1] == isolate else row_labels[index][1] nn[isolate][pair] = distance # debug - if R == 5 and (isolate in in_lineage and pair not in in_lineage): - print('In: ' + isolate + '\tOut: ' + pair) - elif R == 5 and (pair in in_lineage and isolate not in in_lineage): - print('In: ' + pair + '\tOut: ' + isolate) +# if R == 5 and isolate == 'EPI_ISL_419767': +# print('In: ' + isolate + '\tOut: ' + pair + '\tDistance: ' + str(distance)) +# elif R == 5 and (pair in in_lineage and isolate not in in_lineage): +# print('In: ' + pair + '\tOut: ' + isolate) last_dist[isolate] = distance index = index + 1 # return completed dict From f9645039f303100a3987fa845ec47f2e605914fb Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Wed, 29 Apr 2020 15:14:07 +0100 Subject: [PATCH 21/46] Fixed recursive addition of isolates --- PopPUNK/lineage_clustering.py | 152 ++++++++++++++++++---------------- 1 file changed, 80 insertions(+), 72 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index af7fdfa1..ac6bbf95 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -18,6 +18,48 @@ from .utils import iterDistRows from .utils import readRfile +def cluster_neighbours_of_equal_or_lower_rank(isolate, rank, lineage_index, lineage_clustering, lineage_clustering_information, nn): + """ Iteratively adds neighbours of isolates of lower or equal rank + to a lineage if an isolate of a higher rank is added. + + Args: + isolate (string) + Isolate of higher rank added to lineage. + rank (int) + Rank of isolate added to lineage. + lineage_index (int) + Label of current lineage. + lineage_clustering (dict) + Clustering of existing dataset. + lineage_clustering_information (dict) + Dict listing isolates by ranked distance from seed. + nn (nested dict) + Pre-calculated neighbour relationships. + + Returns: + lineage_clustering (dict) + Assignment of isolates to lineages. + """ + isolates_to_check = [isolate] + isolate_ranks = {} + isolate_ranks[isolate] = rank + + while len(isolates_to_check) > 0: + for isolate in isolates_to_check: + rank = isolate_ranks[isolate] + for isolate_neighbour in nn[isolate].keys(): + # if lineage of neighbour is higher, or it is null value (highest) + if lineage_clustering[isolate_neighbour] > lineage_index: + # check if rank of neighbour is lower than that of the current subject isolate + for neighbour_rank in range(1,rank): + if isolate_neighbour in lineage_clustering_information[neighbour_rank]: + lineage_clustering[isolate_neighbour] = lineage_index + isolates_to_check.append(isolate_neighbour) + isolate_ranks[isolate_neighbour] = neighbour_rank + isolates_to_check.remove(isolate) + + return lineage_clustering + def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, R, lineage_index, seed_isolate, previous_lineage_clustering, null_cluster_value): """ Identifies isolates corresponding to a particular lineage given a cluster seed. @@ -66,13 +108,12 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n # add isolate to lineage lineage_clustering[isolate] = lineage_index # add neighbours of same or lower rank to lineage if unclustered - for isolate_neighbour in neighbours[isolate].keys(): - # if lineage of neighbour is higher, or it is null value (highest) - if lineage_clustering[isolate_neighbour] > lineage_index: - # check if rank of neighbour is lower than that of the current subject isolate - for neighbour_rank in range(1,rank): - if isolate_neighbour in lineage_clustering_information[neighbour_rank]: - lineage_clustering[isolate_neighbour] = lineage_index + lineage_clustering = cluster_neighbours_of_equal_or_lower_rank(isolate, + rank, + lineage_index, + lineage_clustering, + lineage_clustering_information, + neighbours) # if this represents a change from the previous iteration of lineage definitions # then the change needs to propagate through higher-ranked members if isolate in previous_lineage_clustering: @@ -123,15 +164,12 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val else: for index,(distance,(isolate1,isolate2)) in enumerate(zip(distances,row_labels)): if lineage_clustering[isolate1] == null_cluster_value: -# print('Found one!: ' + isolate1 + ' at distance ' + str(distance)) seed_isolate = isolate1 break elif lineage_clustering[isolate2] == null_cluster_value: seed_isolate = isolate2 -# print('Found one!: ' + isolate2 + ' at distance ' + str(distance)) break # return identified seed isolate -# print('Seed: ' + seed_isolate) return seed_isolate def get_lineage_clustering_information(seed_isolate, row_labels, distances): @@ -166,42 +204,6 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances): # return information return lineage_info -#def old_get_lineage_clustering_information(seed_isolate, row_labels, distances): -# """ Generates the ranked distances needed for cluster -# definition. -# -# Args: -# seed_isolate (str) -# Isolate used to initiate lineage. -# row_labels (list of tuples) -# Pairs of isolates labelling each distance. -# distances (numpy array) -# Pairwise distances used for defining relationships. -# -# Returns: -# lineage_info (dict) -# Dict listing isolates by ranked distance from seed. -# -# """ -# # data structure -# lineage_info = defaultdict(list) -# # get subset of relevant distances -# comparisons_involving_seed = [n for n,(pair) in enumerate(row_labels) if seed_isolate in pair] -# distances_to_seed = distances[comparisons_involving_seed] -# # get ranks of data -# distance_ranks = rankdata(distances_to_seed) -# # get partners of seed isolate in each comparison -# pairs_involving_seed = [row_labels[x] for x in comparisons_involving_seed] -# seed_partners = [r if q == seed_isolate else q for (r,q) in pairs_involving_seed] -# # create a dict of lists of isolates for a given rank -# # enables later easy iterating through ranked distances -# for rank in np.unique(distance_ranks): -# lineage_info[rank] = [seed_partners[n] for n,r in enumerate(distance_ranks) if r == rank] -# # debug -# if seed_isolate in {'EPI_ISL_416602','EPI_ISL_416650','EPI_ISL_417950','EPI_ISL_417931','EPI_ISL_416649','EPI_ISL_417975','EPI_ISL_413593','EPI_ISL_416633','EPI_ISL_416643','EPI_ISL_413592','EPI_ISL_416653'} and rank < 10: -# print('Rank: ' + str(rank) + '\tpair: ' + str(lineage_info[rank]) + '\tseed: ' + seed_isolate) -# return lineage_info - def generate_nearest_neighbours(distances, row_labels, isolate_list, R): # data structures nn = {} @@ -211,29 +213,26 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R): nn[i] = {} num_ranks[i] = 0 total_isolates = len(isolate_list) + num_distances = len(distances) completed_isolates = 0 index = 0 # iterate through distances until all nearest neighbours identified - while completed_isolates < total_isolates: - try: - distance = distances[index] - except: - sys.stderr.write('Not enough distances! Try reducing R\n') - exit(1) + while completed_isolates < total_isolates and index < num_distances: + distance = distances[index] + # iterate through all listed isolates for isolate in row_labels[index]: if isolate in num_ranks.keys() and num_ranks[isolate] < R: + # R is the number of unique distances so only increment if + # different from the last distance if isolate in last_dist.keys() and last_dist[isolate] != distance: num_ranks[isolate] = num_ranks[isolate] + 1 - if num_ranks[isolate] >= R: + # if maximum number of ranks reached, mark as complete + if num_ranks[isolate] == R: # stop at R as counting from 0 completed_isolates = completed_isolates + 1 + # if not, add as a nearest neighbour else: pair = row_labels[index][0] if row_labels[index][1] == isolate else row_labels[index][1] nn[isolate][pair] = distance - # debug -# if R == 5 and isolate == 'EPI_ISL_419767': -# print('In: ' + isolate + '\tOut: ' + pair + '\tDistance: ' + str(distance)) -# elif R == 5 and (pair in in_lineage and isolate not in in_lineage): -# print('In: ' + pair + '\tOut: ' + isolate) last_dist[isolate] = distance index = index + 1 # return completed dict @@ -276,11 +275,21 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust query_match_indices = [n for n, (r, q) in enumerate(row_labels) if q in qlist or r in qlist] query_row_names = [row_labels[i] for i in query_match_indices] query_distances = np.take(distances, query_match_indices) - + + # get nn for query sequences + query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R) + # add query-query comparisons + for query in query_nn.keys(): + nn[query] = query_nn[query] + # calculate max distances for each isolate max_distance = {} - for existing_isolate in nn.keys(): - max_distance[existing_isolate] = max(nn[existing_isolate].values()) + num_distances = {} + for isolate in nn.keys(): + neighbour_distances = set(nn[isolate].values()) + max_distance[isolate] = max(neighbour_distances) + num_distances[isolate] = len(neighbour_distances) # query-query comparisons may be < R + # iterate through the ref-query distances for index,(distance,(isolate1,isolate2)) in enumerate(zip(query_distances,query_row_names)): # identify ref-query matches @@ -302,18 +311,17 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust nn[ref][query] = distance # delete links if no longer high ranked match if distance < max_distance[ref]: - to_delete = [] - for other in nn[ref].keys(): - if nn[ref][other] == max_distance[ref]: - to_delete.append(other) - for other in to_delete: - del nn[ref][other] + if num_distances[ref] == R: + to_delete = [] + for other in nn[ref].keys(): + if nn[ref][other] == max_distance[ref]: + to_delete.append(other) + for other in to_delete: + del nn[ref][other] + else: + # if set from query-query distances < R + num_distances[ref] = num_distances[ref] + 1 max_distance[ref] = max(nn[ref].values()) - # get nn for query sequences - query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R) - # merge dicts - for query in query_nn.keys(): - nn[query] = query_nn[query] # return updated dict return nn, lineage_clustering From da5b6df826fe1cafe622094f82cb8d3677354661 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 2 May 2020 11:21:43 +0100 Subject: [PATCH 22/46] Update sharing of data structures between parallelised processes --- PopPUNK/lineage_clustering.py | 75 ++++++++++++++++------------------- 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index ac6bbf95..76ecd165 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -11,7 +11,15 @@ from scipy.stats import rankdata from collections import defaultdict import pickle +import collections from multiprocessing import Pool, Lock, Manager, RawArray, shared_memory, managers +try: + from multiprocessing import Pool, shared_memory + from multiprocessing.managers import SharedMemoryManager + NumpyShared = collections.namedtuple('NumpyShared', ('name', 'shape', 'dtype')) +except ImportError as e: + sys.stderr.write("This version of PopPUNK requires python v3.8 or higher\n") + sys.exit(0) from functools import partial # import poppunk package @@ -383,52 +391,37 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis previous_lineage_clustering[R] = {} # shared memory data structures - shm_distances = shared_memory.SharedMemory(create = True, size = distances.nbytes, name = 'shm_distances') - distances_shared = np.ndarray(distances.shape, dtype = distances.dtype, buffer = shm_distances.buf) - distances_shared[:] = distances[:] - - shm_distance_ranks = shared_memory.SharedMemory(create = True, size = distance_ranks.nbytes, name = 'shm_distance_ranks') - distance_ranks_shared = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = shm_distance_ranks.buf) - distance_ranks_shared[:] = distance_ranks[:] - - isolate_list_shared = shared_memory.ShareableList(isolate_list, name = 'shm_isolate_list') + with SharedMemoryManager() as smm: + # share sorted distances + distances_raw = smm.SharedMemory(size = distances.nbytes) + distances_shared_array = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_raw.buf) + distances_shared_array[:] = distances[:] + distances_shared_array = NumpyShared(name = distances_raw.name, shape = distances.shape, dtype = distances.dtype) + + # share distance ranks + distance_ranks_raw = smm.SharedMemory(size = distance_ranks.nbytes) + distance_ranks_shared_array = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = distance_ranks_raw.buf) + distance_ranks_shared_array[:] = distance_ranks[:] + distance_ranks_shared_array = NumpyShared(name = distance_ranks_raw.name, shape = distance_ranks.shape, dtype = distance_ranks.dtype) + + # share isolate list + isolate_list_shared = smm.ShareableList(isolate_list) - # run clustering for an individual R - if num_processes == 1: - for R in rank_list: - lineage_clustering[R], lineage_seed[R], neighbours[R], previous_lineage_clustering[R] = run_clustering_for_R(R, - null_cluster_value = null_cluster_value, - qlist = qlist, - existing_scheme = existing_scheme, - distances_length = distances.shape, - distances_type = distances.dtype) - - - else: - + # run clustering for an individual R with Pool(processes = num_processes) as pool: results = pool.map(partial(run_clustering_for_R, null_cluster_value = null_cluster_value, qlist = qlist, existing_scheme = existing_scheme, - distances_length = distances.shape, - distances_type = distances.dtype), + distances = distances_shared_array, + distance_ranks = distance_ranks_shared_array, + isolates = isolate_list_shared), rank_list) + # extract results from multiprocessing pool for n,result in enumerate(results): R = rank_list[n] lineage_clustering[R], lineage_seed[R], neighbours[R], previous_lineage_clustering[R] = result - - # manage memory - shm_distances.close() - shm_distances.unlink() - del shm_distances - shm_distance_ranks.close() - shm_distance_ranks.unlink() - del shm_distance_ranks - isolate_list_shared.shm.close() - isolate_list_shared.shm.unlink() - del isolate_list_shared # store output with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file: @@ -474,7 +467,7 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis return combined -def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_scheme = False, distances_length = None, distances_type = None): +def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None): """ Clusters isolates into lineages based on their relative distances using a single R to enable parallelisation. @@ -502,11 +495,11 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc """ # load shared memory objects - existing_distance_shm = shared_memory.SharedMemory(name = 'shm_distances') - distances = np.ndarray(distances_length, dtype = distances_type, buffer = existing_distance_shm.buf) - existing_rank_shm = shared_memory.SharedMemory(name = 'shm_distance_ranks') - distance_ranks = np.ndarray(distances_length, dtype = 'int64', buffer = existing_rank_shm.buf) - isolate_list = shared_memory.ShareableList(name = 'shm_isolate_list') + distances_shm = shared_memory.SharedMemory(name = distances.name) + distances = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_shm.buf) + distance_ranks_shm = shared_memory.SharedMemory(name = distance_ranks.name) + distance_ranks = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = distance_ranks_shm.buf) + isolate_list = isolates # calculate row labels # this is inefficient but there appears to be no way of sharing From effb8909e329c8ac57e1bc9aea09be1088f7021d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 2 May 2020 13:25:21 +0100 Subject: [PATCH 23/46] Alterations to command line options --- PopPUNK/__main__.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 0befe65e..66e4f0f2 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -95,6 +95,10 @@ def get_options(): help='Identify lineages within a strain', default=False, action='store_true') + mode.add_argument('--assign-lineages', + help='Assign isolates to an existing lineages scheme', + default=False, + action='store_true') mode.add_argument('--use-model', help='Apply a fitted model to a reference database to restore database files', default=False, @@ -176,15 +180,13 @@ def get_options(): '[default = False]', default=False, action='store_true') queryingGroup.add_argument('--accessory-only', help='Use an accessory-distance only model for assigning queries ' '[default = False]', default=False, action='store_true') - queryingGroup.add_argument('--assign-lineage', help='Assign to lineages rather than strains - can specify ' - 'lineages other than those in the reference database using "--existing-scheme"' - '[default = False]', default=False, action='store_true') # lineage clustering within strains lineagesGroup = parser.add_argument_group('Lineage analysis options') - lineagesGroup.add_argument('--R',help='Comma separated list of ranks used in lineage clustering [default = 1]', type = str, default = "1") + lineagesGroup.add_argument('--ranks',help='Comma separated list of ranks used in lineage clustering [default = 1,2,3]', type = str, default = "1,2,3") lineagesGroup.add_argument('--use-accessory',help='Use accessory distances for lineage definitions [default = use core distances]', action = 'store_true', default = False) - lineagesGroup.add_argument('--existing-scheme',help='Name of pickle file storing existing lineage definitions', type = str, default = None) + lineagesGroup.add_argument('--existing-scheme',help='Name of pickle file storing existing lineage definitions ' + ', required with "--assign-lineages"', type = str, default = None) # plot output faGroup = parser.add_argument_group('Further analysis options') @@ -276,6 +278,10 @@ def main(): if not args.use_mash: sketch_sizes = int(round(max(sketch_sizes.values())/64)) + # check if working with lineages + if args.lineage_clustering or args.assign_lineages: + rank_list = sorted([int(x) for x in args.ranks.split(',')]) + # check on file paths and whether files will be overwritten # confusing to overwrite command line parameter #if not args.full_db and not (args.create_db or args.easy_run or args.assign_query): @@ -525,15 +531,10 @@ def main(): #* within-strain analysis *# #* *# #******************************# - sys.stderr.write("Checking options here\n\n") if args.lineage_clustering: sys.stderr.write("Mode: Identifying lineages within a clade\n\n") - # process rankings to use - rank_list = sorted([int(x) for x in args.R.split(',')]) - # load distances - distances = None if args.distances is not None: distances = args.distances else: @@ -554,13 +555,13 @@ def main(): #* below) *# #* *# #*******************************# - elif args.assign_query: + elif args.assign_query or args.assign_lineages: assign_query(dbFuncs, args.ref_db, args.q_files, args.output, args.update_db, args.full_db, args.distances, args.microreact, args.cytoscape, kmers, sketch_sizes, args.ignore_length, args.estimated_length, args.threads, args.use_mash, args.mash, args.overwrite, args.plot_fit, args.no_stream, args.max_a_dist, args.model_dir, args.previous_clustering, args.external_clustering, args.core_only, args.accessory_only, args.phandango, args.grapetree, args.info_csv, - args.rapidnj, args.perplexity, args.assign_lineage, args.existing_scheme, args.R, args.use_accessory) + args.rapidnj, args.perplexity, args.assign_lineages, args.existing_scheme, rank_list, args.use_accessory) #******************************# #* *# @@ -688,7 +689,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances kmers, sketch_sizes, ignore_length, estimated_length, threads, use_mash, mash, overwrite, plot_fit, no_stream, max_a_dist, model_dir, previous_clustering, external_clustering, core_only, accessory_only, phandango, grapetree, - info_csv, rapidnj, perplexity, assign_lineage, existing_scheme, R, use_accessory): + info_csv, rapidnj, perplexity, assign_lineage, existing_scheme, rank_list, use_accessory): """Code for assign query mode. Written as a separate function so it can be called by pathogen.watch API """ @@ -834,12 +835,10 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances complete_distMat = translate_distMat(combined_seq, core_distMat, acc_distMat) if assign_lineage: - rank_list = sorted([int(x) for x in R.split(',')]) expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl' if existing_scheme is not None: expected_lineage_name = existing_scheme - #isolateClustering = {'combined': cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name, use_accessory, threads)} - isolateClustering = cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name, use_accessory, threads) + isolateClustering = cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name, use_accessory, threads) # Prune distances to references only, if not full db dists_out = output + "/" + os.path.basename(output) + ".dists" From 01cd435a731ae8cbc268fe7eacc4ea0dc7f42eb9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 2 May 2020 13:47:36 +0100 Subject: [PATCH 24/46] Renaming rank variables --- PopPUNK/lineage_clustering.py | 70 +++++++++++++++++------------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 76ecd165..718021d3 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -68,7 +68,7 @@ def cluster_neighbours_of_equal_or_lower_rank(isolate, rank, lineage_index, line return lineage_clustering -def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, R, lineage_index, seed_isolate, previous_lineage_clustering, null_cluster_value): +def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, rank, lineage_index, seed_isolate, previous_lineage_clustering, null_cluster_value): """ Identifies isolates corresponding to a particular lineage given a cluster seed. @@ -79,7 +79,7 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n Dict listing isolates by ranked distance from seed. neighbours (nested dict) Pre-calculated neighbour relationships. - R (int) + rank (int) Maximum rank of neighbours used for clustering. lineage_index (int) Label of current lineage. @@ -212,7 +212,7 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances): # return information return lineage_info -def generate_nearest_neighbours(distances, row_labels, isolate_list, R): +def generate_nearest_neighbours(distances, row_labels, isolate_list, rank): # data structures nn = {} last_dist = {} @@ -229,13 +229,13 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R): distance = distances[index] # iterate through all listed isolates for isolate in row_labels[index]: - if isolate in num_ranks.keys() and num_ranks[isolate] < R: + if isolate in num_ranks.keys() and num_ranks[isolate] < rank: # R is the number of unique distances so only increment if # different from the last distance if isolate in last_dist.keys() and last_dist[isolate] != distance: num_ranks[isolate] = num_ranks[isolate] + 1 # if maximum number of ranks reached, mark as complete - if num_ranks[isolate] == R: # stop at R as counting from 0 + if num_ranks[isolate] == rank: # stop at R as counting from 0 completed_isolates = completed_isolates + 1 # if not, add as a nearest neighbour else: @@ -246,7 +246,7 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, R): # return completed dict return nn -def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clustering, null_cluster_value): +def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_clustering, null_cluster_value): """ Updates the information on nearest neighbours, given a new set of ref-query and query-query distances. @@ -255,7 +255,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust Distances to be used for defining lineages. row_labels (list of tuples) Pairs of isolates labelling each distance. - R (int) + rank (int) Maximum rank of distance used to define nearest neighbours. qlist (list) List of queries to be added to existing dataset. @@ -285,7 +285,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust query_distances = np.take(distances, query_match_indices) # get nn for query sequences - query_nn = generate_nearest_neighbours(distances, row_labels, qlist, R) + query_nn = generate_nearest_neighbours(distances, row_labels, qlist, rank) # add query-query comparisons for query in query_nn.keys(): nn[query] = query_nn[query] @@ -319,7 +319,7 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust nn[ref][query] = distance # delete links if no longer high ranked match if distance < max_distance[ref]: - if num_distances[ref] == R: + if num_distances[ref] == rank: to_delete = [] for other in nn[ref].keys(): if nn[ref][other] == max_distance[ref]: @@ -333,12 +333,12 @@ def update_nearest_neighbours(distances, row_labels, R, qlist, nn, lineage_clust # return updated dict return nn, lineage_clustering -def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1): +def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1): """ Clusters isolates into lineages based on their relative distances. Args: - X (np.array) + distMat (np.array) n x 2 array of core and accessory distances for n samples. This should not be subsampled. rank_list (list of int) @@ -367,7 +367,7 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis # - then the functions can be reimplemented to run faster on a # sorted list distance_index = 1 if use_accessory else 0 - distances = X[:,distance_index] + distances = distMat[:,distance_index] # sort distances distance_ranks = np.argsort(distances) @@ -384,11 +384,11 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis previous_lineage_clustering = {} null_cluster_value = len(isolate_list) + 1 - for R in rank_list: - lineage_clustering[R] = {i:null_cluster_value for i in isolate_list} - lineage_seed[R] = {} - neighbours[R] = {} - previous_lineage_clustering[R] = {} + for rank in rank_list: + lineage_clustering[rank] = {i:null_cluster_value for i in isolate_list} + lineage_seed[rank] = {} + neighbours[rank] = {} + previous_lineage_clustering[rank] = {} # shared memory data structures with SharedMemoryManager() as smm: @@ -409,7 +409,7 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis # run clustering for an individual R with Pool(processes = num_processes) as pool: - results = pool.map(partial(run_clustering_for_R, + results = pool.map(partial(run_clustering_for_rank, null_cluster_value = null_cluster_value, qlist = qlist, existing_scheme = existing_scheme, @@ -420,8 +420,8 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis # extract results from multiprocessing pool for n,result in enumerate(results): - R = rank_list[n] - lineage_clustering[R], lineage_seed[R], neighbours[R], previous_lineage_clustering[R] = result + rank = rank_list[n] + lineage_clustering[rank], lineage_seed[rank], neighbours[rank], previous_lineage_clustering[rank] = result # store output with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file: @@ -429,7 +429,7 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis # print output combined = {} - titles_list = ['Lineage_R' + str(R) for R in rank_list] + titles_list = ['Lineage_R' + str(rank) for rank in rank_list] lineage_output_name = output + "/" + output + "_lineage_clusters.csv" with open(lineage_output_name, 'w') as lFile: # print header @@ -444,15 +444,15 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis lFile.write('\n') # print lines for each isolate - for isolate in lineage_clustering[R].keys(): + for isolate in lineage_clustering[rank].keys(): lFile.write(isolate) - for n,R in enumerate(rank_list): - lFile.write(',' + str(lineage_clustering[R][isolate])) - lineage_string = str(lineage_clustering[R][isolate]) + for n,rank in enumerate(rank_list): + lFile.write(',' + str(lineage_clustering[rank][isolate])) + lineage_string = str(lineage_clustering[rank][isolate]) # include information on lineage clustering combined[titles_list[n]][isolate] = lineage_string - if lineage_clustering[R][isolate] != previous_lineage_clustering[R][isolate] and previous_lineage_clustering[R][isolate] != null_cluster_value: - lineage_string = str(previous_lineage_clustering[R][isolate]) + ':' + lineage_string + if lineage_clustering[rank][isolate] != previous_lineage_clustering[rank][isolate] and previous_lineage_clustering[rank][isolate] != null_cluster_value: + lineage_string = str(previous_lineage_clustering[rank][isolate]) + ':' + lineage_string if isolate in combined['Overall_lineage'].keys(): combined['Overall_lineage'][isolate] = combined['Overall_lineage'][isolate] + '-' + lineage_string else: @@ -467,13 +467,13 @@ def cluster_into_lineages(X, rank_list = None, output = None, rlist = None, qlis return combined -def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None): +def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None): """ Clusters isolates into lineages based on their relative distances using a single R to enable parallelisation. Args: - R (int) + rank (int) Integer specifying the maximum rank of neighbour used for clustering. Should be changed to int list for hierarchical clustering. @@ -517,9 +517,9 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc with open(existing_scheme, 'rb') as pickle_file: lineage_clustering_overall, lineage_seed_overall, neighbours_overall, rank_list = pickle.load(pickle_file) # focus on relevant data - lineage_clustering = lineage_clustering_overall[R] - lineage_seed = lineage_seed_overall[R] - neighbours = neighbours_overall[R] + lineage_clustering = lineage_clustering_overall[rank] + lineage_seed = lineage_seed_overall[rank] + neighbours = neighbours_overall[rank] # add new queries to lineage clustering for q in qlist: lineage_clustering[q] = null_cluster_value @@ -527,7 +527,7 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc neighbours, lineage_clustering = update_nearest_neighbours(distances, row_labels, - R, + rank, qlist, neighbours, lineage_clustering, @@ -536,7 +536,7 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc neighbours = generate_nearest_neighbours(distances, row_labels, isolate_list, - R) + rank) # run clustering lineage_index = 1 @@ -567,7 +567,7 @@ def run_clustering_for_R(R, null_cluster_value = None, qlist = None, existing_sc lineage_clustering = run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, - R, + rank, lineage_index, seed_isolate, previous_lineage_clustering, From 8e0875ff87660230a3b41e195d93e3c2f49464a9 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 2 May 2020 14:08:07 +0100 Subject: [PATCH 25/46] Input argument validation --- PopPUNK/__main__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 66e4f0f2..986c888c 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -281,6 +281,12 @@ def main(): # check if working with lineages if args.lineage_clustering or args.assign_lineages: rank_list = sorted([int(x) for x in args.ranks.split(',')]) + if min(rank_list) == 0 or max(rank_list) > 100: + sys.stderr.write('Ranks should be small non-zero integers for sensible results\n') + exit(1) + if args.assign_lineages and args.existing_scheme is None: + sys.stderr.write('Must provide an existing scheme (--existing-scheme) if assigning to lineages\n') + exit(1) # check on file paths and whether files will be overwritten # confusing to overwrite command line parameter @@ -628,7 +634,7 @@ def main(): if args.external_clustering: cluster_file = args.external_clustering isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)} - elif args.viz_lineages: + if args.viz_lineages: cluster_file = args.viz_lineages #isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)} isolateClustering = readLineages(cluster_file) @@ -758,7 +764,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances ordered_queryList, query_distMat = calculateQueryDistances(dbFuncs, refList, q_files, kmers, estimated_length, output, use_mash, threads) - if not assign_lineage: + else: # Assign these distances as within or between strain model_prefix = ref_db if model_dir is not None: From 8cf51de4adb69e408835829f0652788e1ea208cd Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 2 May 2020 14:35:44 +0100 Subject: [PATCH 26/46] Move Q-Q distance calculation into a reused function --- PopPUNK/__main__.py | 5 +-- PopPUNK/lineage_clustering.py | 61 ----------------------------------- PopPUNK/network.py | 36 +++++++-------------- PopPUNK/sketchlib.py | 56 ++++++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+), 88 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 986c888c..caabfa46 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -18,7 +18,6 @@ from .sketchlib import no_sketchlib, checkSketchlibLibrary from .lineage_clustering import cluster_into_lineages -from .lineage_clustering import calculateQueryDistances from .lineage_clustering import readLineages from .network import fetchNetwork @@ -35,6 +34,8 @@ from .prune_db import prune_distance_matrix +from .sketchlib import calculateQueryQueryDistances + from .utils import setupDBFuncs from .utils import storePickle from .utils import readPickle @@ -761,7 +762,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances if assign_lineage: # Assign lineages by calculating query-query information - ordered_queryList, query_distMat = calculateQueryDistances(dbFuncs, refList, q_files, + ordered_queryList, query_distMat = calculateQueryQueryDistances(dbFuncs, refList, q_files, kmers, estimated_length, output, use_mash, threads) else: diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 718021d3..0b8672f3 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -579,67 +579,6 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist # return clustering return lineage_clustering, lineage_seed, neighbours, previous_lineage_clustering -def calculateQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length, - queryDB, use_mash = False, threads = 1): - """Finds edges between queries and items in the reference database, - and modifies the network to include them. - - Args: - dbFuncs (list) - List of backend functions from :func:`~PopPUNK.utils.setupDBFuncs` - rlist (list) - List of reference names - qfile (str) - File containing queries - kmers (list) - List of k-mer sizes - estimated_length (int) - Estimated length of genome, if not calculated from data - queryDB (str) - Query database location - use_mash (bool) - Use the mash backend - threads (int) - Number of threads to use if new db created - (default = 1) - Returns: - qlist1 (list) - Ordered list of queries - distMat (numpy.array) - Query-query distances - """ - - constructDatabase = dbFuncs['constructDatabase'] - queryDatabase = dbFuncs['queryDatabase'] - readDBParams = dbFuncs['readDBParams'] - - # These are returned - qlist1 = None - distMat = None - - # Set up query names - qList, qSeqs = readRfile(qfile, oneSeq = use_mash) - queryFiles = dict(zip(qList, qSeqs)) - if use_mash == True: - rNames = None - qNames = qSeqs - else: - rNames = qList - qNames = rNames - - # Calculate all query-query distances too, if updating database - qlist1, qlist2, distMat = queryDatabase(rNames = rNames, - qNames = qNames, - dbPrefix = queryDB, - queryPrefix = queryDB, - klist = kmers, - self = True, - number_plot_fits = 0, - threads=threads) - - return qlist1, distMat - - def readLineages(clustCSV): """Read a previous reference clustering from CSV diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 40b7c494..6932bc38 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -18,6 +18,8 @@ from tempfile import mkstemp, mkdtemp from collections import defaultdict, Counter +from .sketchlib import calculateQueryQueryDistances + from .utils import iterDistRows from .utils import readClusters from .utils import readExternalClusters @@ -320,26 +322,10 @@ def addQueryToNetwork(dbFuncs, rlist, qfile, G, kmers, estimated_length, distMat (numpy.array) Query-query distances """ - constructDatabase = dbFuncs['constructDatabase'] - queryDatabase = dbFuncs['queryDatabase'] - readDBParams = dbFuncs['readDBParams'] # initialise links data structure new_edges = [] assigned = set() - # These are returned - qlist1 = None - distMat = None - - # Set up query names - qList, qSeqs = readRfile(qfile, oneSeq = use_mash) - queryFiles = dict(zip(qList, qSeqs)) - if use_mash == True: - rNames = None - qNames = qSeqs - else: - rNames = qList - qNames = rNames # store links for each query in a list of edge tuples for assignment, (ref, query) in zip(assignments, iterDistRows(rlist, qList, self=False)): @@ -350,17 +336,17 @@ def addQueryToNetwork(dbFuncs, rlist, qfile, G, kmers, estimated_length, # Calculate all query-query distances too, if updating database if queryQuery: sys.stderr.write("Calculating all query-query distances\n") + qlist, distMat = calculateQueryQueryDistances(dbFuncs, + refList, + q_files, + kmers, + estimated_length, + output, + use_mash, + threads) - qlist1, qlist2, distMat = queryDatabase(rNames = rNames, - qNames = qNames, - dbPrefix = queryDB, - queryPrefix = queryDB, - klist = kmers, - self = True, - number_plot_fits = 0, - threads=threads) queryAssignation = model.assign(distMat) - for assignment, (ref, query) in zip(queryAssignation, iterDistRows(qlist1, qlist2, self=True)): + for assignment, (ref, query) in zip(queryAssignation, iterDistRows(qlist, qlist, self=True)): if assignment == model.within_label: new_edges.append((ref, query)) diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index 27abe3a8..ee53dc4f 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -446,3 +446,59 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num False, threads, use_gpu, deviceid) return(rNames, qNames, distMat) + +def calculateQueryQueryDistances(dbFuncs, rlist, qfile, kmers, estimated_length, + queryDB, use_mash = False, threads = 1): + """Calculates distances between queries. + + Args: + dbFuncs (list) + List of backend functions from :func:`~PopPUNK.utils.setupDBFuncs` + rlist (list) + List of reference names + qfile (str) + File containing queries + kmers (list) + List of k-mer sizes + estimated_length (int) + Estimated length of genome, if not calculated from data + queryDB (str) + Query database location + use_mash (bool) + Use the mash backend + threads (int) + Number of threads to use if new db created + (default = 1) + + Returns: + qlist1 (list) + Ordered list of queries + distMat (numpy.array) + Query-query distances + """ + + constructDatabase = dbFuncs['constructDatabase'] + queryDatabase = dbFuncs['queryDatabase'] + readDBParams = dbFuncs['readDBParams'] + + # Set up query names + qList, qSeqs = readRfile(qfile, oneSeq = use_mash) + queryFiles = dict(zip(qList, qSeqs)) + if use_mash == True: + rNames = None + qNames = qSeqs + else: + rNames = qList + qNames = rNames + + # Calculate all query-query distances too, if updating database + qlist1, qlist2, distMat = queryDatabase(rNames = rNames, + qNames = qNames, + dbPrefix = queryDB, + queryPrefix = queryDB, + klist = kmers, + self = True, + number_plot_fits = 0, + threads=threads) + + return qlist1, distMat From a0c19512dd61cbe086808d8504880df7d0292a53 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sat, 2 May 2020 21:39:06 +0100 Subject: [PATCH 27/46] Get rid of null cluster value --- PopPUNK/lineage_clustering.py | 56 +++++++++++++---------------------- 1 file changed, 20 insertions(+), 36 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 0b8672f3..9aa156cd 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -57,7 +57,7 @@ def cluster_neighbours_of_equal_or_lower_rank(isolate, rank, lineage_index, line rank = isolate_ranks[isolate] for isolate_neighbour in nn[isolate].keys(): # if lineage of neighbour is higher, or it is null value (highest) - if lineage_clustering[isolate_neighbour] > lineage_index: + if lineage_clustering[isolate_neighbour] is None or lineage_clustering[isolate_neighbour] > lineage_index: # check if rank of neighbour is lower than that of the current subject isolate for neighbour_rank in range(1,rank): if isolate_neighbour in lineage_clustering_information[neighbour_rank]: @@ -68,7 +68,7 @@ def cluster_neighbours_of_equal_or_lower_rank(isolate, rank, lineage_index, line return lineage_clustering -def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, rank, lineage_index, seed_isolate, previous_lineage_clustering, null_cluster_value): +def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, rank, lineage_index, seed_isolate, previous_lineage_clustering): """ Identifies isolates corresponding to a particular lineage given a cluster seed. @@ -87,9 +87,6 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n Isolate to used to initiate next lineage. previous_lineage_clustering (dict) Clustering of existing dataset in previous iteration. - null_cluster_value (int) - Null cluster value used for unsetting lineage assignments - where this may change due to altered neighbour relationships. Returns: lineage_clustering (dict) @@ -98,7 +95,7 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n """ # first make all R neighbours of the seed part of the lineage if unclustered for seed_neighbour in neighbours[seed_isolate]: - if lineage_clustering[seed_neighbour] > lineage_index: + if lineage_clustering[seed_neighbour] is None or lineage_clustering[seed_neighbour] > lineage_index: lineage_clustering[seed_neighbour] = lineage_index # iterate through ranks; for each isolate, test if neighbour belongs to this cluster # overwrite higher cluster values - default value is higer than number of isolates @@ -108,7 +105,7 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n # iterate through isolates of same rank for isolate in lineage_clustering_information[rank]: # test if clustered/belonging to a more peripheral cluster - if lineage_clustering[isolate] > lineage_index: + if lineage_clustering[isolate] is None or lineage_clustering[isolate] > lineage_index: # get clusters of nearest neighbours isolate_neighbour_clusters = [lineage_clustering[isolate_neighbour] for isolate_neighbour in neighbours[isolate].keys()] # if a nearest neighbour is in this cluster @@ -130,12 +127,12 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n if higher_rank > rank: for higher_ranked_isolate in lineage_clustering_information[rank]: if lineage_clustering[isolate] == lineage_index: - lineage_clustering[isolate] = null_cluster_value + lineage_clustering[isolate] = None return lineage_clustering -def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_value, lineage_index, lineage_seed): +def get_seed_isolate(lineage_clustering, row_labels, distances, lineage_index, lineage_seed): """ Identifies the isolate used to initiate a cluster. Args: @@ -145,9 +142,6 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val Pairs of isolates labelling each distance. distances (numpy array) Pairwise distances used for defining relationships. - null_cluster_value (int) - Null cluster value used for unsetting lineage assignments - where this may change due to altered neighbour relationships. lineage_index (int) Label of current lineage. lineage_seed (dict) @@ -166,15 +160,15 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, null_cluster_val # if seed now belongs to a different lineage, then lineage has been overwritten # seed may be unassigned if neighbours have changed - but this does not overwrite the # lineage in the same way - if lineage_clustering[original_seed_isolate] == null_cluster_value or lineage_clustering[original_seed_isolate] == lineage_index: + if lineage_clustering[original_seed_isolate] is None or lineage_clustering[original_seed_isolate] == lineage_index: seed_isolate = original_seed_isolate # else identify a new seed from the closest pair else: for index,(distance,(isolate1,isolate2)) in enumerate(zip(distances,row_labels)): - if lineage_clustering[isolate1] == null_cluster_value: + if lineage_clustering[isolate1] is None: seed_isolate = isolate1 break - elif lineage_clustering[isolate2] == null_cluster_value: + elif lineage_clustering[isolate2] is None: seed_isolate = isolate2 break # return identified seed isolate @@ -246,7 +240,7 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, rank): # return completed dict return nn -def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_clustering, null_cluster_value): +def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_clustering): """ Updates the information on nearest neighbours, given a new set of ref-query and query-query distances. @@ -263,9 +257,6 @@ def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_cl Pre-calculated neighbour relationships. lineage_clustering (dict) Clustering of existing dataset. - null_cluster_value (int) - Null cluster value used for unsetting lineage assignments - where this may change due to altered neighbour relationships. Returns: nn (nested dict) @@ -312,9 +303,9 @@ def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_cl if ref is not None: if distance <= max_distance[ref]: # unset isolate and references - lineage_clustering[ref] = null_cluster_value + lineage_clustering[ref] = None for neighbour in nn[ref]: - lineage_clustering[neighbour] = null_cluster_value + lineage_clustering[neighbour] = None # add neighbours nn[ref][query] = distance # delete links if no longer high ranked match @@ -383,9 +374,8 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None lineage_clustering = {} previous_lineage_clustering = {} - null_cluster_value = len(isolate_list) + 1 for rank in rank_list: - lineage_clustering[rank] = {i:null_cluster_value for i in isolate_list} + lineage_clustering[rank] = {i:None for i in isolate_list} lineage_seed[rank] = {} neighbours[rank] = {} previous_lineage_clustering[rank] = {} @@ -410,7 +400,6 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None # run clustering for an individual R with Pool(processes = num_processes) as pool: results = pool.map(partial(run_clustering_for_rank, - null_cluster_value = null_cluster_value, qlist = qlist, existing_scheme = existing_scheme, distances = distances_shared_array, @@ -451,7 +440,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None lineage_string = str(lineage_clustering[rank][isolate]) # include information on lineage clustering combined[titles_list[n]][isolate] = lineage_string - if lineage_clustering[rank][isolate] != previous_lineage_clustering[rank][isolate] and previous_lineage_clustering[rank][isolate] != null_cluster_value: + if lineage_clustering[rank][isolate] != previous_lineage_clustering[rank][isolate] and previous_lineage_clustering[rank][isolate] is not None: lineage_string = str(previous_lineage_clustering[rank][isolate]) + ':' + lineage_string if isolate in combined['Overall_lineage'].keys(): combined['Overall_lineage'][isolate] = combined['Overall_lineage'][isolate] + '-' + lineage_string @@ -467,7 +456,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None return combined -def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None): +def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None): """ Clusters isolates into lineages based on their relative distances using a single R to enable parallelisation. @@ -477,8 +466,6 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist Integer specifying the maximum rank of neighbour used for clustering. Should be changed to int list for hierarchical clustering. - null_cluster_value (int) - Used to denote as-yet-unclustered isolates. qlist (list) List of query sequences being added to an existing clustering. Should be included within rlist. @@ -508,7 +495,7 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist # reorder by sorted distances row_labels = [row_labels[i] for i in distance_ranks] - lineage_clustering = {i:null_cluster_value for i in isolate_list} + lineage_clustering = {i:None for i in isolate_list} previous_lineage_clustering = lineage_clustering lineage_seed = {} neighbours = {} @@ -522,7 +509,7 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist neighbours = neighbours_overall[rank] # add new queries to lineage clustering for q in qlist: - lineage_clustering[q] = null_cluster_value + lineage_clustering[q] = None previous_lineage_clustering = lineage_clustering neighbours, lineage_clustering = update_nearest_neighbours(distances, @@ -530,8 +517,7 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist rank, qlist, neighbours, - lineage_clustering, - null_cluster_value) + lineage_clustering) else: neighbours = generate_nearest_neighbours(distances, row_labels, @@ -540,13 +526,12 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist # run clustering lineage_index = 1 - while null_cluster_value in lineage_clustering.values(): + while None in lineage_clustering.values(): # get seed isolate based on minimum pairwise distances seed_isolate = get_seed_isolate(lineage_clustering, row_labels, distances, - null_cluster_value, lineage_index, lineage_seed) lineage_seed[lineage_index] = seed_isolate @@ -570,8 +555,7 @@ def run_clustering_for_rank(rank, null_cluster_value = None, qlist = None, exist rank, lineage_index, seed_isolate, - previous_lineage_clustering, - null_cluster_value) + previous_lineage_clustering) # increment index for next lineage lineage_index = lineage_index + 1 From 14ed2f4dde474fff9d45d92a7952e2a642ac1d87 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 3 May 2020 06:20:52 +0100 Subject: [PATCH 28/46] Change dict initialisation --- PopPUNK/lineage_clustering.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 9aa156cd..c8a6da01 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -208,12 +208,10 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances): def generate_nearest_neighbours(distances, row_labels, isolate_list, rank): # data structures - nn = {} + nn = defaultdict(dict) last_dist = {} num_ranks = {} - for i in isolate_list: - nn[i] = {} - num_ranks[i] = 0 + num_ranks = {i:0 for i in isolate_list} total_isolates = len(isolate_list) num_distances = len(distances) completed_isolates = 0 From 2d712d28d7200918959ac857b310647eef4637ef Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 3 May 2020 13:54:56 +0100 Subject: [PATCH 29/46] Merge CSV reading functions --- PopPUNK/__main__.py | 8 ++--- PopPUNK/lineage_clustering.py | 36 ++----------------- PopPUNK/network.py | 20 +++++------ PopPUNK/utils.py | 65 ++++++++++++++++------------------- 4 files changed, 44 insertions(+), 85 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index caabfa46..f996a6a5 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -18,7 +18,6 @@ from .sketchlib import no_sketchlib, checkSketchlibLibrary from .lineage_clustering import cluster_into_lineages -from .lineage_clustering import readLineages from .network import fetchNetwork from .network import constructNetwork @@ -41,10 +40,10 @@ from .utils import readPickle from .utils import writeTmpFile from .utils import qcDistMat -from .utils import readClusters from .utils import translate_distMat from .utils import update_distance_matrices from .utils import readRfile +from .utils import readIsolateTypeFromCsv # Minimum sketchlib version SKETCHLIB_MAJOR = 1 @@ -634,11 +633,10 @@ def main(): # Use external clustering if specified if args.external_clustering: cluster_file = args.external_clustering - isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)} + isolateClustering = readIsolateTypeFromCsv(cluster_file, mode = 'external', return_dict = True) if args.viz_lineages: cluster_file = args.viz_lineages - #isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)} - isolateClustering = readLineages(cluster_file) + isolateClustering = readIsolateTypeFromCsv(cluster_file, mode = 'lineages', return_dict = True) else: # identify existing analysis files model_prefix = args.ref_db diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index c8a6da01..25c886fb 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -211,7 +211,7 @@ def generate_nearest_neighbours(distances, row_labels, isolate_list, rank): nn = defaultdict(dict) last_dist = {} num_ranks = {} - num_ranks = {i:0 for i in isolate_list} + num_ranks = {i:0 for i in isolate_list} total_isolates = len(isolate_list) num_distances = len(distances) completed_isolates = 0 @@ -416,7 +416,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None # print output combined = {} - titles_list = ['Lineage_R' + str(rank) for rank in rank_list] + titles_list = ['Lineage_Rank_' + str(rank) for rank in rank_list] lineage_output_name = output + "/" + output + "_lineage_clusters.csv" with open(lineage_output_name, 'w') as lFile: # print header @@ -560,35 +560,3 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc # return clustering return lineage_clustering, lineage_seed, neighbours, previous_lineage_clustering - -def readLineages(clustCSV): - """Read a previous reference clustering from CSV - - Args: - clustCSV (str) - File name of CSV with previous cluster assignments - - Returns: - clusters (dict) - Or if return_dict is set keys are sample names, - values are cluster assignments. - """ - clusters = {} - relevant_headers = [] - header_elements = [] - - with open(clustCSV, 'r') as csv_file: - header = csv_file.readline() - # identify columns to include - header_elements = header.rstrip().split(",") - relevant_headers.append(header_elements.index('Overall_lineage')) - relevant_headers.extend([n for n,i in enumerate(header_elements) if re.search('Lineage_R',i)]) - for h in relevant_headers: - clusters[header_elements[h]] = {} - for line in csv_file: - elements = line.rstrip().split(",") - if elements[0] != header_elements[0]: - for h in relevant_headers: - clusters[header_elements[h]][elements[0]] = elements[h] - - return clusters diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 6932bc38..9ad9bf73 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -21,8 +21,7 @@ from .sketchlib import calculateQueryQueryDistances from .utils import iterDistRows -from .utils import readClusters -from .utils import readExternalClusters +from .utils import readIsolateTypeFromCsv from .utils import readRfile def fetchNetwork(network_dir, model, refList, @@ -402,7 +401,7 @@ def addQueryToNetwork(dbFuncs, rlist, qfile, G, kmers, estimated_length, return qlist1, distMat def printClusters(G, outPrefix = "_clusters.csv", oldClusterFile = None, - externalClusterCSV = None, printRef = True, printCSV = True): + externalClusterCSV = None, printRef = True, printCSV = True, clustering_type = 'combined'): """Get cluster assignments Also writes assignments to a CSV file @@ -413,26 +412,24 @@ def printClusters(G, outPrefix = "_clusters.csv", oldClusterFile = None, :func:`~addQueryToNetwork`) outPrefix (str) Prefix for output CSV - Default = "_clusters.csv" oldClusterFile (str) CSV with previous cluster assignments. Pass to ensure consistency in cluster assignment name. - Default = None externalClusterCSV (str) CSV with cluster assignments from any source. Will print a file relating these to new cluster assignments - Default = None printRef (bool) If false, print only query sequences in the output - Default = True printCSV (bool) Print results to file - Default = True + clustering_type (str) + Type of clustering network, used for comparison with old clusters + Default = 'combined' Returns: clustering (dict) @@ -446,7 +443,9 @@ def printClusters(G, outPrefix = "_clusters.csv", oldClusterFile = None, oldNames = set() if oldClusterFile != None: - oldClusters = readClusters(oldClusterFile) +# oldClusters = readClusters(oldClusterFile) + oldAllClusters = readIsolateTypeFromCsv(oldClusterFile, mode = 'external', return_dict = False) + oldCluster = oldAllClusters[clustering_type] new_id = len(oldClusters) + 1 # 1-indexed while new_id in oldClusters: new_id += 1 # in case clusters have been merged @@ -557,7 +556,8 @@ def printExternalClusters(newClusters, extClusterFile, outPrefix, d = defaultdict(list) # Read in external clusters - extClusters = readExternalClusters(extClusterFile) +# extClusters = readExternalClusters(extClusterFile) + readIsolateTypeFromCsv(clustCSV, mode = 'external', return_dict = False) # Go through each cluster (as defined by poppunk) and find the external # clusters that had previously been assigned to any sample in the cluster diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 695fa7fe..58d1f0fe 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -214,12 +214,12 @@ def qcDistMat(distMat, refList, queryList, a_max): return passed -def readClusters(clustCSV, return_dict=False): - """Read a previous reference clustering from CSV +def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False): + """Read isolate types from CSV file. Args: clustCSV (str) - File name of CSV with previous cluster assignments + File name of CSV with isolate assignments return_type (str) If True, return a dict with sample->cluster instead of sets @@ -230,46 +230,39 @@ def readClusters(clustCSV, return_dict=False): sets containing samples in the cluster). Or if return_dict is set keys are sample names, values are cluster assignments. """ + # data structures if return_dict: - clusters = {} + clusters = defaultdict(lambda: defaultdict(str)) else: - clusters = defaultdict(set) - - with open(clustCSV, 'r') as csv_file: - header = csv_file.readline() - for line in csv_file: - (sample, clust_id) = line.rstrip().split(",")[:2] + clusters = defaultdict(lambda: defaultdict(set)) + + # read CSV + clustersCsv = pd.read_csv(clustCSV, index_col = 0, quotechar='"') + + # select relevant columns according to mode + if mode == 'clusters': + type_columns = clustersCsv.columns[clustersCsv.columns.str.contains('_Cluster')] + elif mode == 'lineages': + type_columns = clustersCsv.columns[clustersCsv.columns.str.contains('Overall_lineage') | clustersCsv.columns.str.contains('Lineage_Rank_')] + elif mode == 'external': + type_columns = range(1,len(clustersCsv.columns)) + else: + sys.stderr.write('Unknown CSV reading mode: ' + mode + '\n') + exit(1) + + # read file + for row in clustersCsv.itertuples(): + for cls_idx in type_columns: + cluster_name = str(clustersCsv.columns[type_columns[cls_idx]]) + cluster_name = cluster_name.replace('__autocolour','') if return_dict: - clusters[sample] = clust_id + clusters[cluster_name][row.Index] = str(row[cls_idx]) else: - clusters[clust_id].add(sample) + clusters[cluster_name][str(row[cls_idx])].append([row.Index]) + # return data structure return clusters - -def readExternalClusters(clustCSV): - """Read a cluster definition from CSV (does not have to be PopPUNK - generated clusters). Rows samples, columns clusters. - - Args: - clustCSV (str) - File name of CSV with previous cluster assingments - - Returns: - extClusters (dict) - Dictionary of dictionaries of cluster assignments - (first key cluster assignment name, second key sample, value cluster assignment) - """ - extClusters = defaultdict(lambda: defaultdict(str)) - - extClustersFile = pd.read_csv(clustCSV, index_col = 0, quotechar='"') - for row in extClustersFile.itertuples(): - for cls_idx, cluster in enumerate(extClustersFile.columns): - extClusters[str(cluster)][row.Index] = str(row[cls_idx + 1]) - - return(extClusters) - - def translate_distMat(combined_list, core_distMat, acc_distMat): """Convert distances from a square form (2 NxN matrices) to a long form (1 matrix with n_comparisons rows and 2 columns). From ed6e7296d0206fbd82057e14a73ec82f1e5c9d53 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 3 May 2020 15:51:28 +0100 Subject: [PATCH 30/46] Print lineages CSV using writing function in plot.py --- PopPUNK/__main__.py | 2 +- PopPUNK/lineage_clustering.py | 67 +++++++++++++++-------------------- PopPUNK/plot.py | 23 ++++++------ 3 files changed, 41 insertions(+), 51 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index f996a6a5..fb3daf4f 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -840,7 +840,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances complete_distMat = translate_distMat(combined_seq, core_distMat, acc_distMat) if assign_lineage: - expected_lineage_name = ref_db + '/' + ref_db + '_lineageClusters.pkl' + expected_lineage_name = ref_db + '/' + ref_db + '_lineages.pkl' if existing_scheme is not None: expected_lineage_name = existing_scheme isolateClustering = cluster_into_lineages(complete_distMat, rank_list, output, combined_seq, ordered_queryList, expected_lineage_name, use_accessory, threads) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 25c886fb..7b30dc5a 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -23,6 +23,8 @@ from functools import partial # import poppunk package +from .plot import writeClusterCsv + from .utils import iterDistRows from .utils import readRfile @@ -411,48 +413,35 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None lineage_clustering[rank], lineage_seed[rank], neighbours[rank], previous_lineage_clustering[rank] = result # store output - with open(output + "/" + output + '_lineageClusters.pkl', 'wb') as pickle_file: + with open(output + "/" + output + '_lineages.pkl', 'wb') as pickle_file: pickle.dump([lineage_clustering, lineage_seed, neighbours, rank_list], pickle_file) - # print output - combined = {} - titles_list = ['Lineage_Rank_' + str(rank) for rank in rank_list] - lineage_output_name = output + "/" + output + "_lineage_clusters.csv" - with open(lineage_output_name, 'w') as lFile: - # print header - lFile.write('Id') - for t in titles_list: - lFile.write(',' + t + '__autocolor') - combined[t] = {} - lFile.write(',Overall_lineage') - combined['Overall_lineage'] = {} - if qlist is not None: - lFile.write(',Status') - lFile.write('\n') - - # print lines for each isolate - for isolate in lineage_clustering[rank].keys(): - lFile.write(isolate) - for n,rank in enumerate(rank_list): - lFile.write(',' + str(lineage_clustering[rank][isolate])) - lineage_string = str(lineage_clustering[rank][isolate]) - # include information on lineage clustering - combined[titles_list[n]][isolate] = lineage_string - if lineage_clustering[rank][isolate] != previous_lineage_clustering[rank][isolate] and previous_lineage_clustering[rank][isolate] is not None: - lineage_string = str(previous_lineage_clustering[rank][isolate]) + ':' + lineage_string - if isolate in combined['Overall_lineage'].keys(): - combined['Overall_lineage'][isolate] = combined['Overall_lineage'][isolate] + '-' + lineage_string - else: - combined['Overall_lineage'][isolate] = lineage_string - lFile.write(',' + combined['Overall_lineage'][isolate]) - if qlist is not None: - if isolate in qlist: - lFile.write(',Added') - else: - lFile.write(',Existing') - lFile.write('\n') + # process multirank lineages + overall_lineages = {} + overall_lineages = {'Rank_' + str(rank):{} for rank in rank_list} + overall_lineages['overall'] = {} + for isolate in isolate_list: + overall_lineage = None + for rank in rank_list: + overall_lineages['Rank_' + str(rank)][isolate] = lineage_clustering[rank][isolate] + if overall_lineage is None: + overall_lineage = str(lineage_clustering[rank][isolate]) + else: + overall_lineage = overall_lineage + '-' + str(lineage_clustering[rank][isolate]) + overall_lineages['overall'][isolate] = overall_lineage + + # print output as CSV + writeClusterCsv(output + "/" + output + '_lineages.csv', + isolate_list, + isolate_list, + overall_lineages, + output_format = 'phandango', + epiCsv = None, + queryNames = qlist, + suffix = '_Lineage') - return combined + # return lineages + return overall_lineages def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None): """ Clusters isolates into lineages based on their diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 2c8b0b4e..13441f0c 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -399,7 +399,7 @@ def outputsForCytoscape(G, clustering, outPrefix, epiCsv, queryList = None, suff epiCsv, queryList) -def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = 'microreact', epiCsv = None, queryNames = None): +def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = 'microreact', epiCsv = None, queryNames = None, suffix = '_Cluster'): """Print CSV file of clustering and optionally epi data Writes CSV output of clusters which can be used as input to microreact and cytoscape. @@ -433,7 +433,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = if output_format == 'microreact': colnames = ['id'] for cluster_type in clustering: - col_name = cluster_type + '_Cluster__autocolour' + col_name = cluster_type + suffix + '__autocolour' colnames.append(col_name) if queryNames is not None: colnames.append('Status') @@ -441,7 +441,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = elif output_format == 'phandango': colnames = ['id'] for cluster_type in clustering: - col_name = cluster_type + '_Cluster' + col_name = cluster_type + suffix colnames.append(col_name) if queryNames is not None: colnames.append('Status') @@ -449,14 +449,14 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = elif output_format == 'grapetree': colnames = ['ID'] for cluster_type in clustering: - col_name = cluster_type + '_Cluster' + col_name = cluster_type + suffix colnames.append(col_name) if queryNames is not None: colnames.append('Status') elif output_format == 'cytoscape': colnames = ['id'] for cluster_type in clustering: - col_name = cluster_type + '_Cluster' + col_name = cluster_type + suffix colnames.append(col_name) if queryNames is not None: colnames.append('Status') @@ -485,7 +485,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = if output_format == 'microreact': d['id'].append(label) for cluster_type in clustering: - col_name = cluster_type + "_Cluster__autocolour" + col_name = cluster_type + suffix + "__autocolour" d[col_name].append(clustering[cluster_type][name]) if queryNames is not None: if name in queryNames: @@ -497,7 +497,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = elif output_format == 'phandango': d['id'].append(label) for cluster_type in clustering: - col_name = cluster_type + "_Cluster" + col_name = cluster_type + suffix d[col_name].append(clustering[cluster_type][name]) if queryNames is not None: if name in queryNames: @@ -509,7 +509,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = elif output_format == 'grapetree': d['ID'].append(label) for cluster_type in clustering: - col_name = cluster_type + "_Cluster" + col_name = cluster_type + suffix d[col_name].append(clustering[cluster_type][name]) if queryNames is not None: if name in queryNames: @@ -519,7 +519,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = elif output_format == 'cytoscape': d['id'].append(name) for cluster_type in clustering: - col_name = cluster_type + "_Cluster" + col_name = cluster_type + suffix d[col_name].append(clustering[cluster_type][name]) if queryNames is not None: if name in queryNames: @@ -530,7 +530,8 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = # avoid adding if len(columns_to_be_omitted) == 0: columns_to_be_omitted = ['id', 'Id', 'ID', 'combined_Cluster__autocolour', - 'core_Cluster__autocolour', 'accessory_Cluster__autocolour'] + 'core_Cluster__autocolour', 'accessory_Cluster__autocolour', + 'overall_Lineage'] for c in d: if c not in columns_to_be_omitted: columns_to_be_omitted.append(c) @@ -546,7 +547,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = else: sys.stderr.write("Cannot find " + name + " in clustering\n") sys.exit(1) - + # print CSV sys.stderr.write("Parsed data, now writing to CSV\n") try: From 464e1458b20e281de6ea73edfa7b6263b50eb408 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Sun, 3 May 2020 22:07:22 +0100 Subject: [PATCH 31/46] Use indices rather than names for saving memory --- PopPUNK/lineage_clustering.py | 30 +++++++++++++++------------ PopPUNK/utils.py | 39 +++++++++++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 15 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 7b30dc5a..e5a73a18 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -26,6 +26,7 @@ from .plot import writeClusterCsv from .utils import iterDistRows +from .utils import listDistInts from .utils import readRfile def cluster_neighbours_of_equal_or_lower_rank(isolate, rank, lineage_index, lineage_clustering, lineage_clustering_information, nn): @@ -208,13 +209,13 @@ def get_lineage_clustering_information(seed_isolate, row_labels, distances): # return information return lineage_info -def generate_nearest_neighbours(distances, row_labels, isolate_list, rank): +def generate_nearest_neighbours(distances, row_labels, isolate_indices, rank): # data structures nn = defaultdict(dict) last_dist = {} num_ranks = {} - num_ranks = {i:0 for i in isolate_list} - total_isolates = len(isolate_list) + num_ranks = {i:0 for i in isolate_indices} + total_isolates = len(isolate_indices) num_distances = len(distances) completed_isolates = 0 index = 0 @@ -280,6 +281,7 @@ def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_cl # add query-query comparisons for query in query_nn.keys(): nn[query] = query_nn[query] + print('Query: ' + str(query) + ' nn ' + str(query_nn[query])) # calculate max distances for each isolate max_distance = {} @@ -375,7 +377,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None previous_lineage_clustering = {} for rank in rank_list: - lineage_clustering[rank] = {i:None for i in isolate_list} + lineage_clustering[rank] = {i:None for i in range(0,len(isolate_list))} lineage_seed[rank] = {} neighbours[rank] = {} previous_lineage_clustering[rank] = {} @@ -420,14 +422,14 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None overall_lineages = {} overall_lineages = {'Rank_' + str(rank):{} for rank in rank_list} overall_lineages['overall'] = {} - for isolate in isolate_list: + for index,isolate in enumerate(isolate_list): overall_lineage = None for rank in rank_list: - overall_lineages['Rank_' + str(rank)][isolate] = lineage_clustering[rank][isolate] + overall_lineages['Rank_' + str(rank)][isolate] = lineage_clustering[rank][index] if overall_lineage is None: - overall_lineage = str(lineage_clustering[rank][isolate]) + overall_lineage = str(lineage_clustering[rank][index]) else: - overall_lineage = overall_lineage + '-' + str(lineage_clustering[rank][isolate]) + overall_lineage = overall_lineage + '-' + str(lineage_clustering[rank][index]) overall_lineages['overall'][isolate] = overall_lineage # print output as CSV @@ -474,15 +476,16 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc distance_ranks_shm = shared_memory.SharedMemory(name = distance_ranks.name) distance_ranks = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = distance_ranks_shm.buf) isolate_list = isolates + isolate_indices = range(0,len(isolate_list)) # calculate row labels # this is inefficient but there appears to be no way of sharing # strings between processes efficiently - row_labels = list(iter(iterDistRows(isolate_list, isolate_list, self = True))) + row_labels = listDistInts(isolate_list, isolate_list, self = True) # reorder by sorted distances row_labels = [row_labels[i] for i in distance_ranks] - lineage_clustering = {i:None for i in isolate_list} + lineage_clustering = {i:None for i in range(0,len(isolate_list))} previous_lineage_clustering = lineage_clustering lineage_seed = {} neighbours = {} @@ -495,20 +498,21 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc lineage_seed = lineage_seed_overall[rank] neighbours = neighbours_overall[rank] # add new queries to lineage clustering - for q in qlist: + q_indices = [isolate_list.index(q) for q in qlist] + for q in q_indices: lineage_clustering[q] = None previous_lineage_clustering = lineage_clustering neighbours, lineage_clustering = update_nearest_neighbours(distances, row_labels, rank, - qlist, + q_indices, neighbours, lineage_clustering) else: neighbours = generate_nearest_neighbours(distances, row_labels, - isolate_list, + isolate_indices, rank) # run clustering diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 58d1f0fe..ac7353ad 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -143,9 +143,7 @@ def iterDistRows(refSeqs, querySeqs, self=True): List of query sequence names. self (bool) Whether a self-comparison, used when constructing a database. - Requires refSeqs == querySeqs - Default is True Returns: ref, query (str, str) @@ -162,6 +160,43 @@ def iterDistRows(refSeqs, querySeqs, self=True): for ref in refSeqs: yield(ref, query) +def listDistInts(refSeqs, querySeqs, self=True): + """Gets the ref and query ID for each row of the distance matrix + + Returns an iterable with ref and query ID pairs by row. + + Args: + refSeqs (list) + List of reference sequence names. + querySeqs (list) + List of query sequence names. + self (bool) + Whether a self-comparison, used when constructing a database. + Requires refSeqs == querySeqs + Default is True + Returns: + ref, query (str, str) + Iterable of tuples with ref and query names for each distMat row. + """ + dist_orders = [] + n = 0 + if self: + dist_orders = [(0,0)] * int(0.5 * len(refSeqs) * (len(refSeqs) - 1)) + if refSeqs != querySeqs: + raise RuntimeError('refSeqs must equal querySeqs for db building (self = true)') + for i in range(0, len(refSeqs)): + for j in range(i + 1, len(refSeqs)): + dist_orders[n] = (j, i) + n = n + 1 + else: + dist_orders = [(0,0)] * int(len(refSeqs) * (len(querySeqs))) + for i,query in enumerate(querySeqs): + for j,ref in enumerate(refSeqs): + dist_orders[n] = (j, i) + n = n + 1 + + # return final list of tuples + return dist_orders def writeTmpFile(fileList): """Writes a list to a temporary file. Used for turning variable into mash From 09172f569a2b7cbe060e2d1de3c72e66a4f688a1 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 4 May 2020 10:21:46 +0100 Subject: [PATCH 32/46] Fix pandas syntax --- PopPUNK/plot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 13441f0c..eb2ab87f 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -478,6 +478,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = nodeLabels = [r.split('/')[-1].split('.')[0] for r in nodeNames] # get example clustering name for validation + print('Clustering is ' + str(clustering)) example_cluster_title = list(clustering.keys())[0] for name, label in zip(nodeNames, nodeLabels): From ce3a7e97a170862a966d1fb2a2e6eb16c1a74db2 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 4 May 2020 13:19:05 +0100 Subject: [PATCH 33/46] Correct CSV reading/writing --- PopPUNK/__main__.py | 1 + PopPUNK/lineage_clustering.py | 3 ++- PopPUNK/plot.py | 1 - PopPUNK/utils.py | 12 ++++++------ 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index fb3daf4f..6ca5be8b 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -637,6 +637,7 @@ def main(): if args.viz_lineages: cluster_file = args.viz_lineages isolateClustering = readIsolateTypeFromCsv(cluster_file, mode = 'lineages', return_dict = True) + print('CLUSTERS: ' + str(isolateClustering)) else: # identify existing analysis files model_prefix = args.ref_db diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index e5a73a18..da900a33 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -281,7 +281,6 @@ def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_cl # add query-query comparisons for query in query_nn.keys(): nn[query] = query_nn[query] - print('Query: ' + str(query) + ' nn ' + str(query_nn[query])) # calculate max distances for each isolate max_distance = {} @@ -413,6 +412,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None for n,result in enumerate(results): rank = rank_list[n] lineage_clustering[rank], lineage_seed[rank], neighbours[rank], previous_lineage_clustering[rank] = result + print('Rank: ' + str(rank) + ' seeds: ' + str(lineage_seed[rank])) # store output with open(output + "/" + output + '_lineages.pkl', 'wb') as pickle_file: @@ -496,6 +496,7 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc # focus on relevant data lineage_clustering = lineage_clustering_overall[rank] lineage_seed = lineage_seed_overall[rank] + print('Rank: ' + str(rank) + '\nSEEDS!: ' + str(lineage_seed)) neighbours = neighbours_overall[rank] # add new queries to lineage clustering q_indices = [isolate_list.index(q) for q in qlist] diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index eb2ab87f..13441f0c 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -478,7 +478,6 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, output_format = nodeLabels = [r.split('/')[-1].split('.')[0] for r in nodeNames] # get example clustering name for validation - print('Clustering is ' + str(clustering)) example_cluster_title = list(clustering.keys())[0] for name, label in zip(nodeNames, nodeLabels): diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index ac7353ad..80a0b62a 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -267,7 +267,7 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False): """ # data structures if return_dict: - clusters = defaultdict(lambda: defaultdict(str)) + clusters = defaultdict(dict) else: clusters = defaultdict(lambda: defaultdict(set)) @@ -276,9 +276,9 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False): # select relevant columns according to mode if mode == 'clusters': - type_columns = clustersCsv.columns[clustersCsv.columns.str.contains('_Cluster')] + type_columns = [n for n,col in enumerate(clustersCsv.columns) if ('_Cluster' in col)] elif mode == 'lineages': - type_columns = clustersCsv.columns[clustersCsv.columns.str.contains('Overall_lineage') | clustersCsv.columns.str.contains('Lineage_Rank_')] + type_columns = [n for n,col in enumerate(clustersCsv.columns) if ('Rank_' in col or 'overall' in col)] elif mode == 'external': type_columns = range(1,len(clustersCsv.columns)) else: @@ -288,12 +288,12 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False): # read file for row in clustersCsv.itertuples(): for cls_idx in type_columns: - cluster_name = str(clustersCsv.columns[type_columns[cls_idx]]) + cluster_name = clustersCsv.columns[cls_idx] cluster_name = cluster_name.replace('__autocolour','') if return_dict: - clusters[cluster_name][row.Index] = str(row[cls_idx]) + clusters[cluster_name][row.Index] = str(row[cls_idx + 1]) else: - clusters[cluster_name][str(row[cls_idx])].append([row.Index]) + clusters[cluster_name][str(row[cls_idx + 1])].append([row.Index]) # return data structure return clusters From c82e24f4edc858de85550852ce38de91f945b6e1 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 4 May 2020 15:55:20 +0100 Subject: [PATCH 34/46] Simplify lineage clustering --- PopPUNK/__main__.py | 1 - PopPUNK/lineage_clustering.py | 143 ++++++---------------------------- 2 files changed, 22 insertions(+), 122 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 6ca5be8b..fb3daf4f 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -637,7 +637,6 @@ def main(): if args.viz_lineages: cluster_file = args.viz_lineages isolateClustering = readIsolateTypeFromCsv(cluster_file, mode = 'lineages', return_dict = True) - print('CLUSTERS: ' + str(isolateClustering)) else: # identify existing analysis files model_prefix = args.ref_db diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index da900a33..fb400ddf 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -29,57 +29,13 @@ from .utils import listDistInts from .utils import readRfile -def cluster_neighbours_of_equal_or_lower_rank(isolate, rank, lineage_index, lineage_clustering, lineage_clustering_information, nn): - """ Iteratively adds neighbours of isolates of lower or equal rank - to a lineage if an isolate of a higher rank is added. - - Args: - isolate (string) - Isolate of higher rank added to lineage. - rank (int) - Rank of isolate added to lineage. - lineage_index (int) - Label of current lineage. - lineage_clustering (dict) - Clustering of existing dataset. - lineage_clustering_information (dict) - Dict listing isolates by ranked distance from seed. - nn (nested dict) - Pre-calculated neighbour relationships. - - Returns: - lineage_clustering (dict) - Assignment of isolates to lineages. - """ - isolates_to_check = [isolate] - isolate_ranks = {} - isolate_ranks[isolate] = rank - - while len(isolates_to_check) > 0: - for isolate in isolates_to_check: - rank = isolate_ranks[isolate] - for isolate_neighbour in nn[isolate].keys(): - # if lineage of neighbour is higher, or it is null value (highest) - if lineage_clustering[isolate_neighbour] is None or lineage_clustering[isolate_neighbour] > lineage_index: - # check if rank of neighbour is lower than that of the current subject isolate - for neighbour_rank in range(1,rank): - if isolate_neighbour in lineage_clustering_information[neighbour_rank]: - lineage_clustering[isolate_neighbour] = lineage_index - isolates_to_check.append(isolate_neighbour) - isolate_ranks[isolate_neighbour] = neighbour_rank - isolates_to_check.remove(isolate) - - return lineage_clustering - -def run_lineage_clustering(lineage_clustering, lineage_clustering_information, neighbours, rank, lineage_index, seed_isolate, previous_lineage_clustering): +def run_lineage_clustering(lineage_clustering, neighbours, rank, lineage_index, seed_isolate, previous_lineage_clustering): """ Identifies isolates corresponding to a particular lineage given a cluster seed. Args: lineage_clustering (dict) Clustering of existing dataset. - lineage_clustering_information (dict) - Dict listing isolates by ranked distance from seed. neighbours (nested dict) Pre-calculated neighbour relationships. rank (int) @@ -96,43 +52,28 @@ def run_lineage_clustering(lineage_clustering, lineage_clustering_information, n Assignment of isolates to lineages. """ - # first make all R neighbours of the seed part of the lineage if unclustered - for seed_neighbour in neighbours[seed_isolate]: - if lineage_clustering[seed_neighbour] is None or lineage_clustering[seed_neighbour] > lineage_index: - lineage_clustering[seed_neighbour] = lineage_index - # iterate through ranks; for each isolate, test if neighbour belongs to this cluster - # overwrite higher cluster values - default value is higer than number of isolates - # when querying, allows smaller clusters to be merged into more basal ones as connections - # made - for rank in lineage_clustering_information.keys(): - # iterate through isolates of same rank - for isolate in lineage_clustering_information[rank]: - # test if clustered/belonging to a more peripheral cluster + # generate sets of neighbours based on rank + neighbour_set = defaultdict(frozenset) + for isolate in neighbours.keys(): + neighbour_set[isolate] = frozenset(neighbours[isolate].keys()) + # initiate lineage as the seed isolate and immediate unclustered neighbours + in_lineage = {seed_isolate} + for seed_neighbours in neighbour_set[seed_isolate]: + if lineage_clustering[seed_neighbours] is None or lineage_clustering[seed_neighbours] > lineage_index: + in_lineage.add(seed_neighbours) + lineage_clustering[seed_neighbours] = lineage_index + # iterate through other isolates until converged on a stable clustering + alterations = len(neighbours.keys()) + while alterations > 0: + alterations = 0 + for isolate in neighbour_set.keys(): if lineage_clustering[isolate] is None or lineage_clustering[isolate] > lineage_index: - # get clusters of nearest neighbours - isolate_neighbour_clusters = [lineage_clustering[isolate_neighbour] for isolate_neighbour in neighbours[isolate].keys()] - # if a nearest neighbour is in this cluster - if lineage_index in isolate_neighbour_clusters: - # add isolate to lineage + intersection_size = in_lineage.intersection(neighbour_set[isolate]) + if intersection_size is not None and len(intersection_size) > 0: + in_lineage.add(isolate) lineage_clustering[isolate] = lineage_index - # add neighbours of same or lower rank to lineage if unclustered - lineage_clustering = cluster_neighbours_of_equal_or_lower_rank(isolate, - rank, - lineage_index, - lineage_clustering, - lineage_clustering_information, - neighbours) - # if this represents a change from the previous iteration of lineage definitions - # then the change needs to propagate through higher-ranked members - if isolate in previous_lineage_clustering: - if previous_lineage_clustering[isolate] == lineage_index and lineage_clustering[isolate] != lineage_index: - for higher_rank in lineage_clustering_information.keys(): - if higher_rank > rank: - for higher_ranked_isolate in lineage_clustering_information[rank]: - if lineage_clustering[isolate] == lineage_index: - lineage_clustering[isolate] = None - - + alterations = alterations + 1 + # return final clustering return lineage_clustering def get_seed_isolate(lineage_clustering, row_labels, distances, lineage_index, lineage_seed): @@ -177,38 +118,6 @@ def get_seed_isolate(lineage_clustering, row_labels, distances, lineage_index, l # return identified seed isolate return seed_isolate -def get_lineage_clustering_information(seed_isolate, row_labels, distances): - """ Generates the ranked distances needed for cluster - definition. - - Args: - seed_isolate (str) - Isolate used to initiate lineage. - row_labels (list of tuples) - Pairs of isolates labelling each distance. - distances (numpy array) - Pairwise distances used for defining relationships. - - Returns: - lineage_info (dict) - Dict listing isolates by ranked distance from seed. - - """ - # data structure - lineage_info = defaultdict(list) - rank = 0 - last_dist = -1 - # iterate through distances recording rank - for index,distance in enumerate(distances): - if seed_isolate in row_labels[index]: - if distance > last_dist: - rank = rank + 1 - last_dist = distance - pair = row_labels[index][0] if row_labels[index][1] == seed_isolate else row_labels[index][1] - lineage_info[rank].append(pair) - # return information - return lineage_info - def generate_nearest_neighbours(distances, row_labels, isolate_indices, rank): # data structures nn = defaultdict(dict) @@ -412,7 +321,6 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None for n,result in enumerate(results): rank = rank_list[n] lineage_clustering[rank], lineage_seed[rank], neighbours[rank], previous_lineage_clustering[rank] = result - print('Rank: ' + str(rank) + ' seeds: ' + str(lineage_seed[rank])) # store output with open(output + "/" + output + '_lineages.pkl', 'wb') as pickle_file: @@ -496,7 +404,6 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc # focus on relevant data lineage_clustering = lineage_clustering_overall[rank] lineage_seed = lineage_seed_overall[rank] - print('Rank: ' + str(rank) + '\nSEEDS!: ' + str(lineage_seed)) neighbours = neighbours_overall[rank] # add new queries to lineage clustering q_indices = [isolate_list.index(q) for q in qlist] @@ -534,15 +441,9 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc # record status of seed isolate lineage_clustering[seed_isolate] = lineage_index - - # get information for lineage clustering - lineage_clustering_information = get_lineage_clustering_information(seed_isolate, - row_labels, - distances) - + # cluster the lineages lineage_clustering = run_lineage_clustering(lineage_clustering, - lineage_clustering_information, neighbours, rank, lineage_index, From f75160a72eb49fdf10ef3c5387b9e5568284fae1 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 4 May 2020 21:33:13 +0100 Subject: [PATCH 35/46] Edit to pass tests --- PopPUNK/__main__.py | 2 ++ PopPUNK/network.py | 32 +++++++++++++++++++++++--------- PopPUNK/utils.py | 15 ++++++++++----- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index fb3daf4f..159c8364 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -279,6 +279,7 @@ def main(): sketch_sizes = int(round(max(sketch_sizes.values())/64)) # check if working with lineages + rank_list = [] if args.lineage_clustering or args.assign_lineages: rank_list = sorted([int(x) for x in args.ranks.split(',')]) if min(rank_list) == 0 or max(rank_list) > 100: @@ -653,6 +654,7 @@ def main(): # Read in network and cluster assignment genomeNetwork, cluster_file = fetchNetwork(prev_clustering, model, rlist, args.core_only, args.accessory_only) + isolateClustering = readIsolateTypeFromCsv(cluster_file, mode = 'clusters', return_dict = True) # prune the network and dictionary of assignments genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(viz_subset)) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 9ad9bf73..4930bef4 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -326,6 +326,20 @@ def addQueryToNetwork(dbFuncs, rlist, qfile, G, kmers, estimated_length, new_edges = [] assigned = set() + # These are returned + qlist1 = None + distMat = None + + # Set up query names + qList, qSeqs = readRfile(qfile, oneSeq = use_mash) + queryFiles = dict(zip(qList, qSeqs)) + if use_mash == True: + rNames = None + qNames = qSeqs + else: + rNames = qList + qNames = rNames + # store links for each query in a list of edge tuples for assignment, (ref, query) in zip(assignments, iterDistRows(rlist, qList, self=False)): if assignment == model.within_label: @@ -335,17 +349,17 @@ def addQueryToNetwork(dbFuncs, rlist, qfile, G, kmers, estimated_length, # Calculate all query-query distances too, if updating database if queryQuery: sys.stderr.write("Calculating all query-query distances\n") - qlist, distMat = calculateQueryQueryDistances(dbFuncs, - refList, - q_files, + qlist1, distMat = calculateQueryQueryDistances(dbFuncs, + rNames, + qfile, kmers, estimated_length, - output, + queryDB, use_mash, threads) queryAssignation = model.assign(distMat) - for assignment, (ref, query) in zip(queryAssignation, iterDistRows(qlist, qlist, self=True)): + for assignment, (ref, query) in zip(queryAssignation, iterDistRows(qlist1, qlist1, self=True)): if assignment == model.within_label: new_edges.append((ref, query)) @@ -384,7 +398,7 @@ def addQueryToNetwork(dbFuncs, rlist, qfile, G, kmers, estimated_length, number_plot_fits = 0, threads = threads) queryAssignation = model.assign(distMat) - + # identify any links between queries and store in the same links dict # links dict now contains lists of links both to original database and new queries for assignment, (query1, query2) in zip(queryAssignation, iterDistRows(qlist1, qlist2, self=True)): @@ -443,10 +457,10 @@ def printClusters(G, outPrefix = "_clusters.csv", oldClusterFile = None, oldNames = set() if oldClusterFile != None: -# oldClusters = readClusters(oldClusterFile) oldAllClusters = readIsolateTypeFromCsv(oldClusterFile, mode = 'external', return_dict = False) - oldCluster = oldAllClusters[clustering_type] - new_id = len(oldClusters) + 1 # 1-indexed + oldClusters = oldAllClusters[list(oldAllClusters.keys())[0]] + print('oldCluster is ' + str(oldClusters)) + new_id = len(oldClusters.keys()) + 1 # 1-indexed while new_id in oldClusters: new_id += 1 # in case clusters have been merged diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 0e9c2efb..0a8f0133 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -268,22 +268,25 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False): if return_dict: clusters = defaultdict(dict) else: - clusters = defaultdict(lambda: defaultdict(set)) + clusters = {} # read CSV clustersCsv = pd.read_csv(clustCSV, index_col = 0, quotechar='"') # select relevant columns according to mode if mode == 'clusters': - type_columns = [n for n,col in enumerate(clustersCsv.columns) if ('_Cluster' in col)] + type_columns = [n for n,col in enumerate(clustersCsv.columns) if ('Cluster' in col)] elif mode == 'lineages': type_columns = [n for n,col in enumerate(clustersCsv.columns) if ('Rank_' in col or 'overall' in col)] elif mode == 'external': - type_columns = range(1,len(clustersCsv.columns)) + if len(clustersCsv.columns) == 1: + type_columns = [0] + elif len(clustersCsv.columns) > 1: + type_columns = range((len(clustersCsv.columns)-1)) else: sys.stderr.write('Unknown CSV reading mode: ' + mode + '\n') exit(1) - + # read file for row in clustersCsv.itertuples(): for cls_idx in type_columns: @@ -292,7 +295,9 @@ def readIsolateTypeFromCsv(clustCSV, mode = 'clusters', return_dict = False): if return_dict: clusters[cluster_name][row.Index] = str(row[cls_idx + 1]) else: - clusters[cluster_name][str(row[cls_idx + 1])].append([row.Index]) + if cluster_name not in clusters.keys(): + clusters[cluster_name] = defaultdict(set) + clusters[cluster_name][str(row[cls_idx + 1])].add(row.Index) # return data structure return clusters From 46b1e1adab7379903cdb92c71a093005ffa91ace Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 4 May 2020 21:59:45 +0100 Subject: [PATCH 36/46] Output directory generation for lineage clustering --- PopPUNK/__main__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 159c8364..a4fad21d 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -550,6 +550,14 @@ def main(): refList, queryList, self, distMat = readPickle(distances) + # make directory for new output files + if not os.path.isdir(args.output): + try: + os.makedirs(args.output) + except OSError: + sys.stderr.write("Cannot create output directory\n") + sys.exit(1) + # run lineage clustering if self: isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme, num_processes = args.threads) @@ -812,7 +820,7 @@ def assign_query(dbFuncs, ref_db, q_files, output, update_db, full_db, distances # Update the network + ref list # only update network if assigning to strains - if full_db is False or assign_lineage is False: + if full_db is False and assign_lineage is False: mashOrder = refList + ordered_queryList newRepresentativesNames, newRepresentativesFile = extractReferences(genomeNetwork, mashOrder, output, refList) genomeNetwork.remove_nodes_from(set(genomeNetwork.nodes).difference(newRepresentativesNames)) From e72dcbc22bf3a840ce86af6f2b74be6f8b33a605 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 4 May 2020 22:00:34 +0100 Subject: [PATCH 37/46] Add lineages clustering tests to run_test.py --- test/run_test.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/run_test.py b/test/run_test.py index fd2bed8b..0dc981ab 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -72,6 +72,13 @@ sys.stderr.write("Running microreact visualisations (--generate-viz)\n") subprocess.run("python ../poppunk-runner.py --generate-viz --distances example_db_mash/example_db_mash.dists --ref-db example_db_mash --output example_viz --microreact --subset subset.txt", shell=True, check=True) +# lineage clustering +sys.stderr.write("Running lineage clustering test (--lineage-clustering)\n") +subprocess.run("python ../poppunk-runner.py --lineage-clustering --distances example_db/example_db.dists --output example_lineages --ranks 1,2,3,5", shell=True, check=True) + +# assign query to lineages +sys.stderr.write("Running query assignment (--assign-lineages)\n") +subprocess.run("python ../poppunk-runner.py --assign-lineages --q-files queries.txt --distances example_db/example_db.dists --ref-db example_db --existing-scheme example_lineages/example_lineages_lineages.pkl --output example_lineage_query --update-db", shell=True, check=True) # tests of other command line programs (TODO) From abf8d6ea9949ab94cc36e180cbeb111e03cf18f3 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Mon, 4 May 2020 22:07:49 +0100 Subject: [PATCH 38/46] Add mash tests for lineage clustering --- test/run_test.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/run_test.py b/test/run_test.py index 0dc981ab..8f736720 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -80,6 +80,15 @@ sys.stderr.write("Running query assignment (--assign-lineages)\n") subprocess.run("python ../poppunk-runner.py --assign-lineages --q-files queries.txt --distances example_db/example_db.dists --ref-db example_db --existing-scheme example_lineages/example_lineages_lineages.pkl --output example_lineage_query --update-db", shell=True, check=True) +# lineage clustering with mash +sys.stderr.write("Running lineage clustering test (--lineage-clustering)\n") +subprocess.run("python ../poppunk-runner.py --lineage-clustering --distances example_db_mash/example_db_mash.dists --output example_lineages_mash --ranks 1,2,3,5 --use-mash", shell=True, check=True) + +# assign query to lineages with mash +sys.stderr.write("Running query assignment (--assign-lineages)\n") +subprocess.run("python ../poppunk-runner.py --assign-lineages --q-files queries.txt --distances example_db_mash/example_db_mash.dists --ref-db example_db_mash --existing-scheme example_lineages_mash/example_lineages_mash_lineages.pkl --output example_lineage_mash_query --update-db --use-mash", shell=True, check=True) + + # tests of other command line programs (TODO) sys.stderr.write("Tests completed\n") From 4717b0f2e698a4526950bbd3af8e8be340a4606c Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 5 May 2020 05:33:38 +0100 Subject: [PATCH 39/46] Simplify variables --- PopPUNK/__main__.py | 4 ++-- PopPUNK/lineage_clustering.py | 8 +++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index a4fad21d..40909770 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -560,9 +560,9 @@ def main(): # run lineage clustering if self: - isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme, num_processes = args.threads) + isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, isolate_list = refList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme, num_processes = args.threads) else: - isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, rlist = refList, qlist = queryList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme, num_processes = args.threads) + isolateClustering = cluster_into_lineages(distMat, rank_list, args.output, isolate_list = refList, qlist = queryList, use_accessory = args.use_accessory, existing_scheme = args.existing_scheme, num_processes = args.threads) #*******************************# #* *# diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index fb400ddf..4c353b98 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -234,7 +234,7 @@ def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_cl # return updated dict return nn, lineage_clustering -def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1): +def cluster_into_lineages(distMat, rank_list = None, output = None, isolate_list = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1): """ Clusters isolates into lineages based on their relative distances. @@ -245,7 +245,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None rank_list (list of int) Integers specifying the maximum rank of neighbours used for clustering. - rlist (list) + isolate_list (list) List of reference sequences. qlist (list) List of query sequences being added to an existing clustering. @@ -274,9 +274,6 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None distance_ranks = np.argsort(distances) distances = distances[distance_ranks] - # determine whether ref-ref or ref-query analysis - isolate_list = rlist - # determine whether novel analysis or modifying existing analysis use_existing = False neighbours = {} @@ -292,6 +289,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, rlist = None # shared memory data structures with SharedMemoryManager() as smm: + # share sorted distances distances_raw = smm.SharedMemory(size = distances.nbytes) distances_shared_array = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_raw.buf) From cb1f0d1336e3f38068f1ab6b87e55320f054541d Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 5 May 2020 17:03:47 +0100 Subject: [PATCH 40/46] Reimplement lineage clustering using networks --- PopPUNK/lineage_clustering.py | 427 ++++++++++++---------------------- 1 file changed, 145 insertions(+), 282 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 4c353b98..9ec63062 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -12,6 +12,7 @@ from collections import defaultdict import pickle import collections +import networkx as nx from multiprocessing import Pool, Lock, Manager, RawArray, shared_memory, managers try: from multiprocessing import Pool, shared_memory @@ -28,11 +29,60 @@ from .utils import iterDistRows from .utils import listDistInts from .utils import readRfile +from .utils import update_distance_matrices -def run_lineage_clustering(lineage_clustering, neighbours, rank, lineage_index, seed_isolate, previous_lineage_clustering): +def get_chunk_ranges(N, nb): + step = N / nb + range_sizes = [(round(step*i), round(step*(i+1))) for i in range(nb)] + # extend to end of distMat + range_sizes[len(range_sizes) - 1] = (range_sizes[len(range_sizes) - 1][0],N) + # return ranges + return range_sizes + +def rank_distance_matrix(bounds, distances = None): + # load distance matrix from shared memory + distances_shm = shared_memory.SharedMemory(name = distances.name) + distances = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_shm.buf) + # rank relevant slide of distance matrix + ranks = np.apply_along_axis(rankdata, 1, distances[slice(*bounds),:], method = 'ordinal') + return ranks + +def get_nearest_neighbours(rank, isolates = None, ranks = None): + # data structure + nn = {} + # load shared ranks + ranks_shm = shared_memory.SharedMemory(name = ranks.name) + ranks = np.ndarray(ranks.shape, dtype = ranks.dtype, buffer = ranks_shm.buf) + # apply along axis + for i in isolates: + nn[i] = defaultdict(frozenset) + isolate_ranks = ranks[i,:] + closest_ranked = np.ravel(np.where(isolate_ranks <= rank)) + neighbours = frozenset(closest_ranked.tolist()) + nn[i] = neighbours + # return dict + return nn + + +def pick_seed_isolate(G, distances = None): + # identify unclustered isolates + unclustered_isolates = list(nx.isolates(G)) + # select minimum distance between unclustered isolates + minimum_distance_between_unclustered_isolates = np.amin(distances[unclustered_isolates,unclustered_isolates],axis = 0) + # select occurrences of this distance + minimum_distance_coordinates = np.where(distances == minimum_distance_between_unclustered_isolates) + # identify case where both isolates are unclustered + for i in range(len(minimum_distance_coordinates[0])): + if minimum_distance_coordinates[0][i] in unclustered_isolates and minimum_distance_coordinates[1][i] in unclustered_isolates: + seed_isolate = minimum_distance_coordinates[0][i] + break + # return unclustered isolate with minimum distance to another isolate + return seed_isolate + +def get_lineage(G, neighbours, seed_isolate, lineage_index): """ Identifies isolates corresponding to a particular lineage given a cluster seed. - + Args: lineage_clustering (dict) Clustering of existing dataset. @@ -50,189 +100,31 @@ def run_lineage_clustering(lineage_clustering, neighbours, rank, lineage_index, Returns: lineage_clustering (dict) Assignment of isolates to lineages. - + """ - # generate sets of neighbours based on rank - neighbour_set = defaultdict(frozenset) - for isolate in neighbours.keys(): - neighbour_set[isolate] = frozenset(neighbours[isolate].keys()) # initiate lineage as the seed isolate and immediate unclustered neighbours in_lineage = {seed_isolate} - for seed_neighbours in neighbour_set[seed_isolate]: - if lineage_clustering[seed_neighbours] is None or lineage_clustering[seed_neighbours] > lineage_index: - in_lineage.add(seed_neighbours) - lineage_clustering[seed_neighbours] = lineage_index + G.nodes[seed_isolate]['lineage'] = lineage_index + for seed_neighbour in neighbours[seed_isolate]: + if nx.is_isolate(G, seed_neighbour): + G.add_edge(seed_isolate, seed_neighbour) + G.nodes[seed_neighbour]['lineage'] = lineage_index + in_lineage.add(seed_neighbour) # iterate through other isolates until converged on a stable clustering alterations = len(neighbours.keys()) while alterations > 0: alterations = 0 - for isolate in neighbour_set.keys(): - if lineage_clustering[isolate] is None or lineage_clustering[isolate] > lineage_index: - intersection_size = in_lineage.intersection(neighbour_set[isolate]) + for isolate in neighbours.keys(): + if nx.is_isolate(G, isolate): + intersection_size = in_lineage.intersection(neighbours[isolate]) if intersection_size is not None and len(intersection_size) > 0: + for i in intersection_size: + G.add_edge(isolate, i) + G.nodes[isolate]['lineage'] = lineage_index in_lineage.add(isolate) - lineage_clustering[isolate] = lineage_index alterations = alterations + 1 # return final clustering - return lineage_clustering - -def get_seed_isolate(lineage_clustering, row_labels, distances, lineage_index, lineage_seed): - """ Identifies the isolate used to initiate a cluster. - - Args: - lineage_clustering (dict) - Clustering of existing dataset. - row_labels (list of tuples) - Pairs of isolates labelling each distance. - distances (numpy array) - Pairwise distances used for defining relationships. - lineage_index (int) - Label of current lineage. - lineage_seed (dict) - Dict of seeds used to initiate pre-existing lineage definitions. - - Returns: - seed_isolate (str) - Isolate to used to initiate next lineage. - - """ - # variable to return - seed_isolate = None - # first test if there is an existing seed - if lineage_index in lineage_seed.keys(): - original_seed_isolate = lineage_seed[lineage_index] - # if seed now belongs to a different lineage, then lineage has been overwritten - # seed may be unassigned if neighbours have changed - but this does not overwrite the - # lineage in the same way - if lineage_clustering[original_seed_isolate] is None or lineage_clustering[original_seed_isolate] == lineage_index: - seed_isolate = original_seed_isolate - # else identify a new seed from the closest pair - else: - for index,(distance,(isolate1,isolate2)) in enumerate(zip(distances,row_labels)): - if lineage_clustering[isolate1] is None: - seed_isolate = isolate1 - break - elif lineage_clustering[isolate2] is None: - seed_isolate = isolate2 - break - # return identified seed isolate - return seed_isolate - -def generate_nearest_neighbours(distances, row_labels, isolate_indices, rank): - # data structures - nn = defaultdict(dict) - last_dist = {} - num_ranks = {} - num_ranks = {i:0 for i in isolate_indices} - total_isolates = len(isolate_indices) - num_distances = len(distances) - completed_isolates = 0 - index = 0 - # iterate through distances until all nearest neighbours identified - while completed_isolates < total_isolates and index < num_distances: - distance = distances[index] - # iterate through all listed isolates - for isolate in row_labels[index]: - if isolate in num_ranks.keys() and num_ranks[isolate] < rank: - # R is the number of unique distances so only increment if - # different from the last distance - if isolate in last_dist.keys() and last_dist[isolate] != distance: - num_ranks[isolate] = num_ranks[isolate] + 1 - # if maximum number of ranks reached, mark as complete - if num_ranks[isolate] == rank: # stop at R as counting from 0 - completed_isolates = completed_isolates + 1 - # if not, add as a nearest neighbour - else: - pair = row_labels[index][0] if row_labels[index][1] == isolate else row_labels[index][1] - nn[isolate][pair] = distance - last_dist[isolate] = distance - index = index + 1 - # return completed dict - return nn - -def update_nearest_neighbours(distances, row_labels, rank, qlist, nn, lineage_clustering): - """ Updates the information on nearest neighbours, given - a new set of ref-query and query-query distances. - - Args: - distances (numpy array) - Distances to be used for defining lineages. - row_labels (list of tuples) - Pairs of isolates labelling each distance. - rank (int) - Maximum rank of distance used to define nearest neighbours. - qlist (list) - List of queries to be added to existing dataset. - nn (nested dict) - Pre-calculated neighbour relationships. - lineage_clustering (dict) - Clustering of existing dataset. - - Returns: - nn (nested dict) - Updated neighbour relationships. - lineage_clustering (dict) - Updated assignment of isolates to lineage. - - """ - # iterate through isolates and test whether any comparisons with - # newly-added queries replace or supplement existing neighbours - - # data structures for altered entries - nn_new = {} - # pre-process to extract ref-query distances first - query_match_indices = [n for n, (r, q) in enumerate(row_labels) if q in qlist or r in qlist] - query_row_names = [row_labels[i] for i in query_match_indices] - query_distances = np.take(distances, query_match_indices) - - # get nn for query sequences - query_nn = generate_nearest_neighbours(distances, row_labels, qlist, rank) - # add query-query comparisons - for query in query_nn.keys(): - nn[query] = query_nn[query] - - # calculate max distances for each isolate - max_distance = {} - num_distances = {} - for isolate in nn.keys(): - neighbour_distances = set(nn[isolate].values()) - max_distance[isolate] = max(neighbour_distances) - num_distances[isolate] = len(neighbour_distances) # query-query comparisons may be < R - - # iterate through the ref-query distances - for index,(distance,(isolate1,isolate2)) in enumerate(zip(query_distances,query_row_names)): - # identify ref-query matches - ref = None - query = None - if isolate1 in max_distance.keys() and isolate2 not in max_distance.keys(): - ref = isolate1 - query = isolate2 - elif isolate2 in max_distance.keys() and isolate1 not in max_distance.keys(): - ref = isolate2 - query = isolate1 - if ref is not None: - if distance <= max_distance[ref]: - # unset isolate and references - lineage_clustering[ref] = None - for neighbour in nn[ref]: - lineage_clustering[neighbour] = None - # add neighbours - nn[ref][query] = distance - # delete links if no longer high ranked match - if distance < max_distance[ref]: - if num_distances[ref] == rank: - to_delete = [] - for other in nn[ref].keys(): - if nn[ref][other] == max_distance[ref]: - to_delete.append(other) - for other in to_delete: - del nn[ref][other] - else: - # if set from query-query distances < R - num_distances[ref] = num_distances[ref] + 1 - max_distance[ref] = max(nn[ref].values()) - # return updated dict - return nn, lineage_clustering + return G def cluster_into_lineages(distMat, rank_list = None, output = None, isolate_list = None, qlist = None, existing_scheme = None, use_accessory = False, num_processes = 1): """ Clusters isolates into lineages based on their @@ -262,67 +154,74 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, isolate_list combined (dict) Assignment of each isolate to clusters by all ranks used. """ + # data structures + lineage_clustering = defaultdict(dict) + overall_lineage_seeds = defaultdict(dict) + overall_lineages = defaultdict(dict) + if existing_scheme is not None: + with open(existing_scheme, 'rb') as pickle_file: + lineage_clustering, overall_lineage_seeds, rank_list = pickle.load(pickle_file) + + # generate square distance matrix + seqLabels, coreMat, accMat = update_distance_matrices(isolate_list, distMat) + if use_accessory: + distances = accMat + else: + distances = coreMat + try: + assert seqLabels == isolate_list + except: + sys.stderr.write('Isolates in wrong order?') + exit(1) + + # list indices and set self-self to Inf + isolate_indices = [n for n,i in enumerate(isolate_list)] + for i in isolate_indices: + distances[i,i] = np.Inf - # process distance matrix - # - this should be sorted (PyTorch allows this to be done on GPUs) - # - then the functions can be reimplemented to run faster on a - # sorted list - distance_index = 1 if use_accessory else 0 - distances = distMat[:,distance_index] - - # sort distances - distance_ranks = np.argsort(distances) - distances = distances[distance_ranks] - - # determine whether novel analysis or modifying existing analysis - use_existing = False - neighbours = {} - lineage_seed = {} - lineage_clustering = {} - previous_lineage_clustering = {} - - for rank in rank_list: - lineage_clustering[rank] = {i:None for i in range(0,len(isolate_list))} - lineage_seed[rank] = {} - neighbours[rank] = {} - previous_lineage_clustering[rank] = {} - - # shared memory data structures + # get ranks of distances per row + chunk_boundaries = get_chunk_ranges(distances.shape[0], num_processes) with SharedMemoryManager() as smm: - # share sorted distances + # share isolate list + isolate_list_shared = smm.ShareableList(isolate_indices) + + # create shared memory object for distances distances_raw = smm.SharedMemory(size = distances.nbytes) distances_shared_array = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_raw.buf) distances_shared_array[:] = distances[:] distances_shared_array = NumpyShared(name = distances_raw.name, shape = distances.shape, dtype = distances.dtype) - # share distance ranks + # parallelise ranking of distances across CPUs + with Pool(processes = num_processes) as pool: + ranked_array = pool.map(partial(rank_distance_matrix, + distances = distances_shared_array), + chunk_boundaries) + + # concatenate ranks into shared memory + distance_ranks = np.concatenate(ranked_array) distance_ranks_raw = smm.SharedMemory(size = distance_ranks.nbytes) distance_ranks_shared_array = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = distance_ranks_raw.buf) distance_ranks_shared_array[:] = distance_ranks[:] distance_ranks_shared_array = NumpyShared(name = distance_ranks_raw.name, shape = distance_ranks.shape, dtype = distance_ranks.dtype) - - # share isolate list - isolate_list_shared = smm.ShareableList(isolate_list) - - # run clustering for an individual R + + # parallelise neighbour identification for each rank with Pool(processes = num_processes) as pool: results = pool.map(partial(run_clustering_for_rank, - qlist = qlist, - existing_scheme = existing_scheme, - distances = distances_shared_array, - distance_ranks = distance_ranks_shared_array, - isolates = isolate_list_shared), + distances_input = distances_shared_array, + distance_ranks_input = distance_ranks_shared_array, + isolates = isolate_list_shared, + previous_seeds = overall_lineage_seeds), rank_list) # extract results from multiprocessing pool for n,result in enumerate(results): rank = rank_list[n] - lineage_clustering[rank], lineage_seed[rank], neighbours[rank], previous_lineage_clustering[rank] = result + lineage_clustering[rank], overall_lineage_seeds[rank] = result # store output with open(output + "/" + output + '_lineages.pkl', 'wb') as pickle_file: - pickle.dump([lineage_clustering, lineage_seed, neighbours, rank_list], pickle_file) + pickle.dump([lineage_clustering, overall_lineage_seeds, rank_list], pickle_file) # process multirank lineages overall_lineages = {} @@ -351,7 +250,7 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, isolate_list # return lineages return overall_lineages -def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distances = None, distance_ranks = None, isolates = None): +def run_clustering_for_rank(rank, distances_input = None, distance_ranks_input = None, isolates = None, previous_seeds = None): """ Clusters isolates into lineages based on their relative distances using a single R to enable parallelisation. @@ -377,79 +276,43 @@ def run_clustering_for_rank(rank, qlist = None, existing_scheme = False, distanc """ # load shared memory objects - distances_shm = shared_memory.SharedMemory(name = distances.name) - distances = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_shm.buf) - distance_ranks_shm = shared_memory.SharedMemory(name = distance_ranks.name) - distance_ranks = np.ndarray(distance_ranks.shape, dtype = distance_ranks.dtype, buffer = distance_ranks_shm.buf) + distances_shm = shared_memory.SharedMemory(name = distances_input.name) + distances = np.ndarray(distances_input.shape, dtype = distances_input.dtype, buffer = distances_shm.buf) + distance_ranks_shm = shared_memory.SharedMemory(name = distance_ranks_input.name) + distance_ranks = np.ndarray(distance_ranks_input.shape, dtype = distance_ranks_input.dtype, buffer = distance_ranks_shm.buf) isolate_list = isolates isolate_indices = range(0,len(isolate_list)) + + # load previous scheme + seeds = {} + if previous_seeds is not None: + seeds = previous_seeds[rank] + + # create graph structure + G = nx.Graph() + G.add_nodes_from(isolate_indices) + G.nodes.data('lineage', default = 0) - # calculate row labels - # this is inefficient but there appears to be no way of sharing - # strings between processes efficiently - row_labels = listDistInts(isolate_list, isolate_list, self = True) - # reorder by sorted distances - row_labels = [row_labels[i] for i in distance_ranks] - - lineage_clustering = {i:None for i in range(0,len(isolate_list))} - previous_lineage_clustering = lineage_clustering - lineage_seed = {} - neighbours = {} + # identify nearest neighbours + nn = get_nearest_neighbours(rank, + ranks = distance_ranks_input, + isolates = isolate_list) - if existing_scheme is not None: - with open(existing_scheme, 'rb') as pickle_file: - lineage_clustering_overall, lineage_seed_overall, neighbours_overall, rank_list = pickle.load(pickle_file) - # focus on relevant data - lineage_clustering = lineage_clustering_overall[rank] - lineage_seed = lineage_seed_overall[rank] - neighbours = neighbours_overall[rank] - # add new queries to lineage clustering - q_indices = [isolate_list.index(q) for q in qlist] - for q in q_indices: - lineage_clustering[q] = None - previous_lineage_clustering = lineage_clustering - - neighbours, lineage_clustering = update_nearest_neighbours(distances, - row_labels, - rank, - q_indices, - neighbours, - lineage_clustering) - else: - neighbours = generate_nearest_neighbours(distances, - row_labels, - isolate_indices, - rank) - - # run clustering + # iteratively identify lineages lineage_index = 1 - while None in lineage_clustering.values(): - - # get seed isolate based on minimum pairwise distances - seed_isolate = get_seed_isolate(lineage_clustering, - row_labels, - distances, - lineage_index, - lineage_seed) - lineage_seed[lineage_index] = seed_isolate - - # seed isolate is None if a previously-existing cluster has been overwritten - # in which case pass over the lineage to keep nomenclature consistent - if seed_isolate is not None: - - # record status of seed isolate - lineage_clustering[seed_isolate] = lineage_index - - # cluster the lineages - lineage_clustering = run_lineage_clustering(lineage_clustering, - neighbours, - rank, - lineage_index, - seed_isolate, - previous_lineage_clustering) - - # increment index for next lineage + while nx.number_of_isolates(G) > 0: + if lineage_index in seeds.keys(): + seed_isolate = seeds[lineage_index] + else: + seed_isolate = pick_seed_isolate(G, distances = distances) + # skip over previously-defined seeds if amalgamated into different lineage now + if nx.is_isolate(G, seed_isolate): + seeds[lineage_index] = seed_isolate + G = get_lineage(G, nn, seed_isolate, lineage_index) lineage_index = lineage_index + 1 + # identify components and name lineages + lineage_clustering = {node:nodedata for (node, nodedata) in G.nodes(data='lineage')} + # return clustering - return lineage_clustering, lineage_seed, neighbours, previous_lineage_clustering + return lineage_clustering, seeds From 9b143e3b0327bfa0f286fc22eb34dc39ef54b976 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 5 May 2020 20:11:45 +0100 Subject: [PATCH 41/46] Tidy up redundant code --- PopPUNK/lineage_clustering.py | 2 -- PopPUNK/utils.py | 38 ----------------------------------- 2 files changed, 40 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 9ec63062..4c00a170 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -27,8 +27,6 @@ from .plot import writeClusterCsv from .utils import iterDistRows -from .utils import listDistInts -from .utils import readRfile from .utils import update_distance_matrices def get_chunk_ranges(N, nb): diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 0a8f0133..b019f922 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -159,44 +159,6 @@ def iterDistRows(refSeqs, querySeqs, self=True): for ref in refSeqs: yield(ref, query) -def listDistInts(refSeqs, querySeqs, self=True): - """Gets the ref and query ID for each row of the distance matrix - - Returns an iterable with ref and query ID pairs by row. - - Args: - refSeqs (list) - List of reference sequence names. - querySeqs (list) - List of query sequence names. - self (bool) - Whether a self-comparison, used when constructing a database. - Requires refSeqs == querySeqs - Default is True - Returns: - ref, query (str, str) - Iterable of tuples with ref and query names for each distMat row. - """ - dist_orders = [] - n = 0 - if self: - dist_orders = [(0,0)] * int(0.5 * len(refSeqs) * (len(refSeqs) - 1)) - if refSeqs != querySeqs: - raise RuntimeError('refSeqs must equal querySeqs for db building (self = true)') - for i in range(0, len(refSeqs)): - for j in range(i + 1, len(refSeqs)): - dist_orders[n] = (j, i) - n = n + 1 - else: - dist_orders = [(0,0)] * int(len(refSeqs) * (len(querySeqs))) - for i,query in enumerate(querySeqs): - for j,ref in enumerate(refSeqs): - dist_orders[n] = (j, i) - n = n + 1 - - # return final list of tuples - return dist_orders - def writeTmpFile(fileList): """Writes a list to a temporary file. Used for turning variable into mash input. From 905e25b2be2180a9f4835dbddf235b01660828f6 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 5 May 2020 20:43:24 +0100 Subject: [PATCH 42/46] Tidy up documentation --- PopPUNK/lineage_clustering.py | 85 ++++++++++++++++++++++++++++------- 1 file changed, 69 insertions(+), 16 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index 4c00a170..a11f5fd7 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -13,7 +13,7 @@ import pickle import collections import networkx as nx -from multiprocessing import Pool, Lock, Manager, RawArray, shared_memory, managers +from multiprocessing import Pool, RawArray, shared_memory, managers try: from multiprocessing import Pool, shared_memory from multiprocessing.managers import SharedMemoryManager @@ -30,6 +30,19 @@ from .utils import update_distance_matrices def get_chunk_ranges(N, nb): + """ Calculates boundaries for dividing distances array + into chunks for parallelisation. + + Args: + N (int) + Number of rows in array + nb (int) + Number of blocks into which to divide array. + + Returns: + range_sizes (list of tuples) + Limits of blocks for dividing array. + """ step = N / nb range_sizes = [(round(step*i), round(step*(i+1))) for i in range(nb)] # extend to end of distMat @@ -38,6 +51,19 @@ def get_chunk_ranges(N, nb): return range_sizes def rank_distance_matrix(bounds, distances = None): + """ Ranks distances between isolates for each index (row) + isolate. + + Args: + bounds (2-tuple) + Range of rows to process in this thread. + distances (int) + Shared memory object storing pairwise distances. + + Returns: + ranks (numpy ndarray) + Ranks of distances for each row. + """ # load distance matrix from shared memory distances_shm = shared_memory.SharedMemory(name = distances.name) distances = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_shm.buf) @@ -46,6 +72,22 @@ def rank_distance_matrix(bounds, distances = None): return ranks def get_nearest_neighbours(rank, isolates = None, ranks = None): + """ Identifies sets of nearest neighbours for each isolate. + + Args: + rank (int) + Rank used in analysis. + isolates (int list) + List of isolate indices. + ranks (ndarray) + Shared memory object pointing to ndarray of + ranked pairwise distances. + + Returns: + nn (default dict of frozensets) + Dict indexed by isolates, values are a + frozen set of nearest neighbours. + """ # data structure nn = {} # load shared ranks @@ -63,6 +105,19 @@ def get_nearest_neighbours(rank, isolates = None, ranks = None): def pick_seed_isolate(G, distances = None): + """ Identifies seed isolate from the closest pair of + unclustered isolates. + + Args: + G (network) + Network with one node per isolate. + distances (ndarray of pairwise distances) + Pairwise distances between isolates. + + Returns: + seed_isolate (int) + Index of isolate selected as seed. + """ # identify unclustered isolates unclustered_isolates = list(nx.isolates(G)) # select minimum distance between unclustered isolates @@ -82,23 +137,18 @@ def get_lineage(G, neighbours, seed_isolate, lineage_index): lineage given a cluster seed. Args: - lineage_clustering (dict) - Clustering of existing dataset. - neighbours (nested dict) + G (network) + Network with one node per isolate. + neighbours (dict of frozen sets) Pre-calculated neighbour relationships. - rank (int) - Maximum rank of neighbours used for clustering. + seed_isolate (int) + Index of isolate selected as seed. lineage_index (int) Label of current lineage. - seed_isolate (str) - Isolate to used to initiate next lineage. - previous_lineage_clustering (dict) - Clustering of existing dataset in previous iteration. Returns: - lineage_clustering (dict) - Assignment of isolates to lineages. - + G (network) + Network modified with new edges. """ # initiate lineage as the seed isolate and immediate unclustered neighbours in_lineage = {seed_isolate} @@ -135,11 +185,13 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, isolate_list rank_list (list of int) Integers specifying the maximum rank of neighbours used for clustering. + output (string) + Prefix used for printing output files. isolate_list (list) List of reference sequences. qlist (list) List of query sequences being added to an existing clustering. - Should be included within rlist. + Should be included within isolate_list. existing_scheme (str) Path to pickle file containing lineage scheme to which isolates should be added. @@ -149,9 +201,10 @@ def cluster_into_lineages(distMat, rank_list = None, output = None, isolate_list Number of CPUs to use for calculations. Returns: - combined (dict) - Assignment of each isolate to clusters by all ranks used. + overall_lineages (nested dict) + Dict for each rank listing the lineage of each isolate. """ + # data structures lineage_clustering = defaultdict(dict) overall_lineage_seeds = defaultdict(dict) From df81bed4b27d7c21bfc6c53ef6c599cbd96b2587 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 5 May 2020 20:50:23 +0100 Subject: [PATCH 43/46] Changing memory use in seed_isolate function --- PopPUNK/lineage_clustering.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/PopPUNK/lineage_clustering.py b/PopPUNK/lineage_clustering.py index a11f5fd7..d553801a 100644 --- a/PopPUNK/lineage_clustering.py +++ b/PopPUNK/lineage_clustering.py @@ -57,7 +57,7 @@ def rank_distance_matrix(bounds, distances = None): Args: bounds (2-tuple) Range of rows to process in this thread. - distances (int) + distances (ndarray in shared memory) Shared memory object storing pairwise distances. Returns: @@ -79,7 +79,7 @@ def get_nearest_neighbours(rank, isolates = None, ranks = None): Rank used in analysis. isolates (int list) List of isolate indices. - ranks (ndarray) + ranks (ndarray in shared memory) Shared memory object pointing to ndarray of ranked pairwise distances. @@ -111,13 +111,16 @@ def pick_seed_isolate(G, distances = None): Args: G (network) Network with one node per isolate. - distances (ndarray of pairwise distances) + distances (ndarray in shared memory) Pairwise distances between isolates. Returns: seed_isolate (int) Index of isolate selected as seed. """ + # load distances from shared memory + distances_shm = shared_memory.SharedMemory(name = distances.name) + distances = np.ndarray(distances.shape, dtype = distances.dtype, buffer = distances_shm.buf) # identify unclustered isolates unclustered_isolates = list(nx.isolates(G)) # select minimum distance between unclustered isolates @@ -355,7 +358,7 @@ def run_clustering_for_rank(rank, distances_input = None, distance_ranks_input = if lineage_index in seeds.keys(): seed_isolate = seeds[lineage_index] else: - seed_isolate = pick_seed_isolate(G, distances = distances) + seed_isolate = pick_seed_isolate(G, distances = distances_input) # skip over previously-defined seeds if amalgamated into different lineage now if nx.is_isolate(G, seed_isolate): seeds[lineage_index] = seed_isolate From 3ebd9bc54bf64f2bfc5496b832a259a5ef76e1ef Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 5 May 2020 21:37:22 +0100 Subject: [PATCH 44/46] Change to minimum k-mer warning message --- PopPUNK/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index b019f922..80f0e586 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -463,9 +463,8 @@ def assembly_qc(assemblyList, klist, ignoreLengthOutliers, estimated_length): sys.stderr.write("WARNING: Average length over 10Mb - are these assemblies?\n") k_min = min(klist) - max_prob = 1/(pow(4, k_min)/float(genome_length) + 1) - if 1/(pow(4, k_min)/float(genome_length) + 1) > 0.05: - sys.stderr.write("Minimum k-mer length " + str(k_min) + " is too small for genome length " + str(genome_length) +"; please increase to avoid nonsense results\n") + if k_min < 6: + sys.stderr.write("Minimum k-mer length is 6 - any shorter is unlikely to generate sensible results.\n") exit(1) return (int(genome_length), max_prob) From f93700c045e8dc65a2c1b34f4ccd190af3f0e1cf Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 5 May 2020 21:43:11 +0100 Subject: [PATCH 45/46] Change random match probability message --- PopPUNK/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 80f0e586..265ac7ea 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -463,8 +463,10 @@ def assembly_qc(assemblyList, klist, ignoreLengthOutliers, estimated_length): sys.stderr.write("WARNING: Average length over 10Mb - are these assemblies?\n") k_min = min(klist) + if 1/(pow(4, k_min)/float(genome_length) + 1) > 0.05: + sys.stderr.write("Minimum k-mer length " + str(k_min) + " is too small for genome length " + str(genome_length) +"; results will be adjusted for random match probabilities\n") if k_min < 6: - sys.stderr.write("Minimum k-mer length is 6 - any shorter is unlikely to generate sensible results.\n") + sys.stderr.write("Minimum k-mer length is too low; please increase to at least 6\n") exit(1) return (int(genome_length), max_prob) From b801b26df4ffde5f281ea6f6e3662c2d4114f9e7 Mon Sep 17 00:00:00 2001 From: nickjcroucher Date: Tue, 5 May 2020 21:45:17 +0100 Subject: [PATCH 46/46] Define max_prob variable --- PopPUNK/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 265ac7ea..9618c945 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -463,7 +463,8 @@ def assembly_qc(assemblyList, klist, ignoreLengthOutliers, estimated_length): sys.stderr.write("WARNING: Average length over 10Mb - are these assemblies?\n") k_min = min(klist) - if 1/(pow(4, k_min)/float(genome_length) + 1) > 0.05: + max_prob = 1/(pow(4, k_min)/float(genome_length) + 1) + if max_prob > 0.05: sys.stderr.write("Minimum k-mer length " + str(k_min) + " is too small for genome length " + str(genome_length) +"; results will be adjusted for random match probabilities\n") if k_min < 6: sys.stderr.write("Minimum k-mer length is too low; please increase to at least 6\n")