diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py index 012742f2..82bed522 100644 --- a/PopPUNK/__init__.py +++ b/PopPUNK/__init__.py @@ -3,7 +3,7 @@ '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)''' -__version__ = '2.4.2' +__version__ = '2.4.3' # Minimum sketchlib version SKETCHLIB_MAJOR = 1 diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 1091cf8c..f04b25d0 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -526,7 +526,7 @@ def main(): indivNetworks = {} for dist_type, slope in zip(['core', 'accessory'], [0, 1]): if args.indiv_refine == 'both' or args.indiv_refine == dist_type: - indivAssignments = model.assign(distMat, slope) + indivAssignments = model.assign(distMat, slope = slope) indivNetworks[dist_type] = \ construct_network_from_assignments(refList, queryList, @@ -542,7 +542,7 @@ def main(): use_gpu = args.gpu_graph) save_network(indivNetworks[dist_type], prefix = output, - suffix = '_graph', + suffix = '_' + dist_type + '_graph', use_gpu = args.gpu_graph) #******************************# @@ -553,26 +553,48 @@ def main(): # extract limited references from clique by default # (this no longer loses information and should generally be kept on) if model.type != "lineage": - newReferencesIndices, newReferencesNames, newReferencesFile, genomeNetwork = \ - extractReferences(genomeNetwork, - refList, - output, - type_isolate = qc_dict['type_isolate'], - threads = args.threads, + dist_type_list = ['original'] + dist_string_list = [''] + if args.indiv_refine == 'both' or args.indiv_refine == 'core': + dist_type_list.append('core') + dist_string_list.append('_core') + if args.indiv_refine == 'both' or args.indiv_refine == 'accessory': + dist_type_list.append('accessory') + dist_string_list.append('_accessory') + # Iterate through different network types + for dist_type, dist_string in zip(dist_type_list, dist_string_list): + if dist_type == 'original': + network_for_refs = genomeNetwork + elif dist_type == 'core': + network_for_refs = indivNetworks[dist_type] + elif dist_type == 'accessory': + network_for_refs = indivNetworks[dist_type] + newReferencesIndices, newReferencesNames, newReferencesFile, genomeNetwork = \ + extractReferences(network_for_refs, + refList, + output, + outSuffix = dist_string, + type_isolate = qc_dict['type_isolate'], + threads = args.threads, + use_gpu = args.gpu_graph) + nodes_to_remove = set(range(len(refList))).difference(newReferencesIndices) + names_to_remove = [refList[n] for n in nodes_to_remove] + + if (len(names_to_remove) > 0): + # Save reference distances + dists_suffix = dist_string + '.refs.dists' + prune_distance_matrix(refList, names_to_remove, distMat, + output + "/" + os.path.basename(output) + dists_suffix) + # Save reference network + graphs_suffix = dist_string + '.refs_graph' + save_network(genomeNetwork, + prefix = output, + suffix = graphs_suffix, use_gpu = args.gpu_graph) - nodes_to_remove = set(range(len(refList))).difference(newReferencesIndices) - names_to_remove = [refList[n] for n in nodes_to_remove] - - if (len(names_to_remove) > 0): - # Save reference distances - prune_distance_matrix(refList, names_to_remove, distMat, - output + "/" + os.path.basename(output) + ".refs.dists") - # Save reference network - save_network(genomeNetwork, prefix = output, suffix = ".refs_graph", - use_gpu = args.gpu_graph) - removeFromDB(args.ref_db, output, names_to_remove) - os.rename(output + "/" + os.path.basename(output) + ".tmp.h5", - output + "/" + os.path.basename(output) + ".refs.h5") + db_suffix = dist_string + '.refs.h5' + removeFromDB(args.ref_db, output, names_to_remove) + os.rename(output + "/" + os.path.basename(output) + '.tmp.h5', + output + "/" + os.path.basename(output) + db_suffix) sys.stderr.write("\nDone\n") diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index fb933f9d..7517ac19 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -43,8 +43,8 @@ def assign_query(dbFuncs, strand_preserved, previous_clustering, external_clustering, - core_only, - accessory_only, + core, + accessory, gpu_sketch, gpu_dist, gpu_graph, @@ -66,6 +66,7 @@ def assign_query(dbFuncs, from .network import addQueryToNetwork from .network import printClusters from .network import save_network + from .network import get_vertex_list from .plot import writeClusterCsv @@ -116,220 +117,290 @@ def assign_query(dbFuncs, # Find distances to reference db kmers, sketch_sizes, codon_phased = readDBParams(ref_db) - # Find distances vs ref seqs - rNames = [] - use_ref_graph = \ - os.path.isfile(ref_db + "/" + os.path.basename(ref_db) + ".refs") \ - and not update_db and model.type != 'lineage' - if use_ref_graph: - with open(ref_db + "/" + os.path.basename(ref_db) + ".refs") as refFile: - for reference in refFile: - rNames.append(reference.rstrip()) - else: - if os.path.isfile(distances + ".pkl"): - rNames = readPickle(distances, enforce_self = True, distances=False)[0] - elif update_db: - sys.stderr.write("Reference distances missing, cannot use --update-db\n") - sys.exit(1) + # Iterate through different types of model fit with a refined model when specified + # Core and accessory assignments use the same model and same overall set of distances + # but have different networks, references, reference distances and assignments + fit_type_list = ['default'] + if model.type == 'refine' and model.indiv_fitted: + if core: + fit_type_list.append('core_refined') + if accessory: + fit_type_list.append('accessory_refined') + + for fit_type in fit_type_list: + # Define file name extension + file_extension_string = '' + if fit_type != 'default': + file_extension_string = '_' + fit_type + # Find distances vs ref seqs + rNames = [] + ref_file_name = os.path.join(model_prefix, + os.path.basename(model_prefix) + file_extension_string + ".refs") + use_ref_graph = \ + os.path.isfile(ref_file_name) and not update_db and model.type != 'lineage' + if use_ref_graph: + with open(ref_file_name) as refFile: + for reference in refFile: + rNames.append(reference.rstrip()) else: - rNames = getSeqsInDb(ref_db + "/" + os.path.basename(ref_db) + ".h5") - # construct database - if (web and json_sketch): - qNames = sketch_to_hdf5(json_sketch, output) - else: - # construct database - createDatabaseDir(output, kmers) - qNames = constructDatabase(q_files, - kmers, - sketch_sizes, - output, - threads, - overwrite, - codon_phased = codon_phased, - calc_random = False, - use_gpu = gpu_sketch, - deviceid = deviceid) - # run query - qrDistMat = queryDatabase(rNames = rNames, - qNames = qNames, - dbPrefix = ref_db, - queryPrefix = output, - klist = kmers, - self = False, - number_plot_fits = plot_fit, - threads = threads, - use_gpu = gpu_dist) - # QC distance matrix - if qc_dict['run_qc']: - seq_names_passing = qcDistMat(qrDistMat, rNames, qNames, ref_db, output, qc_dict)[0] - else: - seq_names_passing = rNames + qNames - - # Load the network based on supplied options - genomeNetwork, old_cluster_file = \ - fetchNetwork(prev_clustering, - model, - rNames, - ref_graph = use_ref_graph, - core_only = core_only, - accessory_only = accessory_only, - use_gpu = gpu_graph) - - if model.type == 'lineage': - # Assign lineages by calculating query-query information - addRandom(output, qNames, kmers, strand_preserved, overwrite, threads) - qqDistMat = queryDatabase(rNames = qNames, - qNames = qNames, - dbPrefix = output, - queryPrefix = output, - klist = kmers, - self = True, - number_plot_fits = 0, - threads = threads, - use_gpu = gpu_dist) - model.extend(qqDistMat, qrDistMat) - - genomeNetwork = {} - isolateClustering = defaultdict(dict) - for rank in model.ranks: - assignment = model.assign(rank) - # Overwrite the network loaded above - if graph_weights: - weights = model.edge_weights(rank) + if os.path.isfile(distances + ".pkl"): + rNames = readPickle(distances, enforce_self = True, distances=False)[0] + elif update_db: + sys.stderr.write("Reference distances missing, cannot use --update-db\n") + sys.exit(1) else: - weights = None - genomeNetwork[rank] = construct_network_from_edge_list(rNames + qNames, - rNames + qNames, - edge_list = assignment, - weights = weights, - use_gpu = gpu_graph) - - isolateClustering[rank] = \ - printClusters(genomeNetwork[rank], - rNames + qNames, - printCSV = False, - use_gpu = gpu_graph) - - overall_lineage = createOverallLineage(model.ranks, isolateClustering) - writeClusterCsv( - output + "/" + os.path.basename(output) + '_lineages.csv', - rNames + qNames, - rNames + qNames, - overall_lineage, - output_format = 'phandango', - epiCsv = None, - queryNames = qNames, - suffix = '_Lineage') - - else: - # Assign these distances as within or between strain - queryAssignments = model.assign(qrDistMat) - - # Assign clustering by adding to network - if graph_weights: - weights = qrDistMat - else: - weights = None - - genomeNetwork, qqDistMat = \ - addQueryToNetwork(dbFuncs, rNames, qNames, - genomeNetwork, kmers, - queryAssignments, model, output, update_db, - strand_preserved, - weights = weights, threads = threads, use_gpu = gpu_graph) - - isolateClustering = \ - {'combined': printClusters(genomeNetwork, rNames + qNames, - output + "/" + os.path.basename(output), - old_cluster_file, - external_clustering, - write_references or update_db, - use_gpu = gpu_graph)} - - # Update DB as requested - dists_out = output + "/" + os.path.basename(output) + ".dists" - if update_db: - # Check new sequences pass QC before adding them - if len(set(seq_names_passing).difference(rNames + qNames)) > 0: - sys.stderr.write("Queries contained outlier distances, " - "not updating database\n") + rNames = getSeqsInDb(os.path.join(ref_db, os.path.basename(ref_db) + ".h5")) + # construct database - use a single database directory for all query outputs + if (web and json_sketch is not None): + qNames = sketch_to_hdf5(json_sketch, output) + elif (fit_type == 'default'): + # construct database + createDatabaseDir(output, kmers) + qNames = constructDatabase(q_files, + kmers, + sketch_sizes, + output, + threads, + overwrite, + codon_phased = codon_phased, + calc_random = False, + use_gpu = gpu_sketch, + deviceid = deviceid) + if (fit_type == 'default' or (fit_type != 'default' and use_ref_graph)): + # run query + qrDistMat = queryDatabase(rNames = rNames, + qNames = qNames, + dbPrefix = ref_db, + queryPrefix = output, + klist = kmers, + self = False, + number_plot_fits = plot_fit, + threads = threads, + use_gpu = gpu_dist) + + # QC distance matrix + if qc_dict['run_qc']: + seq_names_passing = qcDistMat(qrDistMat, rNames, qNames, ref_db, output, qc_dict)[0] else: - sys.stderr.write("Updating reference database to " + output + "\n") + seq_names_passing = rNames + qNames + + # Load the network based on supplied options + genomeNetwork, old_cluster_file = \ + fetchNetwork(prev_clustering, + model, + rNames, + ref_graph = use_ref_graph, + core_only = (fit_type == 'core_refined'), + accessory_only = (fit_type == 'accessory_refined'), + use_gpu = gpu_graph) + + if max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) != (len(rNames) - 1): + sys.stderr.write("There are " + str(max(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) + 1) + \ + " vertices in the network but " + str(len(rNames)) + " reference names supplied; " + \ + "please check the '--model-dir' variable is pointing to the correct directory\n") - # Update the network + ref list (everything) - joinDBs(ref_db, output, output, - {"threads": threads, "strand_preserved": strand_preserved}) if model.type == 'lineage': - save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph) - # Save sparse distance matrices and updated model - model.outPrefix = os.path.basename(output) - model.save() + # Assign lineages by calculating query-query information + addRandom(output, qNames, kmers, strand_preserved, overwrite, threads) + qqDistMat = queryDatabase(rNames = qNames, + qNames = qNames, + dbPrefix = output, + queryPrefix = output, + klist = kmers, + self = True, + number_plot_fits = 0, + threads = threads, + use_gpu = gpu_dist) + model.extend(qqDistMat, qrDistMat) + + genomeNetwork = {} + isolateClustering = defaultdict(dict) + for rank in model.ranks: + assignment = model.assign(rank) + # Overwrite the network loaded above + if graph_weights: + weights = model.edge_weights(rank) + else: + weights = None + genomeNetwork[rank] = construct_network_from_edge_list(rNames + qNames, + rNames + qNames, + edge_list = assignment, + weights = weights, + use_gpu = gpu_graph, + summarise = False) + + isolateClustering[rank] = \ + printClusters(genomeNetwork[rank], + rNames + qNames, + printCSV = False, + use_gpu = gpu_graph) + + overall_lineage = createOverallLineage(model.ranks, isolateClustering) + writeClusterCsv( + output + "/" + os.path.basename(output) + '_lineages.csv', + rNames + qNames, + rNames + qNames, + overall_lineage, + output_format = 'phandango', + epiCsv = None, + queryNames = qNames, + suffix = '_Lineage') + else: - save_network(genomeNetwork, prefix = output, suffix = '_graph', use_gpu = gpu_graph) - - # Load the previous distances - refList_loaded, refList_copy, self, rrDistMat = \ - readPickle(distances, - enforce_self = True) - # This should now always be true, otherwise both qrDistMat and sparse matrix - # may need reordering - assert(refList_loaded == rNames) - - combined_seq, core_distMat, acc_distMat = \ - update_distance_matrices(rNames, rrDistMat, - qNames, qrDistMat, - qqDistMat, threads = threads) - assert combined_seq == rNames + qNames - - # Get full distance matrix and save - complete_distMat = \ - np.hstack((pp_sketchlib.squareToLong(core_distMat, threads).reshape(-1, 1), - pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1))) - storePickle(combined_seq, combined_seq, True, complete_distMat, dists_out) - - # Copy model if needed - if output != model.outPrefix: - model.copy(output) - - # Clique pruning - if model.type != 'lineage': - dbOrder = rNames + qNames - newRepresentativesIndices, newRepresentativesNames, \ - newRepresentativesFile, genomeNetwork = \ - extractReferences(genomeNetwork, - dbOrder, - output, - existingRefs = rNames, - type_isolate = qc_dict['type_isolate'], - threads = threads, - use_gpu = gpu_graph) - # intersection that maintains order - newQueries = [x for x in qNames if x in frozenset(newRepresentativesNames)] - - # could also have newRepresentativesNames in this diff (should be the same) - but want - # to ensure consistency with the network in case of bad input/bugs - nodes_to_remove = set(range(len(dbOrder))).difference(newRepresentativesIndices) - names_to_remove = [dbOrder[n] for n in nodes_to_remove] - - if (len(names_to_remove) > 0): - # This function also writes out the new ref distance matrix - postpruning_combined_seq, newDistMat = \ - prune_distance_matrix(combined_seq, names_to_remove, complete_distMat, - output + "/" + os.path.basename(output) + ".refs.dists") - save_network(genomeNetwork, prefix = output, suffix = 'refs_graph', use_gpu = gpu_graph) - removeFromDB(output, output, names_to_remove) - os.rename(output + "/" + os.path.basename(output) + ".tmp.h5", - output + "/" + os.path.basename(output) + ".refs.h5") - - # ensure sketch and distMat order match - assert postpruning_combined_seq == rNames + newQueries - else: - storePickle(rNames, qNames, False, qrDistMat, dists_out) - if save_partial_query_graph: + # Assign these distances as within or between strain + if fit_type == 'default': + queryAssignments = model.assign(qrDistMat) + dist_type = 'euclidean' + elif fit_type == 'core_refined': + queryAssignments = model.assign(qrDistMat, slope = 0) + dist_type = 'core' + elif fit_type == 'accessory_refined': + queryAssignments = model.assign(qrDistMat, slope = 1) + dist_type = 'accessory' + + # Assign clustering by adding to network + if graph_weights: + weights = qrDistMat + else: + weights = None + + genomeNetwork, qqDistMat = \ + addQueryToNetwork(dbFuncs, + rNames, + qNames, + genomeNetwork, + kmers, + queryAssignments, + model, + output, + distances = distances, + distance_type = dist_type, + queryQuery = (update_db and + (fit_type == 'default' or + (fit_type != 'default' and use_ref_graph) + ) + ), + strand_preserved = strand_preserved, + weights = weights, + threads = threads, + use_gpu = gpu_graph) + + output_fn = os.path.join(output, os.path.basename(output) + file_extension_string) + isolateClustering = \ + {'combined': printClusters(genomeNetwork, + rNames + qNames, + output_fn, + old_cluster_file, + external_clustering, + write_references or update_db, + use_gpu = gpu_graph)} + # Update DB as requested + dists_out = output + "/" + os.path.basename(output) + ".dists" + if update_db: + # Check new sequences pass QC before adding them + if len(set(seq_names_passing).difference(rNames + qNames)) > 0: + sys.stderr.write("Queries contained outlier distances, " + "not updating database\n") + else: + sys.stderr.write("Updating reference database to " + output + "\n") + # Update the network + ref list (everything) - no need to duplicate for core/accessory + if fit_type == 'default': + joinDBs(ref_db, output, output, + {"threads": threads, "strand_preserved": strand_preserved}) if model.type == 'lineage': - save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph) + save_network(genomeNetwork[min(model.ranks)], + prefix = output, + suffix = '_graph', + use_gpu = gpu_graph) + # Save sparse distance matrices and updated model + model.outPrefix = os.path.basename(output) + model.save() else: - save_network(genomeNetwork, prefix = output, suffix = '_graph', use_gpu = gpu_graph) + graph_suffix = file_extension_string + '_graph' + save_network(genomeNetwork, + prefix = output, + suffix = graph_suffix, + use_gpu = gpu_graph) + # Load the previous distances + refList_loaded, refList_copy, self, rrDistMat = \ + readPickle(distances, + enforce_self = True) + # This should now always be true, otherwise both qrDistMat and sparse matrix + # may need reordering + assert(refList_loaded == rNames) + combined_seq, core_distMat, acc_distMat = \ + update_distance_matrices(rNames, rrDistMat, + qNames, qrDistMat, + qqDistMat, threads = threads) + assert combined_seq == rNames + qNames + + # Get full distance matrix and save + complete_distMat = \ + np.hstack((pp_sketchlib.squareToLong(core_distMat, threads).reshape(-1, 1), + pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1))) + storePickle(combined_seq, combined_seq, True, complete_distMat, dists_out) + + # Copy model if needed + if output != model.outPrefix and fit_type == 'default': + model.copy(output) + + # Clique pruning + if model.type != 'lineage': + + existing_ref_list = [] + with open(ref_file_name) as refFile: + for reference in refFile: + existing_ref_list.append(reference.rstrip()) + + # Extract references from graph + newRepresentativesIndices, newRepresentativesNames, \ + newRepresentativesFile, genomeNetwork = \ + extractReferences(genomeNetwork, + combined_seq, + output, + outSuffix = file_extension_string, + existingRefs = existing_ref_list, + type_isolate = qc_dict['type_isolate'], + threads = threads, + use_gpu = gpu_graph) + + # intersection that maintains order + newQueries = [x for x in qNames if x in frozenset(newRepresentativesNames)] + + # could also have newRepresentativesNames in this diff (should be the same) - but want + # to ensure consistency with the network in case of bad input/bugs + nodes_to_remove = set(range(len(combined_seq))).difference(newRepresentativesIndices) + names_to_remove = [combined_seq[n] for n in nodes_to_remove] + + if (len(names_to_remove) > 0): + # This function also writes out the new ref distance matrix + dists_suffix = file_extension_string + '.refs.dists' + postpruning_combined_seq, newDistMat = \ + prune_distance_matrix(combined_seq, names_to_remove, complete_distMat, + output + "/" + os.path.basename(output) + dists_suffix) + graph_suffix = file_extension_string + '_refs_graph' + save_network(genomeNetwork, + prefix = output, + suffix = graph_suffix, + use_gpu = gpu_graph) + removeFromDB(output, output, names_to_remove) + db_suffix = file_extension_string + '.refs.h5' + os.rename(output + "/" + os.path.basename(output) + ".tmp.h5", + output + "/" + os.path.basename(output) + db_suffix) + + # Check that the updated set of references includes all old references, and references added from + # queries; there may be further new references, even from the original database, where paths are + # added between reference isolates in the same component, or new cliques formed + added_references = set(existing_ref_list).union(set(newQueries)) + assert set(postpruning_combined_seq).issuperset(added_references), "Error identifying references" + else: + storePickle(rNames, qNames, False, qrDistMat, dists_out) + if save_partial_query_graph: + if model.type == 'lineage': + save_network(genomeNetwork[min(model.ranks)], prefix = output, suffix = '_graph', use_gpu = gpu_graph) + else: + graph_suffix = file_extension_string + '_graph' + save_network(genomeNetwork, prefix = output, suffix = graph_suffix, use_gpu = gpu_graph) return(isolateClustering) @@ -405,10 +476,10 @@ def get_options(): queryingGroup.add_argument('--previous-clustering', help='Directory containing previous cluster definitions ' 'and network [default = use that in the directory ' 'containing the model]', type = str) - queryingGroup.add_argument('--core-only', help='(with a \'refine\' model) ' + queryingGroup.add_argument('--core', help='(with a \'refine\' model) ' 'Use a core-distance only model for assigning queries ' '[default = False]', default=False, action='store_true') - queryingGroup.add_argument('--accessory-only', help='(with a \'refine\' or \'lineage\' model) ' + queryingGroup.add_argument('--accessory', help='(with a \'refine\' or \'lineage\' model) ' 'Use an accessory-distance only model for assigning queries ' '[default = False]', default=False, action='store_true') @@ -533,8 +604,8 @@ def main(): args.strand_preserved, args.previous_clustering, args.external_clustering, - args.core_only, - args.accessory_only, + args.core, + args.accessory, args.gpu_sketch, args.gpu_dist, args.gpu_graph, diff --git a/PopPUNK/info.py b/PopPUNK/info.py new file mode 100644 index 00000000..8a084d87 --- /dev/null +++ b/PopPUNK/info.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +# vim: set fileencoding= : +# Copyright 2018-2020 John Lees and Nick Croucher + +# universal +import os +import sys +import pickle +# additional +import h5py +import argparse +import numpy as np +import pandas as pd +from scipy import sparse +import graph_tool.all as gt + +# Load GPU libraries +try: + import cupyx + import cugraph + import cudf + import cupy + from numba import cuda + import rmm + gpu_lib = True +except ImportError as e: + gpu_lib = False + +# command line parsing +def get_options(): + + parser = argparse.ArgumentParser(description='Get information about a PopPUNK database', + prog='poppunk_db_info') + + # input options + parser.add_argument('--db', + required = True, + help='PopPUNK database directory') + parser.add_argument('--network', + required = False, + default = None, + help='Network or lineage fit file for analysis') + parser.add_argument('--threads', + default = 1, + help='Number of cores to use in analysis') + parser.add_argument('--use-gpu', + default = False, + action = 'store_true', + help='Whether GPU libraries should be used in analysis') + parser.add_argument('--output', + required = True, + help='Prefix for output files') + parser.add_argument('--simple', + default = False, + action = 'store_true', + help='Do not print per sample information') + + return parser.parse_args() + +# main code +def main(): + + # Import functions + from .network import add_self_loop + from .network import load_network_file + from .network import sparse_mat_to_network + from .utils import check_and_set_gpu + from .utils import setGtThreads + + # Check input ok + args = get_options() + + # Check whether GPU libraries can be loaded + use_gpu = check_and_set_gpu(args.use_gpu, gpu_lib, quit_on_fail = False) + + # Set threads for graph-tool + setGtThreads(args.threads) + + # Open and process sequence database + h5_fn = os.path.join(args.db, os.path.basename(args.db) + '.h5') + ref_db = h5py.File(h5_fn, 'r') + + # Print overall database information + print("PopPUNK database:\t\t" + args.db) + + sketch_version = ref_db['sketches'].attrs['sketch_version'] + print("Sketch version:\t\t\t" + sketch_version) + + num_samples = len(ref_db['sketches'].keys()) + print("Number of samples:\t\t" + str(num_samples)) + + first_sample = list(ref_db['sketches'].keys())[0] + kmer_size = ref_db['sketches/' + first_sample].attrs['kmers'] + print("K-mer sizes:\t\t\t" + ",".join([str(x) for x in kmer_size])) + + sketch_size = int(ref_db['sketches/' + first_sample].attrs['sketchsize64']) * 64 + print("Sketch size:\t\t\t" + str(sketch_size)) + + if 'random' in ref_db.keys(): + has_random = True + else: + has_random = False + print("Contains random matches:\t" + str(has_random)) + + try: + codon_phased = ref_db['sketches'].attrs['codon_phased'] == 1 + except KeyError: + codon_phased = False + print("Codon phased seeds:\t\t" + str(codon_phased)) + + if 'use_rc' in ref_db.keys(): + use_rc = ref_db['sketches'].attrs['use_rc'] == 1 + print("Uses canonical k-mers:\t" + str(use_rc)) + + # Stop if requested + if args.simple: + sys.exit(0) + + # Print sample information + sample_names = list(ref_db['sketches'].keys()) + sample_sequence_length = {} + sample_missing_bases = {} + sample_base_frequencies = {name: [] for name in sample_names} + + for sample_name in sample_names: + sample_base_frequencies[sample_name] = ref_db['sketches/' + sample_name].attrs['base_freq'] + sample_sequence_length[sample_name] = ref_db['sketches/' + sample_name].attrs['length'] + sample_missing_bases[sample_name] = ref_db['sketches/' + sample_name].attrs['missing_bases'] + + # Select network file name + network_fn = args.network + if network_fn is None: + if use_gpu: + network_fn = os.path.join(args.db, os.path.basename(args.db) + '_graph.csv.gz') + else: + network_fn = os.path.join(args.db, os.path.basename(args.db) + '_graph.gt') + + # Open network file + if network_fn.endswith('.gt'): + G = load_network_file(network_fn, use_gpu = False) + elif network_fn.endswith('.csv.gz'): + if use_gpu: + G = load_network_file(network_fn, use_gpu = True) + else: + sys.stderr.write('Unable to load necessary GPU libraries\n') + exit(1) + elif network_fn.endswith('.npz'): + sparse_mat = sparse.load_npz(network_fn) + G = sparse_mat_to_network(sparse_mat, sample_names, use_gpu = use_gpu) + else: + sys.stderr.write('Unrecognised suffix: expected ".gt", ".csv.gz" or ".npz"\n') + exit(1) + + # Analyse network + if use_gpu: + component_assignments_df = cugraph.components.connectivity.connected_components(G) + component_counts_df = component_assignments_df.groupby('labels')['vertex'].count() + component_counts_df.name = 'component_count' + component_information_df = component_assignments_df.merge(component_counts_df, on = ['labels'], how = 'left') + outdegree_df = G.out_degree() + graph_properties_df = component_information_df.merge(outdegree_df, on = ['vertex']) + else: + graph_properties_df = pd.DataFrame() + graph_properties_df['vertex'] = np.arange(len(sample_names)) + graph_properties_df['labels'] = gt.label_components(G)[0].a + graph_properties_df['degree'] = G.get_out_degrees(G.get_vertices()) + graph_properties_df['component_count'] = graph_properties_df.groupby('labels')['vertex'].transform('count') + graph_properties_df = graph_properties_df.sort_values('vertex', axis = 0) # inplace not implemented for cudf + graph_properties_df['vertex'] = sample_names + + # Merge data and print output + with open(args.output,'w') as out_file: + out_file.write( + 'Sample,Length,Missing_bases,Frequency_A,Frequency_C,Frequency_G,Frequency_T,Component_label,Component_size,Node_degree\n' + ) + for i,sample_name in enumerate(sample_names): + out_file.write(sample_name + ',' + str(sample_sequence_length[sample_name]) + ',' + str(sample_missing_bases[sample_name]) + ',') + for frequency in sample_base_frequencies[sample_name]: + out_file.write(str(frequency) + ',') + graph_properties_row = graph_properties_df.loc[graph_properties_df['vertex']==sample_name,:] + out_file.write(str(graph_properties_row['labels'].values[0]) + ',') + out_file.write(str(graph_properties_row['component_count'].values[0]) + ',') + out_file.write(str(graph_properties_row['degree'].values[0])) + out_file.write("\n") + + sys.exit(0) + +if __name__ == '__main__': + main() + + sys.exit(0) diff --git a/PopPUNK/models.py b/PopPUNK/models.py index 97eae8bc..40e32260 100644 --- a/PopPUNK/models.py +++ b/PopPUNK/models.py @@ -35,11 +35,12 @@ # Load GPU libraries try: - import cupyx.scipy.sparse + import cupyx import cugraph import cudf import cupy as cp from numba import cuda + import rmm gpu_lib = True except ImportError as e: gpu_lib = False @@ -116,7 +117,7 @@ def loadClusterFit(pkl_file, npz_file, outPrefix = "", max_samples = 100000, rank_file = os.path.dirname(pkl_file) + "/" + \ prefix.group(1) + rankFile(rank) if use_gpu: - fit_data[rank] = cupyx.scipy.sparse.load_npz(rank_file) + fit_data[rank] = scipy.sparse.load_npz(rank_file) else: fit_data[rank] = scipy.sparse.load_npz(rank_file) else: @@ -829,7 +830,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi print(e) sys.stderr.write("Could not separately refine core and accessory boundaries. " "Using joint 2D refinement only.\n") - + self.indiv_fitted = True y = self.assign(X) return y @@ -950,11 +951,8 @@ def assign(self, X, slope=None): Core and accessory distances slope (int) Override self.slope. Default - use self.slope - Set to 0 for a vertical line, 1 for a horizontal line, or 2 to use a slope - cpus (int) - Number of threads to use Returns: y (numpy.array) Cluster assignments by samples @@ -962,11 +960,13 @@ def assign(self, X, slope=None): if not self.fitted: raise RuntimeError("Trying to assign using an unfitted model") else: - if slope == 2 or (slope == None and self.slope == 2): + if slope == None: + slope = self.slope + if slope == 2: y = poppunk_refine.assignThreshold(X/self.scale, 2, self.optimal_x, self.optimal_y, self.threads) - elif slope == 0 or (slope == None and self.slope == 0): + elif slope == 0: y = poppunk_refine.assignThreshold(X/self.scale, 0, self.core_boundary, 0, self.threads) - elif slope == 1 or (slope == None and self.slope == 1): + elif slope == 1: y = poppunk_refine.assignThreshold(X/self.scale, 1, 0, self.accessory_boundary, self.threads) return y @@ -1014,6 +1014,9 @@ def fit(self, X, accessory): y (numpy.array) Cluster assignments of samples in X ''' + # Check if model requires GPU + check_and_set_gpu(self.use_gpu, gpu_lib, quit_on_fail = True) + ClusterFit.fit(self, X) sample_size = int(round(0.5 * (1 + np.sqrt(1 + 8 * X.shape[0])))) if (max(self.ranks) >= sample_size): @@ -1033,12 +1036,15 @@ def fit(self, X, accessory): 0, rank ) - data = [epsilon if d < epsilon else d for d in data] if self.use_gpu: - self.nn_dists[rank] = cupyx.scipy.sparse.coo_matrix((cp.array(data),(cp.array(row),cp.array(col))), + data = cp.array(data) + data[data < epsilon] = epsilon + self.nn_dists[rank] = cupyx.scipy.sparse.coo_matrix((data,(cp.array(row),cp.array(col))), shape=(sample_size, sample_size), dtype = X.dtype) else: + data = np.array(data) + data[data < epsilon] = epsilon self.nn_dists[rank] = scipy.sparse.coo_matrix((data, (row, col)), shape=(sample_size, sample_size), dtype = X.dtype) @@ -1054,16 +1060,10 @@ def save(self): raise RuntimeError("Trying to save unfitted model") else: for rank in self.ranks: - if self.use_gpu: - cupyx.scipy.sparse.save_npz( - self.outPrefix + "/" + os.path.basename(self.outPrefix) + \ - rankFile(rank), - self.nn_dists[rank]) - else: - scipy.sparse.save_npz( - self.outPrefix + "/" + os.path.basename(self.outPrefix) + \ - rankFile(rank), - self.nn_dists[rank]) + scipy.sparse.save_npz( + self.outPrefix + "/" + os.path.basename(self.outPrefix) + \ + rankFile(rank), + self.nn_dists[rank]) with open(self.outPrefix + "/" + os.path.basename(self.outPrefix) + \ '_fit.pkl', 'wb') as pickle_file: pickle.dump([[self.ranks, self.dist_col], self.type], pickle_file) @@ -1141,6 +1141,26 @@ def edge_weights(self, rank): return (self.nn_dists[rank].data) def extend(self, qqDists, qrDists): + '''Update the sparse distance matrix of nearest neighbours after querying + + Args: + qqDists (numpy or cupy ndarray) + Two column array of query-query distances + qqDists (numpy or cupy ndarray) + Two column array of reference-query distances + Returns: + y (list of tuples) + Edges to include in network + ''' + + # Check if model requires GPU + check_and_set_gpu(self.use_gpu, gpu_lib, quit_on_fail = True) + + # Convert data structures if using GPU + if self.use_gpu: + qqDists = cp.array(qqDists) + qrDists = cp.array(qrDists) + # Reshape qq and qr dist matrices qqSquare = pp_sketchlib.longToSquare(qqDists[:, [self.dist_col]], self.threads) qqSquare[qqSquare < epsilon] = epsilon @@ -1175,7 +1195,7 @@ def extend(self, qqDists, qrDists): dist_row, dist_col, dist = cupyx.scipy.sparse.find(sample_row) else: dist_row, dist_col, dist = scipy.sparse.find(sample_row) - dist = [epsilon if d < epsilon else d for d in dist] + dist[dist < epsilon] = epsilon dist_idx_sort = np.argsort(dist) # Identical to C++ code in matrix_ops.cpp:sparsify_dists diff --git a/PopPUNK/network.py b/PopPUNK/network.py index ec2b5d53..c5a1e7e7 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -31,6 +31,7 @@ import cudf import cupy as cp from numba import cuda + import rmm gpu_lib = True except ImportError as e: gpu_lib = False @@ -95,12 +96,16 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, graph_suffix = '.gt' if core_only and model.type == 'refine': - model.slope = 0 - network_file = dir_prefix + '_core_graph' + graph_suffix + if ref_graph: + network_file = dir_prefix + '_core.refs_graph' + graph_suffix + else: + network_file = dir_prefix + '_core_graph' + graph_suffix cluster_file = dir_prefix + '_core_clusters.csv' elif accessory_only and model.type == 'refine': - model.slope = 1 - network_file = dir_prefix + '_accessory_graph' + graph_suffix + if ref_graph: + network_file = dir_prefix + '_accessory.refs_graph' + graph_suffix + else: + network_file = dir_prefix + '_accessory_graph' + graph_suffix cluster_file = dir_prefix + '_accessory_clusters.csv' else: if ref_graph and os.path.isfile(dir_prefix + '.refs_graph' + graph_suffix): @@ -109,10 +114,11 @@ def fetchNetwork(network_dir, model, refList, ref_graph = False, network_file = dir_prefix + '_graph' + graph_suffix cluster_file = dir_prefix + '_clusters.csv' if core_only or accessory_only: - sys.stderr.write("Can only do --core-only or --accessory-only fits from " + sys.stderr.write("Can only do --core or --accessory fits from " "a refined fit. Using the combined distances.\n") # Load network file + sys.stderr.write("Loading network from " + network_file + "\n") genomeNetwork = load_network_file(network_file, use_gpu = use_gpu) # Ensure all in dists are in final network @@ -139,12 +145,13 @@ def load_network_file(fn, use_gpu = False): # Load the network from the specified file if use_gpu: G_df = cudf.read_csv(fn, compression = 'gzip') + if 'src' in G_df.columns: + G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) genomeNetwork = cugraph.Graph() if 'weights' in G_df.columns: - G_df.columns = ['source','destination','weights'] + G_df = G_df[['source','destination','weights']] genomeNetwork.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: - G_df.columns = ['source','destination'] genomeNetwork.from_cudf_edgelist(G_df,renumber=False) sys.stderr.write("Network loaded: " + str(genomeNetwork.number_of_vertices()) + " samples\n") else: @@ -214,7 +221,27 @@ def cliquePrune(component, graph, reference_indices, components_list): ref_list = getCliqueRefs(subgraph, refs) return(list(ref_list)) -def extractReferences(G, dbOrder, outPrefix, type_isolate = None, +def translate_network_indices(G_ref_df, reference_indices): + """Function for ensuring an updated reference network retains + numbering consistent with sample names + + Args: + G_ref_df (cudf data frame) + List of edges in reference network + reference_indices (list) + The ordered list of reference indices in the original network + + Returns: + G_ref (cugraph network) + Network of reference sequences + """ + # Translate network indices to match name order + G_ref_df['source'] = [reference_indices.index(x) for x in G_ref_df['old_source'].to_arrow().to_pylist()] + G_ref_df['destination'] = [reference_indices.index(x) for x in G_ref_df['old_destination'].to_arrow().to_pylist()] + G_ref = add_self_loop(G_ref_df, len(reference_indices) - 1, renumber = True) + return(G_ref) + +def extractReferences(G, dbOrder, outPrefix, outSuffix = '', type_isolate = None, existingRefs = None, threads = 1, use_gpu = False): """Extract references for each cluster based on cliques @@ -226,7 +253,9 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, dbOrder (list) The order of files in the sketches, so returned references are in the same order outPrefix (str) - Prefix for output file (.refs will be appended) + Prefix for output file + outSuffix (str) + Suffix for output file (.refs will be appended) type_isolate (str) Isolate to be included in set of references existingRefs (list) @@ -247,7 +276,6 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, references = set(existingRefs) index_lookup = {v:k for k,v in enumerate(dbOrder)} reference_indices = set([index_lookup[r] for r in references]) - # Add type isolate, if necessary type_isolate_index = None if type_isolate is not None: @@ -275,16 +303,16 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, # Order found references as in sketchlib database reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] - refFileName = writeReferences(reference_names, outPrefix) # Extract reference edges G_df = G.view_edge_list() if 'src' in G_df.columns: - G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) - G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] - # Add self-loop if needed - max_in_vertex_labels = max(reference_indices) - G_ref = add_self_loop(G_ref_df,max_in_vertex_labels, renumber = False) + G_df.rename(columns={'src': 'old_source','dst': 'old_destination'}, inplace=True) + else: + G_df.rename(columns={'source': 'old_source','destination': 'old_destination'}, inplace=True) + G_ref_df = G_df[G_df['old_source'].isin(reference_indices) & G_df['old_destination'].isin(reference_indices)] + # Translate network indices to match name order + G_ref = translate_network_indices(G_ref_df, reference_indices) # Check references in same component in overall graph are connected in the reference graph # First get components of original reference graph @@ -325,8 +353,8 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, # Add expanded reference set to the overall list reference_indices = list(reference_index_set) # Create new reference graph - G_ref_df = G_df[G_df['source'].isin(reference_indices) & G_df['destination'].isin(reference_indices)] - G_ref = add_self_loop(G_ref_df, max_in_vertex_labels, renumber = False) + G_ref_df = G_df[G_df['old_source'].isin(reference_indices) & G_df['old_destination'].isin(reference_indices)] + G_ref = translate_network_indices(G_ref_df, reference_indices) else: # Each component is independent, so can be multithreaded @@ -337,12 +365,14 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, gt.openmp_set_num_threads(1) # Cliques are pruned, taking one reference from each, until none remain + sys.setrecursionlimit = 5000 with Pool(processes=threads) as pool: ref_lists = pool.map(partial(cliquePrune, graph=G, reference_indices=reference_indices, components_list=components), set(components)) + sys.setrecursionlimit = 1000 # Returns nested lists, which need to be flattened reference_indices = set([entry for sublist in ref_lists for entry in sublist]) @@ -408,40 +438,49 @@ def extractReferences(G, dbOrder, outPrefix, type_isolate = None, # Order found references as in sketch files reference_names = [dbOrder[int(x)] for x in sorted(reference_indices)] - refFileName = writeReferences(reference_names, outPrefix) + refFileName = writeReferences(reference_names, outPrefix, outSuffix = outSuffix) return reference_indices, reference_names, refFileName, G_ref -def writeReferences(refList, outPrefix): +def writeReferences(refList, outPrefix, outSuffix = ""): """Writes chosen references to file Args: refList (list) Reference names to write outPrefix (str) - Prefix for output file (.refs will be appended) + Prefix for output file + outSuffix (str) + Suffix for output file (.refs will be appended) Returns: refFileName (str) The name of the file references were written to """ # write references to file - refFileName = outPrefix + "/" + os.path.basename(outPrefix) + ".refs" + refFileName = outPrefix + "/" + os.path.basename(outPrefix) + outSuffix + ".refs" with open(refFileName, 'w') as rFile: for ref in refList: rFile.write(ref + '\n') - return refFileName -def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False, - use_gpu = False): +def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, + old_ids = None, previous_pkl = None, weights = False, + use_gpu = False): """Load previous network, extract the edges to match the vertex order specified in rlist, and also return weights if specified. Args: - prev_G_fn (str) - Path of file containing existing network. + prev_G_fn (str or graph object) + Path of file containing existing network, or already-loaded + graph object + adding_qq_dists (bool) + Boolean specifying whether query-query edges are being added + to an existing network, such that not all the sequence IDs will + be found in the old IDs, which should already be correctly ordered rlist (list) List of reference sequence labels in new network + old_ids (list) + List of IDs of vertices in existing network previous_pkl (str) Path of pkl file containing names of sequences in previous network @@ -459,10 +498,14 @@ def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False, edge_weights (list) Weights for each new edge """ - # get list for translating node IDs to rlist - prev_G = load_network_file(prev_G_fn, use_gpu = use_gpu) + # Load graph from file if passed string; else use graph object passed in + # as argument + if isinstance(prev_G_fn, str): + prev_G = load_network_file(prev_G_fn, use_gpu = use_gpu) + else: + prev_G = prev_G_fn - # load list of names in previous network + # load list of names in previous network if pkl name supplied if previous_pkl is not None: with open(previous_pkl, 'rb') as pickle_file: old_rlist, old_qlist, self = pickle.load(pickle_file) @@ -470,7 +513,7 @@ def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False, old_ids = old_rlist else: old_ids = old_rlist + old_qlist - else: + elif old_ids is None: sys.stderr.write('Missing .pkl file containing names of sequences in ' 'previous network\n') sys.exit(1) @@ -479,25 +522,40 @@ def network_to_edges(prev_G_fn, rlist, previous_pkl = None, weights = False, if use_gpu: G_df = prev_G.view_edge_list() if weights: - G_df.columns = ['source','destination','weight'] - edge_weights = G_df['weight'].to_arrow().to_pylist() - else: - G_df.columns = ['source','destination'] - old_source_ids = G_df['source'].to_arrow().to_pylist() - old_target_ids = G_df['destination'].to_arrow().to_pylist() + if len(G_df.columns) < 3: + sys.stderr.write('Loaded network does not have edge weights; try a different ' + 'network or turn off graph weights\n') + exit(1) + if 'src' in G_df.columns: + G_df.rename(columns={'source': 'src','destination': 'dst'}, inplace=True) + edge_weights = G_df['weights'].to_arrow().to_pylist() + G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) + old_source_ids = G_df['source'].astype('int32').to_arrow().to_pylist() + old_target_ids = G_df['destination'].astype('int32').to_arrow().to_pylist() else: # get the source and target nodes old_source_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "source") old_target_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "target") # get the weights if weights: + if prev_G.edge_properties.keys() is None or 'weight' not in prev_G.edge_properties.keys(): + sys.stderr.write('Loaded network does not have edge weights; try a different ' + 'network or turn off graph weights\n') + exit(1) edge_weights = list(prev_G.ep['weight']) - # Update IDs to new versions - old_id_indices = [rlist.index(x) for x in old_ids] - # translate to indices - source_ids = [old_id_indices[x] for x in old_source_ids] - target_ids = [old_id_indices[x] for x in old_target_ids] + # If appending queries to an existing network, then the recovered links can be left + # unchanged, as the new IDs are the queries, and the existing sequences will not be found + # in the list of IDs + if adding_qq_dists: + source_ids = old_source_ids + target_ids = old_target_ids + else: + # Update IDs to new versions + old_id_indices = [rlist.index(x) for x in old_ids] + # translate to indices + source_ids = [old_id_indices[x] for x in old_source_ids] + target_ids = [old_id_indices[x] for x in old_target_ids] # return values if weights: @@ -546,11 +604,12 @@ def initial_graph_properties(rlist, qlist): Whether the network is being constructed from all-v-all distances or reference-v-query information """ - self_comparison = True - vertex_labels = rlist - if rlist != qlist: + if rlist == qlist: + self_comparison = True + vertex_labels = rlist + else: self_comparison = False - vertex_labels.append(qlist) + vertex_labels = rlist + qlist return vertex_labels, self_comparison def process_weights(distMat, weights_type): @@ -573,38 +632,29 @@ def process_weights(distMat, weights_type): sys.stderr.write("Unable to calculate distance type " + str(weights_type) + "; " "accepted types are " + str(accepted_weights_types) + "\n") if weights_type == 'euclidean': - processed_weights = np.linalg.norm(distMat, axis = 1) + processed_weights = np.linalg.norm(distMat, axis = 1).tolist() elif weights_type == 'core': - processed_weights = distMat[:, 0] + processed_weights = distMat[:, 0].tolist() elif weights_type == 'accessory': - processed_weights = distMat[:, 1] + processed_weights = distMat[:, 1].tolist() else: sys.stderr.write('Require distance matrix to calculate distances\n') return processed_weights -def process_previous_network(previous_network = None, previous_pkl = None, vertex_labels = None, - weights = False, use_gpu = False): +def process_previous_network(previous_network = None, adding_qq_dists = False, old_ids = None, + previous_pkl = None, vertex_labels = None, weights = False, use_gpu = False): """Extract edge types from an existing network - Will print summary statistics about the network to ``STDERR`` - Args: - rlist (list) - List of reference sequence labels - qlist (list) - List of query sequence labels - G_df (cudf or pandas data frame) - Data frame in which the first two columns are the nodes linked by edges - weights (bool) - Whether weights in the G_df data frame should be included in the network - distMat (2 column ndarray) - Numpy array of pairwise distances - weights_type (str) - Measure to calculate from the distMat to use as edge weights in network - - options are core, accessory or euclidean distance - previous_network (str) + previous_network (str or graph object) Name of file containing a previous network to be integrated into this new - network + network, or already-loaded graph object + adding_qq_dists (bool) + Boolean specifying whether query-query edges are being added + to an existing network, such that not all the sequence IDs will + be found in the old IDs, which should already be correctly ordered + old_ids (list) + Ordered list of vertex names in previous network previous_pkl (str) Name of file containing the names of the sequences in the previous_network ordered based on the original network construction @@ -623,11 +673,13 @@ def process_previous_network(previous_network = None, previous_pkl = None, verte extra_weights (list or None) List of edge weights """ - if previous_pkl is not None: - if weights is not None: + if previous_pkl is not None or old_ids is not None: + if weights: # Extract from network extra_sources, extra_targets, extra_weights = network_to_edges(previous_network, vertex_labels, + adding_qq_dists = adding_qq_dists, + old_ids = old_ids, previous_pkl = previous_pkl, weights = True, use_gpu = use_gpu) @@ -635,6 +687,8 @@ def process_previous_network(previous_network = None, previous_pkl = None, verte # Extract from network extra_sources, extra_targets = network_to_edges(previous_network, vertex_labels, + adding_qq_dists = adding_qq_dists, + old_ids = old_ids, previous_pkl = previous_pkl, weights = False, use_gpu = use_gpu) @@ -645,9 +699,18 @@ def process_previous_network(previous_network = None, previous_pkl = None, verte return extra_sources, extra_targets, extra_weights -def construct_network_from_edge_list(rlist, qlist, edge_list, - weights = None, distMat = None, weights_type = None, previous_network = None, previous_pkl = None, - betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): +def construct_network_from_edge_list(rlist, + qlist, + edge_list, + weights = None, + distMat = None, + previous_network = None, + adding_qq_dists = False, + old_ids = None, + previous_pkl = None, + betweenness_sample = betweenness_sample_default, + summarise = True, + use_gpu = False): """Construct an undirected network using a data frame of edges. Nodes are samples and edges where samples are within the same cluster @@ -660,16 +723,19 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, List of query sequence labels G_df (cudf or pandas data frame) Data frame in which the first two columns are the nodes linked by edges - weights (bool) - Whether weights in the G_df data frame should be included in the network + weights (list) + List of edge weights distMat (2 column ndarray) Numpy array of pairwise distances - weights_type (str) - Measure to calculate from the distMat to use as edge weights in network - - options are core, accessory or euclidean distance - previous_network (str) + previous_network (str or graph object) Name of file containing a previous network to be integrated into this new - network + network, or the already-loaded graph object + adding_qq_dists (bool) + Boolean specifying whether query-query edges are being added + to an existing network, such that not all the sequence IDs will + be found in the old IDs, which should already be correctly ordered + old_ids (list) + Ordered list of vertex names in previous network previous_pkl (str) Name of file containing the names of the sequences in the previous_network betweenness_sample (int) @@ -691,48 +757,51 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, # data structures vertex_labels, self_comparison = initial_graph_properties(rlist, qlist) - if weights_type is not None: - weights = process_weights(distMat, weights_type) - - # Load previous network - if previous_network is not None: - extra_sources, extra_targets, extra_weights = process_previous_network(previous_network = previous_network, - previous_pkl = previous_pkl, - vertex_labels = vertex_labels, - weights = (weights is not None), - use_gpu = use_gpu) # Create new network if use_gpu: - # Add extra information from previous network - if previous_network is not None: - for (src, dest) in zip(extra_sources, extra_targets): - edge_list.append((src, dest)) - if weights is not None: - weights.extend(extra_weights) # benchmarking concurs with https://stackoverflow.com/questions/55922162/recommended-cudf-dataframe-construction - edge_array = cp.array(edge_list, dtype = np.int32) - edge_gpu_matrix = cuda.to_device(edge_array) - G_df = cudf.DataFrame(edge_gpu_matrix, columns = ['source','destination']) + if len(edge_list) > 1: + edge_array = cp.array(edge_list, dtype = np.int32) + edge_gpu_matrix = cuda.to_device(edge_array) + G_df = cudf.DataFrame(edge_gpu_matrix, columns = ['source','destination']) + else: + # Cannot generate an array when one edge + G_df = cudf.DataFrame(columns = ['source','destination']) + G_df['source'] = [edge_list[0][0]] + G_df['destination'] = [edge_list[0][1]] if weights is not None: G_df['weights'] = weights G = construct_network_from_df(rlist, qlist, G_df, weights = (weights is not None), distMat = distMat, - weights_type = weights_type, + adding_qq_dists = adding_qq_dists, + old_ids = old_ids, previous_network = previous_network, previous_pkl = previous_pkl, summarise = False, use_gpu = use_gpu) else: + # Load previous network + if previous_network is not None: + extra_sources, extra_targets, extra_weights = \ + process_previous_network(previous_network = previous_network, + adding_qq_dists = adding_qq_dists, + old_ids = old_ids, + previous_pkl = previous_pkl, + vertex_labels = vertex_labels, + weights = (weights is not None), + use_gpu = use_gpu) # Construct list of tuples for graph-tool # Include information from previous graph if supplied if weights is not None: + weighted_edges = [] for ((src, dest), weight) in zip(edge_list, weights): - edge_list.append((src, dest, weight)) + weighted_edges.append((src, dest, weight)) if previous_network is not None: - for ((src, dest), weight) in zip(extra_sources, extra_targets, extra_weights): - edge_list.append((src, dest, weight)) + for (src, dest, weight) in zip(extra_sources, extra_targets, extra_weights): + weighted_edges.append((src, dest, weight)) + edge_list = weighted_edges else: if previous_network is not None: for (src, dest) in zip(extra_sources, extra_targets): @@ -748,11 +817,21 @@ def construct_network_from_edge_list(rlist, qlist, edge_list, G.add_edge_list(edge_list) if summarise: print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) + return G -def construct_network_from_df(rlist, qlist, G_df, - weights = False, distMat = None, weights_type = None, previous_network = None, previous_pkl = None, - betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): +def construct_network_from_df(rlist, + qlist, + G_df, + weights = False, + distMat = None, + previous_network = None, + adding_qq_dists = False, + old_ids = None, + previous_pkl = None, + betweenness_sample = betweenness_sample_default, + summarise = True, + use_gpu = False): """Construct an undirected network using a data frame of edges. Nodes are samples and edges where samples are within the same cluster @@ -769,12 +848,15 @@ def construct_network_from_df(rlist, qlist, G_df, Whether weights in the G_df data frame should be included in the network distMat (2 column ndarray) Numpy array of pairwise distances - weights_type (str) - Measure to calculate from the distMat to use as edge weights in network - - options are core, accessory or euclidean distance - previous_network (str) + previous_network (str or graph object) Name of file containing a previous network to be integrated into this new - network + network, or the already-loaded graph object + adding_qq_dists (bool) + Boolean specifying whether query-query edges are being added + to an existing network, such that not all the sequence IDs will + be found in the old IDs, which should already be correctly ordered + old_ids (list) + Ordered list of vertex names in previous network previous_pkl (str) Name of file containing the names of the sequences in the previous_network betweenness_sample (int) @@ -796,8 +878,6 @@ def construct_network_from_df(rlist, qlist, G_df, # data structures vertex_labels, self_comparison = initial_graph_properties(rlist, qlist) - if weights_type is not None: - G_df['weights'] = process_weights(distMat, weights_type) # Check df format is correct if weights: @@ -808,6 +888,8 @@ def construct_network_from_df(rlist, qlist, G_df, # Load previous network if previous_network is not None: extra_sources, extra_targets, extra_weights = process_previous_network(previous_network = previous_network, + adding_qq_dists = adding_qq_dists, + old_ids = old_ids, previous_pkl = previous_pkl, vertex_labels = vertex_labels, weights = weights, @@ -826,7 +908,6 @@ def construct_network_from_df(rlist, qlist, G_df, # direct conversion # ensure the highest-integer node is included in the edge list # by adding a self-loop if necessary; see https://github.com/rapidsai/cugraph/issues/1206 - max_in_df = np.amax([G_df['source'].max(),G_df['destination'].max()]) max_in_vertex_labels = len(vertex_labels)-1 use_weights = False if weights: @@ -843,8 +924,8 @@ def construct_network_from_df(rlist, qlist, G_df, G = construct_network_from_edge_list(rlist, qlist, connections, weights = weights, distMat = distMat, - weights_type = weights_type, previous_network = previous_network, + old_ids = old_ids, previous_pkl = previous_pkl, summarise = False, use_gpu = use_gpu) @@ -852,9 +933,15 @@ def construct_network_from_df(rlist, qlist, G_df, print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) return G -def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, - weights = None, weights_type = None, previous_network = None, previous_pkl = None, - betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): +def construct_network_from_sparse_matrix(rlist, + qlist, + sparse_input, + weights = None, + previous_network = None, + previous_pkl = None, + betweenness_sample = betweenness_sample_default, + summarise = True, + use_gpu = False): """Construct an undirected network using a sparse matrix. Nodes are samples and edges where samples are within the same cluster @@ -871,9 +958,6 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, List of weights for each edge in the network distMat (2 column ndarray) Numpy array of pairwise distances - weights_type (str) - Measure to calculate from the distMat to use as edge weights in network - - options are core, accessory or euclidean distance previous_network (str) Name of file containing a previous network to be integrated into this new network @@ -905,7 +989,6 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, G_df['weights'] = sparse_input.data G = construct_network_from_df(rlist, qlist, G_df, weights = True, - weights_type = weights_type, previous_network = previous_network, previous_pkl = previous_pkl, betweenness_sample = betweenness_sample, @@ -915,9 +998,74 @@ def construct_network_from_sparse_matrix(rlist, qlist, sparse_input, print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) return G -def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, - weights = None, distMat = None, weights_type = None, previous_network = None, previous_pkl = None, - betweenness_sample = betweenness_sample_default, summarise = True, use_gpu = False): +def construct_dense_weighted_network(rlist, distMat, weights_type = None, use_gpu = False): + """Construct an undirected network using sequence lists, assignments of pairwise distances + to clusters, and the identifier of the cluster assigned to within-strain distances. + Nodes are samples and edges where samples are within the same cluster + + Will print summary statistics about the network to ``STDERR`` + + Args: + rlist (list) + List of reference sequence labels + distMat (2 column ndarray) + Numpy array of pairwise distances + weights_type (str) + Type of weight to use for network + use_gpu (bool) + Whether to use GPUs for network construction + + Returns: + G (graph) + The resulting network + """ + # Check GPU library use + use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True) + + # data structures + vertex_labels, self_comparison = initial_graph_properties(rlist, rlist) + + # Filter weights to only the relevant edges + if weights is None: + sys.stderr.write("Need weights to construct weighted network\n") + sys.exit(1) + + # Process weights + weights = process_weights(distMat, weights_type) + + # Convert edge indices to tuples + edge_list = poppunk_refine.generateAllTuples(num_ref = len(rlist), + self = True, + int_offset = 0) + + if use_gpu: + # Construct network with GPU via data frame + G_df = cudf.DataFrame(columns = ['source','destination']) + G_df['source'] = [edge_list[0][0]] + G_df['destination'] = [edge_list[0][1]] + G_df['weights'] = weights + max_in_vertex_labels = len(vertex_labels)-1 + G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) + else: + # Construct network with CPU via edge list + weighted_edges = [] + for ((src, dest), weight) in zip(edge_list, weights): + weighted_edges.append((src, dest, weight)) + # build the graph + G = gt.Graph(directed = False) + G.add_vertex(len(vertex_labels)) + eweight = G.new_ep("float") + # Could alternatively assign weights through eweight.a = weights + G.add_edge_list(weighted_edges, eprops = [eweight]) + G.edge_properties["weight"] = eweight + + return G + + +def construct_network_from_assignments(rlist, qlist, assignments, within_label = 1, int_offset = 0, + weights = None, distMat = None, weights_type = None, previous_network = None, old_ids = None, + adding_qq_dists = False, previous_pkl = None, betweenness_sample = betweenness_sample_default, + summarise = True, use_gpu = False): """Construct an undirected network using sequence lists, assignments of pairwise distances to clusters, and the identifier of the cluster assigned to within-strain distances. Nodes are samples and edges where samples are within the same cluster @@ -933,6 +1081,8 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = Labels of most likely cluster assignment within_label (int) The label for the cluster representing within-strain distances + int_offset (int) + Constant integer to add to each node index weights (list) List of weights for each edge in the network distMat (2 column ndarray) @@ -943,6 +1093,12 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = previous_network (str) Name of file containing a previous network to be integrated into this new network + old_ids (list) + Ordered list of vertex names in previous network + adding_qq_dists (bool) + Boolean specifying whether query-query edges are being added + to an existing network, such that not all the sequence IDs will + be found in the old IDs, which should already be correctly ordered previous_pkl (str) Name of file containing the names of the sequences in the previous_network betweenness_sample (int) @@ -961,24 +1117,36 @@ def construct_network_from_assignments(rlist, qlist, assignments, within_label = # Check GPU library use use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True) - - # Convert edge indices to tuples - connections = poppunk_refine.generateTuples(assignments, within_label) + # Filter weights to only the relevant edges if weights is not None: weights = weights[assignments == within_label] - elif distMat is not None: + elif distMat is not None and weights_type is not None: + if isinstance(assignments, list): + assignments = np.array(assignments) distMat = distMat[assignments == within_label,:] + weights = process_weights(distMat, weights_type) + + # Convert edge indices to tuples + connections = poppunk_refine.generateTuples(assignments, + within_label, + self = (rlist == qlist), + num_ref = len(rlist), + int_offset = int_offset) + + # Construct network using edge list G = construct_network_from_edge_list(rlist, qlist, connections, weights = weights, distMat = distMat, - weights_type = weights_type, previous_network = previous_network, + adding_qq_dists = adding_qq_dists, + old_ids = old_ids, previous_pkl = previous_pkl, summarise = False, use_gpu = use_gpu) if summarise: print_network_summary(G, betweenness_sample = betweenness_sample, use_gpu = use_gpu) + return G def get_cugraph_triangles(G): @@ -1091,8 +1259,8 @@ def networkSummary(G, calc_betweenness=True, betweenness_sample = betweenness_sa return(metrics, scores) def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, - assignments, model, queryDB, queryQuery = False, - strand_preserved = False, weights = None, threads = 1, + assignments, model, queryDB, distances = None, distance_type = 'euclidean', + queryQuery = False, strand_preserved = False, weights = None, threads = 1, use_gpu = False): """Finds edges between queries and items in the reference database, and modifies the network to include them. @@ -1114,6 +1282,10 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, Model fitted to reference database queryDB (str) Query database location + distances (str) + Prefix of distance files for extending network + distance_type (str) + Distance type to use as weights in network queryQuery (bool) Add in all query-query distances (default = False) @@ -1136,6 +1308,10 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, """ # initalise functions queryDatabase = dbFuncs['queryDatabase'] + + # do not calculate weights unless specified + if weights is None: + distance_type = None # initialise links data structure new_edges = [] @@ -1146,16 +1322,18 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, # store links for each query in a list of edge tuples ref_count = len(rList) - for row_idx, (assignment, (ref, query)) in enumerate(zip(assignments, listDistInts(rList, qList, self = False))): - if assignment == model.within_label: - # query index needs to be adjusted for existing vertices in network - if weights is not None: - dist = np.linalg.norm(weights[row_idx, :]) - edge_tuple = (ref, query + ref_count, dist) - else: - edge_tuple = (ref, query + ref_count) - new_edges.append(edge_tuple) - assigned.add(qList[query]) + + # Add queries to network + G = construct_network_from_assignments(rList, + qList, + assignments, + within_label = model.within_label, + previous_network = G, + old_ids = rList, + distMat = weights, + weights_type = distance_type, + summarise = False, + use_gpu = use_gpu) # Calculate all query-query distances too, if updating database if queryQuery: @@ -1173,15 +1351,26 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, number_plot_fits = 0, threads = threads) - queryAssignation = model.assign(qqDistMat) - for row_idx, (assignment, (ref, query)) in enumerate(zip(queryAssignation, listDistInts(qList, qList, self = True))): - if assignment == model.within_label: - if weights is not None: - dist = np.linalg.norm(qqDistMat[row_idx, :]) - edge_tuple = (ref + ref_count, query + ref_count, dist) - else: - edge_tuple = (ref + ref_count, query + ref_count) - new_edges.append(edge_tuple) + if distance_type == 'core': + queryAssignation = model.assign(qqDistMat, slope = 0) + elif distance_type == 'accessory': + queryAssignation = model.assign(qqDistMat, slope = 1) + else: + queryAssignation = model.assign(qqDistMat) + + # Add queries to network + G = construct_network_from_assignments(qList, + qList, + queryAssignation, + int_offset = ref_count, + within_label = model.within_label, + previous_network = G, + old_ids = rList, + adding_qq_dists = True, + distMat = qqDistMat, + weights_type = distance_type, + summarise = False, + use_gpu = use_gpu) # Otherwise only calculate query-query distances for new clusters else: @@ -1202,8 +1391,13 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, self = True, number_plot_fits = 0, threads = threads) - - queryAssignation = model.assign(qqDistMat) + + if distance_type == 'core': + queryAssignation = model.assign(qqDistMat, slope = 0) + elif distance_type == 'accessory': + queryAssignation = model.assign(qqDistMat, slope = 1) + else: + queryAssignation = model.assign(qqDistMat) # identify any links between queries and store in the same links dict # links dict now contains lists of links both to original database and new queries @@ -1211,44 +1405,29 @@ def addQueryToNetwork(dbFuncs, rList, qList, G, kmers, for row_idx, (assignment, (query1, query2)) in enumerate(zip(queryAssignation, iterDistRows(qList, qList, self = True))): if assignment == model.within_label: if weights is not None: - dist = np.linalg.norm(qqDistMat[row_idx, :]) + if distance_type == 'core': + dist = weights[row_idx, 0] + elif distance_type == 'accessory': + dist = weights[row_idx, 1] + else: + dist = np.linalg.norm(weights[row_idx, :]) edge_tuple = (query_indices[query1], query_indices[query2], dist) else: edge_tuple = (query_indices[query1], query_indices[query2]) new_edges.append(edge_tuple) - - # finish by updating the network - if use_gpu: - - use_gpu = check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = True) - - # construct updated graph - G_current_df = G.view_edge_list() - if weights is not None: - G_current_df.columns = ['source','destination','weights'] - G_extra_df = cudf.DataFrame(new_edges, columns = ['source','destination','weights']) - G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True) - else: - G_current_df.columns = ['source','destination'] - G_extra_df = cudf.DataFrame(new_edges, columns = ['source','destination']) - G_df = cudf.concat([G_current_df,G_extra_df], ignore_index = True) - - # use self-loop to ensure all nodes are present - max_in_vertex_labels = ref_count + len(qList) - 1 - include_weights = False - if weights is not None: - include_weights = True - G = add_self_loop(G_df, max_in_vertex_labels, weights = include_weights) - - else: - G.add_vertex(len(qList)) - - if weights is not None: - eweight = G.new_ep("float") - G.add_edge_list(new_edges, eprops = [eweight]) - G.edge_properties["weight"] = eweight - else: - G.add_edge_list(new_edges) + + G = construct_network_from_assignments(qList, + qList, + queryAssignation, + int_offset = ref_count, + within_label = model.within_label, + previous_network = G, + old_ids = rList + qList, + adding_qq_dists = True, + distMat = qqDistMat, + weights_type = distance_type, + summarise = False, + use_gpu = use_gpu) return G, qqDistMat @@ -1573,7 +1752,6 @@ def generate_minimum_spanning_tree(G, from_cugraph = False): # MST - check cuDF implementation is the same max_indices = mst_df.groupby(['labels'])['degree'].idxmax() seed_vertices = mst_df.iloc[max_indices]['vertex'] - num_components = seed_vertices.size() else: component_assignments, component_frequencies = gt.label_components(mst_network) num_components = len(component_frequencies) @@ -1597,9 +1775,9 @@ def generate_minimum_spanning_tree(G, from_cugraph = False): # so no extra edges can be retrieved from the graph G_df = G.view_edge_list() max_weight = G_df['weights'].max() - first_seed = seed_vertices[0] + first_seed = seed_vertices.iloc[0] G_seed_link_df = cudf.DataFrame() - G_seed_link_df['dst'] = seed_vertices.iloc[1:seed_vertices.size()] + G_seed_link_df['dst'] = seed_vertices.iloc[1:seed_vertices.size] G_seed_link_df['src'] = seed_vertices.iloc[0] G_seed_link_df['weights'] = seed_vertices.iloc[0] G_df = G_df.append(G_seed_link_df) @@ -1624,7 +1802,9 @@ def generate_minimum_spanning_tree(G, from_cugraph = False): # Construct graph if from_cugraph: - mst_network = G_df.from_cudf_edgelist(edge_attr='weights', renumber=False) + mst_network = cugraph.Graph() + G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) + mst_network.from_cudf_edgelist(G_df, edge_attr='weights', renumber=False) else: seed_G = gt.Graph(directed = False) seed_G.add_vertex(len(seed_vertex)) @@ -1707,7 +1887,7 @@ def cugraph_to_graph_tool(G, rlist): Graph tool network """ edge_df = G.view_edge_list() - edge_tuple = edge_df[['src', 'dst']].values + edge_tuple = edge_df[['src', 'dst']].values.tolist() edge_weights = None if 'weights' in edge_df.columns: edge_weights = edge_df['weights'].values_host @@ -1719,3 +1899,37 @@ def cugraph_to_graph_tool(G, rlist): vals = rlist) G.vp.id = vid return G + +def sparse_mat_to_network(sparse_mat, rlist, use_gpu = False): + """Generate a network from a lineage rank fit + + Args: + sparse_mat (scipy or cupyx sparse matrix) + Sparse matrix of kNN from lineage fit + rlist (list) + List of sequence names + use_gpu (bool) + Whether GPU libraries should be used + + Returns: + G (network) + Graph tool or cugraph network + """ + if use_gpu: + G_df = cudf.DataFrame(columns = ['source','destination','weights']) + G_df['source'] = sparse_mat.row + G_df['destination'] = sparse_mat.col + G_df['weights'] = sparse_mat.data + max_in_vertex_labels = len(rlist)-1 + G = add_self_loop(G_df, max_in_vertex_labels, weights = True, renumber = False) + else: + connections = [] + for (src,dst) in zip(sparse_mat.row,sparse_mat.col): + connections.append(src,dst) + G = construct_network_from_edge_list(rlist, + rlist, + connections, + weights=sparse_mat.data, + summarise=False) + + return G diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py index 425acf6d..99c733a1 100644 --- a/PopPUNK/plot.py +++ b/PopPUNK/plot.py @@ -307,7 +307,7 @@ def plot_refined_results(X, Y, x_boundary, y_boundary, core_boundary, accessory_ plt.plot([core_boundary*scale[0], core_boundary*scale[0]], [0, np.amax(X[:,1])], color='red', linewidth=2, linestyle='--', label='Threshold boundary') - plt.legend() + plt.legend(loc='lower right') plt.title(title) plt.xlabel('Core distance (' + r'$\pi$' + ')') plt.ylabel('Accessory distance (' + r'$a$' + ')') @@ -502,20 +502,17 @@ def outputsForCytoscape(G, G_mst, isolate_names, clustering, outPrefix, epiCsv, # write graph file if suffix is None: - graph_file_name = os.path.basename(outPrefix) + "_cytoscape.graphml" + suffix = '_cytoscape' else: - graph_file_name = os.path.basename(outPrefix) + "_" + suffix + "_cytoscape.graphml" - G.save(outPrefix + "/" + graph_file_name, fmt = 'graphml') + suffix = suffix + '_cytoscape' + save_network(G, prefix = outPrefix, suffix = suffix, use_graphml = True) if G_mst != None: isolate_labels = isolateNameToLabel(G_mst.vp.id) for n,v in enumerate(G_mst.vertices()): G_mst.vp.id[v] = isolate_labels[n] - if suffix is not None: - graph_suffix = '_' + suffix + '_cytoscape_mst' - else: - graph_suffix = '_cytoscape_mst' - save_network(G_mst, prefix = outPrefix, suffix = graph_suffix, use_graphml = True) + suffix = suffix + '_mst' + save_network(G_mst, prefix = outPrefix, suffix = suffix, use_graphml = True) # Write CSV of metadata if writeCsv: @@ -591,7 +588,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering, if queryNames is not None: colnames.append('Status') else: - sys.stderr.write("Do not recognise format for CSV writing") + sys.stderr.write("Do not recognise format for CSV writing\n") exit(1) # process epidemiological data diff --git a/PopPUNK/reference_pick.py b/PopPUNK/reference_pick.py index 6e4bcd68..d8b2be5f 100755 --- a/PopPUNK/reference_pick.py +++ b/PopPUNK/reference_pick.py @@ -14,6 +14,8 @@ from .sketchlib import removeFromDB from .network import extractReferences +from .network import load_network_file +from .network import save_network from .prune_db import prune_distance_matrix @@ -44,6 +46,7 @@ def get_options(): # processing other = parser.add_argument_group('Other options') other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]') + other.add_argument('--use-gpu', default=False, action='store_true', help='Whether to use GPUs') other.add_argument('--version', action='version', version='%(prog)s '+__version__) @@ -70,14 +73,24 @@ def main(): refList, queryList, self, distMat = readPickle(args.distances, enforce_self=True) # Read in full network - genomeNetwork = gt.load_graph(args.network) - sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") + genomeNetwork = load_network_file(args.network, use_gpu = args.use_gpu) + if args.use_gpu: + sys.stderr.write("Network loaded: " + str(genomeNetwork.number_of_vertices()) + " samples\n") + else: + sys.stderr.write("Network loaded: " + str(len(list(genomeNetwork.vertices()))) + " samples\n") # This is the same set of function calls for --fit-model when no --full-db in __main__.py # Find refs and prune network reference_indices, reference_names, refFileName, G_ref = \ - extractReferences(genomeNetwork, refList, args.output, threads = args.threads) - G_ref.save(args.output + "/" + os.path.basename(args.output) + '_graph.gt', fmt = 'gt') + extractReferences(genomeNetwork, + refList, + args.output, + threads = args.threads, + use_gpu = args.use_gpu) + save_network(G_ref, + prefix = args.output, + suffix = ".refs_graph", + use_gpu = args.use_gpu) # Prune distances nodes_to_remove = set(range(len(refList))).difference(reference_indices) diff --git a/PopPUNK/refine.py b/PopPUNK/refine.py index dc4cb51f..327afef1 100644 --- a/PopPUNK/refine.py +++ b/PopPUNK/refine.py @@ -32,6 +32,7 @@ import cudf import cupy as cp from numba import cuda + import rmm gpu_lib = True except ImportError as e: gpu_lib = False diff --git a/PopPUNK/sketchlib.py b/PopPUNK/sketchlib.py index de584180..7e1adab8 100644 --- a/PopPUNK/sketchlib.py +++ b/PopPUNK/sketchlib.py @@ -544,15 +544,30 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num example = sample(rNames, k=2) raw = np.zeros(len(klist)) corrected = np.zeros(len(klist)) - for kidx, kmer in enumerate(klist): - raw[kidx] = pp_sketchlib.jaccardDist(ref_db, example[0], example[1], kmer, False) - corrected[kidx] = pp_sketchlib.jaccardDist(ref_db, example[0], example[1], kmer, True) - raw_fit = fitKmerCurve(raw, klist, jacobian) - corrected_fit = fitKmerCurve(corrected, klist, jacobian) + raw = pp_sketchlib.queryDatabase(ref_db, + ref_db, + [example[0]], + [example[1]], + klist, + random_correct = False, + jaccard = True, + num_threads = threads, + use_gpu = False) + corrected = pp_sketchlib.queryDatabase(ref_db, + ref_db, + [example[0]], + [example[1]], + klist, + random_correct = True, + jaccard = True, + num_threads = threads, + use_gpu = False) + raw_fit = fitKmerCurve(raw[0], klist, jacobian) + corrected_fit = fitKmerCurve(corrected[0], klist, jacobian) plot_fit(klist, - raw, + raw[0], raw_fit, - corrected, + corrected[0], corrected_fit, dbPrefix + "/" + dbPrefix + "_fit_example_" + str(plot_idx + 1), "Example fit " + str(plot_idx + 1) + " - " + example[0] + " vs. " + example[1]) @@ -568,6 +583,41 @@ def queryDatabase(rNames, qNames, dbPrefix, queryPrefix, klist, self = True, num query_db = queryPrefix + "/" + os.path.basename(queryPrefix) distMat = pp_sketchlib.queryDatabase(ref_db, query_db, rNames, qNames, klist, True, False, threads, use_gpu, deviceid) + + # option to plot core/accessory fits. Choose a random number from cmd line option + if number_plot_fits > 0: + jacobian = -np.hstack((np.ones((klist.shape[0], 1)), klist.reshape(-1, 1))) + ref_examples = sample(rNames, k = number_plot_fits) + query_examples = sample(qNames, k = number_plot_fits) + raw = pp_sketchlib.queryDatabase(ref_db, + query_db, + ref_examples, + query_examples, + klist, + random_correct = False, + jaccard = True, + num_threads = threads, + use_gpu = False) + corrected = pp_sketchlib.queryDatabase(ref_db, + query_db, + ref_examples, + query_examples, + klist, + random_correct = True, + jaccard = True, + num_threads = threads, + use_gpu = False) + for plot_idx in range(number_plot_fits): + raw_fit = fitKmerCurve(raw[plot_idx], klist, jacobian) + corrected_fit = fitKmerCurve(corrected[plot_idx], klist, jacobian) + plot_fit(klist, + raw[plot_idx], + raw_fit, + corrected[plot_idx], + corrected_fit, + queryPrefix + "/" + queryPrefix + "_fit_example_" + str(plot_idx + 1), + "Example fit " + str(plot_idx + 1) + " - " + ref_examples[plot_idx] + \ + " vs. " + query_examples[plot_idx]) return distMat @@ -713,7 +763,7 @@ def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads failed, full_names = True) os.rename(filtered_db_name, db_name) - + hdf_in.close() # if failure still close files to avoid corruption except: hdf_in.close() @@ -738,6 +788,8 @@ def sketchlibAssemblyQC(prefix, names, klist, qc_dict, strand_preserved, threads # remove random matches if already present if 'random' in hdf_in: + hdf_in.close() + hdf_in = h5py.File(db_name, 'r+') del hdf_in['random'] hdf_in.close() diff --git a/PopPUNK/sparse_mst.py b/PopPUNK/sparse_mst.py index a8233465..3ecfa86f 100755 --- a/PopPUNK/sparse_mst.py +++ b/PopPUNK/sparse_mst.py @@ -19,6 +19,7 @@ import cudf import cupy as cp from numba import cuda + import rmm gpu_lib = True except ImportError as e: gpu_lib = False @@ -33,9 +34,12 @@ from .network import construct_network_from_sparse_matrix from .plot import drawMST + from .trees import mst_to_phylogeny, write_tree + from .utils import setGtThreads, readIsolateTypeFromCsv from .utils import check_and_set_gpu +from .utils import read_rlist_from_distance_pickle # command line parsing def get_options(): @@ -52,6 +56,7 @@ def get_options(): iGroup.add_argument('--previous-mst', help='Graph tool file from which previous MST can be loaded', default=None) iGroup.add_argument('--distance-pkl', help='Input pickle from distances, which contains sample names') + iGroup.add_argument('--previous-distance-pkl', help='Input pickle from distances, which contains sample names') iGroup.add_argument('--display-cluster', default=None, help='Column of clustering CSV to use for plotting') # output options @@ -72,51 +77,15 @@ def get_options(): return parser.parse_args() -def main(): - - # Check input args ok - args = get_options() - - import graph_tool.all as gt - # load CUDA libraries - args.gpu_graph = check_and_set_gpu(args.gpu_graph, gpu_lib) - - # Read in sample names - if (args.distance_pkl is not None) ^ (args.previous_clustering is not None): - sys.stderr.write("To label strains, both --distance-pkl and --previous-clustering" - " must be provided\n") - sys.exit(1) - elif os.path.exists(args.distance_pkl): - with open(args.distance_pkl, 'rb') as pickle_file: - rlist, qlist, self = pickle.load(pickle_file) - if not self: - sys.stderr.write("This script must be run on a full all-v-all model\n") - sys.exit(1) - else: - sys.stderr.write("Cannot find file " + args.distance_pkl + "\n") - sys.exit(1) - - # Check output path ok - if not os.path.isdir(args.output): - try: - os.makedirs(args.output) - except OSError: - sys.stderr.write("Cannot create output directory\n") - sys.exit(1) - setGtThreads(args.threads) - - # Create network with sparse dists - sys.stderr.write("Loading distances into graph\n") - sparse_mat = sparse.load_npz(args.rank_fit) - if args.gpu_graph: +def generate_mst_from_sparse_input(sparse_mat, rlist, old_rlist = None, previous_mst = None, gpu_graph = False): + if gpu_graph: # Load previous MST if specified - if args.previous_mst is not None: - print("Previous: " + str(args.previous_mst)) - extra_sources, extra_targets, extra_weights = network_to_edges(args.previous_mst, + if previous_mst is not None: + extra_sources, extra_targets, extra_weights = network_to_edges(previous_mst, rlist, - previous_pkl = args.distance_pkl, + old_ids = old_rlist, weights = True, - use_gpu = use_gpu) + use_gpu = gpu_graph) sources = np.append(sparse_mat.row, np.asarray(extra_sources)) targets = np.append(sparse_mat.col, np.asarray(extra_targets)) weights = np.append(sparse_mat.data, np.asarray(extra_weights)) @@ -134,12 +103,12 @@ def main(): G = cugraph.minimum_spanning_tree(G_cu, weight='weights') else: # Load previous MST if specified - if args.previous_mst is not None: + if previous_mst is not None: G = construct_network_from_sparse_matrix(rlist, rlist, sparse_mat, summarise=False, - previous_network = args.previous_mst) + previous_network = previous_mst) else: G = construct_network_from_sparse_matrix(rlist, rlist, @@ -147,7 +116,55 @@ def main(): summarise=False) sys.stderr.write("Calculating MST (CPU)\n") - G = generate_minimum_spanning_tree(G, args.gpu_graph) + G = generate_minimum_spanning_tree(G, gpu_graph) + + return(G) + + +def main(): + + # Check input args ok + args = get_options() + + import graph_tool.all as gt + # load CUDA libraries + args.gpu_graph = check_and_set_gpu(args.gpu_graph, gpu_lib) + + # Read in sample names + if (args.distance_pkl is not None) ^ (args.previous_clustering is not None): + sys.stderr.write("To label strains, both --distance-pkl and --previous-clustering" + " must be provided\n") + sys.exit(1) + elif os.path.exists(args.distance_pkl): + rlist = read_rlist_from_distance_pickle(args.distance_pkl, + allow_non_self = False) + else: + sys.stderr.write("Cannot find file " + args.distance_pkl + "\n") + sys.exit(1) + + # Read in old sequence names + old_rlist = None + if args.previous_distance_pkl is not None and os.path.exists(args.previous_distance_pkl): + old_rlist = read_rlist_from_distance_pickle(args.previous_distance_pkl, + allow_non_self = False) + + # Check output path ok + if not os.path.isdir(args.output): + try: + os.makedirs(args.output) + except OSError: + sys.stderr.write("Cannot create output directory\n") + sys.exit(1) + setGtThreads(args.threads) + + # Create network with sparse dists + sys.stderr.write("Loading distances into graph\n") + sparse_mat = sparse.load_npz(args.rank_fit) + G = generate_mst_from_sparse_input(sparse_mat, + rlist, + old_rlist = old_rlist, + previous_mst = args.previous_mst, + gpu_graph = args.gpu_graph) # Save output sys.stderr.write("Generating output\n") diff --git a/PopPUNK/trees.py b/PopPUNK/trees.py index f6683b5e..e240037d 100644 --- a/PopPUNK/trees.py +++ b/PopPUNK/trees.py @@ -19,6 +19,7 @@ import cudf import cupy as cp from numba import cuda + import rmm gpu_lib = True except ImportError as e: gpu_lib = False @@ -173,6 +174,8 @@ def generate_nj_tree(coreMat, seqLabels, outPrefix, rapidnj, threads): tree_string = tree.as_string(schema="newick", suppress_rooting=True, unquoted_underscores=True) + tree_string = tree_string.replace("'","") + return tree_string def mst_to_phylogeny(mst_network, names, use_gpu = False): diff --git a/PopPUNK/tsne.py b/PopPUNK/tsne.py index 7068449f..75234037 100644 --- a/PopPUNK/tsne.py +++ b/PopPUNK/tsne.py @@ -16,7 +16,9 @@ import cugraph import cudf import cupy as cp + from cuml import manifold as manifold_gpu from numba import cuda + import rmm gpu_lib = True except ImportError as e: gpu_lib = False @@ -80,6 +82,7 @@ def get_options(): parser.add_argument('--output', required=True, help='Name of output file') parser.add_argument('--perplexity', help='Perplexity used to generate t-SNE projection [default = 30]', type=int, default=30) parser.add_argument('--verbosity', help='Verbosity level for t-SNE (0-3) [default = 0]', type=int, default=0) + parser.add_argument('--use-gpu', help='Whether to use GPU libraries for t-SNE calculation', default = False, action='store_true') return parser.parse_args() @@ -125,7 +128,7 @@ def main(): j += 1 # generate accessory genome distance representation - generate_tsne(seqLabels, accMat, args.perplexity, args.output, overwrite = True, verbosity = verbosity) + generate_tsne(seqLabels, accMat, args.perplexity, args.output, overwrite = True, use_gpu = args.use_gpu, verbosity = verbosity) if __name__ == "__main__": diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index aa3e9113..f823defe 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -22,6 +22,9 @@ try: import cudf + import rmm + import cupy + from numba import cuda gpu_lib = True except ImportError as e: gpu_lib = False @@ -264,7 +267,7 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): from .sketchlib import pickTypeIsolate # Create overall list of sequences - if refList == refList: + if refList == queryList: seq_names_passing = refList else: seq_names_passing = refList + queryList @@ -288,7 +291,11 @@ def qcDistMat(distMat, refList, queryList, ref_db, prefix, qc_dict): # First check with numpy, which is quicker than iterating over everything #long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])])[1].tolist() long_distance_rows = np.where([(distMat[:, 0] > qc_dict['max_pi_dist']) | (distMat[:, 1] > qc_dict['max_a_dist'])],0,1)[0].tolist() - long_edges = poppunk_refine.generateTuples(long_distance_rows, 0) + long_edges = poppunk_refine.generateTuples(long_distance_rows, + 0, + self = (refList == queryList), + num_ref = len(refList), + int_offset = 0) if len(long_edges) > 0: # Prune sequences based on reference sequence for (s,t) in long_edges: @@ -495,7 +502,7 @@ def readRfile(rFile, oneSeq=False): for sequence in rFields[1:]: sample_files.append(sequence) - # Take first of sequence list if using mash + # Take first of sequence list if oneSeq: if len(sample_files) > 1: sys.stderr.write("Multiple sequence found for " + rFields[0] + @@ -504,6 +511,9 @@ def readRfile(rFile, oneSeq=False): else: sequences.append(sample_files) + # Process names to ensure compatibility with downstream software + names = isolateNameToLabel(names) + if len(set(names)) != len(names): seen = set() dupes = set(x for x in names if x in seen or seen.add(x)) @@ -534,7 +544,8 @@ def isolateNameToLabel(names): """ # useful to have as a function in case we # want to remove certain characters - labels = [name.split('/')[-1].split('.')[0].replace(':','') for name in names] + labels = [name.split('/')[-1].replace('.','_').replace(':','').replace('(','_').replace(')','_') \ + for name in names] return labels @@ -626,6 +637,32 @@ def check_and_set_gpu(use_gpu, gpu_lib, quit_on_fail = False): # Set memory management for large networks if use_gpu: + rmm.reinitialize(managed_memory=True) cudf.set_allocator("managed") - + if "cupy" in sys.modules: + cupy.cuda.set_allocator(rmm.rmm_cupy_allocator) + if "cuda" in sys.modules: + cuda.set_memory_manager(rmm.RMMNumbaManager) + assert(rmm.is_initialized()) + return use_gpu + +def read_rlist_from_distance_pickle(fn, allow_non_self = True): + """Return the list of reference sequences from a distance pickle. + + Args: + fn (str) + Name of distance pickle + allow_non_self (bool) + Whether non-self distance datasets are permissible + Returns: + rlist (list) + List of reference sequence names + """ + with open(fn, 'rb') as pickle_file: + rlist, qlist, self = pickle.load(pickle_file) + if not allow_non_self and not self: + sys.stderr.write("Thi analysis requires an all-v-all" + " distance dataset\n") + sys.exit(1) + return rlist diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py index 800af930..f60333a2 100644 --- a/PopPUNK/visualise.py +++ b/PopPUNK/visualise.py @@ -5,9 +5,20 @@ # universal import os import sys +import pickle # additional import numpy as np -import scipy.sparse +from scipy import sparse + +try: + import cudf + import rmm + import cupy + import cugraph + from numba import cuda + gpu_lib = True +except ImportError as e: + gpu_lib = False # required from v2.1.1 onwards (no mash support) import pp_sketchlib @@ -39,7 +50,10 @@ def get_options(): help='Location of query database, if distances ' 'are from ref-query') iGroup.add_argument('--distances', - help='Prefix of input pickle of pre-calculated distances') + help='Prefix of input pickle of pre-calculated distances', + default=None) + iGroup.add_argument('--rank-fit', + help='Location of rank fit, a sparse matrix (*_rank*_fit.npz)') iGroup.add_argument('--include-files', help='File with list of sequences to include in visualisation. ' 'Default is to use all sequences in database.', @@ -62,6 +76,15 @@ def get_options(): 'from poppunk_assign [default = use that in the directory ' 'of the query database]', type = str) + iGroup.add_argument('--previous-mst', + help='File containing previous minimum spanning tree', + default=None, + type = str) + iGroup.add_argument('--previous-distances', + help='Prefix of distance files used to generate the previous ' + 'minimum spanning tree', + default=None, + type = str) iGroup.add_argument('--network-file', help='Specify a file to use for any graph visualisations', type = str) @@ -135,6 +158,7 @@ def get_options(): def generate_visualisations(query_db, ref_db, distances, + rank_fit, threads, output, gpu_dist, @@ -150,6 +174,8 @@ def generate_visualisations(query_db, model_dir, previous_clustering, previous_query_clustering, + previous_mst, + previous_distances, network_file, gpu_graph, info_csv, @@ -169,6 +195,8 @@ def generate_visualisations(query_db, from .network import generate_minimum_spanning_tree from .network import load_network_file from .network import cugraph_to_graph_tool + from .network import save_network + from .network import sparse_mat_to_network from .plot import drawMST from .plot import outputsForMicroreact @@ -182,6 +210,8 @@ def generate_visualisations(query_db, from .sketchlib import readDBParams from .sketchlib import getKmersFromReferenceDatabase from .sketchlib import addRandom + + from .sparse_mst import generate_mst_from_sparse_input from .trees import load_tree, generate_nj_tree, mst_to_phylogeny @@ -192,6 +222,13 @@ def generate_visualisations(query_db, from .utils import readIsolateTypeFromCsv from .utils import joinClusterDicts from .utils import listDistInts + from .utils import read_rlist_from_distance_pickle + + #******************************# + #* *# + #* Initial checks and set up *# + #* *# + #******************************# # Check on parallelisation of graph-tools setGtThreads(threads) @@ -209,6 +246,13 @@ def generate_visualisations(query_db, sys.stderr.write("Cannot create output directory\n") sys.exit(1) + #******************************# + #* *# + #* Process dense or sparse *# + #* distances *# + #* *# + #******************************# + if distances is None: if query_db is None: distances = ref_db + "/" + os.path.basename(ref_db) + ".dists" @@ -217,53 +261,79 @@ def generate_visualisations(query_db, else: distances = distances - rlist, qlist, self, complete_distMat = readPickle(distances) - if not self: - qr_distMat = complete_distMat - else: - rr_distMat = complete_distMat - - # Fill in qq-distances if required - if self == False: - sys.stderr.write("Note: Distances in " + distances + " are from assign mode\n" - "Note: Distance will be extended to full all-vs-all distances\n" - "Note: Re-run poppunk_assign with --update-db to avoid this\n") - ref_db_loc = ref_db + "/" + os.path.basename(ref_db) - rlist_original, qlist_original, self_ref, rr_distMat = readPickle(ref_db_loc + ".dists") - if not self_ref: - sys.stderr.write("Distances in " + ref_db + " not self all-vs-all either\n") - sys.exit(1) - kmers, sketch_sizes, codon_phased = readDBParams(query_db) - addRandom(query_db, qlist, kmers, - strand_preserved = strand_preserved, threads = threads) - query_db_loc = query_db + "/" + os.path.basename(query_db) - qq_distMat = pp_sketchlib.queryDatabase(query_db_loc, query_db_loc, - qlist, qlist, kmers, - True, False, - threads, - gpu_dist, - deviceid) - - # If the assignment was run with references, qrDistMat will be incomplete - if rlist != rlist_original: - rlist = rlist_original - qr_distMat = pp_sketchlib.queryDatabase(ref_db_loc, query_db_loc, - rlist, qlist, kmers, + # Determine whether to use sparse distances + use_sparse = False + use_dense = False + if (tree == 'mst' or tree == 'both' or cytoscape) and rank_fit is not None: + # Set flag + use_sparse = True + # Read list of sequence names and sparse distance matrix + rlist = read_rlist_from_distance_pickle(distances + '.pkl') + sparse_mat = sparse.load_npz(rank_fit) + combined_seq = rlist + # Check previous distances have been supplied if building on a previous MST + old_rlist = None + if previous_distances is not None: + old_rlist = read_rlist_from_distance_pickle(previous_distances + '.pkl') + elif previous_mst is not None: + sys.stderr.write('The prefix of the distance files used to create the previous MST' + ' is needed to use the network') + if tree == 'nj' or tree == 'both' or microreact: + use_dense = True + # Process dense distance matrix + rlist, qlist, self, complete_distMat = readPickle(distances) + if not self: + qr_distMat = complete_distMat + else: + rr_distMat = complete_distMat + + # Fill in qq-distances if required + if self == False: + sys.stderr.write("Note: Distances in " + distances + " are from assign mode\n" + "Note: Distance will be extended to full all-vs-all distances\n" + "Note: Re-run poppunk_assign with --update-db to avoid this\n") + ref_db_loc = ref_db + "/" + os.path.basename(ref_db) + rlist_original, qlist_original, self_ref, rr_distMat = readPickle(ref_db_loc + ".dists") + if not self_ref: + sys.stderr.write("Distances in " + ref_db + " not self all-vs-all either\n") + sys.exit(1) + kmers, sketch_sizes, codon_phased = readDBParams(query_db) + addRandom(query_db, qlist, kmers, + strand_preserved = strand_preserved, threads = threads) + query_db_loc = query_db + "/" + os.path.basename(query_db) + qq_distMat = pp_sketchlib.queryDatabase(query_db_loc, query_db_loc, + qlist, qlist, kmers, True, False, threads, gpu_dist, deviceid) - else: - qlist = None - qr_distMat = None - qq_distMat = None + # If the assignment was run with references, qrDistMat will be incomplete + if rlist != rlist_original: + rlist = rlist_original + qr_distMat = pp_sketchlib.queryDatabase(ref_db_loc, query_db_loc, + rlist, qlist, kmers, + True, False, + threads, + gpu_dist, + deviceid) - # Turn long form matrices into square form - combined_seq, core_distMat, acc_distMat = \ - update_distance_matrices(rlist, rr_distMat, - qlist, qr_distMat, qq_distMat, - threads = threads) + else: + qlist = None + qr_distMat = None + qq_distMat = None + + # Turn long form matrices into square form + combined_seq, core_distMat, acc_distMat = \ + update_distance_matrices(rlist, rr_distMat, + qlist, qr_distMat, qq_distMat, + threads = threads) + + #*******************************# + #* *# + #* Extract subset of sequences *# + #* *# + #*******************************# # extract subset of distances if requested if include_files is not None: @@ -277,13 +347,22 @@ def generate_visualisations(query_db, # Only keep found rows row_slice = [True if name in viz_subset else False for name in combined_seq] combined_seq = [name for name in combined_seq if name in viz_subset] - if qlist != None: - qlist = list(viz_subset.intersection(qlist)) - core_distMat = core_distMat[np.ix_(row_slice, row_slice)] - acc_distMat = acc_distMat[np.ix_(row_slice, row_slice)] + if use_sparse: + sparse_mat = sparse_mat[np.ix_(row_slice, row_slice)] + if use_dense: + if qlist != None: + qlist = list(viz_subset.intersection(qlist)) + core_distMat = core_distMat[np.ix_(row_slice, row_slice)] + acc_distMat = acc_distMat[np.ix_(row_slice, row_slice)] else: viz_subset = None + #**********************************# + #* *# + #* Process clustering information *# + #* *# + #**********************************# + # Either use strain definitions, lineage assignments or external clustering isolateClustering = {} # Use external clustering if specified @@ -322,28 +401,42 @@ def generate_visualisations(query_db, if model.type == "lineage": mode = "lineages" suffix = "_lineages.csv" - if model.indiv_fitted: - sys.stderr.write("Note: Individual (core/accessory) fits found, but " - "visualisation only supports combined boundary fit\n") prev_clustering = os.path.basename(model_file) + '/' + os.path.basename(model_file) + suffix isolateClustering = readIsolateTypeFromCsv(prev_clustering, mode = mode, return_dict = True) + # Add individual refinement clusters if they exist + if model.indiv_fitted: + for type, suffix in zip(['Core','Accessory'],['_core_clusters.csv','_accessory_clusters.csv']): + indiv_clustering = os.path.basename(model_file) + '/' + os.path.basename(model_file) + suffix + if os.path.isfile(indiv_clustering): + indiv_isolateClustering = readIsolateTypeFromCsv(indiv_clustering, + mode = mode, + return_dict = True) + isolateClustering[type] = indiv_isolateClustering['Cluster'] + # Join clusters with query clusters if required - if not self: - if previous_query_clustering is not None: - prev_query_clustering = previous_query_clustering - else: - prev_query_clustering = os.path.basename(query_db) + '/' + os.path.basename(query_db) + suffix + if use_dense: + if not self: + if previous_query_clustering is not None: + prev_query_clustering = previous_query_clustering + else: + prev_query_clustering = os.path.basename(query_db) + '/' + os.path.basename(query_db) + suffix + + queryIsolateClustering = readIsolateTypeFromCsv( + prev_query_clustering, + mode = mode, + return_dict = True) + isolateClustering = joinClusterDicts(isolateClustering, queryIsolateClustering) - queryIsolateClustering = readIsolateTypeFromCsv( - prev_query_clustering, - mode = mode, - return_dict = True) - isolateClustering = joinClusterDicts(isolateClustering, queryIsolateClustering) + #*******************# + #* *# + #* Generate trees *# + #* *# + #*******************# - # Generate MST + # Generate trees mst_tree = None mst_graph = None nj_tree = None @@ -365,33 +458,52 @@ def generate_visualisations(query_db, clustering_name = display_cluster else: clustering_name = list(isolateClustering.keys())[0] - # Get distance matrix - complete_distMat = \ - np.hstack((pp_sketchlib.squareToLong(core_distMat, threads).reshape(-1, 1), - pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1))) - # Dense network may be slow - sys.stderr.write("Generating MST from dense distances (may be slow)\n") - G = construct_network_from_assignments(combined_seq, - combined_seq, - [0]*complete_distMat.shape[0], - within_label = 0, - distMat = complete_distMat, - weights_type = mst_distances, - use_gpu = gpu_graph, - summarise = False) - if gpu_graph: - G = cugraph.minimum_spanning_tree(G, weight='weights') + if use_sparse: + G = generate_mst_from_sparse_input(sparse_mat, + rlist, + old_rlist = old_rlist, + previous_mst = previous_mst, + gpu_graph = gpu_graph) + elif use_dense: + # Get distance matrix + complete_distMat = \ + np.hstack((pp_sketchlib.squareToLong(core_distMat, threads).reshape(-1, 1), + pp_sketchlib.squareToLong(acc_distMat, threads).reshape(-1, 1))) + # Dense network may be slow + sys.stderr.write("Generating MST from dense distances (may be slow)\n") + G = construct_network_from_assignments(combined_seq, + combined_seq, + [0]*complete_distMat.shape[0], + within_label = 0, + distMat = complete_distMat, + weights_type = mst_distances, + use_gpu = gpu_graph, + summarise = False) + if gpu_graph: + G = cugraph.minimum_spanning_tree(G, weight='weights') + else: + sys.stderr.write("Need either sparse or dense distances matrix to construct MST\n") + exit(1) mst_graph = generate_minimum_spanning_tree(G, gpu_graph) del G - mst_as_tree = mst_to_phylogeny(mst_graph, - isolateNameToLabel(combined_seq), - use_gpu = gpu_graph) + # save outputs + save_network(mst_graph, + prefix = output, + suffix = '_mst', + use_graphml = False, + use_gpu = gpu_graph) if gpu_graph: mst_graph = cugraph_to_graph_tool(mst_graph, isolateNameToLabel(combined_seq)) else: vid = mst_graph.new_vertex_property('string', vals = isolateNameToLabel(combined_seq)) mst_graph.vp.id = vid + mst_as_tree = mst_to_phylogeny(mst_graph, + isolateNameToLabel(combined_seq), + use_gpu = False) + mst_as_tree = mst_as_tree.replace("'","") + with open(os.path.join(output,os.path.basename(output) + '_mst.nwk'),'w') as tree_out: + tree_out.write(mst_as_tree) drawMST(mst_graph, output, isolateClustering, clustering_name, overwrite) else: mst_tree = existing_tree @@ -412,6 +524,12 @@ def generate_visualisations(query_db, else: sys.stderr.write("Fewer than three sequences, not drawing trees\n") + #****************# + #* *# + #* Write output *# + #* *# + #****************# + # Now have all the objects needed to generate selected visualisations if microreact: sys.stderr.write("Writing microreact output\n") @@ -451,10 +569,13 @@ def generate_visualisations(query_db, if cytoscape: sys.stderr.write("Writing cytoscape output\n") - if network_file is None: - sys.stderr.write('Cytoscape output requires a network file is provided\n') + if network_file is not None: + genomeNetwork = load_network_file(network_file, use_gpu = gpu_graph) + elif rank_fit is not None: + genomeNetwork = sparse_mat_to_network(sparse_mat, combined_seq, use_gpu = gpu_graph) + else: + sys.stderr.write('Cytoscape output requires a network file or lineage rank fit is provided\n') sys.exit(1) - genomeNetwork = load_network_file(network_file, use_gpu = gpu_graph) if gpu_graph: genomeNetwork = cugraph_to_graph_tool(genomeNetwork, isolateNameToLabel(combined_seq)) outputsForCytoscape(genomeNetwork, @@ -477,6 +598,7 @@ def main(): generate_visualisations(args.query_db, args.ref_db, args.distances, + args.rank_fit, args.threads, args.output, args.gpu_dist, @@ -492,6 +614,8 @@ def main(): args.model_dir, args.previous_clustering, args.previous_query_clustering, + args.previous_mst, + args.previous_distances, args.network_file, args.gpu_graph, args.info_csv, diff --git a/docs/qc.rst b/docs/qc.rst index 74fed79f..97541c24 100644 --- a/docs/qc.rst +++ b/docs/qc.rst @@ -59,8 +59,11 @@ and run:: poppunk_prune --remove remove.txt --distances strain_db/strain_db.dists --output pruned_db -This will remove the samples from the ``strain_db.dists`` files, from which -``--model-fit`` can be run again. +This will remove the samples from the ``strain_db.dists`` files. This can instead be done +simultaneously as the model is fitted - problematic sequences can be pruned, and the model fitted +to the remaining high-quality samples by modifying the model fitting command to include QC options: + + poppunk --fit-model dbscan --ref-db example_db --output example_dbscan --max-a-dist 0.4 --max-pi-dist 0.2 Dealing with poor quality data ------------------------------ @@ -123,4 +126,3 @@ cytoscape directly, though removal from the PopPUNK database is best. The second largest cluster is also suspicious, where there are few triangles (low transitivity) and the nodes involved have high Stress. This is indicative of a bad fit overall, rather than a single problem sample. - diff --git a/poppunk_info-runner.py b/poppunk_info-runner.py new file mode 100755 index 00000000..764311f3 --- /dev/null +++ b/poppunk_info-runner.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# vim: set fileencoding= : +# Copyright 2018-2020 John Lees, Daniel Anderson and Nick Croucher + +"""Convenience wrapper for running poppunk_info directly from source tree.""" + +from PopPUNK.info import main + +if __name__ == '__main__': + main() + diff --git a/scripts/poppunk_db_info.py b/scripts/poppunk_db_info.py deleted file mode 100755 index 77ae20f9..00000000 --- a/scripts/poppunk_db_info.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python -# vim: set fileencoding= : -# Copyright 2018-2020 John Lees and Nick Croucher - -import sys -import argparse -import h5py - -# command line parsing -def get_options(): - - parser = argparse.ArgumentParser(description='Get information about a PopPUNK database', - prog='poppunk_db_info') - - # input options - parser.add_argument('db', help='Database file (.h5)') - parser.add_argument('--list-samples', action='store_true', default=False, - help='List every sample in the database') - - return parser.parse_args() - -# main code -if __name__ == "__main__": - - # Check input ok - args = get_options() - - ref_db = h5py.File(args.db, 'r') - print("PopPUNK database:\t\t" + args.db) - - sketch_version = ref_db['sketches'].attrs['sketch_version'] - print("Sketch version:\t\t\t" + sketch_version) - - num_samples = len(ref_db['sketches'].keys()) - print("Number of samples:\t\t" + str(num_samples)) - - first_sample = list(ref_db['sketches'].keys())[0] - kmer_size = ref_db['sketches/' + first_sample].attrs['kmers'] - print("K-mer sizes:\t\t\t" + ",".join([str(x) for x in kmer_size])) - - sketch_size = int(ref_db['sketches/' + first_sample].attrs['sketchsize64']) * 64 - print("Sketch size:\t\t\t" + str(sketch_size)) - - if 'random' in ref_db.keys(): - has_random = True - else: - has_random = False - print("Contains random matches:\t" + str(has_random)) - - try: - codon_phased = ref_db['sketches'].attrs['codon_phased'] == 1 - except KeyError: - codon_phased = False - print("Codon phased seeds:\t\t" + str(codon_phased)) - - if args.list_samples: - print("\n") - print("\t".join(["name", "base_frequencies", "length", "missing_bases"])) - for sample_name in list(ref_db['sketches'].keys()): - sample_string = [sample_name] - base_freq = ref_db['sketches/' + sample_name].attrs['base_freq'] - sample_string.append(",".join([base + ':' + "{:.3f}".format(x) for base, x in zip(['A', 'C', 'G', 'T'], base_freq)])) - sample_string.append(str(ref_db['sketches/' + sample_name].attrs['length'])) - sample_string.append(str(ref_db['sketches/' + sample_name].attrs['missing_bases'])) - print("\t".join(sample_string)) - - - sys.exit(0) diff --git a/setup.py b/setup.py index fb3cd788..ca76070a 100644 --- a/setup.py +++ b/setup.py @@ -113,7 +113,8 @@ def build_extension(self, ext): 'poppunk_mst = PopPUNK.sparse_mst:main', 'poppunk_prune = PopPUNK.prune_db:main', 'poppunk_references = PopPUNK.reference_pick:main', - 'poppunk_tsne = PopPUNK.tsne:main' + 'poppunk_tsne = PopPUNK.tsne:main', + 'poppunk_info = PopPUNK.info:main' ] }, scripts=['scripts/poppunk_calculate_rand_indices.py', @@ -122,7 +123,6 @@ def build_extension(self, ext): 'scripts/poppunk_batch_mst.py', 'scripts/poppunk_extract_distances.py', 'scripts/poppunk_add_weights.py', - 'scripts/poppunk_db_info.py', 'scripts/poppunk_easy_run.py', 'scripts/poppunk_pickle_fix.py'], ext_modules=[CMakeExtension('poppunk_refine')], diff --git a/src/boundary.cpp b/src/boundary.cpp index 66071a43..d1991fa7 100644 --- a/src/boundary.cpp +++ b/src/boundary.cpp @@ -113,20 +113,61 @@ edge_tuple edge_iterate(const NumpyMatrix &distMat, const int slope, return edge_vec; } -edge_tuple generate_tuples(const std::vector &assignments, const int within_label) { +edge_tuple generate_tuples(const std::vector &assignments, + const int within_label, + bool self, + const int num_ref, + const int int_offset) { const size_t n_rows = assignments.size(); const size_t n_samples = 0.5 * (1 + sqrt(1 + 8 * (n_rows))); edge_tuple edge_vec; for (long row_idx = 0; row_idx < n_rows; row_idx++) { + unsigned long i, j; if (assignments[row_idx] == within_label) { - long i = calc_row_idx(row_idx, n_samples); - long j = calc_col_idx(row_idx, i, n_samples); + if (self) { + i = calc_row_idx(row_idx, n_samples); + j = calc_col_idx(row_idx, i, n_samples) + int_offset; + i = i + int_offset; + } else { + i = row_idx % num_ref + int_offset; + j = row_idx / num_ref + num_ref + int_offset; + } + if (i > j) { + std::swap(i, j); + } edge_vec.push_back(std::make_tuple(i, j)); } } return edge_vec; } +edge_tuple generate_all_tuples(const int num_ref, + const int num_queries, + bool self, + const int int_offset) { + edge_tuple edge_vec; + if (self) { + const size_t n_rows = (pow(2 * num_ref - 1, 2) - 1) / 8; + for (long row_idx = 0; row_idx < n_rows; row_idx++) { + unsigned long i, j; + i = calc_row_idx(row_idx, num_ref); + j = calc_col_idx(row_idx, i, num_ref) + int_offset; + i = i + int_offset; + if (i > j) { + std::swap(i, j); + } + edge_vec.push_back(std::make_tuple(i, j)); + } + } else { + for (unsigned long j = 0; j < num_ref; j++) { + for (unsigned long i = 0; i < num_queries; i++) { + edge_vec.push_back(std::make_tuple(i, j + num_ref)); + } + } + } + return edge_vec; +} + // Line defined between (x0, y0) and (x1, y1) // Offset is distance along this line, starting at (x0, y0) network_coo threshold_iterate_1D(const NumpyMatrix &distMat, diff --git a/src/boundary.hpp b/src/boundary.hpp index 3b361146..d29b2448 100644 --- a/src/boundary.hpp +++ b/src/boundary.hpp @@ -27,13 +27,21 @@ edge_tuple edge_iterate(const NumpyMatrix &distMat, const int slope, const float x_max, const float y_max); edge_tuple generate_tuples(const std::vector &assignments, - const int within_label); + const int within_label, + bool self, + const int num_ref, + const int int_offset); + +edge_tuple generate_all_tuples(const int num_ref, + const int num_queries, + bool self, + const int int_offset); network_coo threshold_iterate_1D(const NumpyMatrix &distMat, const std::vector &offsets, const int slope, const float x0, const float y0, const float x1, const float y1, - const int num_threads = 1); + const int num_threads); network_coo threshold_iterate_2D(const NumpyMatrix &distMat, const std::vector &x_max, diff --git a/src/python_bindings.cpp b/src/python_bindings.cpp index 31ec3f9e..1fdf6634 100644 --- a/src/python_bindings.cpp +++ b/src/python_bindings.cpp @@ -31,9 +31,27 @@ edge_tuple edgeThreshold(const Eigen::Ref &distMat, } edge_tuple generateTuples(const std::vector &assignments, - const int within_label) { - edge_tuple edges = generate_tuples(assignments, within_label); - return (edges); + const int within_label, + bool self, + const int num_ref, + const int int_offset) { + edge_tuple edges = generate_tuples(assignments, + within_label, + self, + num_ref, + int_offset); + return (edges); +} + +edge_tuple generateAllTuples(const int num_ref, + const int num_queries, + bool self = true, + const int int_offset = 0) { + edge_tuple edges = generate_all_tuples(num_ref, + num_queries, + self, + int_offset); + return (edges); } network_coo thresholdIterate1D(const Eigen::Ref &distMat, @@ -80,10 +98,22 @@ PYBIND11_MODULE(poppunk_refine, m) { py::arg("y_max")); m.def("generateTuples", &generateTuples, - py::return_value_policy::reference_internal, - "Return edge tuples based on assigned groups", - py::arg("assignments"), py::arg("within_label")); + py::return_value_policy::reference_internal, + "Return edge tuples based on assigned groups", + py::arg("assignments"), + py::arg("within_label"), + py::arg("self") = true, + py::arg("num_ref") = 0, + py::arg("int_offset") = 0); + m.def("generateAllTuples", &generateAllTuples, + py::return_value_policy::reference_internal, + "Return all edge tuples", + py::arg("num_ref"), + py::arg("num_queries") = 0, + py::arg("self") = true, + py::arg("int_offset") = 0); + m.def("thresholdIterate1D", &thresholdIterate1D, py::return_value_policy::reference_internal, "Move a 2D boundary to grow a network by adding edges at each offset", diff --git a/test/manual.txt b/test/manual.txt index 65a718b0..85c73b72 100644 --- a/test/manual.txt +++ b/test/manual.txt @@ -1,2 +1,2 @@ start 0,0 -end 0.3,0.3 \ No newline at end of file +end 0.3,0.3 diff --git a/test/run_test.py b/test/run_test.py index 4e0bc2dd..9d7fab26 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -62,10 +62,10 @@ #assign query sys.stderr.write("Running query assignment\n") -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query --overwrite", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query_update --update-db --graph-weights --overwrite", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --output example_single_query --update-db --overwrite", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_lineages --output example_lineage_query --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --overwrite --core --accessory", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_dbscan --output example_query_update --update-db --graph-weights --overwrite", shell=True, check=True) # uses graph weights +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --model-dir example_refine --output example_single_query --update-db --overwrite", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --model-dir example_lineages --output example_lineage_query --overwrite", shell=True, check=True) # viz sys.stderr.write("Running visualisations (poppunk_visualise)\n") @@ -80,7 +80,7 @@ # MST sys.stderr.write("Running MST\n") -subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree mst", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree both", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_mst-runner.py --distance-pkl example_db/example_db.dists.pkl --rank-fit example_lineages/example_lineages_rank5_fit.npz --previous-clustering example_dbscan/example_dbscan_clusters.csv --output example_sparse_mst --no-plot", shell=True, check=True) # t-sne @@ -95,6 +95,10 @@ sys.stderr.write("Running poppunk_references\n") subprocess.run(python_cmd + " ../poppunk_references-runner.py --network example_db/example_db_graph.gt --distances example_db/example_db.dists --ref-db example_db --output example_refs --model example_db", shell=True, check=True) +# info +sys.stderr.write("Running poppunk_info\n") +subprocess.run(python_cmd + " ../poppunk_info-runner.py --db example_db --output example_db.info.csv", shell=True, check=True) + # citations sys.stderr.write("Printing citations\n") subprocess.run(python_cmd + " ../poppunk-runner.py --citation --fit-model bgmm --ref-db example_db --K 4", shell=True, check=True) diff --git a/test/test-gpu.py b/test/test-gpu.py index 60111ffe..9b932cbb 100755 --- a/test/test-gpu.py +++ b/test/test-gpu.py @@ -27,7 +27,7 @@ # test updating order is correct sys.stderr.write("Running distance matrix order check (--update-db)\n") -subprocess.run(python_cmd + " test-update.py", shell=True, check=True) +subprocess.run(python_cmd + " test-update-gpu.py", shell=True, check=True) #fit GMM sys.stderr.write("Running GMM model fit (--fit-model gmm)\n") @@ -39,12 +39,13 @@ #refine model with GMM sys.stderr.write("Running model refinement (--fit-model refine)\n") -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --indiv-refine both --no-local --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --unconstrained --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 1 --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.8 --overwrite --score-idx 2 --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --manual-start manual.txt --overwrite --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --indiv-refine both --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --indiv-refine both --no-local --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --unconstrained --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --score-idx 1 --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model refine --ref-db example_db --output example_refine --neg-shift 0.2 --overwrite --score-idx 2 --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model threshold --threshold 0.003 --ref-db example_db --output example_threshold --gpu-graph", shell=True, check=True) # lineage clustering @@ -61,15 +62,15 @@ #assign query sys.stderr.write("Running query assignment\n") -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query --overwrite --gpu-dist --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --output example_query_update --update-db --graph-weights --overwrite --gpu-dist --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --output example_single_query --update-db --overwrite --gpu-dist --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_refine --output example_query --overwrite --gpu-dist --gpu-graph --core --accessory", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_dbscan --output example_query_update --update-db --graph-weights --overwrite --gpu-dist --gpu-graph", shell=True, check=True) # uses graph weights +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query single_query.txt --db example_db --model-dir example_refine --output example_single_query --update-db --overwrite --gpu-dist --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_assign-runner.py --query some_queries.txt --db example_db --model-dir example_lineages --output example_lineage_query --overwrite --gpu-graph --gpu-dist", shell=True, check=True) # viz sys.stderr.write("Running visualisations (poppunk_visualise)\n") subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --microreact --gpu-graph", shell=True, check=True) -subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --cytoscape --network-file example_db/example_db_graph.gt --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --cytoscape --network-file example_db/example_db_graph.csv.gz --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --phandango --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz --grapetree --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_viz_subset --microreact --include-files subset.txt --gpu-graph", shell=True, check=True) @@ -79,20 +80,24 @@ # MST sys.stderr.write("Running MST\n") -subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree mst --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_visualise-runner.py --ref-db example_db --output example_mst --microreact --tree both --gpu-graph", shell=True, check=True) subprocess.run(python_cmd + " ../poppunk_mst-runner.py --distance-pkl example_db/example_db.dists.pkl --rank-fit example_lineages/example_lineages_rank5_fit.npz --previous-clustering example_dbscan/example_dbscan_clusters.csv --output example_sparse_mst --no-plot --gpu-graph", shell=True, check=True) # t-sne sys.stderr.write("Running tsne viz\n") -subprocess.run(python_cmd + " ../poppunk_tsne-runner.py --distances example_db/example_db.dists --output example_tsne --perplexity 5 --verbosity 1 --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_tsne-runner.py --distances example_db/example_db.dists --output example_tsne --perplexity 5 --verbosity 1 --use-gpu", shell=True, check=True) # prune sys.stderr.write("Running poppunk_prune\n") -subprocess.run(python_cmd + " ../poppunk_prune-runner.py --distances example_db/example_db.dists --ref-db example_db --remove subset.txt --output example_prune --gpu-dist", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_prune-runner.py --distances example_db/example_db.dists --ref-db example_db --remove subset.txt --output example_prune", shell=True, check=True) # references sys.stderr.write("Running poppunk_references\n") -subprocess.run(python_cmd + " ../poppunk_references-runner.py --network example_db/example_db_graph.gt --distances example_db/example_db.dists --ref-db example_db --output example_refs --model example_db --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_references-runner.py --network example_db/example_db_graph.csv.gz --distances example_db/example_db.dists --ref-db example_db --output example_refs --model example_db --use-gpu", shell=True, check=True) + +# info +sys.stderr.write("Running poppunk_info\n") +subprocess.run(python_cmd + " ../poppunk_info-runner.py --db example_db --output example_db.info.csv --use-gpu", shell=True, check=True) # citations sys.stderr.write("Printing citations\n") diff --git a/test/test-update-gpu.py b/test/test-update-gpu.py new file mode 100755 index 00000000..20887750 --- /dev/null +++ b/test/test-update-gpu.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python +# Copyright 2018-2021 John Lees and Nick Croucher + +"""Tests for PopPUNK --update-db order""" + +import subprocess +import os, sys +import sys +import shutil +import pickle + +import numpy as np +from scipy import stats +import h5py +import scipy.sparse + +import pp_sketchlib + +if os.environ.get("POPPUNK_PYTHON"): + python_cmd = os.environ.get("POPPUNK_PYTHON") +else: + python_cmd = "python" + +def run_regression(x, y, threshold = 0.99): + res = stats.linregress(x, y) + print("R^2: " + str(res.rvalue**2)) + if res.rvalue**2 < threshold: + sys.stderr.write("Distance matrix order failed!\n") + sys.exit(1) + +def compare_sparse_matrices(d1,d2,r1,r2): + d1_pairs = get_seq_tuples(d1.row,d1.col,r1) + d2_pairs = get_seq_tuples(d2.row,d2.col,r2) + d1_dists = [] + d2_dists = [] + + for (pair1,dist1) in zip(d1_pairs,d1.data): + for (pair2,dist2) in zip(d2_pairs,d2.data): + if pair1 == pair2: + d1_dists.append(dist1) + d2_dists.append(dist2) + break + + run_regression(np.asarray(d1_dists),np.asarray(d2_dists)) + +def get_seq_tuples(rows,cols,names): + tuple_list = [] + for (i,j) in zip(rows,cols): + sorted_pair = tuple(sorted((names[i],names[j]))) + tuple_list.append(sorted_pair) + return tuple_list + +def old_get_seq_tuples(rows,cols): + max_seqs = np.maximum(rows,cols) + min_seqs = np.minimum(rows,cols) + concat_seqs = np.vstack((max_seqs,min_seqs)) + seq_pairs = concat_seqs.T + seq_tuples = [tuple(row) for row in seq_pairs] + return seq_tuples + +# Check distances after one query + +# Check that order is the same after doing 1 + 2 with --update-db, as doing all of 1 + 2 together +subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile12.txt --output batch12 --overwrite --gpu-dist", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch12 --ranks 1,2 --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile1.txt --output batch1 --overwrite --gpu-dist", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch1 --ranks 1,2 --gpu-graph", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch1 --query rfile2.txt --output batch2 --update-db --overwrite --gpu-graph --gpu-dist", shell=True, check=True) + +# Load updated distances +X2 = np.load("batch2/batch2.dists.npy") +with open("batch2/batch2.dists.pkl", 'rb') as pickle_file: + rlist2, qlist, self = pickle.load(pickle_file) + +# Get same distances from the full database +ref_db = "batch12/batch12" +ref_h5 = h5py.File(ref_db + ".h5", 'r') +db_kmers = sorted(ref_h5['sketches/' + rlist2[0]].attrs['kmers']) +ref_h5.close() +X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist2, rlist2, db_kmers, + True, False, 1, False, 0) + +# Check distances match +run_regression(X1[:, 0], X2[:, 0]) +run_regression(X1[:, 1], X2[:, 1]) + +# Check sparse distances after one query +with open("batch12/batch12.dists.pkl", 'rb') as pickle_file: + rlist1, qlist1, self = pickle.load(pickle_file) +S1 = scipy.sparse.load_npz("batch12/batch12_rank2_fit.npz") +S2 = scipy.sparse.load_npz("batch2/batch2_rank2_fit.npz") +compare_sparse_matrices(S1,S2,rlist1,rlist2) + +# Check distances after second query + +# Check that order is the same after doing 1 + 2 + 3 with --update-db, as doing all of 1 + 2 + 3 together +subprocess.run(python_cmd + " ../poppunk-runner.py --create-db --r-files rfile123.txt --output batch123 --overwrite --gpu-dist", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk-runner.py --fit-model lineage --ref-db batch123 --ranks 1,2 --gpu-graph --gpu-dist", shell=True, check=True) +subprocess.run(python_cmd + " ../poppunk_assign-runner.py --db batch2 --query rfile3.txt --output batch3 --update-db --overwrite --gpu-graph --gpu-dist", shell=True, check=True) + +# Load updated distances +X2 = np.load("batch3/batch3.dists.npy") +with open("batch3/batch3.dists.pkl", 'rb') as pickle_file: + rlist4, qlist, self = pickle.load(pickle_file) + +# Get same distances from the full database +ref_db = "batch123/batch123" +ref_h5 = h5py.File(ref_db + ".h5", 'r') +db_kmers = sorted(ref_h5['sketches/' + rlist4[0]].attrs['kmers']) +ref_h5.close() +X1 = pp_sketchlib.queryDatabase(ref_db, ref_db, rlist4, rlist4, db_kmers, + True, False, 1, False, 0) + +# Check distances match +run_regression(X1[:, 0], X2[:, 0]) +run_regression(X1[:, 1], X2[:, 1]) + +# Check sparse distances after second query +with open("batch123/batch123.dists.pkl", 'rb') as pickle_file: + rlist3, qlist, self = pickle.load(pickle_file) +S3 = scipy.sparse.load_npz("batch123/batch123_rank2_fit.npz") +S4 = scipy.sparse.load_npz("batch3/batch3_rank2_fit.npz") + +compare_sparse_matrices(S3,S4,rlist3,rlist4) diff --git a/test/test-web.py b/test/test-web.py index a69505c1..a12ba280 100644 --- a/test/test-web.py +++ b/test/test-web.py @@ -26,6 +26,7 @@ def main(): os.mkdir(outdir) args = default_options(species_db) qc_dict = {'run_qc': False } + print("Weights: " + str(args.assign.graph_weights)) dbFuncs = setupDBFuncs(args.assign, args.assign.min_kmer_count, qc_dict) ClusterResult = assign_query(dbFuncs, args.assign.ref_db, @@ -38,7 +39,7 @@ def main(): args.assign.threads, args.assign.overwrite, args.assign.plot_fit, - args.assign.graph_weights, + False, #args.assign.graph_weights, args.assign.max_a_dist, args.assign.max_pi_dist, args.assign.type_isolate, @@ -60,13 +61,14 @@ def main(): colours = get_colours(query, clusters) url = api(query, "example_viz") sys.stderr.write('PopPUNK-web assign test successful\n') - + print("Done clustering") # Test generate_visualisations() for PopPUNK-web sys.stderr.write('\nTesting visualisations for PopPUNK-web\n') if len(to_include) < 3: args.visualise.microreact = False generate_visualisations(outdir, species_db, + os.path.join(outdir, outdir + '.dists'), # distances, None, args.visualise.threads, outdir, @@ -83,6 +85,8 @@ def main(): species_db, species_db + "/" + os.path.basename(species_db) + "_clusters.csv", args.visualise.previous_query_clustering, + None, # previous MST + None, # previous distances, outdir + "/" + os.path.basename(outdir) + "_graph.gt", args.visualise.gpu_graph, args.visualise.info_csv, @@ -128,4 +132,4 @@ def main(): sys.stderr.write('\nAPI tests complete\n') if __name__ == "__main__": - main() \ No newline at end of file + main()