From 7d36c71b2415a827119b347a3652cc9e6224917b Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 28 May 2019 16:53:32 -0400 Subject: [PATCH] Combine readClusters functions Small changes to fix in b07ebc2: need a dict not dict of sets returned for generate viz (#44). --- PopPUNK/__init__.py | 2 +- PopPUNK/__main__.py | 12 ++---------- PopPUNK/utils.py | 39 ++++++++++++++------------------------- 3 files changed, 17 insertions(+), 36 deletions(-) diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py index ca9143e5..7b607e67 100644 --- a/PopPUNK/__init__.py +++ b/PopPUNK/__init__.py @@ -3,4 +3,4 @@ '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)''' -__version__ = '1.1.6' +__version__ = '1.1.7' diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py index 04d86b70..04de9948 100644 --- a/PopPUNK/__main__.py +++ b/PopPUNK/__main__.py @@ -9,7 +9,6 @@ import numpy as np import networkx as nx import subprocess -from collections import defaultdict # import poppunk package from .__init__ import __version__ @@ -41,7 +40,7 @@ from .utils import readPickle from .utils import writeTmpFile from .utils import qcDistMat -from .utils import readClustersToDict +from .utils import readClusters from .utils import translate_distMat from .utils import update_distance_matrices @@ -426,13 +425,7 @@ def main(): # Read in network and cluster assignment genomeNetwork, cluster_file = fetchNetwork(prev_clustering, model, rlist, args.core_only, args.accessory_only) - isolateClustering = defaultdict(set) - if args.core_only: - isolateClustering['core'] = readClustersToDict(cluster_file) - elif args.core_only: - isolateClustering['accessory'] = readClustersToDict(cluster_file) - else: - isolateClustering['combined'] = readClustersToDict(cluster_file) + isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)} # extract subset of distances if requested if args.subset is not None: @@ -440,7 +433,6 @@ def main(): with open(args.subset, 'r') as assemblyFiles: for assembly in assemblyFiles: viz_subset.append(assembly.rstrip()) - viz_subset_set = set(viz_subset) # Use the same code as no full_db in assign_query to take a subset dists_out = args.output + "/" + os.path.basename(args.output) + ".dists" diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py index 65fbd809..f2b28700 100644 --- a/PopPUNK/utils.py +++ b/PopPUNK/utils.py @@ -142,49 +142,38 @@ def qcDistMat(distMat, refList, queryList, a_max): return passed -def readClusters(clustCSV): +def readClusters(clustCSV, return_dict=False): """Read a previous reference clustering from CSV Args: clustCSV (str) File name of CSV with previous cluster assignments + return_type (str) + If True, return a dict with sample->cluster instead + of sets Returns: clusters (dict) Dictionary of cluster assignments (keys are cluster names, values are - sets containing samples in the cluster) + sets containing samples in the cluster). Or if return_dict is set keys + are sample names, values are cluster assignments. """ - clusters = defaultdict(set) + if return_dict: + clusters = {} + else: + clusters = defaultdict(set) with open(clustCSV, 'r') as csv_file: header = csv_file.readline() for line in csv_file: (sample, clust_id) = line.rstrip().split(",") - clusters[clust_id].add(sample) + if return_dict: + clusters[sample] = clust_id + else: + clusters[clust_id].add(sample) return clusters -def readClustersToDict(clustCSV): - """Read a previous reference clustering from CSV - - Args: - clustCSV (str) - File name of CSV with previous cluster assignments - - Returns: - clusters (dict) - Dictionary of cluster assignments (keys are sample names, values are - cluster assignments) - """ - clusters = {} - - with open(clustCSV, 'r') as csv_file: - header = csv_file.readline() - for line in csv_file: - (sample, clust_id) = line.rstrip().split(",") - clusters[sample] = clust_id - - return clusters def readExternalClusters(clustCSV): """Read a cluster definition from CSV (does not have to be PopPUNK