Combine readClusters functions

Small changes to fix in b07ebc2: need a dict not dict of sets returned for generate viz (#44).
bacpop · May 28, 2019 · 7d36c71 · 7d36c71
1 parent c1b4d42
commit 7d36c71
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 36 deletions.
diff --git a/PopPUNK/__init__.py b/PopPUNK/__init__.py
@@ -3,4 +3,4 @@
 
 '''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''
 
-__version__ = '1.1.6'
+__version__ = '1.1.7'
diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
@@ -9,7 +9,6 @@
 import numpy as np
 import networkx as nx
 import subprocess
-from collections import defaultdict
 
 # import poppunk package
 from .__init__ import __version__
@@ -41,7 +40,7 @@
 from .utils import readPickle
 from .utils import writeTmpFile
 from .utils import qcDistMat
-from .utils import readClustersToDict
+from .utils import readClusters
 from .utils import translate_distMat
 from .utils import update_distance_matrices
 
@@ -426,21 +425,14 @@ def main():
 
             # Read in network and cluster assignment
             genomeNetwork, cluster_file = fetchNetwork(prev_clustering, model, rlist, args.core_only, args.accessory_only)
-            isolateClustering = defaultdict(set)
-            if args.core_only:
-                isolateClustering['core'] = readClustersToDict(cluster_file)
-            elif args.core_only:
-                isolateClustering['accessory'] = readClustersToDict(cluster_file)
-            else:
-                isolateClustering['combined'] = readClustersToDict(cluster_file)
+            isolateClustering = {'combined': readClusters(cluster_file, return_dict=True)}
 
             # extract subset of distances if requested
             if args.subset is not None:
                 viz_subset = []
                 with open(args.subset, 'r') as assemblyFiles:
                     for assembly in assemblyFiles:
                         viz_subset.append(assembly.rstrip())
-                viz_subset_set = set(viz_subset)
 
                 # Use the same code as no full_db in assign_query to take a subset
                 dists_out = args.output + "/" + os.path.basename(args.output) + ".dists"

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
@@ -142,49 +142,38 @@ def qcDistMat(distMat, refList, queryList, a_max):
     return passed
 
 
-def readClusters(clustCSV):
+def readClusters(clustCSV, return_dict=False):
     """Read a previous reference clustering from CSV
 
     Args:
         clustCSV (str)
             File name of CSV with previous cluster assignments
+        return_type (str)
+            If True, return a dict with sample->cluster instead
+            of sets
 
     Returns:
         clusters (dict)
             Dictionary of cluster assignments (keys are cluster names, values are
-            sets containing samples in the cluster)
+            sets containing samples in the cluster). Or if return_dict is set keys
+            are sample names, values are cluster assignments.
     """
-    clusters = defaultdict(set)
+    if return_dict:
+        clusters = {}
+    else:
+        clusters = defaultdict(set)
 
     with open(clustCSV, 'r') as csv_file:
         header = csv_file.readline()
         for line in csv_file:
             (sample, clust_id) = line.rstrip().split(",")
-            clusters[clust_id].add(sample)
+            if return_dict:
+                clusters[sample] = clust_id
+            else:
+                clusters[clust_id].add(sample)
 
     return clusters
 
-def readClustersToDict(clustCSV):
-    """Read a previous reference clustering from CSV
-        
-    Args:
-        clustCSV (str)
-            File name of CSV with previous cluster assignments
-        
-    Returns:
-        clusters (dict)
-            Dictionary of cluster assignments (keys are sample names, values are
-            cluster assignments)
-        """
-    clusters = {}
-
-    with open(clustCSV, 'r') as csv_file:
-        header = csv_file.readline()
-        for line in csv_file:
-            (sample, clust_id) = line.rstrip().split(",")
-            clusters[sample] = clust_id
-
-    return clusters
 
 def readExternalClusters(clustCSV):
     """Read a cluster definition from CSV (does not have to be PopPUNK
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,4 +3,4 @@

		'''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''

		__version__ = '1.1.6'
		__version__ = '1.1.7'