add tqdm progress bars to main loops

luponzo86 · Nov 14, 2019 · 050ea7b · 050ea7b
1 parent a8a34af
commit 050ea7b
Show file tree

Hide file tree

Showing 6 changed files with 388 additions and 28 deletions.
diff --git a/rhapsody/features/PDB.py b/rhapsody/features/PDB.py
@@ -3,16 +3,18 @@
 PDB-based structural and dynamical features in a single place, and a
 function for using the latter on a list of PDB SAV coordinates."""
 
+import numpy as np
+import pickle
+import datetime
+import os
+from tqdm import tqdm
 from prody import Atomic, parsePDB, writePDB, LOGGER, SETTINGS
 from prody import GNM, ANM, calcSqFlucts
 from prody import calcPerturbResponse, calcMechStiff
 # from prody import calcMBS
 from prody import reduceModel, sliceModel
 from prody import execDSSP, parseDSSP
-import numpy as np
-import pickle
-import datetime
-import os
+
 
 __all__ = ['STR_FEATS', 'DYN_FEATS', 'PDB_FEATS',
            'PDBfeatures', 'calcPDBfeatures']
@@ -646,7 +648,7 @@ def calcSelFeatures(self, chain='all', resid=None, sel_feats=None):
 
 
 def calcPDBfeatures(mapped_SAVs, sel_feats=None, custom_PDB=None,
-                    refresh=False):
+                    refresh=False, status_file=None, status_prefix=None):
     LOGGER.info('Computing structural and dynamical features '
                 'from PDB structures...')
     LOGGER.timeit('_calcPDBFeats')
@@ -665,24 +667,40 @@ def calcPDBfeatures(mapped_SAVs, sel_feats=None, custom_PDB=None,
     else:
         # no need to sort when using a custom PDB or PDBID
         sorting_map = range(num_SAVs)
+    # define how to report progress
+    if status_prefix is None:
+        status_prefix = ''
+    bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
+    if status_file is not None:
+        status_file = open(status_file, 'w')
+        progress_bar = tqdm(
+            [(i, mapped_SAVs[i]) for i in sorting_map], file=status_file,
+            bar_format=bar_format+'\n')
+    else:
+        progress_bar = tqdm(
+            [(i, mapped_SAVs[i]) for i in sorting_map], bar_format=bar_format)
     cache = {'PDBID': None, 'chain': None, 'obj': None}
     count = 0
-    for indx, SAV in [(i, mapped_SAVs[i]) for i in sorting_map]:
+    for indx, SAV in progress_bar:
         count += 1
         if SAV['PDB size'] == 0:
             # SAV could not be mapped to PDB
             _features = np.nan
             SAV_coords = SAV['SAV coords']
-            LOGGER.info(f"[{count}/{num_SAVs}] SAV '{SAV_coords}' "
-                        "couldn't be mapped to PDB")
+            progress_msg = f"{status_prefix}No PDB for SAV '{SAV_coords}'"
         else:
             parsed_PDB_coords = SAV['PDB SAV coords'].split()
             PDBID, chID = parsed_PDB_coords[:2]
             resid = int(parsed_PDB_coords[2])
-            LOGGER.info("[{}/{}] Analizing mutation site {}:{} {}..."
-                        .format(count, num_SAVs, PDBID, chID, resid))
+            progress_msg = status_prefix + \
+                f'Analizing mutation site {PDBID}:{chID} {resid}'
             # chID == "?" stands for "empty space"
             chID = " " if chID == "?" else chID
+        # report progress
+        # LOGGER.info(f"[{count}/{num_SAVs}] {progress_msg}...")
+        progress_bar.set_description(progress_msg)
+        # compute PDB features, if possible
+        if SAV['PDB size'] != 0:
             if PDBID == cache['PDBID']:
                 # use PDBfeatures instance from previous iteration
                 obj = cache['obj']
@@ -728,4 +746,6 @@ def calcPDBfeatures(mapped_SAVs, sel_feats=None, custom_PDB=None,
            and custom_PDB is None:
             cache['obj'].savePickle()
     LOGGER.report('PDB features have been computed in %.1fs.', '_calcPDBFeats')
+    if status_file:
+        os.remove(status_file.name)
     return features
diff --git a/rhapsody/features/Pfam.py b/rhapsody/features/Pfam.py
@@ -3,7 +3,9 @@
 coevolution properties of an amino acid substitution from a Pfam
 multiple sequence alignment."""
 
+import os
 import numpy as np
+from tqdm import tqdm
 from prody import LOGGER
 from .Uniprot import UniprotMapping
 
@@ -38,7 +40,7 @@ def calcNormRank(array, i):
         return feats
 
 
-def calcPfamFeatures(SAVs):
+def calcPfamFeatures(SAVs, status_file=None, status_prefix=None):
     LOGGER.info('Computing sequence properties from Pfam domains...')
     LOGGER.timeit('_calcPfamFeats')
     # sort SAVs, so to group together those
@@ -49,14 +51,29 @@ def calcPfamFeatures(SAVs):
     num_SAVs = len(SAVs)
     feat_dtype = np.dtype([('entropy', 'f'), ('ranked_MI', 'f')])
     features = np.zeros(num_SAVs, dtype=feat_dtype)
+    # define how to report progress
+    if status_prefix is None:
+        status_prefix = ''
+    bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
+    if status_file is not None:
+        status_file = open(status_file, 'w')
+        progress_bar = tqdm(
+            [(i, SAVs[i]) for i in sorting_map], file=status_file,
+            bar_format=bar_format+'\n')
+    else:
+        progress_bar = tqdm(
+            [(i, SAVs[i]) for i in sorting_map], bar_format=bar_format)
     # map to Pfam domains using UniprotMapping class
     cache = {'acc': None, 'obj': None, 'warn': ''}
     count = 0
-    for indx, SAV in [(i, SAVs[i]) for i in sorting_map]:
+    for indx, SAV in progress_bar:
         count += 1
         acc, pos, aa1, aa2 = SAV.split()
         pos = int(pos)
-        LOGGER.info(f"[{count}/{num_SAVs}] Mapping SAV '{SAV}' to Pfam...")
+        # report progress
+        progress_msg = f"{status_prefix}Mapping SAV '{SAV}' to Pfam"
+        # LOGGER.info(f"[{count}/{num_SAVs}] {progress_msg}...")
+        progress_bar.set_description(progress_msg)
         # map to Pfam domains using 'UniprotMapping' class
         if acc == cache['acc']:
             # use object from previous iteration
@@ -102,4 +119,6 @@ def calcPfamFeatures(SAVs):
             cache['obj'].savePickle()
     LOGGER.report('SAVs have been mapped on Pfam domains and sequence '
                   'properties have been computed in %.1fs.', '_calcPfamFeats')
+    if status_file:
+        os.remove(status_file.name)
     return features
diff --git a/rhapsody/features/Uniprot.py b/rhapsody/features/Uniprot.py
@@ -9,6 +9,7 @@
 import numpy as np
 import prody as pd
 from prody import LOGGER, SETTINGS
+from tqdm import tqdm
 from Bio.pairwise2 import align as bioalign
 from Bio.pairwise2 import format_alignment
 from Bio.SubsMat import MatrixInfo as matlist
@@ -758,28 +759,45 @@ def calcEvolProperties(self, resid='all', refresh=False, folder=None,
         return {k: self.Pfam[k] for k in PF_list}
 
 
-def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False):
+def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False,
+                status_file=None, status_prefix=None):
     LOGGER.info('Mapping SAVs to PDB structures...')
     LOGGER.timeit('_map2PDB')
     # sort SAVs, so to group together those
     # with identical accession number
     accs = [s.split()[0] for s in SAV_coords]
     sorting_map = np.argsort(accs)
     # define a structured array
-    PDBmap_dtype = np.dtype([('orig. SAV coords', 'U25'),
-                             ('unique SAV coords', 'U25'),
-                             ('PDB SAV coords', 'U100'),
-                             ('PDB size', 'i')])
+    PDBmap_dtype = np.dtype([
+        ('orig. SAV coords', 'U25'),
+        ('unique SAV coords', 'U25'),
+        ('PDB SAV coords', 'U100'),
+        ('PDB size', 'i')])
     nSAVs = len(SAV_coords)
     mapped_SAVs = np.zeros(nSAVs, dtype=PDBmap_dtype)
+    # define how to report progress
+    if status_prefix is None:
+        status_prefix = ''
+    bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
+    if status_file is not None:
+        status_file = open(status_file, 'w')
+        progress_bar = tqdm(
+            [(i, SAV_coords[i]) for i in sorting_map], file=status_file,
+            bar_format=bar_format+'\n')
+    else:
+        progress_bar = tqdm(
+            [(i, SAV_coords[i]) for i in sorting_map], bar_format=bar_format)
     # map to PDB using Uniprot class
     cache = {'acc': None, 'obj': None}
     count = 0
-    for indx, SAV in [(i, SAV_coords[i]) for i in sorting_map]:
+    for indx, SAV in progress_bar:
         count += 1
         acc, pos, aa1, aa2 = SAV.split()
         pos = int(pos)
-        LOGGER.info(f"[{count}/{nSAVs}] Mapping SAV '{SAV}' to PDB...")
+        # report progress
+        progress_msg = f"{status_prefix}Mapping SAV '{SAV}' to PDB"
+        # LOGGER.info(f"[{count}/{nSAVs}] {progress_msg}...")
+        progress_bar.set_description(progress_msg)
         # map Uniprot to PDB chains
         if acc == cache['acc']:
             # use mapping from previous iteration
@@ -836,6 +854,8 @@ def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False):
     n = sum(mapped_SAVs['PDB size'] != 0)
     LOGGER.report(f'{n} out of {nSAVs} SAVs have been mapped to PDB in %.1fs.',
                   '_map2PDB')
+    if status_file:
+        os.remove(status_file.name)
     return mapped_SAVs
 
 

diff --git a/rhapsody/predict/core.py b/rhapsody/predict/core.py
@@ -22,7 +22,8 @@ class Rhapsody:
     EVmutation.
     """
 
-    def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True):
+    def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True,
+                 **kwargs):
         """ Initialize a Rhapsody object with a list of SAVs (optional).
 
         :arg query: Single Amino Acid Variants (SAVs) in Uniprot coordinates.
@@ -53,6 +54,14 @@ def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True):
 
         assert query_type in ('SAVs', 'PolyPhen2')
         assert isinstance(queryPolyPhen2, bool)
+        valid_kwargs = [
+            'status_file_Uniprot',
+            'status_file_PDB',
+            'status_file_Pfam',
+            'status_prefix_Uniprot',
+            'status_prefix_PDB',
+            'status_prefix_Pfam']
+        assert all([k in valid_kwargs for k in kwargs])
 
         # masked NumPy array that will contain all info abut SAVs
         self.data = None
@@ -103,6 +112,8 @@ def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True):
         self.classifier = None
         self.aux_classifier = None
         self.featSet = None
+        # options
+        self.options = kwargs
 
         if query is None:
             # a SAV list can be uploaded later with setSAVs()
@@ -283,6 +294,8 @@ def getUniprot2PDBmap(self, filename='rhapsody-Uniprot2PDB.txt',
             # compute mapping
             m = Uniprot.mapSAVs2PDB(
                 self.data['SAV coords'], custom_PDB=self.customPDB,
+                status_file=self.options.get('status_file_Uniprot'),
+                status_prefix=self.options.get('status_prefix_Uniprot'),
                 refresh=refresh)
             self.data['unique SAV coords'] = m['unique SAV coords']
             self.data['PDB SAV coords'] = m['PDB SAV coords']
@@ -403,15 +416,20 @@ def _calcFeatMatrix(self, refresh=False):
             # compute structural and dynamical features from a PDB structure
             f = PDB.calcPDBfeatures(
                 Uniprot2PDBmap, sel_feats=sel_PDBfeats,
-                custom_PDB=self.customPDB, refresh=refresh)
+                custom_PDB=self.customPDB, refresh=refresh,
+                status_file=self.options.get('status_file_PDB'),
+                status_prefix=self.options.get('status_prefix_PDB'))
             all_feats.append(f)
         if RHAPSODY_FEATS['BLOSUM'].intersection(self.featSet):
             # retrieve BLOSUM values
             f = BLOSUM.calcBLOSUMfeatures(self.data['SAV coords'])
             all_feats.append(f)
         if RHAPSODY_FEATS['Pfam'].intersection(self.featSet):
             # compute sequence properties from Pfam domains
-            f = Pfam.calcPfamFeatures(self.data['SAV coords'])
+            f = Pfam.calcPfamFeatures(
+                self.data['SAV coords'],
+                status_file=self.options.get('status_file_Pfam'),
+                status_prefix=self.options.get('status_prefix_Pfam'))
             all_feats.append(f)
         if RHAPSODY_FEATS['EVmut'].intersection(self.featSet):
             # recover EVmutation data

diff --git a/rhapsody/predict/main.py b/rhapsody/predict/main.py
@@ -12,7 +12,7 @@
 def rhapsody(query, query_type='SAVs',
              main_classifier=None, aux_classifier=None,
              custom_PDB=None, force_env=None,
-             refresh=False, log=True):
+             refresh=False, log=True, **kwargs):
     """Obtain Rhapsody pathogenicity predictions on a list of human missense
     variants ([ref]_)
 
@@ -77,7 +77,7 @@ def rhapsody(query, query_type='SAVs',
             aux_classifier = getDefaultClassifiers()['reduced']
 
     # initialize object that will contain all results and predictions
-    r = Rhapsody()
+    r = Rhapsody(**kwargs)
 
     # import classifiers and feature set from pickle
     r.importClassifiers(main_classifier, aux_classifier, force_env=force_env)
@@ -99,9 +99,9 @@ def rhapsody(query, query_type='SAVs',
     r.printPredictions()
     if aux_classifier is not None:
         # print both 'full' and 'reduced' predictions in a more detailed format
-        r.printPredictions(classifier="both",
-                           PolyPhen2=False, EVmutation=False,
-                           filename='rhapsody-predictions-full_vs_reduced.txt')
+        r.printPredictions(
+            classifier="both", PolyPhen2=False, EVmutation=False,
+            filename='rhapsody-predictions-full_vs_reduced.txt')
 
     # save pickle
     r.savePickle()