Skip to content

Commit

Permalink
add tqdm progress bars to main loops
Browse files Browse the repository at this point in the history
  • Loading branch information
luponzo86 committed Nov 14, 2019
1 parent a8a34af commit 050ea7b
Show file tree
Hide file tree
Showing 6 changed files with 388 additions and 28 deletions.
40 changes: 30 additions & 10 deletions rhapsody/features/PDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@
PDB-based structural and dynamical features in a single place, and a
function for using the latter on a list of PDB SAV coordinates."""

import numpy as np
import pickle
import datetime
import os
from tqdm import tqdm
from prody import Atomic, parsePDB, writePDB, LOGGER, SETTINGS
from prody import GNM, ANM, calcSqFlucts
from prody import calcPerturbResponse, calcMechStiff
# from prody import calcMBS
from prody import reduceModel, sliceModel
from prody import execDSSP, parseDSSP
import numpy as np
import pickle
import datetime
import os


__all__ = ['STR_FEATS', 'DYN_FEATS', 'PDB_FEATS',
'PDBfeatures', 'calcPDBfeatures']
Expand Down Expand Up @@ -646,7 +648,7 @@ def calcSelFeatures(self, chain='all', resid=None, sel_feats=None):


def calcPDBfeatures(mapped_SAVs, sel_feats=None, custom_PDB=None,
refresh=False):
refresh=False, status_file=None, status_prefix=None):
LOGGER.info('Computing structural and dynamical features '
'from PDB structures...')
LOGGER.timeit('_calcPDBFeats')
Expand All @@ -665,24 +667,40 @@ def calcPDBfeatures(mapped_SAVs, sel_feats=None, custom_PDB=None,
else:
# no need to sort when using a custom PDB or PDBID
sorting_map = range(num_SAVs)
# define how to report progress
if status_prefix is None:
status_prefix = ''
bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
if status_file is not None:
status_file = open(status_file, 'w')
progress_bar = tqdm(
[(i, mapped_SAVs[i]) for i in sorting_map], file=status_file,
bar_format=bar_format+'\n')
else:
progress_bar = tqdm(
[(i, mapped_SAVs[i]) for i in sorting_map], bar_format=bar_format)
cache = {'PDBID': None, 'chain': None, 'obj': None}
count = 0
for indx, SAV in [(i, mapped_SAVs[i]) for i in sorting_map]:
for indx, SAV in progress_bar:
count += 1
if SAV['PDB size'] == 0:
# SAV could not be mapped to PDB
_features = np.nan
SAV_coords = SAV['SAV coords']
LOGGER.info(f"[{count}/{num_SAVs}] SAV '{SAV_coords}' "
"couldn't be mapped to PDB")
progress_msg = f"{status_prefix}No PDB for SAV '{SAV_coords}'"
else:
parsed_PDB_coords = SAV['PDB SAV coords'].split()
PDBID, chID = parsed_PDB_coords[:2]
resid = int(parsed_PDB_coords[2])
LOGGER.info("[{}/{}] Analizing mutation site {}:{} {}..."
.format(count, num_SAVs, PDBID, chID, resid))
progress_msg = status_prefix + \
f'Analizing mutation site {PDBID}:{chID} {resid}'
# chID == "?" stands for "empty space"
chID = " " if chID == "?" else chID
# report progress
# LOGGER.info(f"[{count}/{num_SAVs}] {progress_msg}...")
progress_bar.set_description(progress_msg)
# compute PDB features, if possible
if SAV['PDB size'] != 0:
if PDBID == cache['PDBID']:
# use PDBfeatures instance from previous iteration
obj = cache['obj']
Expand Down Expand Up @@ -728,4 +746,6 @@ def calcPDBfeatures(mapped_SAVs, sel_feats=None, custom_PDB=None,
and custom_PDB is None:
cache['obj'].savePickle()
LOGGER.report('PDB features have been computed in %.1fs.', '_calcPDBFeats')
if status_file:
os.remove(status_file.name)
return features
25 changes: 22 additions & 3 deletions rhapsody/features/Pfam.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
coevolution properties of an amino acid substitution from a Pfam
multiple sequence alignment."""

import os
import numpy as np
from tqdm import tqdm
from prody import LOGGER
from .Uniprot import UniprotMapping

Expand Down Expand Up @@ -38,7 +40,7 @@ def calcNormRank(array, i):
return feats


def calcPfamFeatures(SAVs):
def calcPfamFeatures(SAVs, status_file=None, status_prefix=None):
LOGGER.info('Computing sequence properties from Pfam domains...')
LOGGER.timeit('_calcPfamFeats')
# sort SAVs, so to group together those
Expand All @@ -49,14 +51,29 @@ def calcPfamFeatures(SAVs):
num_SAVs = len(SAVs)
feat_dtype = np.dtype([('entropy', 'f'), ('ranked_MI', 'f')])
features = np.zeros(num_SAVs, dtype=feat_dtype)
# define how to report progress
if status_prefix is None:
status_prefix = ''
bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
if status_file is not None:
status_file = open(status_file, 'w')
progress_bar = tqdm(
[(i, SAVs[i]) for i in sorting_map], file=status_file,
bar_format=bar_format+'\n')
else:
progress_bar = tqdm(
[(i, SAVs[i]) for i in sorting_map], bar_format=bar_format)
# map to Pfam domains using UniprotMapping class
cache = {'acc': None, 'obj': None, 'warn': ''}
count = 0
for indx, SAV in [(i, SAVs[i]) for i in sorting_map]:
for indx, SAV in progress_bar:
count += 1
acc, pos, aa1, aa2 = SAV.split()
pos = int(pos)
LOGGER.info(f"[{count}/{num_SAVs}] Mapping SAV '{SAV}' to Pfam...")
# report progress
progress_msg = f"{status_prefix}Mapping SAV '{SAV}' to Pfam"
# LOGGER.info(f"[{count}/{num_SAVs}] {progress_msg}...")
progress_bar.set_description(progress_msg)
# map to Pfam domains using 'UniprotMapping' class
if acc == cache['acc']:
# use object from previous iteration
Expand Down Expand Up @@ -102,4 +119,6 @@ def calcPfamFeatures(SAVs):
cache['obj'].savePickle()
LOGGER.report('SAVs have been mapped on Pfam domains and sequence '
'properties have been computed in %.1fs.', '_calcPfamFeats')
if status_file:
os.remove(status_file.name)
return features
34 changes: 27 additions & 7 deletions rhapsody/features/Uniprot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np
import prody as pd
from prody import LOGGER, SETTINGS
from tqdm import tqdm
from Bio.pairwise2 import align as bioalign
from Bio.pairwise2 import format_alignment
from Bio.SubsMat import MatrixInfo as matlist
Expand Down Expand Up @@ -758,28 +759,45 @@ def calcEvolProperties(self, resid='all', refresh=False, folder=None,
return {k: self.Pfam[k] for k in PF_list}


def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False):
def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False,
status_file=None, status_prefix=None):
LOGGER.info('Mapping SAVs to PDB structures...')
LOGGER.timeit('_map2PDB')
# sort SAVs, so to group together those
# with identical accession number
accs = [s.split()[0] for s in SAV_coords]
sorting_map = np.argsort(accs)
# define a structured array
PDBmap_dtype = np.dtype([('orig. SAV coords', 'U25'),
('unique SAV coords', 'U25'),
('PDB SAV coords', 'U100'),
('PDB size', 'i')])
PDBmap_dtype = np.dtype([
('orig. SAV coords', 'U25'),
('unique SAV coords', 'U25'),
('PDB SAV coords', 'U100'),
('PDB size', 'i')])
nSAVs = len(SAV_coords)
mapped_SAVs = np.zeros(nSAVs, dtype=PDBmap_dtype)
# define how to report progress
if status_prefix is None:
status_prefix = ''
bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
if status_file is not None:
status_file = open(status_file, 'w')
progress_bar = tqdm(
[(i, SAV_coords[i]) for i in sorting_map], file=status_file,
bar_format=bar_format+'\n')
else:
progress_bar = tqdm(
[(i, SAV_coords[i]) for i in sorting_map], bar_format=bar_format)
# map to PDB using Uniprot class
cache = {'acc': None, 'obj': None}
count = 0
for indx, SAV in [(i, SAV_coords[i]) for i in sorting_map]:
for indx, SAV in progress_bar:
count += 1
acc, pos, aa1, aa2 = SAV.split()
pos = int(pos)
LOGGER.info(f"[{count}/{nSAVs}] Mapping SAV '{SAV}' to PDB...")
# report progress
progress_msg = f"{status_prefix}Mapping SAV '{SAV}' to PDB"
# LOGGER.info(f"[{count}/{nSAVs}] {progress_msg}...")
progress_bar.set_description(progress_msg)
# map Uniprot to PDB chains
if acc == cache['acc']:
# use mapping from previous iteration
Expand Down Expand Up @@ -836,6 +854,8 @@ def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False):
n = sum(mapped_SAVs['PDB size'] != 0)
LOGGER.report(f'{n} out of {nSAVs} SAVs have been mapped to PDB in %.1fs.',
'_map2PDB')
if status_file:
os.remove(status_file.name)
return mapped_SAVs


Expand Down
24 changes: 21 additions & 3 deletions rhapsody/predict/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ class Rhapsody:
EVmutation.
"""

def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True):
def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True,
**kwargs):
""" Initialize a Rhapsody object with a list of SAVs (optional).
:arg query: Single Amino Acid Variants (SAVs) in Uniprot coordinates.
Expand Down Expand Up @@ -53,6 +54,14 @@ def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True):

assert query_type in ('SAVs', 'PolyPhen2')
assert isinstance(queryPolyPhen2, bool)
valid_kwargs = [
'status_file_Uniprot',
'status_file_PDB',
'status_file_Pfam',
'status_prefix_Uniprot',
'status_prefix_PDB',
'status_prefix_Pfam']
assert all([k in valid_kwargs for k in kwargs])

# masked NumPy array that will contain all info abut SAVs
self.data = None
Expand Down Expand Up @@ -103,6 +112,8 @@ def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True):
self.classifier = None
self.aux_classifier = None
self.featSet = None
# options
self.options = kwargs

if query is None:
# a SAV list can be uploaded later with setSAVs()
Expand Down Expand Up @@ -283,6 +294,8 @@ def getUniprot2PDBmap(self, filename='rhapsody-Uniprot2PDB.txt',
# compute mapping
m = Uniprot.mapSAVs2PDB(
self.data['SAV coords'], custom_PDB=self.customPDB,
status_file=self.options.get('status_file_Uniprot'),
status_prefix=self.options.get('status_prefix_Uniprot'),
refresh=refresh)
self.data['unique SAV coords'] = m['unique SAV coords']
self.data['PDB SAV coords'] = m['PDB SAV coords']
Expand Down Expand Up @@ -403,15 +416,20 @@ def _calcFeatMatrix(self, refresh=False):
# compute structural and dynamical features from a PDB structure
f = PDB.calcPDBfeatures(
Uniprot2PDBmap, sel_feats=sel_PDBfeats,
custom_PDB=self.customPDB, refresh=refresh)
custom_PDB=self.customPDB, refresh=refresh,
status_file=self.options.get('status_file_PDB'),
status_prefix=self.options.get('status_prefix_PDB'))
all_feats.append(f)
if RHAPSODY_FEATS['BLOSUM'].intersection(self.featSet):
# retrieve BLOSUM values
f = BLOSUM.calcBLOSUMfeatures(self.data['SAV coords'])
all_feats.append(f)
if RHAPSODY_FEATS['Pfam'].intersection(self.featSet):
# compute sequence properties from Pfam domains
f = Pfam.calcPfamFeatures(self.data['SAV coords'])
f = Pfam.calcPfamFeatures(
self.data['SAV coords'],
status_file=self.options.get('status_file_Pfam'),
status_prefix=self.options.get('status_prefix_Pfam'))
all_feats.append(f)
if RHAPSODY_FEATS['EVmut'].intersection(self.featSet):
# recover EVmutation data
Expand Down
10 changes: 5 additions & 5 deletions rhapsody/predict/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
def rhapsody(query, query_type='SAVs',
main_classifier=None, aux_classifier=None,
custom_PDB=None, force_env=None,
refresh=False, log=True):
refresh=False, log=True, **kwargs):
"""Obtain Rhapsody pathogenicity predictions on a list of human missense
variants ([ref]_)
Expand Down Expand Up @@ -77,7 +77,7 @@ def rhapsody(query, query_type='SAVs',
aux_classifier = getDefaultClassifiers()['reduced']

# initialize object that will contain all results and predictions
r = Rhapsody()
r = Rhapsody(**kwargs)

# import classifiers and feature set from pickle
r.importClassifiers(main_classifier, aux_classifier, force_env=force_env)
Expand All @@ -99,9 +99,9 @@ def rhapsody(query, query_type='SAVs',
r.printPredictions()
if aux_classifier is not None:
# print both 'full' and 'reduced' predictions in a more detailed format
r.printPredictions(classifier="both",
PolyPhen2=False, EVmutation=False,
filename='rhapsody-predictions-full_vs_reduced.txt')
r.printPredictions(
classifier="both", PolyPhen2=False, EVmutation=False,
filename='rhapsody-predictions-full_vs_reduced.txt')

# save pickle
r.savePickle()
Expand Down
Loading

0 comments on commit 050ea7b

Please sign in to comment.