Skip to content

Commit

Permalink
calcPredictions always returns a struct. array, even when no predicti…
Browse files Browse the repository at this point in the history
…ons are computed (before: None)
  • Loading branch information
luponzo86 committed Jan 14, 2019
1 parent 0a3d1a7 commit b61d07b
Showing 1 changed file with 28 additions and 29 deletions.
57 changes: 28 additions & 29 deletions rhapsody/rhapsody.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@


class Rhapsody:
"""A class for running calculations and handling results
"""A class for running calculations and handling results
from RHAPSODY.
"""
def __init__(self):
# filename of the classifier pickle
self.classifier = None
# tuple of feature names
# tuple of feature names
self.featSet = None
# dictionary of properties of the trained classifier
self.CVsummary = None
Expand All @@ -36,7 +36,7 @@ def __init__(self):
# If an error occurs, unique Uniprot coords and/or PDB coords may
# be replaced by a string describing the error.
self.Uniprot2PDBmap = None
# numpy array (num_SAVS)x(num_features)
# numpy array (num_SAVS)x(num_features)
self.featMatrix = None
# structured array containing predictions
self.predictions = None
Expand Down Expand Up @@ -100,8 +100,8 @@ def importPolyPhen2output(self, PP2output):
return self.PP2output

def getUniprot2PDBmap(self, output='rhapsody-Uniprot2PDB.txt'):
"""Maps each SAV to the corresponding resid in a PDB chain.
The format is: (PDBID, chainID, resid, wild-type aa, length).
"""Maps each SAV to the corresponding resid in a PDB chain.
The format is: (PDBID, chainID, resid, wild-type aa, length).
"""
assert self.SAVcoords is not None, "Uniprot coordinates not set."
if self.Uniprot2PDBmap is None:
Expand Down Expand Up @@ -148,7 +148,7 @@ def _calcFeatMatrix(self):
Uniprot2PDBmap = self.getUniprot2PDBmap()
mapped_SAVs = tuple(t[2] for t in Uniprot2PDBmap)
# compute structural and dynamical features from a PDB structure
f = calcPDBfeatures(mapped_SAVs, sel_feats=sel_PDBfeats,
f = calcPDBfeatures(mapped_SAVs, sel_feats=sel_PDBfeats,
custom_PDB=self.customPDB)
all_feats.append(f)
if RHAPSODY_FEATS['BLOSUM'].intersection(self.featSet):
Expand All @@ -170,10 +170,10 @@ def _calcFeatMatrix(self):
return buildFeatMatrix(self.featSet, all_feats)

def calcPredictions(self, output='rhapsody-predictions.txt'):
assert self.predictions is None, 'Predictions already computed.'
assert self.predictions is None, 'Predictions already computed.'
assert self.classifier is not None, 'Classifier not set.'
assert self.featMatrix is not None, 'Features not computed.'
p = calcPredictions(self.featMatrix, self.classifier,
p = calcPredictions(self.featMatrix, self.classifier,
SAV_coords=self.SAVcoords['text'])
if output is not None and p is not None:
np.savetxt(output, p, fmt='%-5.3f %-5.3f %-12s %-12s')
Expand Down Expand Up @@ -209,7 +209,7 @@ def calcAuxPredictions(self, aux_clsf, output='rhapsody-aux_predictions.txt'):
self.auxPreds = p_a
self.mixedPreds = p_m
return self.auxPreds, self.mixedPreds

def savePickle(self, output='rhapsody-pickle.pkl'):
f = pickle.dump(self, open(output, "wb"))
return f
Expand All @@ -219,8 +219,8 @@ def savePickle(self, output='rhapsody-pickle.pkl'):


def pathRhapsodyFolder(folder=None):
"""Returns or specify local folder for storing files and pickles necessary
to run Rhapsody. To release the current folder, pass an invalid path, e.g.
"""Returns or specify local folder for storing files and pickles necessary
to run Rhapsody. To release the current folder, pass an invalid path, e.g.
``folder=''``.
"""
if folder is None:
Expand Down Expand Up @@ -250,9 +250,9 @@ def pathRhapsodyFolder(folder=None):


def seqScanning(Uniprot_coord):
'''Returns a list of SAVs. If the string 'Uniprot_coord' is just a Uniprot ID,
the list will contain all possible amino acid substitutions at all positions
in the sequence. If 'Uniprot_coord' also includes a specific position, the list
'''Returns a list of SAVs. If the string 'Uniprot_coord' is just a Uniprot ID,
the list will contain all possible amino acid substitutions at all positions
in the sequence. If 'Uniprot_coord' also includes a specific position, the list
will only contain all possible amino acid variants at that position.
'''
assert isinstance(Uniprot_coord, str), "Must be a string."
Expand Down Expand Up @@ -291,7 +291,7 @@ def printSAVlist(input_SAVs, filename):
def mapSAVs2PDB(SAV_coords, custom_PDB=None):
LOGGER.info('Mapping SAVs to PDB structures...')
LOGGER.timeit('_map2PDB')
# sort SAVs, so to group together those
# sort SAVs, so to group together those
# with identical accession number
sorting_map = np.argsort(SAV_coords['acc'])
# map to PDB using Uniprot class
Expand Down Expand Up @@ -320,13 +320,13 @@ def mapSAVs2PDB(SAV_coords, custom_PDB=None):
LOGGER.info('Aligning Uniprot sequence to custom PDB...')
U2P_map.alignCustomPDB(custom_PDB, 'all')
except Exception as e:
U2P_map = str(e)
U2P_map = str(e)
cache['obj'] = U2P_map
# map specific SAV
try:
if isinstance(U2P_map, str):
raise RuntimeError(U2P_map)
# check wt aa
# check wt aa
if not 0 < pos <= len(U2P_map.sequence):
raise ValueError('Index out of range')
wt_aa = U2P_map.sequence[pos-1]
Expand All @@ -340,7 +340,7 @@ def mapSAVs2PDB(SAV_coords, custom_PDB=None):
r = U2P_map.mapSingleRes2CustomPDBs(pos, check_aa=True)
if len(r) == 0:
raise RuntimeError('Unable to map SAV to PDB')
else:
else:
res_map = r[0]
except Exception as e:
res_map = str(e)
Expand All @@ -366,7 +366,7 @@ def calcPredictions(feat_matrix, clsf, SAV_coords=None):
else:
LOGGER.timeit('_import_clsf')
clsf_dict = pickle.load(open(clsf, 'rb'))
LOGGER.report('Random Forest classifier imported in %.1fs.',
LOGGER.report('Random Forest classifier imported in %.1fs.',
'_import_clsf')
classifier = clsf_dict['trained RF']
opt_cutoff = clsf_dict['CV summary']['optimal cutoff']
Expand All @@ -377,7 +377,7 @@ def calcPredictions(feat_matrix, clsf, SAV_coords=None):

# define a structured array for storing predictions
pred_dtype = np.dtype([('score', 'f'),
('path. probability' , 'f'),
('path. probability' , 'f'),
('path. class', 'U12'),
('training info', 'U12')])
predictions = np.zeros(len(feat_matrix), dtype=pred_dtype)
Expand All @@ -387,18 +387,19 @@ def calcPredictions(feat_matrix, clsf, SAV_coords=None):
n_pred = len(sel_rows)
if n_pred == 0:
LOGGER.warning('No predictions could be computed.')
return None
sliced_feat_matrix = feat_matrix[sel_rows]
proba = None
else:
# compute predictions
sliced_feat_matrix = feat_matrix[sel_rows]
proba = classifier.predict_proba(sliced_feat_matrix)

# check if SAVs are found in the training dataset
known_SAVs = {}
if SAV_coords is not None:
f = lambda x: 'known_del' if x==1 else 'known_neu'
known_SAVs = {l['SAV_coords']: f(l['true_label']) for l in train_data}

# compute predictions
proba = classifier.predict_proba(sliced_feat_matrix)
# output
# output
J, err_bar = opt_cutoff
Jminus = J - err_bar
Jplus = J + err_bar
Expand All @@ -417,7 +418,7 @@ def calcPredictions(feat_matrix, clsf, SAV_coords=None):
score = proba[k, 1]
# assign pathogenicity probability by interpolating
# the pathogenicity profile computed during CV
path_prob = np.interp(score, path_curve[0], path_curve[1])
path_prob = np.interp(score, path_curve[0], path_curve[1])
# assign class of pathogenicity based on Youden's cutoff
if score > Jplus:
path_class = "deleterious"
Expand All @@ -430,9 +431,7 @@ def calcPredictions(feat_matrix, clsf, SAV_coords=None):
# store values
predictions[i] = (score, path_prob, path_class, SAV_status)
k = k + 1
LOGGER.report('{} predictions computed in %.1fs.'.format(n_pred),
LOGGER.report('{} predictions computed in %.1fs.'.format(n_pred),
'_preds')

return predictions


0 comments on commit b61d07b

Please sign in to comment.