Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update right before publication #12

Merged
merged 10 commits into from
Mar 7, 2020
3 changes: 2 additions & 1 deletion rhapsody/features/PolyPhen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
import requests
import datetime
import numpy as np
from prody import LOGGER, queryUniprot
from prody import LOGGER
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from math import log
from .Uniprot import queryUniprot

__author__ = "Luca Ponzoni"
__date__ = "December 2019"
Expand Down
56 changes: 42 additions & 14 deletions rhapsody/features/Uniprot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np
import prody as pd
from prody import LOGGER, SETTINGS
from prody.utilities import openURL
from tqdm import tqdm
from Bio.pairwise2 import align as bioalign
from Bio.pairwise2 import format_alignment
Expand All @@ -20,7 +21,16 @@
__email__ = "[email protected]"
__status__ = "Production"

__all__ = ['UniprotMapping', 'mapSAVs2PDB', 'seqScanning', 'printSAVlist']
__all__ = ['queryUniprot', 'UniprotMapping', 'mapSAVs2PDB',
'seqScanning', 'printSAVlist']


def queryUniprot(*args, **kwargs):
"""
Redefine prody function to check for no internet connection
"""
_ = openURL('http://www.uniprot.org/')
return pd.queryUniprot(*args, **kwargs)


class UniprotMapping:
Expand Down Expand Up @@ -52,7 +62,7 @@ def refresh(self):
delete precomputed alignments.
"""
# import Uniprot record and official accession number
self.fullRecord = pd.queryUniprot(self.acc)
self.fullRecord = queryUniprot(self.acc)
self.uniq_acc = self.fullRecord['accession 0']
# import main sequence and PDB records
rec = self.fullRecord
Expand Down Expand Up @@ -393,7 +403,7 @@ def recoverPickle(self, filename=None, folder=None, days=30, **kwargs):
pickle_path = os.path.join(folder, filename)
if not os.path.isfile(pickle_path):
# import unique accession number
acc = pd.queryUniprot(self.acc)['accession 0']
acc = queryUniprot(self.acc)['accession 0']
filename = 'UniprotMap-' + acc + '.pkl'
pickle_path = os.path.join(folder, filename)
else:
Expand Down Expand Up @@ -865,26 +875,44 @@ def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False,
return mapped_SAVs


def seqScanning(Uniprot_coord):
'''Returns a list of SAVs. If the string 'Uniprot_coord' is just a Uniprot ID,
the list will contain all possible amino acid substitutions at all positions
in the sequence. If 'Uniprot_coord' also includes a specific position, the list
will only contain all possible amino acid variants at that position.
def seqScanning(Uniprot_coord, sequence=None):
'''Returns a list of SAVs. If the string 'Uniprot_coord' is just a
Uniprot ID, the list will contain all possible amino acid substitutions
at all positions in the sequence. If 'Uniprot_coord' also includes a
specific position, the list will only contain all possible amino acid
variants at that position. If 'sequence' is 'None' (default), the
sequence will be downloaded from Uniprot.
'''
assert isinstance(Uniprot_coord, str), "Must be a string."
coord = Uniprot_coord.strip().split()
coord = Uniprot_coord.upper().strip().split()
assert len(coord) < 3, "Invalid format. Examples: 'Q9BW27' or 'Q9BW27 10'."
Uniprot_record = pd.queryUniprot(coord[0])
sequence = Uniprot_record['sequence 0'].replace("\n", "")
aa_list = 'ACDEFGHIKLMNPQRSTVWY'
if sequence is None:
Uniprot_record = queryUniprot(coord[0])
sequence = Uniprot_record['sequence 0'].replace("\n", "")
else:
assert isinstance(sequence, str), "Must be a string."
sequence = sequence.upper()
assert set(sequence).issubset(aa_list), "Invalid list of amino acids."
if len(coord) == 1:
# user asks for full-sequence scanning
positions = range(len(sequence))
else:
positions = [int(coord[1]) - 1]
# user asks for single-site scanning
site = int(coord[1])
positions = [site - 1]
# if user provides only one amino acid as 'sequence', interpret it
# as the amino acid at the specified position
if len(sequence) == 1:
sequence = sequence*site
else:
assert len(sequence) >= site, ("Requested position is not found "
"in input sequence.")
SAV_list = []
acc = coord[0]
for i in positions:
wt_aa = sequence[i]
for aa in 'ACDEFGHIKLMNPQRSTVWY':
for aa in aa_list:
if aa == wt_aa:
continue
s = ' '.join([acc, str(i+1), wt_aa, aa])
Expand All @@ -900,5 +928,5 @@ def printSAVlist(input_SAVs, filename):
m = f'error in SAV {i}: '
assert isinstance(line, str), f'{m} not a string'
assert len(line) < 25, f'{m} too many characters'
print(line, file=f)
print(line.upper(), file=f)
return filename
24 changes: 17 additions & 7 deletions rhapsody/predict/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,21 +145,30 @@ def _isColSet(self, column):
assert self.data is not None, 'Data array not initialized.'
return self.data[column].count() != 0

def _isSaturationMutagenesis(self):
def _isSaturationMutagenesis(self, queryUniprot=False):
assert self._isColSet('SAV coords'), 'SAV list not set.'
if self.saturation_mutagenesis is None:
self.saturation_mutagenesis = False
try:
SAVs = self.getUniqueSAVcoords()
SAV_list = list(SAVs['unique SAV coords'])
acc = SAVs[0]['Uniprot ID']
acc = list(set(SAVs['Uniprot ID']))
if len(acc) != 1:
raise RuntimeError('Multiple accession numbers found')
else:
acc = acc[0]
pos = list(set(SAVs['position']))
if len(pos) == 1:
query = f'{acc} {pos[0]}'
else:
query = acc
generated_SAV_list = Uniprot.seqScanning(query)
if SAV_list == generated_SAV_list:
# generate target scanning list
if queryUniprot:
target_SAV_list = Uniprot.seqScanning(query)
else:
seq = ''.join(SAVs['wt. aa'][range(0, len(SAVs), 19)])
target_SAV_list = Uniprot.seqScanning(query, sequence=seq)
if SAV_list == target_SAV_list:
self.saturation_mutagenesis = True
else:
raise RuntimeError('Missing SAVs detected.')
Expand All @@ -184,18 +193,19 @@ def setSAVs(self, query):
if isfile(query):
# 'query' is a filename, with line format 'P17516 135 G E'
SAVs = np.loadtxt(query, dtype=SAV_dtype)
SAV_list = ['{} {} {} {}'.format(*s) for s in SAVs]
SAV_list = ['{} {} {} {}'.format(*s).upper() for s in SAVs]
elif len(query.split()) < 3:
# single Uniprot acc (+ pos), e.g. 'P17516' or 'P17516 135'
SAV_list = Uniprot.seqScanning(query)
self.saturation_mutagenesis = True
else:
# single SAV
SAV = np.array(query.split(), dtype=SAV_dtype)
SAV = np.array(query.upper().split(), dtype=SAV_dtype)
SAV_list = ['{} {} {} {}'.format(*SAV)]
else:
# 'query' is a list or tuple of SAV coordinates
SAVs = np.array([tuple(s.split()) for s in query], dtype=SAV_dtype)
SAVs = np.array([tuple(s.upper().split()) for s in query],
dtype=SAV_dtype)
SAV_list = ['{} {} {} {}'.format(*s) for s in SAVs]
# store SAV coordinates
numSAVs = len(SAV_list)
Expand Down
9 changes: 7 additions & 2 deletions rhapsody/predict/figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def print_sat_mutagen_figure(filename, rhapsody_obj, res_interval=None,
ax0.grid(which='minor', color='w', linestyle='-', linewidth=.5)

# mutagenesis table (heatmap)
matplotlib.cm.coolwarm.set_bad(color='antiquewhite')
matplotlib.cm.coolwarm.set_bad(color='#F9E79F')
im = ax1.imshow(table_best[:, res_i-1:res_f], aspect='auto',
cmap='coolwarm', vmin=0, vmax=1)
axcb.figure.colorbar(im, cax=axcb)
Expand All @@ -221,7 +221,12 @@ def print_sat_mutagen_figure(filename, rhapsody_obj, res_interval=None,
# average pathogenicity profile
x_resids = np.arange(1, upper_lim+1)
# shading showing range of values
ax2.fill_between(x_resids, min_p, max_p, alpha=0.5,
# NB: a bug in pyplot.fill_between() arises when selecting a region with
# set_xlim() in a large plot (e.g. > 1000), causing the shaded area to
# be plotted even though it's outside the selected region. As a workaround,
# here I slice the plot to fit the selected region.
sl = slice(max(1, res_i-2), min(res_f+2, upper_lim+1))
ax2.fill_between(x_resids[sl], min_p[sl], max_p[sl], alpha=0.5,
edgecolor='salmon', facecolor='salmon')
# plot average profile for other predictions, if available
if extra_plot is not None:
Expand Down
122 changes: 85 additions & 37 deletions rhapsody/train/RFtraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import matthews_corrcoef, precision_recall_fscore_support
from sklearn.utils import resample
from ..utils.settings import DEFAULT_FEATSETS, getDefaultTrainingDataset
from .figures import print_pred_distrib_figure, print_path_prob_figure
from .figures import print_ROC_figure, print_feat_imp_figure
Expand All @@ -27,46 +28,93 @@
'extendDefaultTrainingDataset']


def calcScoreMetrics(y_test, y_pred):
# compute ROC and AUROC
fpr, tpr, roc_thr = roc_curve(y_test, y_pred)
roc = {'FPR': fpr, 'TPR': tpr, 'thresholds': roc_thr}
auroc = roc_auc_score(y_test, y_pred)
# compute optimal cutoff J (argmax of Youden's index)
diff = np.array([y-x for x, y in zip(fpr, tpr)])
Jopt = roc_thr[(-diff).argsort()][0]
# compute Precision-Recall curve and AUPRC
pre, rec, prc_thr = precision_recall_curve(y_test, y_pred)
prc = {'precision': pre, 'recall': rec, 'thresholds': prc_thr}
auprc = average_precision_score(y_test, y_pred)
output = {
'ROC': roc,
'AUROC': auroc,
'optimal cutoff': Jopt,
'PRC': prc,
'AUPRC': auprc
}
def calcScoreMetrics(y_test, y_pred, bootstrap=0, **resample_kwargs):
'''Compute accuracy metrics of continuous values (optionally bootstrapped)
'''
def _calcScoreMetrics(y_test, y_pred):
# compute ROC and AUROC
fpr, tpr, roc_thr = roc_curve(y_test, y_pred)
roc = {'FPR': fpr, 'TPR': tpr, 'thresholds': roc_thr}
auroc = roc_auc_score(y_test, y_pred)
# compute optimal cutoff J (argmax of Youden's index)
diff = np.array([y-x for x, y in zip(fpr, tpr)])
Jopt = roc_thr[(-diff).argsort()][0]
# compute Precision-Recall curve and AUPRC
pre, rec, prc_thr = precision_recall_curve(y_test, y_pred)
prc = {'precision': pre, 'recall': rec, 'thresholds': prc_thr}
auprc = average_precision_score(y_test, y_pred)
return {
'ROC': roc,
'AUROC': auroc,
'optimal cutoff': Jopt,
'PRC': prc,
'AUPRC': auprc
}

if bootstrap < 2:
output = _calcScoreMetrics(y_test, y_pred)
else:
# apply bootstrap
outputs = []
for i in range(bootstrap):
yy_test, yy_pred = resample(y_test, y_pred, **resample_kwargs)
outputs.append(_calcScoreMetrics(yy_test, yy_pred))
# compute mean and standard deviation of metrics
output = {}
for metric in ['AUROC', 'optimal cutoff', 'AUPRC']:
v = [d[metric] for d in outputs]
output[f'mean {metric}'] = np.mean(v)
output[f'{metric} std'] = np.std(v)
# compute average ROC
mean_fpr = np.linspace(0, 1, len(y_pred))
mean_tpr = 0.0
for d in outputs:
mean_tpr += np.interp(mean_fpr, d['ROC']['FPR'], d['ROC']['TPR'])
mean_tpr /= bootstrap
mean_tpr[0] = 0.0
mean_tpr[-1] = 1.0
output['mean ROC'] = {'FPR': mean_fpr, 'TPR': mean_tpr}

return output


def calcClassMetrics(y_test, y_pred):
mcc = matthews_corrcoef(y_test, y_pred)
pre, rec, f1s, sup = precision_recall_fscore_support(
y_test, y_pred, labels=[0, 1])
avg_pre, avg_rec, avg_f1s, _ = precision_recall_fscore_support(
y_test, y_pred, average='weighted')
output = {
'MCC': mcc,
'precision (0)': pre[0],
'recall (0)': rec[0],
'F1 score (0)': f1s[0],
'precision (1)': pre[1],
'recall (1)': rec[1],
'F1 score (1)': f1s[1],
'precision': avg_pre,
'recall': avg_rec,
'F1 score': avg_f1s
}
def calcClassMetrics(y_test, y_pred, bootstrap=0, **resample_kwargs):
'''Compute accuracy metrics of binary labels (optionally bootstrapped)
'''
def _calcClassMetrics(y_test, y_pred):
mcc = matthews_corrcoef(y_test, y_pred)
pre, rec, f1s, sup = precision_recall_fscore_support(
y_test, y_pred, labels=[0, 1])
avg_pre, avg_rec, avg_f1s, _ = precision_recall_fscore_support(
y_test, y_pred, average='weighted')
return {
'MCC': mcc,
'precision (0)': pre[0],
'recall (0)': rec[0],
'F1 score (0)': f1s[0],
'precision (1)': pre[1],
'recall (1)': rec[1],
'F1 score (1)': f1s[1],
'precision': avg_pre,
'recall': avg_rec,
'F1 score': avg_f1s
}

if bootstrap < 2:
output = _calcClassMetrics(y_test, y_pred)
else:
# apply bootstrap
outputs = []
for i in range(bootstrap):
yy_test, yy_pred = resample(y_test, y_pred, **resample_kwargs)
outputs.append(_calcClassMetrics(yy_test, yy_pred))
# compute mean and standard deviation of metrics
output = {}
for metric in outputs[0].keys():
v = [d[metric] for d in outputs]
output[f'mean {metric}'] = np.mean(v)
output[f'{metric} std'] = np.std(v)

return output


Expand Down
7 changes: 5 additions & 2 deletions rhapsody/utils/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,11 @@ def initialSetup(working_dir=None, refresh=False, download_EVmutation=True):
working_dir = DEFAULT_WORKING_DIR
if os.path.isdir(working_dir):
raise EnvironmentError(
f"A folder named '{working_dir}' already exists. "
"Please specify another name.")
f"A folder at default path '{working_dir}' already "
"exists. If it contains configuration files from a "
"previous installation that you wish to recover, please "
f"run: initialSetup('{working_dir}') \n"
"Otherwise, please specify another location.")
else:
os.mkdir(working_dir)
pd.LOGGER.info(f'Default working directory set: {working_dir}')
Expand Down