diff --git a/rhapsody/features/PolyPhen2.py b/rhapsody/features/PolyPhen2.py index b9ce7b2..4991395 100644 --- a/rhapsody/features/PolyPhen2.py +++ b/rhapsody/features/PolyPhen2.py @@ -8,10 +8,11 @@ import requests import datetime import numpy as np -from prody import LOGGER, queryUniprot +from prody import LOGGER from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from math import log +from .Uniprot import queryUniprot __author__ = "Luca Ponzoni" __date__ = "December 2019" diff --git a/rhapsody/features/Uniprot.py b/rhapsody/features/Uniprot.py index 4c016cb..9dd85e7 100644 --- a/rhapsody/features/Uniprot.py +++ b/rhapsody/features/Uniprot.py @@ -9,6 +9,7 @@ import numpy as np import prody as pd from prody import LOGGER, SETTINGS +from prody.utilities import openURL from tqdm import tqdm from Bio.pairwise2 import align as bioalign from Bio.pairwise2 import format_alignment @@ -20,7 +21,16 @@ __email__ = "lponzoni@pitt.edu" __status__ = "Production" -__all__ = ['UniprotMapping', 'mapSAVs2PDB', 'seqScanning', 'printSAVlist'] +__all__ = ['queryUniprot', 'UniprotMapping', 'mapSAVs2PDB', + 'seqScanning', 'printSAVlist'] + + +def queryUniprot(*args, **kwargs): + """ + Redefine prody function to check for no internet connection + """ + _ = openURL('http://www.uniprot.org/') + return pd.queryUniprot(*args, **kwargs) class UniprotMapping: @@ -52,7 +62,7 @@ def refresh(self): delete precomputed alignments. """ # import Uniprot record and official accession number - self.fullRecord = pd.queryUniprot(self.acc) + self.fullRecord = queryUniprot(self.acc) self.uniq_acc = self.fullRecord['accession 0'] # import main sequence and PDB records rec = self.fullRecord @@ -393,7 +403,7 @@ def recoverPickle(self, filename=None, folder=None, days=30, **kwargs): pickle_path = os.path.join(folder, filename) if not os.path.isfile(pickle_path): # import unique accession number - acc = pd.queryUniprot(self.acc)['accession 0'] + acc = queryUniprot(self.acc)['accession 0'] filename = 'UniprotMap-' + acc + '.pkl' pickle_path = os.path.join(folder, filename) else: @@ -865,26 +875,44 @@ def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False, return mapped_SAVs -def seqScanning(Uniprot_coord): - '''Returns a list of SAVs. If the string 'Uniprot_coord' is just a Uniprot ID, - the list will contain all possible amino acid substitutions at all positions - in the sequence. If 'Uniprot_coord' also includes a specific position, the list - will only contain all possible amino acid variants at that position. +def seqScanning(Uniprot_coord, sequence=None): + '''Returns a list of SAVs. If the string 'Uniprot_coord' is just a + Uniprot ID, the list will contain all possible amino acid substitutions + at all positions in the sequence. If 'Uniprot_coord' also includes a + specific position, the list will only contain all possible amino acid + variants at that position. If 'sequence' is 'None' (default), the + sequence will be downloaded from Uniprot. ''' assert isinstance(Uniprot_coord, str), "Must be a string." - coord = Uniprot_coord.strip().split() + coord = Uniprot_coord.upper().strip().split() assert len(coord) < 3, "Invalid format. Examples: 'Q9BW27' or 'Q9BW27 10'." - Uniprot_record = pd.queryUniprot(coord[0]) - sequence = Uniprot_record['sequence 0'].replace("\n", "") + aa_list = 'ACDEFGHIKLMNPQRSTVWY' + if sequence is None: + Uniprot_record = queryUniprot(coord[0]) + sequence = Uniprot_record['sequence 0'].replace("\n", "") + else: + assert isinstance(sequence, str), "Must be a string." + sequence = sequence.upper() + assert set(sequence).issubset(aa_list), "Invalid list of amino acids." if len(coord) == 1: + # user asks for full-sequence scanning positions = range(len(sequence)) else: - positions = [int(coord[1]) - 1] + # user asks for single-site scanning + site = int(coord[1]) + positions = [site - 1] + # if user provides only one amino acid as 'sequence', interpret it + # as the amino acid at the specified position + if len(sequence) == 1: + sequence = sequence*site + else: + assert len(sequence) >= site, ("Requested position is not found " + "in input sequence.") SAV_list = [] acc = coord[0] for i in positions: wt_aa = sequence[i] - for aa in 'ACDEFGHIKLMNPQRSTVWY': + for aa in aa_list: if aa == wt_aa: continue s = ' '.join([acc, str(i+1), wt_aa, aa]) @@ -900,5 +928,5 @@ def printSAVlist(input_SAVs, filename): m = f'error in SAV {i}: ' assert isinstance(line, str), f'{m} not a string' assert len(line) < 25, f'{m} too many characters' - print(line, file=f) + print(line.upper(), file=f) return filename diff --git a/rhapsody/predict/core.py b/rhapsody/predict/core.py index 87a2005..1bf4897 100644 --- a/rhapsody/predict/core.py +++ b/rhapsody/predict/core.py @@ -145,21 +145,30 @@ def _isColSet(self, column): assert self.data is not None, 'Data array not initialized.' return self.data[column].count() != 0 - def _isSaturationMutagenesis(self): + def _isSaturationMutagenesis(self, queryUniprot=False): assert self._isColSet('SAV coords'), 'SAV list not set.' if self.saturation_mutagenesis is None: self.saturation_mutagenesis = False try: SAVs = self.getUniqueSAVcoords() SAV_list = list(SAVs['unique SAV coords']) - acc = SAVs[0]['Uniprot ID'] + acc = list(set(SAVs['Uniprot ID'])) + if len(acc) != 1: + raise RuntimeError('Multiple accession numbers found') + else: + acc = acc[0] pos = list(set(SAVs['position'])) if len(pos) == 1: query = f'{acc} {pos[0]}' else: query = acc - generated_SAV_list = Uniprot.seqScanning(query) - if SAV_list == generated_SAV_list: + # generate target scanning list + if queryUniprot: + target_SAV_list = Uniprot.seqScanning(query) + else: + seq = ''.join(SAVs['wt. aa'][range(0, len(SAVs), 19)]) + target_SAV_list = Uniprot.seqScanning(query, sequence=seq) + if SAV_list == target_SAV_list: self.saturation_mutagenesis = True else: raise RuntimeError('Missing SAVs detected.') @@ -184,18 +193,19 @@ def setSAVs(self, query): if isfile(query): # 'query' is a filename, with line format 'P17516 135 G E' SAVs = np.loadtxt(query, dtype=SAV_dtype) - SAV_list = ['{} {} {} {}'.format(*s) for s in SAVs] + SAV_list = ['{} {} {} {}'.format(*s).upper() for s in SAVs] elif len(query.split()) < 3: # single Uniprot acc (+ pos), e.g. 'P17516' or 'P17516 135' SAV_list = Uniprot.seqScanning(query) self.saturation_mutagenesis = True else: # single SAV - SAV = np.array(query.split(), dtype=SAV_dtype) + SAV = np.array(query.upper().split(), dtype=SAV_dtype) SAV_list = ['{} {} {} {}'.format(*SAV)] else: # 'query' is a list or tuple of SAV coordinates - SAVs = np.array([tuple(s.split()) for s in query], dtype=SAV_dtype) + SAVs = np.array([tuple(s.upper().split()) for s in query], + dtype=SAV_dtype) SAV_list = ['{} {} {} {}'.format(*s) for s in SAVs] # store SAV coordinates numSAVs = len(SAV_list) diff --git a/rhapsody/predict/figures.py b/rhapsody/predict/figures.py index f742d93..e515f48 100644 --- a/rhapsody/predict/figures.py +++ b/rhapsody/predict/figures.py @@ -203,7 +203,7 @@ def print_sat_mutagen_figure(filename, rhapsody_obj, res_interval=None, ax0.grid(which='minor', color='w', linestyle='-', linewidth=.5) # mutagenesis table (heatmap) - matplotlib.cm.coolwarm.set_bad(color='antiquewhite') + matplotlib.cm.coolwarm.set_bad(color='#F9E79F') im = ax1.imshow(table_best[:, res_i-1:res_f], aspect='auto', cmap='coolwarm', vmin=0, vmax=1) axcb.figure.colorbar(im, cax=axcb) @@ -221,7 +221,12 @@ def print_sat_mutagen_figure(filename, rhapsody_obj, res_interval=None, # average pathogenicity profile x_resids = np.arange(1, upper_lim+1) # shading showing range of values - ax2.fill_between(x_resids, min_p, max_p, alpha=0.5, + # NB: a bug in pyplot.fill_between() arises when selecting a region with + # set_xlim() in a large plot (e.g. > 1000), causing the shaded area to + # be plotted even though it's outside the selected region. As a workaround, + # here I slice the plot to fit the selected region. + sl = slice(max(1, res_i-2), min(res_f+2, upper_lim+1)) + ax2.fill_between(x_resids[sl], min_p[sl], max_p[sl], alpha=0.5, edgecolor='salmon', facecolor='salmon') # plot average profile for other predictions, if available if extra_plot is not None: diff --git a/rhapsody/train/RFtraining.py b/rhapsody/train/RFtraining.py index 42149dc..9b28869 100644 --- a/rhapsody/train/RFtraining.py +++ b/rhapsody/train/RFtraining.py @@ -12,6 +12,7 @@ from sklearn.metrics import roc_curve, roc_auc_score from sklearn.metrics import precision_recall_curve, average_precision_score from sklearn.metrics import matthews_corrcoef, precision_recall_fscore_support +from sklearn.utils import resample from ..utils.settings import DEFAULT_FEATSETS, getDefaultTrainingDataset from .figures import print_pred_distrib_figure, print_path_prob_figure from .figures import print_ROC_figure, print_feat_imp_figure @@ -27,46 +28,93 @@ 'extendDefaultTrainingDataset'] -def calcScoreMetrics(y_test, y_pred): - # compute ROC and AUROC - fpr, tpr, roc_thr = roc_curve(y_test, y_pred) - roc = {'FPR': fpr, 'TPR': tpr, 'thresholds': roc_thr} - auroc = roc_auc_score(y_test, y_pred) - # compute optimal cutoff J (argmax of Youden's index) - diff = np.array([y-x for x, y in zip(fpr, tpr)]) - Jopt = roc_thr[(-diff).argsort()][0] - # compute Precision-Recall curve and AUPRC - pre, rec, prc_thr = precision_recall_curve(y_test, y_pred) - prc = {'precision': pre, 'recall': rec, 'thresholds': prc_thr} - auprc = average_precision_score(y_test, y_pred) - output = { - 'ROC': roc, - 'AUROC': auroc, - 'optimal cutoff': Jopt, - 'PRC': prc, - 'AUPRC': auprc - } +def calcScoreMetrics(y_test, y_pred, bootstrap=0, **resample_kwargs): + '''Compute accuracy metrics of continuous values (optionally bootstrapped) + ''' + def _calcScoreMetrics(y_test, y_pred): + # compute ROC and AUROC + fpr, tpr, roc_thr = roc_curve(y_test, y_pred) + roc = {'FPR': fpr, 'TPR': tpr, 'thresholds': roc_thr} + auroc = roc_auc_score(y_test, y_pred) + # compute optimal cutoff J (argmax of Youden's index) + diff = np.array([y-x for x, y in zip(fpr, tpr)]) + Jopt = roc_thr[(-diff).argsort()][0] + # compute Precision-Recall curve and AUPRC + pre, rec, prc_thr = precision_recall_curve(y_test, y_pred) + prc = {'precision': pre, 'recall': rec, 'thresholds': prc_thr} + auprc = average_precision_score(y_test, y_pred) + return { + 'ROC': roc, + 'AUROC': auroc, + 'optimal cutoff': Jopt, + 'PRC': prc, + 'AUPRC': auprc + } + + if bootstrap < 2: + output = _calcScoreMetrics(y_test, y_pred) + else: + # apply bootstrap + outputs = [] + for i in range(bootstrap): + yy_test, yy_pred = resample(y_test, y_pred, **resample_kwargs) + outputs.append(_calcScoreMetrics(yy_test, yy_pred)) + # compute mean and standard deviation of metrics + output = {} + for metric in ['AUROC', 'optimal cutoff', 'AUPRC']: + v = [d[metric] for d in outputs] + output[f'mean {metric}'] = np.mean(v) + output[f'{metric} std'] = np.std(v) + # compute average ROC + mean_fpr = np.linspace(0, 1, len(y_pred)) + mean_tpr = 0.0 + for d in outputs: + mean_tpr += np.interp(mean_fpr, d['ROC']['FPR'], d['ROC']['TPR']) + mean_tpr /= bootstrap + mean_tpr[0] = 0.0 + mean_tpr[-1] = 1.0 + output['mean ROC'] = {'FPR': mean_fpr, 'TPR': mean_tpr} + return output -def calcClassMetrics(y_test, y_pred): - mcc = matthews_corrcoef(y_test, y_pred) - pre, rec, f1s, sup = precision_recall_fscore_support( - y_test, y_pred, labels=[0, 1]) - avg_pre, avg_rec, avg_f1s, _ = precision_recall_fscore_support( - y_test, y_pred, average='weighted') - output = { - 'MCC': mcc, - 'precision (0)': pre[0], - 'recall (0)': rec[0], - 'F1 score (0)': f1s[0], - 'precision (1)': pre[1], - 'recall (1)': rec[1], - 'F1 score (1)': f1s[1], - 'precision': avg_pre, - 'recall': avg_rec, - 'F1 score': avg_f1s - } +def calcClassMetrics(y_test, y_pred, bootstrap=0, **resample_kwargs): + '''Compute accuracy metrics of binary labels (optionally bootstrapped) + ''' + def _calcClassMetrics(y_test, y_pred): + mcc = matthews_corrcoef(y_test, y_pred) + pre, rec, f1s, sup = precision_recall_fscore_support( + y_test, y_pred, labels=[0, 1]) + avg_pre, avg_rec, avg_f1s, _ = precision_recall_fscore_support( + y_test, y_pred, average='weighted') + return { + 'MCC': mcc, + 'precision (0)': pre[0], + 'recall (0)': rec[0], + 'F1 score (0)': f1s[0], + 'precision (1)': pre[1], + 'recall (1)': rec[1], + 'F1 score (1)': f1s[1], + 'precision': avg_pre, + 'recall': avg_rec, + 'F1 score': avg_f1s + } + + if bootstrap < 2: + output = _calcClassMetrics(y_test, y_pred) + else: + # apply bootstrap + outputs = [] + for i in range(bootstrap): + yy_test, yy_pred = resample(y_test, y_pred, **resample_kwargs) + outputs.append(_calcClassMetrics(yy_test, yy_pred)) + # compute mean and standard deviation of metrics + output = {} + for metric in outputs[0].keys(): + v = [d[metric] for d in outputs] + output[f'mean {metric}'] = np.mean(v) + output[f'{metric} std'] = np.std(v) + return output diff --git a/rhapsody/utils/settings.py b/rhapsody/utils/settings.py index 4c5f6f3..9d54145 100644 --- a/rhapsody/utils/settings.py +++ b/rhapsody/utils/settings.py @@ -79,8 +79,11 @@ def initialSetup(working_dir=None, refresh=False, download_EVmutation=True): working_dir = DEFAULT_WORKING_DIR if os.path.isdir(working_dir): raise EnvironmentError( - f"A folder named '{working_dir}' already exists. " - "Please specify another name.") + f"A folder at default path '{working_dir}' already " + "exists. If it contains configuration files from a " + "previous installation that you wish to recover, please " + f"run: initialSetup('{working_dir}') \n" + "Otherwise, please specify another location.") else: os.mkdir(working_dir) pd.LOGGER.info(f'Default working directory set: {working_dir}')