prody · luponzo86 · Mar 7, 2020 · Jan 18, 2020 · Jan 24, 2020 · Jan 26, 2020
diff --git a/rhapsody/features/PolyPhen2.py b/rhapsody/features/PolyPhen2.py
@@ -8,10 +8,11 @@
 import requests
 import datetime
 import numpy as np
-from prody import LOGGER, queryUniprot
+from prody import LOGGER
 from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry
 from math import log
+from .Uniprot import queryUniprot
 
 __author__ = "Luca Ponzoni"
 __date__ = "December 2019"

diff --git a/rhapsody/features/Uniprot.py b/rhapsody/features/Uniprot.py
@@ -9,6 +9,7 @@
 import numpy as np
 import prody as pd
 from prody import LOGGER, SETTINGS
+from prody.utilities import openURL
 from tqdm import tqdm
 from Bio.pairwise2 import align as bioalign
 from Bio.pairwise2 import format_alignment
@@ -20,7 +21,16 @@
 __email__ = "[email protected]"
 __status__ = "Production"
 
-__all__ = ['UniprotMapping', 'mapSAVs2PDB', 'seqScanning', 'printSAVlist']
+__all__ = ['queryUniprot', 'UniprotMapping', 'mapSAVs2PDB',
+           'seqScanning', 'printSAVlist']
+
+
+def queryUniprot(*args, **kwargs):
+    """
+    Redefine prody function to check for no internet connection
+    """
+    _ = openURL('http://www.uniprot.org/')
+    return pd.queryUniprot(*args, **kwargs)
 
 
 class UniprotMapping:
@@ -52,7 +62,7 @@ def refresh(self):
         delete precomputed alignments.
         """
         # import Uniprot record and official accession number
-        self.fullRecord = pd.queryUniprot(self.acc)
+        self.fullRecord = queryUniprot(self.acc)
         self.uniq_acc = self.fullRecord['accession   0']
         # import main sequence and PDB records
         rec = self.fullRecord
@@ -393,7 +403,7 @@ def recoverPickle(self, filename=None, folder=None, days=30, **kwargs):
             pickle_path = os.path.join(folder, filename)
             if not os.path.isfile(pickle_path):
                 # import unique accession number
-                acc = pd.queryUniprot(self.acc)['accession   0']
+                acc = queryUniprot(self.acc)['accession   0']
                 filename = 'UniprotMap-' + acc + '.pkl'
                 pickle_path = os.path.join(folder, filename)
         else:
@@ -865,26 +875,44 @@ def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False,
     return mapped_SAVs
 
 
-def seqScanning(Uniprot_coord):
-    '''Returns a list of SAVs. If the string 'Uniprot_coord' is just a Uniprot ID,
-    the list will contain all possible amino acid substitutions at all positions
-    in the sequence. If 'Uniprot_coord' also includes a specific position, the list
-    will only contain all possible amino acid variants at that position.
+def seqScanning(Uniprot_coord, sequence=None):
+    '''Returns a list of SAVs. If the string 'Uniprot_coord' is just a
+    Uniprot ID, the list will contain all possible amino acid substitutions
+    at all positions in the sequence. If 'Uniprot_coord' also includes a
+    specific position, the list will only contain all possible amino acid
+    variants at that position. If 'sequence' is 'None' (default), the
+    sequence will be downloaded from Uniprot.
     '''
     assert isinstance(Uniprot_coord, str), "Must be a string."
-    coord = Uniprot_coord.strip().split()
+    coord = Uniprot_coord.upper().strip().split()
     assert len(coord) < 3, "Invalid format. Examples: 'Q9BW27' or 'Q9BW27 10'."
-    Uniprot_record = pd.queryUniprot(coord[0])
-    sequence = Uniprot_record['sequence   0'].replace("\n", "")
+    aa_list = 'ACDEFGHIKLMNPQRSTVWY'
+    if sequence is None:
+        Uniprot_record = queryUniprot(coord[0])
+        sequence = Uniprot_record['sequence   0'].replace("\n", "")
+    else:
+        assert isinstance(sequence, str), "Must be a string."
+        sequence = sequence.upper()
+        assert set(sequence).issubset(aa_list), "Invalid list of amino acids."
     if len(coord) == 1:
+        # user asks for full-sequence scanning
         positions = range(len(sequence))
     else:
-        positions = [int(coord[1]) - 1]
+        # user asks for single-site scanning
+        site = int(coord[1])
+        positions = [site - 1]
+        # if user provides only one amino acid as 'sequence', interpret it
+        # as the amino acid at the specified position
+        if len(sequence) == 1:
+            sequence = sequence*site
+        else:
+            assert len(sequence) >= site, ("Requested position is not found "
+                                           "in input sequence.")
     SAV_list = []
     acc = coord[0]
     for i in positions:
         wt_aa = sequence[i]
-        for aa in 'ACDEFGHIKLMNPQRSTVWY':
+        for aa in aa_list:
             if aa == wt_aa:
                 continue
             s = ' '.join([acc, str(i+1), wt_aa, aa])
@@ -900,5 +928,5 @@ def printSAVlist(input_SAVs, filename):
             m = f'error in SAV {i}: '
             assert isinstance(line, str), f'{m} not a string'
             assert len(line) < 25, f'{m} too many characters'
-            print(line, file=f)
+            print(line.upper(), file=f)
     return filename
diff --git a/rhapsody/predict/core.py b/rhapsody/predict/core.py
@@ -145,21 +145,30 @@ def _isColSet(self, column):
         assert self.data is not None, 'Data array not initialized.'
         return self.data[column].count() != 0
 
-    def _isSaturationMutagenesis(self):
+    def _isSaturationMutagenesis(self, queryUniprot=False):
         assert self._isColSet('SAV coords'), 'SAV list not set.'
         if self.saturation_mutagenesis is None:
             self.saturation_mutagenesis = False
             try:
                 SAVs = self.getUniqueSAVcoords()
                 SAV_list = list(SAVs['unique SAV coords'])
-                acc = SAVs[0]['Uniprot ID']
+                acc = list(set(SAVs['Uniprot ID']))
+                if len(acc) != 1:
+                    raise RuntimeError('Multiple accession numbers found')
+                else:
+                    acc = acc[0]
                 pos = list(set(SAVs['position']))
                 if len(pos) == 1:
                     query = f'{acc} {pos[0]}'
                 else:
                     query = acc
-                generated_SAV_list = Uniprot.seqScanning(query)
-                if SAV_list == generated_SAV_list:
+                # generate target scanning list
+                if queryUniprot:
+                    target_SAV_list = Uniprot.seqScanning(query)
+                else:
+                    seq = ''.join(SAVs['wt. aa'][range(0, len(SAVs), 19)])
+                    target_SAV_list = Uniprot.seqScanning(query, sequence=seq)
+                if SAV_list == target_SAV_list:
                     self.saturation_mutagenesis = True
                 else:
                     raise RuntimeError('Missing SAVs detected.')
@@ -184,18 +193,19 @@ def setSAVs(self, query):
             if isfile(query):
                 # 'query' is a filename, with line format 'P17516 135 G E'
                 SAVs = np.loadtxt(query, dtype=SAV_dtype)
-                SAV_list = ['{} {} {} {}'.format(*s) for s in SAVs]
+                SAV_list = ['{} {} {} {}'.format(*s).upper() for s in SAVs]
             elif len(query.split()) < 3:
                 # single Uniprot acc (+ pos), e.g. 'P17516' or 'P17516 135'
                 SAV_list = Uniprot.seqScanning(query)
                 self.saturation_mutagenesis = True
             else:
                 # single SAV
-                SAV = np.array(query.split(), dtype=SAV_dtype)
+                SAV = np.array(query.upper().split(), dtype=SAV_dtype)
                 SAV_list = ['{} {} {} {}'.format(*SAV)]
         else:
             # 'query' is a list or tuple of SAV coordinates
-            SAVs = np.array([tuple(s.split()) for s in query], dtype=SAV_dtype)
+            SAVs = np.array([tuple(s.upper().split()) for s in query],
+                            dtype=SAV_dtype)
             SAV_list = ['{} {} {} {}'.format(*s) for s in SAVs]
         # store SAV coordinates
         numSAVs = len(SAV_list)

diff --git a/rhapsody/predict/figures.py b/rhapsody/predict/figures.py
@@ -203,7 +203,7 @@ def print_sat_mutagen_figure(filename, rhapsody_obj, res_interval=None,
     ax0.grid(which='minor', color='w', linestyle='-', linewidth=.5)
 
     # mutagenesis table (heatmap)
-    matplotlib.cm.coolwarm.set_bad(color='antiquewhite')
+    matplotlib.cm.coolwarm.set_bad(color='#F9E79F')
     im = ax1.imshow(table_best[:, res_i-1:res_f], aspect='auto',
                     cmap='coolwarm', vmin=0, vmax=1)
     axcb.figure.colorbar(im, cax=axcb)
@@ -221,7 +221,12 @@ def print_sat_mutagen_figure(filename, rhapsody_obj, res_interval=None,
     # average pathogenicity profile
     x_resids = np.arange(1, upper_lim+1)
     # shading showing range of values
-    ax2.fill_between(x_resids, min_p, max_p, alpha=0.5,
+    # NB: a bug in pyplot.fill_between() arises when selecting a region with
+    # set_xlim() in a large plot (e.g. > 1000), causing the shaded area to
+    # be plotted even though it's outside the selected region. As a workaround,
+    # here I slice the plot to fit the selected region.
+    sl = slice(max(1, res_i-2), min(res_f+2, upper_lim+1))
+    ax2.fill_between(x_resids[sl], min_p[sl], max_p[sl], alpha=0.5,
                      edgecolor='salmon', facecolor='salmon')
     # plot average profile for other predictions, if available
     if extra_plot is not None:

diff --git a/rhapsody/train/RFtraining.py b/rhapsody/train/RFtraining.py
@@ -12,6 +12,7 @@
 from sklearn.metrics import roc_curve, roc_auc_score
 from sklearn.metrics import precision_recall_curve, average_precision_score
 from sklearn.metrics import matthews_corrcoef, precision_recall_fscore_support
+from sklearn.utils import resample
 from ..utils.settings import DEFAULT_FEATSETS, getDefaultTrainingDataset
 from .figures import print_pred_distrib_figure, print_path_prob_figure
 from .figures import print_ROC_figure, print_feat_imp_figure
@@ -27,46 +28,93 @@
            'extendDefaultTrainingDataset']
 
 
-def calcScoreMetrics(y_test, y_pred):
-    # compute ROC and AUROC
-    fpr, tpr, roc_thr = roc_curve(y_test, y_pred)
-    roc = {'FPR': fpr, 'TPR': tpr, 'thresholds': roc_thr}
-    auroc = roc_auc_score(y_test, y_pred)
-    # compute optimal cutoff J (argmax of Youden's index)
-    diff = np.array([y-x for x, y in zip(fpr, tpr)])
-    Jopt = roc_thr[(-diff).argsort()][0]
-    # compute Precision-Recall curve and AUPRC
-    pre, rec, prc_thr = precision_recall_curve(y_test, y_pred)
-    prc = {'precision': pre, 'recall': rec, 'thresholds': prc_thr}
-    auprc = average_precision_score(y_test, y_pred)
-    output = {
-        'ROC': roc,
-        'AUROC': auroc,
-        'optimal cutoff': Jopt,
-        'PRC': prc,
-        'AUPRC': auprc
-    }
+def calcScoreMetrics(y_test, y_pred, bootstrap=0, **resample_kwargs):
+    '''Compute accuracy metrics of continuous values (optionally bootstrapped)
+    '''
+    def _calcScoreMetrics(y_test, y_pred):
+        # compute ROC and AUROC
+        fpr, tpr, roc_thr = roc_curve(y_test, y_pred)
+        roc = {'FPR': fpr, 'TPR': tpr, 'thresholds': roc_thr}
+        auroc = roc_auc_score(y_test, y_pred)
+        # compute optimal cutoff J (argmax of Youden's index)
+        diff = np.array([y-x for x, y in zip(fpr, tpr)])
+        Jopt = roc_thr[(-diff).argsort()][0]
+        # compute Precision-Recall curve and AUPRC
+        pre, rec, prc_thr = precision_recall_curve(y_test, y_pred)
+        prc = {'precision': pre, 'recall': rec, 'thresholds': prc_thr}
+        auprc = average_precision_score(y_test, y_pred)
+        return {
+            'ROC': roc,
+            'AUROC': auroc,
+            'optimal cutoff': Jopt,
+            'PRC': prc,
+            'AUPRC': auprc
+        }
+
+    if bootstrap < 2:
+        output = _calcScoreMetrics(y_test, y_pred)
+    else:
+        # apply bootstrap
+        outputs = []
+        for i in range(bootstrap):
+            yy_test, yy_pred = resample(y_test, y_pred, **resample_kwargs)
+            outputs.append(_calcScoreMetrics(yy_test, yy_pred))
+        # compute mean and standard deviation of metrics
+        output = {}
+        for metric in ['AUROC', 'optimal cutoff', 'AUPRC']:
+            v = [d[metric] for d in outputs]
+            output[f'mean {metric}'] = np.mean(v)
+            output[f'{metric} std'] = np.std(v)
+        # compute average ROC
+        mean_fpr = np.linspace(0, 1, len(y_pred))
+        mean_tpr = 0.0
+        for d in outputs:
+            mean_tpr += np.interp(mean_fpr, d['ROC']['FPR'], d['ROC']['TPR'])
+        mean_tpr /= bootstrap
+        mean_tpr[0] = 0.0
+        mean_tpr[-1] = 1.0
+        output['mean ROC'] = {'FPR': mean_fpr, 'TPR': mean_tpr}
+
     return output
 
 
-def calcClassMetrics(y_test, y_pred):
-    mcc = matthews_corrcoef(y_test, y_pred)
-    pre, rec, f1s, sup = precision_recall_fscore_support(
-        y_test, y_pred, labels=[0, 1])
-    avg_pre, avg_rec, avg_f1s, _ = precision_recall_fscore_support(
-        y_test, y_pred, average='weighted')
-    output = {
-        'MCC': mcc,
-        'precision (0)': pre[0],
-        'recall (0)': rec[0],
-        'F1 score (0)': f1s[0],
-        'precision (1)': pre[1],
-        'recall (1)': rec[1],
-        'F1 score (1)': f1s[1],
-        'precision': avg_pre,
-        'recall': avg_rec,
-        'F1 score': avg_f1s
-    }
+def calcClassMetrics(y_test, y_pred, bootstrap=0, **resample_kwargs):
+    '''Compute accuracy metrics of binary labels (optionally bootstrapped)
+    '''
+    def _calcClassMetrics(y_test, y_pred):
+        mcc = matthews_corrcoef(y_test, y_pred)
+        pre, rec, f1s, sup = precision_recall_fscore_support(
+            y_test, y_pred, labels=[0, 1])
+        avg_pre, avg_rec, avg_f1s, _ = precision_recall_fscore_support(
+            y_test, y_pred, average='weighted')
+        return {
+            'MCC': mcc,
+            'precision (0)': pre[0],
+            'recall (0)': rec[0],
+            'F1 score (0)': f1s[0],
+            'precision (1)': pre[1],
+            'recall (1)': rec[1],
+            'F1 score (1)': f1s[1],
+            'precision': avg_pre,
+            'recall': avg_rec,
+            'F1 score': avg_f1s
+        }
+
+    if bootstrap < 2:
+        output = _calcClassMetrics(y_test, y_pred)
+    else:
+        # apply bootstrap
+        outputs = []
+        for i in range(bootstrap):
+            yy_test, yy_pred = resample(y_test, y_pred, **resample_kwargs)
+            outputs.append(_calcClassMetrics(yy_test, yy_pred))
+        # compute mean and standard deviation of metrics
+        output = {}
+        for metric in outputs[0].keys():
+            v = [d[metric] for d in outputs]
+            output[f'mean {metric}'] = np.mean(v)
+            output[f'{metric} std'] = np.std(v)
+
     return output
 
 

diff --git a/rhapsody/utils/settings.py b/rhapsody/utils/settings.py
@@ -79,8 +79,11 @@ def initialSetup(working_dir=None, refresh=False, download_EVmutation=True):
             working_dir = DEFAULT_WORKING_DIR
             if os.path.isdir(working_dir):
                 raise EnvironmentError(
-                    f"A folder named '{working_dir}' already exists. "
-                    "Please specify another name.")
+                    f"A folder at default path '{working_dir}' already "
+                    "exists. If it contains configuration files from a "
+                    "previous installation that you wish to recover, please "
+                    f"run: initialSetup('{working_dir}') \n"
+                    "Otherwise, please specify another location.")
             else:
                 os.mkdir(working_dir)
                 pd.LOGGER.info(f'Default working directory set: {working_dir}')