From 050ea7b1a4f8d69f293ac6d6a790eabc69f0300f Mon Sep 17 00:00:00 2001
From: luponzo86 <ponzoniluca@gmail.com>
Date: Thu, 14 Nov 2019 12:46:35 -0800
Subject: [PATCH] add tqdm progress bars to main loops

---
 rhapsody/features/PDB.py     |  40 +++--
 rhapsody/features/Pfam.py    |  25 +++-
 rhapsody/features/Uniprot.py |  34 ++++-
 rhapsody/predict/core.py     |  24 ++-
 rhapsody/predict/main.py     |  10 +-
 rhapsody/utils/misc.py       | 283 +++++++++++++++++++++++++++++++++++
 6 files changed, 388 insertions(+), 28 deletions(-)
 create mode 100644 rhapsody/utils/misc.py

diff --git a/rhapsody/features/PDB.py b/rhapsody/features/PDB.py
index 9000783..4d8b27e 100644
--- a/rhapsody/features/PDB.py
+++ b/rhapsody/features/PDB.py
@@ -3,16 +3,18 @@
 PDB-based structural and dynamical features in a single place, and a
 function for using the latter on a list of PDB SAV coordinates."""
 
+import numpy as np
+import pickle
+import datetime
+import os
+from tqdm import tqdm
 from prody import Atomic, parsePDB, writePDB, LOGGER, SETTINGS
 from prody import GNM, ANM, calcSqFlucts
 from prody import calcPerturbResponse, calcMechStiff
 # from prody import calcMBS
 from prody import reduceModel, sliceModel
 from prody import execDSSP, parseDSSP
-import numpy as np
-import pickle
-import datetime
-import os
+
 
 __all__ = ['STR_FEATS', 'DYN_FEATS', 'PDB_FEATS',
            'PDBfeatures', 'calcPDBfeatures']
@@ -646,7 +648,7 @@ def calcSelFeatures(self, chain='all', resid=None, sel_feats=None):
 
 
 def calcPDBfeatures(mapped_SAVs, sel_feats=None, custom_PDB=None,
-                    refresh=False):
+                    refresh=False, status_file=None, status_prefix=None):
     LOGGER.info('Computing structural and dynamical features '
                 'from PDB structures...')
     LOGGER.timeit('_calcPDBFeats')
@@ -665,24 +667,40 @@ def calcPDBfeatures(mapped_SAVs, sel_feats=None, custom_PDB=None,
     else:
         # no need to sort when using a custom PDB or PDBID
         sorting_map = range(num_SAVs)
+    # define how to report progress
+    if status_prefix is None:
+        status_prefix = ''
+    bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
+    if status_file is not None:
+        status_file = open(status_file, 'w')
+        progress_bar = tqdm(
+            [(i, mapped_SAVs[i]) for i in sorting_map], file=status_file,
+            bar_format=bar_format+'\n')
+    else:
+        progress_bar = tqdm(
+            [(i, mapped_SAVs[i]) for i in sorting_map], bar_format=bar_format)
     cache = {'PDBID': None, 'chain': None, 'obj': None}
     count = 0
-    for indx, SAV in [(i, mapped_SAVs[i]) for i in sorting_map]:
+    for indx, SAV in progress_bar:
         count += 1
         if SAV['PDB size'] == 0:
             # SAV could not be mapped to PDB
             _features = np.nan
             SAV_coords = SAV['SAV coords']
-            LOGGER.info(f"[{count}/{num_SAVs}] SAV '{SAV_coords}' "
-                        "couldn't be mapped to PDB")
+            progress_msg = f"{status_prefix}No PDB for SAV '{SAV_coords}'"
         else:
             parsed_PDB_coords = SAV['PDB SAV coords'].split()
             PDBID, chID = parsed_PDB_coords[:2]
             resid = int(parsed_PDB_coords[2])
-            LOGGER.info("[{}/{}] Analizing mutation site {}:{} {}..."
-                        .format(count, num_SAVs, PDBID, chID, resid))
+            progress_msg = status_prefix + \
+                f'Analizing mutation site {PDBID}:{chID} {resid}'
             # chID == "?" stands for "empty space"
             chID = " " if chID == "?" else chID
+        # report progress
+        # LOGGER.info(f"[{count}/{num_SAVs}] {progress_msg}...")
+        progress_bar.set_description(progress_msg)
+        # compute PDB features, if possible
+        if SAV['PDB size'] != 0:
             if PDBID == cache['PDBID']:
                 # use PDBfeatures instance from previous iteration
                 obj = cache['obj']
@@ -728,4 +746,6 @@ def calcPDBfeatures(mapped_SAVs, sel_feats=None, custom_PDB=None,
            and custom_PDB is None:
             cache['obj'].savePickle()
     LOGGER.report('PDB features have been computed in %.1fs.', '_calcPDBFeats')
+    if status_file:
+        os.remove(status_file.name)
     return features
diff --git a/rhapsody/features/Pfam.py b/rhapsody/features/Pfam.py
index 091b9ac..57117e5 100644
--- a/rhapsody/features/Pfam.py
+++ b/rhapsody/features/Pfam.py
@@ -3,7 +3,9 @@
 coevolution properties of an amino acid substitution from a Pfam
 multiple sequence alignment."""
 
+import os
 import numpy as np
+from tqdm import tqdm
 from prody import LOGGER
 from .Uniprot import UniprotMapping
 
@@ -38,7 +40,7 @@ def calcNormRank(array, i):
         return feats
 
 
-def calcPfamFeatures(SAVs):
+def calcPfamFeatures(SAVs, status_file=None, status_prefix=None):
     LOGGER.info('Computing sequence properties from Pfam domains...')
     LOGGER.timeit('_calcPfamFeats')
     # sort SAVs, so to group together those
@@ -49,14 +51,29 @@ def calcPfamFeatures(SAVs):
     num_SAVs = len(SAVs)
     feat_dtype = np.dtype([('entropy', 'f'), ('ranked_MI', 'f')])
     features = np.zeros(num_SAVs, dtype=feat_dtype)
+    # define how to report progress
+    if status_prefix is None:
+        status_prefix = ''
+    bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
+    if status_file is not None:
+        status_file = open(status_file, 'w')
+        progress_bar = tqdm(
+            [(i, SAVs[i]) for i in sorting_map], file=status_file,
+            bar_format=bar_format+'\n')
+    else:
+        progress_bar = tqdm(
+            [(i, SAVs[i]) for i in sorting_map], bar_format=bar_format)
     # map to Pfam domains using UniprotMapping class
     cache = {'acc': None, 'obj': None, 'warn': ''}
     count = 0
-    for indx, SAV in [(i, SAVs[i]) for i in sorting_map]:
+    for indx, SAV in progress_bar:
         count += 1
         acc, pos, aa1, aa2 = SAV.split()
         pos = int(pos)
-        LOGGER.info(f"[{count}/{num_SAVs}] Mapping SAV '{SAV}' to Pfam...")
+        # report progress
+        progress_msg = f"{status_prefix}Mapping SAV '{SAV}' to Pfam"
+        # LOGGER.info(f"[{count}/{num_SAVs}] {progress_msg}...")
+        progress_bar.set_description(progress_msg)
         # map to Pfam domains using 'UniprotMapping' class
         if acc == cache['acc']:
             # use object from previous iteration
@@ -102,4 +119,6 @@ def calcPfamFeatures(SAVs):
             cache['obj'].savePickle()
     LOGGER.report('SAVs have been mapped on Pfam domains and sequence '
                   'properties have been computed in %.1fs.', '_calcPfamFeats')
+    if status_file:
+        os.remove(status_file.name)
     return features
diff --git a/rhapsody/features/Uniprot.py b/rhapsody/features/Uniprot.py
index 79d8046..83b8dc0 100644
--- a/rhapsody/features/Uniprot.py
+++ b/rhapsody/features/Uniprot.py
@@ -9,6 +9,7 @@
 import numpy as np
 import prody as pd
 from prody import LOGGER, SETTINGS
+from tqdm import tqdm
 from Bio.pairwise2 import align as bioalign
 from Bio.pairwise2 import format_alignment
 from Bio.SubsMat import MatrixInfo as matlist
@@ -758,7 +759,8 @@ def calcEvolProperties(self, resid='all', refresh=False, folder=None,
         return {k: self.Pfam[k] for k in PF_list}
 
 
-def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False):
+def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False,
+                status_file=None, status_prefix=None):
     LOGGER.info('Mapping SAVs to PDB structures...')
     LOGGER.timeit('_map2PDB')
     # sort SAVs, so to group together those
@@ -766,20 +768,36 @@ def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False):
     accs = [s.split()[0] for s in SAV_coords]
     sorting_map = np.argsort(accs)
     # define a structured array
-    PDBmap_dtype = np.dtype([('orig. SAV coords', 'U25'),
-                             ('unique SAV coords', 'U25'),
-                             ('PDB SAV coords', 'U100'),
-                             ('PDB size', 'i')])
+    PDBmap_dtype = np.dtype([
+        ('orig. SAV coords', 'U25'),
+        ('unique SAV coords', 'U25'),
+        ('PDB SAV coords', 'U100'),
+        ('PDB size', 'i')])
     nSAVs = len(SAV_coords)
     mapped_SAVs = np.zeros(nSAVs, dtype=PDBmap_dtype)
+    # define how to report progress
+    if status_prefix is None:
+        status_prefix = ''
+    bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
+    if status_file is not None:
+        status_file = open(status_file, 'w')
+        progress_bar = tqdm(
+            [(i, SAV_coords[i]) for i in sorting_map], file=status_file,
+            bar_format=bar_format+'\n')
+    else:
+        progress_bar = tqdm(
+            [(i, SAV_coords[i]) for i in sorting_map], bar_format=bar_format)
     # map to PDB using Uniprot class
     cache = {'acc': None, 'obj': None}
     count = 0
-    for indx, SAV in [(i, SAV_coords[i]) for i in sorting_map]:
+    for indx, SAV in progress_bar:
         count += 1
         acc, pos, aa1, aa2 = SAV.split()
         pos = int(pos)
-        LOGGER.info(f"[{count}/{nSAVs}] Mapping SAV '{SAV}' to PDB...")
+        # report progress
+        progress_msg = f"{status_prefix}Mapping SAV '{SAV}' to PDB"
+        # LOGGER.info(f"[{count}/{nSAVs}] {progress_msg}...")
+        progress_bar.set_description(progress_msg)
         # map Uniprot to PDB chains
         if acc == cache['acc']:
             # use mapping from previous iteration
@@ -836,6 +854,8 @@ def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False):
     n = sum(mapped_SAVs['PDB size'] != 0)
     LOGGER.report(f'{n} out of {nSAVs} SAVs have been mapped to PDB in %.1fs.',
                   '_map2PDB')
+    if status_file:
+        os.remove(status_file.name)
     return mapped_SAVs
 
 
diff --git a/rhapsody/predict/core.py b/rhapsody/predict/core.py
index 008c17c..b3f03bf 100644
--- a/rhapsody/predict/core.py
+++ b/rhapsody/predict/core.py
@@ -22,7 +22,8 @@ class Rhapsody:
     EVmutation.
     """
 
-    def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True):
+    def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True,
+                 **kwargs):
         """ Initialize a Rhapsody object with a list of SAVs (optional).
 
         :arg query: Single Amino Acid Variants (SAVs) in Uniprot coordinates.
@@ -53,6 +54,14 @@ def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True):
 
         assert query_type in ('SAVs', 'PolyPhen2')
         assert isinstance(queryPolyPhen2, bool)
+        valid_kwargs = [
+            'status_file_Uniprot',
+            'status_file_PDB',
+            'status_file_Pfam',
+            'status_prefix_Uniprot',
+            'status_prefix_PDB',
+            'status_prefix_Pfam']
+        assert all([k in valid_kwargs for k in kwargs])
 
         # masked NumPy array that will contain all info abut SAVs
         self.data = None
@@ -103,6 +112,8 @@ def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True):
         self.classifier = None
         self.aux_classifier = None
         self.featSet = None
+        # options
+        self.options = kwargs
 
         if query is None:
             # a SAV list can be uploaded later with setSAVs()
@@ -283,6 +294,8 @@ def getUniprot2PDBmap(self, filename='rhapsody-Uniprot2PDB.txt',
             # compute mapping
             m = Uniprot.mapSAVs2PDB(
                 self.data['SAV coords'], custom_PDB=self.customPDB,
+                status_file=self.options.get('status_file_Uniprot'),
+                status_prefix=self.options.get('status_prefix_Uniprot'),
                 refresh=refresh)
             self.data['unique SAV coords'] = m['unique SAV coords']
             self.data['PDB SAV coords'] = m['PDB SAV coords']
@@ -403,7 +416,9 @@ def _calcFeatMatrix(self, refresh=False):
             # compute structural and dynamical features from a PDB structure
             f = PDB.calcPDBfeatures(
                 Uniprot2PDBmap, sel_feats=sel_PDBfeats,
-                custom_PDB=self.customPDB, refresh=refresh)
+                custom_PDB=self.customPDB, refresh=refresh,
+                status_file=self.options.get('status_file_PDB'),
+                status_prefix=self.options.get('status_prefix_PDB'))
             all_feats.append(f)
         if RHAPSODY_FEATS['BLOSUM'].intersection(self.featSet):
             # retrieve BLOSUM values
@@ -411,7 +426,10 @@ def _calcFeatMatrix(self, refresh=False):
             all_feats.append(f)
         if RHAPSODY_FEATS['Pfam'].intersection(self.featSet):
             # compute sequence properties from Pfam domains
-            f = Pfam.calcPfamFeatures(self.data['SAV coords'])
+            f = Pfam.calcPfamFeatures(
+                self.data['SAV coords'],
+                status_file=self.options.get('status_file_Pfam'),
+                status_prefix=self.options.get('status_prefix_Pfam'))
             all_feats.append(f)
         if RHAPSODY_FEATS['EVmut'].intersection(self.featSet):
             # recover EVmutation data
diff --git a/rhapsody/predict/main.py b/rhapsody/predict/main.py
index 3843f3e..22f93cd 100644
--- a/rhapsody/predict/main.py
+++ b/rhapsody/predict/main.py
@@ -12,7 +12,7 @@
 def rhapsody(query, query_type='SAVs',
              main_classifier=None, aux_classifier=None,
              custom_PDB=None, force_env=None,
-             refresh=False, log=True):
+             refresh=False, log=True, **kwargs):
     """Obtain Rhapsody pathogenicity predictions on a list of human missense
     variants ([ref]_)
 
@@ -77,7 +77,7 @@ def rhapsody(query, query_type='SAVs',
             aux_classifier = getDefaultClassifiers()['reduced']
 
     # initialize object that will contain all results and predictions
-    r = Rhapsody()
+    r = Rhapsody(**kwargs)
 
     # import classifiers and feature set from pickle
     r.importClassifiers(main_classifier, aux_classifier, force_env=force_env)
@@ -99,9 +99,9 @@ def rhapsody(query, query_type='SAVs',
     r.printPredictions()
     if aux_classifier is not None:
         # print both 'full' and 'reduced' predictions in a more detailed format
-        r.printPredictions(classifier="both",
-                           PolyPhen2=False, EVmutation=False,
-                           filename='rhapsody-predictions-full_vs_reduced.txt')
+        r.printPredictions(
+            classifier="both", PolyPhen2=False, EVmutation=False,
+            filename='rhapsody-predictions-full_vs_reduced.txt')
 
     # save pickle
     r.savePickle()
diff --git a/rhapsody/utils/misc.py b/rhapsody/utils/misc.py
new file mode 100644
index 0000000..3df3632
--- /dev/null
+++ b/rhapsody/utils/misc.py
@@ -0,0 +1,283 @@
+# -*- coding: utf-8 -*-
+"""This module defines default configuration parameters and
+a function for the initial setup and training of Rhapsody."""
+
+import os
+import tarfile
+import pickle
+import urllib.request
+import shutil
+import sklearn
+import numpy as np
+import prody as pd
+import rhapsody as rd
+
+__all__ = ['DEFAULT_FEATSETS', 'initialSetup',
+           'getDefaultTrainingDataset', 'getDefaultClassifiers',
+           'importDefaultClassifier', 'delSettings', 'getSettings']
+
+USERHOME = os.getenv('USERPROFILE') or os.getenv('HOME') or './'
+DEFAULT_WORKING_DIR = os.path.join(USERHOME, 'rhapsody')
+DEFAULT_EVMUT_DIR = os.path.join(DEFAULT_WORKING_DIR,
+                                 'EVmutation_mutation_effects')
+EVMUT_URL = 'https://marks.hms.harvard.edu/evmutation/data/effects.tar.gz'
+PACKAGE_DATA = os.path.join(rd.__path__[0], 'data.tar.gz')
+TRAINING_DATASET = 'precomputed_features-ID_opt.npy'
+DEFAULT_CLSF_DIR = f'default_classifiers-sklearn_v{sklearn.__version__}'
+DEFAULT_FEATSETS = {
+  'full':    ['wt_PSIC', 'Delta_PSIC', 'SASA', 'ANM_MSF-chain',
+              'ANM_effectiveness-chain', 'ANM_sensitivity-chain',
+              'stiffness-chain', 'entropy', 'ranked_MI', 'BLOSUM'],
+  'reduced': ['wt_PSIC', 'Delta_PSIC', 'SASA', 'ANM_MSF-chain',
+              'ANM_effectiveness-chain', 'ANM_sensitivity-chain',
+              'stiffness-chain', 'BLOSUM'],
+  'EVmut':   ['wt_PSIC', 'Delta_PSIC', 'SASA', 'ANM_MSF-chain',
+              'ANM_effectiveness-chain', 'ANM_sensitivity-chain',
+              'stiffness-chain', 'entropy', 'ranked_MI', 'BLOSUM',
+              'EVmut-DeltaE_epist'],
+}
+
+
+def initialSetup(working_dir=None, refresh=False, download_EVmutation=True):
+    """Function to be run right after installation for setting up the
+    environment and main parameters and for training the default classifiers.
+    By default, a working directory  will be created in the user home directory
+    (:file:`~/rhapsody/`). Previous configuration data will be recovered.
+    Additional data from EVmutation website will be automatically downloaded
+    (~1.4GB).
+
+    :arg working_dir: path to a local folder
+    :type working_dir: str
+
+    :arg refresh: if **True**, previous trained classifiers will be deleted,
+        if found
+    :type refresh: bool
+
+    :arg download_EVmutation: if **True**, precomputed EVmutation scores will
+        be downloaded (recommended)
+    :type download_EVmutation: bool
+    """
+
+    pd.LOGGER.info(f'You are running Rhapsody v{rd.__version__}')
+
+    # set working directory
+    if working_dir is None:
+        # check pre-existing configuration
+        old_dir = pd.SETTINGS.get('rhapsody_local_folder')
+        if type(old_dir) is str and os.path.isdir(old_dir):
+            working_dir = old_dir
+            pd.LOGGER.info('Pre-existing working directory detected: '
+                           f'{working_dir}')
+        else:
+            # use default location and create folder if needed
+            working_dir = DEFAULT_WORKING_DIR
+            if os.path.isdir(working_dir):
+                raise EnvironmentError(
+                    f"A folder named '{working_dir}' already exists. "
+                    "Please specify another name.")
+            else:
+                os.mkdir(working_dir)
+                pd.LOGGER.info(f'Default working directory set: {working_dir}')
+    else:
+        working_dir = os.path.abspath(working_dir)
+        if os.path.isdir(working_dir):
+            pd.LOGGER.info(f'Working directory set: {working_dir}')
+        else:
+            raise EnvironmentError(f'Invalid working directory: {working_dir}')
+    pd.SETTINGS['rhapsody_local_folder'] = working_dir
+
+    # create pickles folder
+    folder = os.path.join(working_dir, 'pickles')
+    if not os.path.isdir(folder):
+        os.mkdir(folder)
+
+    # check for pre-existing folder containing trained classifiers
+    folder = os.path.join(working_dir, DEFAULT_CLSF_DIR)
+    training_dataset = None
+    if os.path.isdir(folder) and not refresh:
+        pd.LOGGER.info(f'Pre-existing classifiers found: {folder}')
+        # check for missing classifiers
+        for featset in DEFAULT_FEATSETS:
+            fname = os.path.join(folder, featset, 'trained_classifier.pkl')
+            if not os.path.isfile(fname):
+                raise IOError(f"Missing classifier: '{featset}'. Please "
+                              f'delete folder {folder} and rerun setup.')
+    else:
+        # delete old classifiers and train new ones
+        if os.path.isdir(folder):
+            shutil.rmtree(folder)
+        os.mkdir(folder)
+        pd.LOGGER.info(f'Classifiers folder created: {folder}')
+        # delete EVmutation metrics as well, that must be updated
+        pd.SETTINGS.pop('EVmutation_metrics')
+        # import training dataset included with package
+        training_dataset = getDefaultTrainingDataset()
+        info = {
+            'size': len(training_dataset),
+            'fields': training_dataset.dtype.names
+        }
+        pd.SETTINGS['rhapsody_training_dataset'] = info
+        # train new default classifiers
+        pd.LOGGER.info('')
+        for name, featset in DEFAULT_FEATSETS.items():
+            clsf_folder = os.path.join(folder, name)
+            os.mkdir(clsf_folder)
+            logfile = os.path.join(clsf_folder, 'RF_training.log')
+            # run training procedure
+            pd.LOGGER.info(f'Training {name} classifier...')
+            pd.LOGGER.start(logfile)
+            fields = ['SAV_coords', 'true_label'] + featset
+            rd.trainRFclassifier(training_dataset[fields])
+            # move trained classifier and figures into folder
+            output_files = ['predictions_distribution.png',
+                            'pathogenicity_prob.png',
+                            'ROC.png',
+                            'feat_importances.png',
+                            'trained_classifier.pkl', ]
+            for file in output_files:
+                shutil.move(file, clsf_folder)
+            pd.LOGGER.close(logfile)
+        pd.LOGGER.info('')
+
+    # check EVmutation metrics
+    metrics = pd.SETTINGS.get('EVmutation_metrics', default={})
+    if 'AUROC' in metrics:
+        pd.LOGGER.info(f'Pre-existing EVmutation metrics found.')
+    else:
+        # compute EVmutation metrics from included training dataset
+        if training_dataset is None:
+            training_dataset = getDefaultTrainingDataset()
+        if 'EVmut-DeltaE_epist' not in training_dataset.dtype.names:
+            pd.SETTINGS['EVmutation_metrics'] = {}
+            pd.LOGGER.warn('Unable to compute EVmutation metrics: '
+                           'precomputed scores not found.')
+        else:
+            sel = ~np.isnan(training_dataset['EVmut-DeltaE_epist'])
+            # NB: EVmutation score and pathogenicity are anti-correlated
+            true_labels = training_dataset['true_label'][sel]
+            EVmut_predictor = -training_dataset['EVmut-DeltaE_epist'][sel]
+            metrics = rd.calcScoreMetrics(true_labels, EVmut_predictor)
+            pd.SETTINGS['EVmutation_metrics'] = metrics
+            pd.LOGGER.info(f'EVmutation metrics computed.')
+
+    # fetch EVmutation precomputed data, if needed
+    folder = pd.SETTINGS.get('EVmutation_local_folder')
+    if type(folder) is str and os.path.isdir(folder):
+        pd.LOGGER.info(f'EVmutation folder found: {folder}')
+    else:
+        folder = DEFAULT_EVMUT_DIR
+        if os.path.isdir(DEFAULT_EVMUT_DIR):
+            pd.LOGGER.info(f'EVmutation folder found: {folder}')
+        elif download_EVmutation:
+            pd.LOGGER.info(f'Downloading EVmutation data...')
+            # download tar.gz file and save it locally
+            tgz = os.path.join(working_dir, 'effects.tar.gz')
+            with urllib.request.urlopen(EVMUT_URL) as r, open(tgz, 'wb') as f:
+                shutil.copyfileobj(r, f)
+            # extract archive
+            tar = tarfile.open(tgz, "r:gz")
+            tar.extractall(path=folder)
+            tar.close()
+            os.remove(tgz)
+            pd.LOGGER.info(f'EVmutation folder set: {folder}')
+        else:
+            folder = None
+            msg = ('For full functionality, please consider downloading '
+                   f'EVmutation data from {EVMUT_URL} and then set up the '
+                   'relative path in the configuration file.')
+            pd.LOGGER.warn(msg)
+    pd.SETTINGS['EVmutation_local_folder'] = folder
+
+    # check if DSSP is installed
+    which = pd.utilities.which
+    if which('dssp') is None and which('mkdssp') is None:
+        msg = ('For full functionality, please consider installing DSSP, '
+               'for instance by typing in a Linux terminal: '
+               "'sudo apt install dssp'")
+        pd.LOGGER.warn(msg)
+    else:
+        pd.LOGGER.info('DSSP is installed on the system.')
+
+    pd.SETTINGS.save()
+    pd.LOGGER.info('Setup complete.')
+
+    return
+
+
+def getDefaultTrainingDataset():
+    # import training dataset included with package
+    working_dir = pd.SETTINGS.get('rhapsody_local_folder')
+    tar = tarfile.open(PACKAGE_DATA, "r:gz")
+    tar.extractall(path=working_dir)
+    tar.close()
+    fname = os.path.join(working_dir, TRAINING_DATASET)
+    training_dataset = np.load(fname)
+    os.remove(fname)
+    return training_dataset
+
+
+def getDefaultClassifiers():
+    """Returns a dictionary with the paths to the three default classifiers
+    (``'full'``, ``'reduced'`` and ``'EVmut'``)
+    """
+    working_dir = pd.SETTINGS.get('rhapsody_local_folder')
+    clsf_folder = os.path.join(working_dir, DEFAULT_CLSF_DIR)
+
+    def_clsfs = {fs: os.path.join(clsf_folder, fs, 'trained_classifier.pkl')
+                 for fs in DEFAULT_FEATSETS}
+
+    if any([not os.path.isfile(c) for c in def_clsfs.values()]):
+        raise IOError('One or more default classifiers are missing. '
+                      'Please rerun setup with initialSetup(refresh=True)')
+    else:
+        return def_clsfs
+
+
+def importDefaultClassifier(version):
+    """Imports the specified classifier and its summary
+
+    :arg version: either 'full', 'reduced' or 'EVmut'
+    :type version: str
+    """
+    assert version in ['full', 'reduced', 'EVmut']
+    with open(getDefaultClassifiers()[version], 'rb') as p:
+        clsf = pickle.load(p)
+    return clsf
+
+
+def delSettings():
+    for entry in ['rhapsody_local_folder', 'rhapsody_training_dataset',
+                  'EVmutation_local_folder', 'EVmutation_metrics']:
+        pd.SETTINGS.pop(entry)
+
+
+def getSettings(print=True):
+    """Returns and prints essential information about the current Rhapsody
+    configuration, such as the location of working directory and default
+    classifiers
+    """
+
+    config_dict = {}
+
+    for entry in ['rhapsody_local_folder', 'rhapsody_training_dataset',
+                  'EVmutation_local_folder', 'EVmutation_metrics']:
+        config_dict[entry] = pd.SETTINGS.get(entry)
+
+    def_clsfs = getDefaultClassifiers()
+    for fs, path in def_clsfs.items():
+        fs += ' classifier'
+        config_dict[fs] = path
+
+    if print:
+        entries = ['rhapsody_local_folder', 'EVmutation_local_folder'] \
+                  + [f'{c} classifier' for c in def_clsfs]
+        for entry in entries:
+            pd.LOGGER.info(f'{entry:24}: {config_dict[entry]}')
+        d = pd.SETTINGS['rhapsody_training_dataset']
+        pd.LOGGER.info('training dataset size   : {}'.format(d['size']))
+        if 'AUROC' in pd.SETTINGS.get('EVmutation_metrics', {}):
+            pd.LOGGER.info('EVmutation_metrics      : <computed>')
+        else:
+            pd.LOGGER.info('EVmutation_metrics      : <missing>')
+
+    return config_dict