From 050ea7b1a4f8d69f293ac6d6a790eabc69f0300f Mon Sep 17 00:00:00 2001 From: luponzo86 Date: Thu, 14 Nov 2019 12:46:35 -0800 Subject: [PATCH] add tqdm progress bars to main loops --- rhapsody/features/PDB.py | 40 +++-- rhapsody/features/Pfam.py | 25 +++- rhapsody/features/Uniprot.py | 34 ++++- rhapsody/predict/core.py | 24 ++- rhapsody/predict/main.py | 10 +- rhapsody/utils/misc.py | 283 +++++++++++++++++++++++++++++++++++ 6 files changed, 388 insertions(+), 28 deletions(-) create mode 100644 rhapsody/utils/misc.py diff --git a/rhapsody/features/PDB.py b/rhapsody/features/PDB.py index 9000783..4d8b27e 100644 --- a/rhapsody/features/PDB.py +++ b/rhapsody/features/PDB.py @@ -3,16 +3,18 @@ PDB-based structural and dynamical features in a single place, and a function for using the latter on a list of PDB SAV coordinates.""" +import numpy as np +import pickle +import datetime +import os +from tqdm import tqdm from prody import Atomic, parsePDB, writePDB, LOGGER, SETTINGS from prody import GNM, ANM, calcSqFlucts from prody import calcPerturbResponse, calcMechStiff # from prody import calcMBS from prody import reduceModel, sliceModel from prody import execDSSP, parseDSSP -import numpy as np -import pickle -import datetime -import os + __all__ = ['STR_FEATS', 'DYN_FEATS', 'PDB_FEATS', 'PDBfeatures', 'calcPDBfeatures'] @@ -646,7 +648,7 @@ def calcSelFeatures(self, chain='all', resid=None, sel_feats=None): def calcPDBfeatures(mapped_SAVs, sel_feats=None, custom_PDB=None, - refresh=False): + refresh=False, status_file=None, status_prefix=None): LOGGER.info('Computing structural and dynamical features ' 'from PDB structures...') LOGGER.timeit('_calcPDBFeats') @@ -665,24 +667,40 @@ def calcPDBfeatures(mapped_SAVs, sel_feats=None, custom_PDB=None, else: # no need to sort when using a custom PDB or PDBID sorting_map = range(num_SAVs) + # define how to report progress + if status_prefix is None: + status_prefix = '' + bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]' + if status_file is not None: + status_file = open(status_file, 'w') + progress_bar = tqdm( + [(i, mapped_SAVs[i]) for i in sorting_map], file=status_file, + bar_format=bar_format+'\n') + else: + progress_bar = tqdm( + [(i, mapped_SAVs[i]) for i in sorting_map], bar_format=bar_format) cache = {'PDBID': None, 'chain': None, 'obj': None} count = 0 - for indx, SAV in [(i, mapped_SAVs[i]) for i in sorting_map]: + for indx, SAV in progress_bar: count += 1 if SAV['PDB size'] == 0: # SAV could not be mapped to PDB _features = np.nan SAV_coords = SAV['SAV coords'] - LOGGER.info(f"[{count}/{num_SAVs}] SAV '{SAV_coords}' " - "couldn't be mapped to PDB") + progress_msg = f"{status_prefix}No PDB for SAV '{SAV_coords}'" else: parsed_PDB_coords = SAV['PDB SAV coords'].split() PDBID, chID = parsed_PDB_coords[:2] resid = int(parsed_PDB_coords[2]) - LOGGER.info("[{}/{}] Analizing mutation site {}:{} {}..." - .format(count, num_SAVs, PDBID, chID, resid)) + progress_msg = status_prefix + \ + f'Analizing mutation site {PDBID}:{chID} {resid}' # chID == "?" stands for "empty space" chID = " " if chID == "?" else chID + # report progress + # LOGGER.info(f"[{count}/{num_SAVs}] {progress_msg}...") + progress_bar.set_description(progress_msg) + # compute PDB features, if possible + if SAV['PDB size'] != 0: if PDBID == cache['PDBID']: # use PDBfeatures instance from previous iteration obj = cache['obj'] @@ -728,4 +746,6 @@ def calcPDBfeatures(mapped_SAVs, sel_feats=None, custom_PDB=None, and custom_PDB is None: cache['obj'].savePickle() LOGGER.report('PDB features have been computed in %.1fs.', '_calcPDBFeats') + if status_file: + os.remove(status_file.name) return features diff --git a/rhapsody/features/Pfam.py b/rhapsody/features/Pfam.py index 091b9ac..57117e5 100644 --- a/rhapsody/features/Pfam.py +++ b/rhapsody/features/Pfam.py @@ -3,7 +3,9 @@ coevolution properties of an amino acid substitution from a Pfam multiple sequence alignment.""" +import os import numpy as np +from tqdm import tqdm from prody import LOGGER from .Uniprot import UniprotMapping @@ -38,7 +40,7 @@ def calcNormRank(array, i): return feats -def calcPfamFeatures(SAVs): +def calcPfamFeatures(SAVs, status_file=None, status_prefix=None): LOGGER.info('Computing sequence properties from Pfam domains...') LOGGER.timeit('_calcPfamFeats') # sort SAVs, so to group together those @@ -49,14 +51,29 @@ def calcPfamFeatures(SAVs): num_SAVs = len(SAVs) feat_dtype = np.dtype([('entropy', 'f'), ('ranked_MI', 'f')]) features = np.zeros(num_SAVs, dtype=feat_dtype) + # define how to report progress + if status_prefix is None: + status_prefix = '' + bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]' + if status_file is not None: + status_file = open(status_file, 'w') + progress_bar = tqdm( + [(i, SAVs[i]) for i in sorting_map], file=status_file, + bar_format=bar_format+'\n') + else: + progress_bar = tqdm( + [(i, SAVs[i]) for i in sorting_map], bar_format=bar_format) # map to Pfam domains using UniprotMapping class cache = {'acc': None, 'obj': None, 'warn': ''} count = 0 - for indx, SAV in [(i, SAVs[i]) for i in sorting_map]: + for indx, SAV in progress_bar: count += 1 acc, pos, aa1, aa2 = SAV.split() pos = int(pos) - LOGGER.info(f"[{count}/{num_SAVs}] Mapping SAV '{SAV}' to Pfam...") + # report progress + progress_msg = f"{status_prefix}Mapping SAV '{SAV}' to Pfam" + # LOGGER.info(f"[{count}/{num_SAVs}] {progress_msg}...") + progress_bar.set_description(progress_msg) # map to Pfam domains using 'UniprotMapping' class if acc == cache['acc']: # use object from previous iteration @@ -102,4 +119,6 @@ def calcPfamFeatures(SAVs): cache['obj'].savePickle() LOGGER.report('SAVs have been mapped on Pfam domains and sequence ' 'properties have been computed in %.1fs.', '_calcPfamFeats') + if status_file: + os.remove(status_file.name) return features diff --git a/rhapsody/features/Uniprot.py b/rhapsody/features/Uniprot.py index 79d8046..83b8dc0 100644 --- a/rhapsody/features/Uniprot.py +++ b/rhapsody/features/Uniprot.py @@ -9,6 +9,7 @@ import numpy as np import prody as pd from prody import LOGGER, SETTINGS +from tqdm import tqdm from Bio.pairwise2 import align as bioalign from Bio.pairwise2 import format_alignment from Bio.SubsMat import MatrixInfo as matlist @@ -758,7 +759,8 @@ def calcEvolProperties(self, resid='all', refresh=False, folder=None, return {k: self.Pfam[k] for k in PF_list} -def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False): +def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False, + status_file=None, status_prefix=None): LOGGER.info('Mapping SAVs to PDB structures...') LOGGER.timeit('_map2PDB') # sort SAVs, so to group together those @@ -766,20 +768,36 @@ def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False): accs = [s.split()[0] for s in SAV_coords] sorting_map = np.argsort(accs) # define a structured array - PDBmap_dtype = np.dtype([('orig. SAV coords', 'U25'), - ('unique SAV coords', 'U25'), - ('PDB SAV coords', 'U100'), - ('PDB size', 'i')]) + PDBmap_dtype = np.dtype([ + ('orig. SAV coords', 'U25'), + ('unique SAV coords', 'U25'), + ('PDB SAV coords', 'U100'), + ('PDB size', 'i')]) nSAVs = len(SAV_coords) mapped_SAVs = np.zeros(nSAVs, dtype=PDBmap_dtype) + # define how to report progress + if status_prefix is None: + status_prefix = '' + bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]' + if status_file is not None: + status_file = open(status_file, 'w') + progress_bar = tqdm( + [(i, SAV_coords[i]) for i in sorting_map], file=status_file, + bar_format=bar_format+'\n') + else: + progress_bar = tqdm( + [(i, SAV_coords[i]) for i in sorting_map], bar_format=bar_format) # map to PDB using Uniprot class cache = {'acc': None, 'obj': None} count = 0 - for indx, SAV in [(i, SAV_coords[i]) for i in sorting_map]: + for indx, SAV in progress_bar: count += 1 acc, pos, aa1, aa2 = SAV.split() pos = int(pos) - LOGGER.info(f"[{count}/{nSAVs}] Mapping SAV '{SAV}' to PDB...") + # report progress + progress_msg = f"{status_prefix}Mapping SAV '{SAV}' to PDB" + # LOGGER.info(f"[{count}/{nSAVs}] {progress_msg}...") + progress_bar.set_description(progress_msg) # map Uniprot to PDB chains if acc == cache['acc']: # use mapping from previous iteration @@ -836,6 +854,8 @@ def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False): n = sum(mapped_SAVs['PDB size'] != 0) LOGGER.report(f'{n} out of {nSAVs} SAVs have been mapped to PDB in %.1fs.', '_map2PDB') + if status_file: + os.remove(status_file.name) return mapped_SAVs diff --git a/rhapsody/predict/core.py b/rhapsody/predict/core.py index 008c17c..b3f03bf 100644 --- a/rhapsody/predict/core.py +++ b/rhapsody/predict/core.py @@ -22,7 +22,8 @@ class Rhapsody: EVmutation. """ - def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True): + def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True, + **kwargs): """ Initialize a Rhapsody object with a list of SAVs (optional). :arg query: Single Amino Acid Variants (SAVs) in Uniprot coordinates. @@ -53,6 +54,14 @@ def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True): assert query_type in ('SAVs', 'PolyPhen2') assert isinstance(queryPolyPhen2, bool) + valid_kwargs = [ + 'status_file_Uniprot', + 'status_file_PDB', + 'status_file_Pfam', + 'status_prefix_Uniprot', + 'status_prefix_PDB', + 'status_prefix_Pfam'] + assert all([k in valid_kwargs for k in kwargs]) # masked NumPy array that will contain all info abut SAVs self.data = None @@ -103,6 +112,8 @@ def __init__(self, query=None, query_type='SAVs', queryPolyPhen2=True): self.classifier = None self.aux_classifier = None self.featSet = None + # options + self.options = kwargs if query is None: # a SAV list can be uploaded later with setSAVs() @@ -283,6 +294,8 @@ def getUniprot2PDBmap(self, filename='rhapsody-Uniprot2PDB.txt', # compute mapping m = Uniprot.mapSAVs2PDB( self.data['SAV coords'], custom_PDB=self.customPDB, + status_file=self.options.get('status_file_Uniprot'), + status_prefix=self.options.get('status_prefix_Uniprot'), refresh=refresh) self.data['unique SAV coords'] = m['unique SAV coords'] self.data['PDB SAV coords'] = m['PDB SAV coords'] @@ -403,7 +416,9 @@ def _calcFeatMatrix(self, refresh=False): # compute structural and dynamical features from a PDB structure f = PDB.calcPDBfeatures( Uniprot2PDBmap, sel_feats=sel_PDBfeats, - custom_PDB=self.customPDB, refresh=refresh) + custom_PDB=self.customPDB, refresh=refresh, + status_file=self.options.get('status_file_PDB'), + status_prefix=self.options.get('status_prefix_PDB')) all_feats.append(f) if RHAPSODY_FEATS['BLOSUM'].intersection(self.featSet): # retrieve BLOSUM values @@ -411,7 +426,10 @@ def _calcFeatMatrix(self, refresh=False): all_feats.append(f) if RHAPSODY_FEATS['Pfam'].intersection(self.featSet): # compute sequence properties from Pfam domains - f = Pfam.calcPfamFeatures(self.data['SAV coords']) + f = Pfam.calcPfamFeatures( + self.data['SAV coords'], + status_file=self.options.get('status_file_Pfam'), + status_prefix=self.options.get('status_prefix_Pfam')) all_feats.append(f) if RHAPSODY_FEATS['EVmut'].intersection(self.featSet): # recover EVmutation data diff --git a/rhapsody/predict/main.py b/rhapsody/predict/main.py index 3843f3e..22f93cd 100644 --- a/rhapsody/predict/main.py +++ b/rhapsody/predict/main.py @@ -12,7 +12,7 @@ def rhapsody(query, query_type='SAVs', main_classifier=None, aux_classifier=None, custom_PDB=None, force_env=None, - refresh=False, log=True): + refresh=False, log=True, **kwargs): """Obtain Rhapsody pathogenicity predictions on a list of human missense variants ([ref]_) @@ -77,7 +77,7 @@ def rhapsody(query, query_type='SAVs', aux_classifier = getDefaultClassifiers()['reduced'] # initialize object that will contain all results and predictions - r = Rhapsody() + r = Rhapsody(**kwargs) # import classifiers and feature set from pickle r.importClassifiers(main_classifier, aux_classifier, force_env=force_env) @@ -99,9 +99,9 @@ def rhapsody(query, query_type='SAVs', r.printPredictions() if aux_classifier is not None: # print both 'full' and 'reduced' predictions in a more detailed format - r.printPredictions(classifier="both", - PolyPhen2=False, EVmutation=False, - filename='rhapsody-predictions-full_vs_reduced.txt') + r.printPredictions( + classifier="both", PolyPhen2=False, EVmutation=False, + filename='rhapsody-predictions-full_vs_reduced.txt') # save pickle r.savePickle() diff --git a/rhapsody/utils/misc.py b/rhapsody/utils/misc.py new file mode 100644 index 0000000..3df3632 --- /dev/null +++ b/rhapsody/utils/misc.py @@ -0,0 +1,283 @@ +# -*- coding: utf-8 -*- +"""This module defines default configuration parameters and +a function for the initial setup and training of Rhapsody.""" + +import os +import tarfile +import pickle +import urllib.request +import shutil +import sklearn +import numpy as np +import prody as pd +import rhapsody as rd + +__all__ = ['DEFAULT_FEATSETS', 'initialSetup', + 'getDefaultTrainingDataset', 'getDefaultClassifiers', + 'importDefaultClassifier', 'delSettings', 'getSettings'] + +USERHOME = os.getenv('USERPROFILE') or os.getenv('HOME') or './' +DEFAULT_WORKING_DIR = os.path.join(USERHOME, 'rhapsody') +DEFAULT_EVMUT_DIR = os.path.join(DEFAULT_WORKING_DIR, + 'EVmutation_mutation_effects') +EVMUT_URL = 'https://marks.hms.harvard.edu/evmutation/data/effects.tar.gz' +PACKAGE_DATA = os.path.join(rd.__path__[0], 'data.tar.gz') +TRAINING_DATASET = 'precomputed_features-ID_opt.npy' +DEFAULT_CLSF_DIR = f'default_classifiers-sklearn_v{sklearn.__version__}' +DEFAULT_FEATSETS = { + 'full': ['wt_PSIC', 'Delta_PSIC', 'SASA', 'ANM_MSF-chain', + 'ANM_effectiveness-chain', 'ANM_sensitivity-chain', + 'stiffness-chain', 'entropy', 'ranked_MI', 'BLOSUM'], + 'reduced': ['wt_PSIC', 'Delta_PSIC', 'SASA', 'ANM_MSF-chain', + 'ANM_effectiveness-chain', 'ANM_sensitivity-chain', + 'stiffness-chain', 'BLOSUM'], + 'EVmut': ['wt_PSIC', 'Delta_PSIC', 'SASA', 'ANM_MSF-chain', + 'ANM_effectiveness-chain', 'ANM_sensitivity-chain', + 'stiffness-chain', 'entropy', 'ranked_MI', 'BLOSUM', + 'EVmut-DeltaE_epist'], +} + + +def initialSetup(working_dir=None, refresh=False, download_EVmutation=True): + """Function to be run right after installation for setting up the + environment and main parameters and for training the default classifiers. + By default, a working directory will be created in the user home directory + (:file:`~/rhapsody/`). Previous configuration data will be recovered. + Additional data from EVmutation website will be automatically downloaded + (~1.4GB). + + :arg working_dir: path to a local folder + :type working_dir: str + + :arg refresh: if **True**, previous trained classifiers will be deleted, + if found + :type refresh: bool + + :arg download_EVmutation: if **True**, precomputed EVmutation scores will + be downloaded (recommended) + :type download_EVmutation: bool + """ + + pd.LOGGER.info(f'You are running Rhapsody v{rd.__version__}') + + # set working directory + if working_dir is None: + # check pre-existing configuration + old_dir = pd.SETTINGS.get('rhapsody_local_folder') + if type(old_dir) is str and os.path.isdir(old_dir): + working_dir = old_dir + pd.LOGGER.info('Pre-existing working directory detected: ' + f'{working_dir}') + else: + # use default location and create folder if needed + working_dir = DEFAULT_WORKING_DIR + if os.path.isdir(working_dir): + raise EnvironmentError( + f"A folder named '{working_dir}' already exists. " + "Please specify another name.") + else: + os.mkdir(working_dir) + pd.LOGGER.info(f'Default working directory set: {working_dir}') + else: + working_dir = os.path.abspath(working_dir) + if os.path.isdir(working_dir): + pd.LOGGER.info(f'Working directory set: {working_dir}') + else: + raise EnvironmentError(f'Invalid working directory: {working_dir}') + pd.SETTINGS['rhapsody_local_folder'] = working_dir + + # create pickles folder + folder = os.path.join(working_dir, 'pickles') + if not os.path.isdir(folder): + os.mkdir(folder) + + # check for pre-existing folder containing trained classifiers + folder = os.path.join(working_dir, DEFAULT_CLSF_DIR) + training_dataset = None + if os.path.isdir(folder) and not refresh: + pd.LOGGER.info(f'Pre-existing classifiers found: {folder}') + # check for missing classifiers + for featset in DEFAULT_FEATSETS: + fname = os.path.join(folder, featset, 'trained_classifier.pkl') + if not os.path.isfile(fname): + raise IOError(f"Missing classifier: '{featset}'. Please " + f'delete folder {folder} and rerun setup.') + else: + # delete old classifiers and train new ones + if os.path.isdir(folder): + shutil.rmtree(folder) + os.mkdir(folder) + pd.LOGGER.info(f'Classifiers folder created: {folder}') + # delete EVmutation metrics as well, that must be updated + pd.SETTINGS.pop('EVmutation_metrics') + # import training dataset included with package + training_dataset = getDefaultTrainingDataset() + info = { + 'size': len(training_dataset), + 'fields': training_dataset.dtype.names + } + pd.SETTINGS['rhapsody_training_dataset'] = info + # train new default classifiers + pd.LOGGER.info('') + for name, featset in DEFAULT_FEATSETS.items(): + clsf_folder = os.path.join(folder, name) + os.mkdir(clsf_folder) + logfile = os.path.join(clsf_folder, 'RF_training.log') + # run training procedure + pd.LOGGER.info(f'Training {name} classifier...') + pd.LOGGER.start(logfile) + fields = ['SAV_coords', 'true_label'] + featset + rd.trainRFclassifier(training_dataset[fields]) + # move trained classifier and figures into folder + output_files = ['predictions_distribution.png', + 'pathogenicity_prob.png', + 'ROC.png', + 'feat_importances.png', + 'trained_classifier.pkl', ] + for file in output_files: + shutil.move(file, clsf_folder) + pd.LOGGER.close(logfile) + pd.LOGGER.info('') + + # check EVmutation metrics + metrics = pd.SETTINGS.get('EVmutation_metrics', default={}) + if 'AUROC' in metrics: + pd.LOGGER.info(f'Pre-existing EVmutation metrics found.') + else: + # compute EVmutation metrics from included training dataset + if training_dataset is None: + training_dataset = getDefaultTrainingDataset() + if 'EVmut-DeltaE_epist' not in training_dataset.dtype.names: + pd.SETTINGS['EVmutation_metrics'] = {} + pd.LOGGER.warn('Unable to compute EVmutation metrics: ' + 'precomputed scores not found.') + else: + sel = ~np.isnan(training_dataset['EVmut-DeltaE_epist']) + # NB: EVmutation score and pathogenicity are anti-correlated + true_labels = training_dataset['true_label'][sel] + EVmut_predictor = -training_dataset['EVmut-DeltaE_epist'][sel] + metrics = rd.calcScoreMetrics(true_labels, EVmut_predictor) + pd.SETTINGS['EVmutation_metrics'] = metrics + pd.LOGGER.info(f'EVmutation metrics computed.') + + # fetch EVmutation precomputed data, if needed + folder = pd.SETTINGS.get('EVmutation_local_folder') + if type(folder) is str and os.path.isdir(folder): + pd.LOGGER.info(f'EVmutation folder found: {folder}') + else: + folder = DEFAULT_EVMUT_DIR + if os.path.isdir(DEFAULT_EVMUT_DIR): + pd.LOGGER.info(f'EVmutation folder found: {folder}') + elif download_EVmutation: + pd.LOGGER.info(f'Downloading EVmutation data...') + # download tar.gz file and save it locally + tgz = os.path.join(working_dir, 'effects.tar.gz') + with urllib.request.urlopen(EVMUT_URL) as r, open(tgz, 'wb') as f: + shutil.copyfileobj(r, f) + # extract archive + tar = tarfile.open(tgz, "r:gz") + tar.extractall(path=folder) + tar.close() + os.remove(tgz) + pd.LOGGER.info(f'EVmutation folder set: {folder}') + else: + folder = None + msg = ('For full functionality, please consider downloading ' + f'EVmutation data from {EVMUT_URL} and then set up the ' + 'relative path in the configuration file.') + pd.LOGGER.warn(msg) + pd.SETTINGS['EVmutation_local_folder'] = folder + + # check if DSSP is installed + which = pd.utilities.which + if which('dssp') is None and which('mkdssp') is None: + msg = ('For full functionality, please consider installing DSSP, ' + 'for instance by typing in a Linux terminal: ' + "'sudo apt install dssp'") + pd.LOGGER.warn(msg) + else: + pd.LOGGER.info('DSSP is installed on the system.') + + pd.SETTINGS.save() + pd.LOGGER.info('Setup complete.') + + return + + +def getDefaultTrainingDataset(): + # import training dataset included with package + working_dir = pd.SETTINGS.get('rhapsody_local_folder') + tar = tarfile.open(PACKAGE_DATA, "r:gz") + tar.extractall(path=working_dir) + tar.close() + fname = os.path.join(working_dir, TRAINING_DATASET) + training_dataset = np.load(fname) + os.remove(fname) + return training_dataset + + +def getDefaultClassifiers(): + """Returns a dictionary with the paths to the three default classifiers + (``'full'``, ``'reduced'`` and ``'EVmut'``) + """ + working_dir = pd.SETTINGS.get('rhapsody_local_folder') + clsf_folder = os.path.join(working_dir, DEFAULT_CLSF_DIR) + + def_clsfs = {fs: os.path.join(clsf_folder, fs, 'trained_classifier.pkl') + for fs in DEFAULT_FEATSETS} + + if any([not os.path.isfile(c) for c in def_clsfs.values()]): + raise IOError('One or more default classifiers are missing. ' + 'Please rerun setup with initialSetup(refresh=True)') + else: + return def_clsfs + + +def importDefaultClassifier(version): + """Imports the specified classifier and its summary + + :arg version: either 'full', 'reduced' or 'EVmut' + :type version: str + """ + assert version in ['full', 'reduced', 'EVmut'] + with open(getDefaultClassifiers()[version], 'rb') as p: + clsf = pickle.load(p) + return clsf + + +def delSettings(): + for entry in ['rhapsody_local_folder', 'rhapsody_training_dataset', + 'EVmutation_local_folder', 'EVmutation_metrics']: + pd.SETTINGS.pop(entry) + + +def getSettings(print=True): + """Returns and prints essential information about the current Rhapsody + configuration, such as the location of working directory and default + classifiers + """ + + config_dict = {} + + for entry in ['rhapsody_local_folder', 'rhapsody_training_dataset', + 'EVmutation_local_folder', 'EVmutation_metrics']: + config_dict[entry] = pd.SETTINGS.get(entry) + + def_clsfs = getDefaultClassifiers() + for fs, path in def_clsfs.items(): + fs += ' classifier' + config_dict[fs] = path + + if print: + entries = ['rhapsody_local_folder', 'EVmutation_local_folder'] \ + + [f'{c} classifier' for c in def_clsfs] + for entry in entries: + pd.LOGGER.info(f'{entry:24}: {config_dict[entry]}') + d = pd.SETTINGS['rhapsody_training_dataset'] + pd.LOGGER.info('training dataset size : {}'.format(d['size'])) + if 'AUROC' in pd.SETTINGS.get('EVmutation_metrics', {}): + pd.LOGGER.info('EVmutation_metrics : ') + else: + pd.LOGGER.info('EVmutation_metrics : ') + + return config_dict