v0.3.7

Protein-Engineering-Framework · Nov 21, 2024 · 2eb5682 · 2eb5682
1 parent 6a6fc50
commit 2eb5682
Show file tree

Hide file tree

Showing 10 changed files with 111 additions and 27 deletions.
diff --git a/.gitignore b/.gitignore
@@ -113,6 +113,8 @@ datasets/ANEH/ML_Model_Performance_WEBA780101_ELASTICNET.png
 datasets/ANEH/ML_Model_Performance_WEBA780101_PLS_LOOCV.png
 datasets/ANEH/ML_Model_Performance_WEBA780101_PLS.png
 datasets/ANEH/ML_Model_Performance_WEBA780101_RIDGE.png
+datasets/ANEH/ML_Model_Performance_CHAM830104_PLS_LOOCV.png
+datasets/ANEH/ML_Model_Performance_OOBM850104_PLS_LOOCV.png
 datasets/ANEH/ml_pls_37_ANEH_variants_aaidx_encoded_train_concat_lvls_extrapolation.png
 datasets/ANEH/ml_pls_37_ANEH_variants_aaidx_encoded_train_lvl_1_extrapolation.png
 datasets/ANEH/ml_pls_37_ANEH_variants_gremlin_dca_encoded_train_concat_lvls_extrapolation.png
@@ -250,6 +252,11 @@ datasets/ANEH/CV_performance/WEBA780101_PLS_LOOCV_5-fold-CV.png
 datasets/ANEH/CV_performance/WEBA780101_PLS_LOOCV_CV_Results.txt
 datasets/ANEH/CV_performance/WEBA780101_RIDGE_5-fold-CV.png
 datasets/ANEH/CV_performance/WEBA780101_RIDGE_CV_Results.txt
+datasets/ANEH/CV_performance/CHAM830104_PLS_LOOCV_5-fold-CV.png
+datasets/ANEH/CV_performance/CHAM830104_PLS_LOOCV_CV_Results.txt
+datasets/ANEH/CV_performance/OOBM850104_PLS_LOOCV_5-fold-CV.png
+datasets/ANEH/CV_performance/OOBM850104_PLS_LOOCV_CV_Results.txt
+datasets/ANEH/Pickles/OOBM850104
 datasets/ANEH/Diverse_Double_Split/Diverse_Double_Split1.fasta
 datasets/ANEH/Diverse_Double_Split/Predictions_FAUJ880104_TopDiverse_Double_Split.txt
 datasets/ANEH/Diverse_Double_Split/Predictions_MLgremlin_TopDiverse_Double_Split.txt

diff --git a/gui/qt_window.py b/gui/qt_window.py
@@ -124,12 +124,21 @@ def __init__(
         self.button_dca_inference_gremlin = QtWidgets.QPushButton("MSA optimization (GREMLIN)")
         self.button_dca_inference_gremlin.setMinimumWidth(80)
         self.button_dca_inference_gremlin.setToolTip(
-            "Generating DCA parameters using GREMLIN (\"MSA optimization\"), "
-            "you have to provide an MSA in FASTA or A2M format"
+            "Generating DCA parameters using GREMLIN (\"MSA optimization\"); "
+            "requires an MSA in FASTA or A2M format"
         )
         self.button_dca_inference_gremlin.clicked.connect(self.pypef_gremlin)
         self.button_dca_inference_gremlin.setStyleSheet(button_style)
 
+        self.button_dca_inference_gremlin_msa_info = QtWidgets.QPushButton("GREMLIN SSM prediction")
+        self.button_dca_inference_gremlin_msa_info.setMinimumWidth(80)
+        self.button_dca_inference_gremlin_msa_info.setToolTip(
+            "Generating DCA parameters using GREMLIN (\"MSA optimization\") and save plots of "
+            "visualized results; requires an MSA in FASTA or A2M format"
+        )
+        self.button_dca_inference_gremlin_msa_info.clicked.connect(self.pypef_gremlin_msa_info)
+        self.button_dca_inference_gremlin_msa_info.setStyleSheet(button_style)
+
         self.button_dca_test_dca = QtWidgets.QPushButton("Test (DCA)")
         self.button_dca_test_dca.setMinimumWidth(80)
         self.button_dca_test_dca.setToolTip(
@@ -208,8 +217,9 @@ def __init__(
 
         layout.addWidget(self.dca_text, 3, 1, 1, 1)
         layout.addWidget(self.button_dca_inference_gremlin, 4, 1, 1, 1)
-        layout.addWidget(self.button_dca_test_dca, 5, 1, 1, 1)
-        layout.addWidget(self.button_dca_predict_dca, 6, 1, 1, 1)
+        layout.addWidget(self.button_dca_inference_gremlin_msa_info, 5, 1, 1, 1)
+        layout.addWidget(self.button_dca_test_dca, 6, 1, 1, 1)
+        layout.addWidget(self.button_dca_predict_dca, 7, 1, 1, 1)
 
         layout.addWidget(self.hybrid_text, 3, 2, 1, 1)
         layout.addWidget(self.button_hybrid_train_dca, 4, 2, 1, 1)
@@ -222,7 +232,7 @@ def __init__(
         layout.addWidget(self.button_supervised_train_test_dca, 4, 3, 1, 1)
         layout.addWidget(self.button_supervised_train_test_onehot, 5, 3, 1, 1)
 
-        layout.addWidget(self.textedit_out, 7, 0, 1, -1)
+        layout.addWidget(self.textedit_out, 8, 0, 1, -1)
 
         self.process = QtCore.QProcess(self)
         self.process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
@@ -233,6 +243,8 @@ def __init__(
         self.process.finished.connect(lambda: self.button_mklsts.setEnabled(True))
         self.process.started.connect(lambda: self.button_dca_inference_gremlin.setEnabled(False))
         self.process.finished.connect(lambda: self.button_dca_inference_gremlin.setEnabled(True))
+        self.process.started.connect(lambda: self.button_dca_inference_gremlin_msa_info.setEnabled(False))
+        self.process.finished.connect(lambda: self.button_dca_inference_gremlin_msa_info.setEnabled(True))
         self.process.started.connect(lambda: self.button_dca_test_dca.setEnabled(False))
         self.process.finished.connect(lambda: self.button_dca_test_dca.setEnabled(True))
         self.process.started.connect(lambda: self.button_dca_predict_dca.setEnabled(False))
@@ -290,6 +302,14 @@ def pypef_gremlin(self):
             self.version_text.setText("Running GREMLIN (DCA) optimization on MSA...")
             self.exec_pypef(f'param_inference --wt {wt_fasta_file} --msa {msa_file}')  # --opt_iter 100
 
+    @QtCore.Slot()
+    def pypef_gremlin_msa_info(self):
+        wt_fasta_file = QtWidgets.QFileDialog.getOpenFileName(self, "Select WT FASTA File")[0]
+        msa_file = QtWidgets.QFileDialog.getOpenFileName(
+            self, "Select Multiple Sequence Alignment (MSA) file (in FASTA or A2M format)")[0]
+        if wt_fasta_file and msa_file:
+            self.version_text.setText("Running GREMLIN (DCA) optimization on MSA...")
+            self.exec_pypef(f'save_msa_info --wt {wt_fasta_file} --msa {msa_file}')
 
     @QtCore.Slot()
     def pypef_dca_test(self):

diff --git a/pypef/__init__.py b/pypef/__init__.py
@@ -12,4 +12,4 @@
 # Journal of Chemical Information and Modeling, 2021, 61, 3463-3476
 # https://doi.org/10.1021/acs.jcim.1c00099
 
-__version__ = '0.3.7-dev'
+__version__ = '0.3.7'
diff --git a/pypef/dca/dca_run.py b/pypef/dca/dca_run.py
@@ -19,7 +19,7 @@
 from pypef.utils.variant_data import read_csv, get_wt_sequence
 from pypef.dca.plmc_encoding import save_plmc_dca_encoding_model
 from pypef.dca.hybrid_model import get_model_and_type, performance_ls_ts, predict_ps, generate_model_and_save_pkl
-from pypef.dca.gremlin_inference import save_gremlin_as_pickle, save_corr_csv, plot_all_corr_mtx
+from pypef.dca.gremlin_inference import save_gremlin_as_pickle, save_corr_csv, plot_all_corr_mtx, plot_predicted_ssm
 from pypef.utils.low_n_mutation_extrapolation import performance_mutation_extrapolation, low_n
 
 
@@ -128,6 +128,7 @@ def run_pypef_hybrid_modeling(arguments):
         )
         save_corr_csv(gremlin)
         plot_all_corr_mtx(gremlin)
+        plot_predicted_ssm(gremlin)
 
     else:
         performance_ls_ts(

diff --git a/pypef/dca/gremlin_inference.py b/pypef/dca/gremlin_inference.py
@@ -28,13 +28,13 @@
 
 References:
 [1] Kamisetty, H., Ovchinnikov, S., & Baker, D.
-    Assessing the utility of coevolution-based residue–residue contact predictions in a
+    Assessing the utility of coevolution-based residue-residue contact predictions in a
     sequence- and structure-rich era.
     Proceedings of the National Academy of Sciences, 2013, 110, 15674-15679
     https://www.pnas.org/doi/10.1073/pnas.1314045110
 [2] Balakrishnan, S., Kamisetty, H., Carbonell, J. G., Lee, S.-I., & Langmead, C. J.
     Learning generative models for protein fold families.
-    Proteins, 79(4), 2011, 1061–78.
+    Proteins, 79(4), 2011, 1061-78.
     https://doi.org/10.1002/prot.22934
 [3] Ekeberg, M., Lövkvist, C., Lan, Y., Weigt, M., & Aurell, E.
     Improved contact prediction in proteins: Using pseudolikelihoods to infer Potts models.
@@ -57,8 +57,9 @@
 from scipy.special import logsumexp
 from scipy.stats import boxcox
 import pandas as pd
+from tqdm import tqdm
 import tensorflow as tf
-tf.get_logger().setLevel('DEBUG')
+tf.get_logger().setLevel('WARNING')
 # Uncomment to hide GPU devices
 #environ['CUDA_VISIBLE_DEVICES'] = '-1'  
 
@@ -647,6 +648,7 @@ def plot_correlation_matrix(self, matrix_type: str = 'apc', set_diag_zero=True):
         ax.set_ylim(-1, matrix.shape[0])
         plt.title(matrix_type.upper())
         plt.savefig(f'{matrix_type}.png', dpi=500)
+        logger.info(f"Plotted correlation matrix {os.path.abspath(matrix_type)}.png")
         plt.close('all')
 
     def get_top_coevolving_residues(self, wt_seq=None, min_distance=0, sort_by="apc"):
@@ -718,7 +720,7 @@ def save_gremlin_as_pickle(alignment: str, wt_seq: str, opt_iter: int = 100):
         },
         open('Pickles/GREMLIN', 'wb')
     )
-    logger.info(f"Saved GREMLIN model as Pickle file ({os.path.abspath('Pickles/GREMLIN')})...")
+    logger.info(f"Saved GREMLIN model as Pickle file as {os.path.abspath('Pickles/GREMLIN')}...")
     return gremlin
 
 
@@ -732,4 +734,58 @@ def save_corr_csv(gremlin: GREMLIN, min_distance: int = 0, sort_by: str = 'apc')
     df_mtx_sorted_mindist = gremlin.get_top_coevolving_residues(
         min_distance=min_distance, sort_by=sort_by
     )
-    df_mtx_sorted_mindist.to_csv(f"coevolution_{sort_by}_sorted.csv")
+    df_mtx_sorted_mindist.to_csv(f"coevolution_{sort_by}_sorted.csv", sep=',')
+    logger.info(f"Saved coevolution CSV data as "
+                f"{os.path.abspath(f'coevolution_{sort_by}_sorted.csv')}")
+
+
+def plot_predicted_ssm(gremlin: GREMLIN):
+    """
+    Function to plot all predicted 19 amino acid substitution 
+    effects at all predictable WT/input sequence positions; e.g.: 
+    M1A, M1C, M1E, ..., D2A, D2C, D2E, ..., ..., T300V, T300W, T300Y
+    """
+    wt_sequence = gremlin.wt_seq
+    wt_score = gremlin.get_wt_score()[0]
+    aas = "".join(sorted(gremlin.char_alphabet.replace("-", "")))
+    variantss, variant_sequencess, variant_scoress = [], [], []
+    logger.info("Predicting all SSM effects using the unsupervised GREMLIN model...")
+    for i, aa_wt in enumerate(tqdm(wt_sequence)):
+        variants, variant_sequences, variant_scores = [], [], []
+        for aa_sub in aas:
+            variant = aa_wt + str(i + 1) + aa_sub
+            variant_sequence = wt_sequence[:i] + aa_sub + wt_sequence[i + 1:]
+            variant_score = gremlin.get_score(variant_sequence)[0]
+            variants.append(variant)
+            variant_sequences.append(variant_sequence)
+            variant_scores.append(variant_score - wt_score)
+        variantss.append(variants)
+        variant_sequencess.append(variant_sequences)
+        variant_scoress.append(variant_scores)
+
+    fig, ax = plt.subplots(figsize=(2 * len(wt_sequence) / len(aas), 3))
+    ax.imshow(np.array(variant_scoress).T)
+    for i_vss, vss in enumerate(variant_scoress):
+        for i_vs, vs in enumerate(vss):
+            ax.text(
+                i_vss, i_vs, 
+                f'{variantss[i_vss][i_vs]}\n{round(vs, 1)}', 
+                size=1.5, va='center', ha='center'
+            )
+    ax.set_xticks(
+        range(len(wt_sequence)), 
+        [f'{aa}{i + 1}' for i, aa in enumerate(wt_sequence)], 
+        size=6, rotation=90
+    )
+    ax.set_yticks(range(len(aas)), aas, size=6)
+    plt.tight_layout()
+    plt.savefig('SSM_landscape.png', dpi=500)
+    pd.DataFrame(
+        {
+            'Variant': np.array(variantss).flatten(),
+            'Sequence': np.array(variant_sequencess).flatten(),
+            'Variant_Score': np.array(variant_scoress).flatten()
+        }
+    ).to_csv('SSM_landscape.csv', sep=',')
+    logger.info(f"Saved SSM landscape as {os.path.abspath('SSM_landscape.png')} "
+                f"and CSV data as {os.path.abspath('SSM_landscape.csv')}...")
diff --git a/pypef/dca/hybrid_model.py b/pypef/dca/hybrid_model.py
@@ -686,7 +686,6 @@ def save_model_to_dict_pickle(
         model_type = 'MODEL'
 
     pkl_path = os.path.abspath(f'Pickles/{model_type}')
-    logger.info(f'Saving model as Pickle file ({pkl_path})...')
     pickle.dump(
         {
             'model': model,
@@ -698,6 +697,7 @@ def save_model_to_dict_pickle(
         },
         open(f'Pickles/{model_type}', 'wb')
     )
+    logger.info(f'Saved model as Pickle file ({pkl_path})...')
 
 
 global_model = None
@@ -742,9 +742,9 @@ def plmc_or_gremlin_encoding(
             logger.info(f"Following positions are frequent gap positions in the MSA "
                         f"and cannot be considered for effective modeling, i.e., "
                         f"substitutions at these positions are removed as these would be "
-                        f"predicted with wild-type fitness:\n{[gap + 1 for gap in model.gaps]}.\n"
+                        f"predicted with wild-type fitness:\n{[int(gap) + 1 for gap in model.gaps]}.\n"
                         f"Effective positions (N={len(model.v_idx)}) are:\n"
-                        f"{[v_pos + 1 for v_pos in model.v_idx]}")
+                        f"{[int(v_pos) + 1 for v_pos in model.v_idx]}")
         xs, x_wt, variants, sequences, ys_true = gremlin_encoding(
             model, variants, sequences, ys_true,
             shift_pos=1, substitution_sep=substitution_sep

diff --git a/pypef/main.py b/pypef/main.py
@@ -307,6 +307,7 @@
 from pypef.dca.dca_run import run_pypef_hybrid_modeling
 from pypef.utils.utils_run import run_pypef_utils
 
+
 logger = logging.getLogger("pypef")
 logger.setLevel(logging.INFO)
 

diff --git a/pypef/ml/regression.py b/pypef/ml/regression.py
@@ -649,11 +649,11 @@ def formatted_output(
     for (idx, val, val2, val3, val4, val5, r_m, pam) in performance_list:
         if val >= minimum_r2:
             index.append(get_basename(idx))
-            value.append('{:f}'.format(val))
-            value2.append('{:f}'.format(val2))
-            value3.append('{:f}'.format(val3))
-            value4.append('{:f}'.format(val4))
-            value5.append('{:f}'.format(val5))
+            value.append(f'{val:.6f}')
+            value2.append(f'{val2:.6f}')
+            value3.append(f'{val3:.6f}')
+            value4.append(f'{val4:.6f}')
+            value5.append(f'{val5:.6f}')
             regression_model.append(r_m.upper())
             params.append(pam)
 
@@ -853,10 +853,9 @@ def crossval_on_all(x_train, x_test, y_train, y_test, regressor: str, parameter,
         get_performances(y_test_total, y_predicted_total)
 
     with open(cv_filename, 'a') as f:
-        f.write('Regression type: {}; Parameter: {}; Encoding index: {}\n'.format(
-            regressor.upper(), parameter, name))
-        f.write('R2 = {:.5f}; RMSE = {:.5f}; NRMSE = {:.5f}; Pearson\'s r = {:.5f};'
-                ' Spearman\'s rho = {:.5f}\n\n'.format(r_squared, rmse, nrmse, pearson_r, spearman_rho))
+        f.write(f'Regression type: {regressor.upper()}; Parameter: {parameter}; Encoding index: {name}\n')
+        f.write(f'R2 = {r_squared:.6f}; RMSE = {rmse:.6f}; NRMSE = {nrmse:.6f}; Pearson\'s r = {pearson_r:.6f}; '
+                f'Spearman\'s rho = {spearman_rho:.6f}\n\n')
 
     figure, ax = plt.subplots()
     legend = r'$R^2$' + f' = {r_squared:.3f}' + f'\nRMSE = {rmse:.3f}' + f'\nNRMSE = {nrmse:.3f}' + \
@@ -959,10 +958,10 @@ def save_model(
             if model_type in ['PLMC', 'GREMLIN'] and encoding not in ['aaidx', 'onehot']:
                 name = 'ML' + model_type.lower()
             f_name = os.path.abspath(os.path.join(path, 'Pickles', name))
-            logger.info(f'Saving model ({f_name})...')
             file = open(f_name, 'wb')
             pickle.dump(regressor_, file)
             file.close()
+            logger.info(f'Saved model as {f_name}...')
 
         except IndexError:
             raise IndexError

diff --git a/pypef/utils/low_n_mutation_extrapolation.py b/pypef/utils/low_n_mutation_extrapolation.py
@@ -289,8 +289,8 @@ def performance_mutation_extrapolation(
             logger.info('Fitting regressor on lvl 1 substitution data...')
             regressor.fit(x_train, y_train)
             if save_model:
-                logger.info(f'Saving model as Pickle file: ML_LVL_1')
                 pickle.dump(regressor, open(os.path.join('Pickles', 'ML_LVL_1'), 'wb'))
+                logger.info(f'Saved model as Pickle file: ML_LVL_1')
         for i, _ in enumerate(tqdm(collected_levels)):
             if i < len(collected_levels) - 1:  # not last i else error, last entry is: lvl 1 --> all higher variants
                 test_idx = collected_levels[i + 1]

diff --git a/pypef/utils/plot.py b/pypef/utils/plot.py
@@ -87,6 +87,6 @@ def plot_y_true_vs_y_pred(
     #     i += 1  # iterate until finding an unused file name
     #     file_name = f'DCA_Hybrid_Model_LS_TS_Performance({i}).png'
     plt.colorbar()
-    logger.info(f'Saving plot ({os.path.abspath(file_name)})...')
     plt.savefig(file_name, dpi=500)
     plt.close('all')
+    logger.info(f'Saved plot as {os.path.abspath(file_name)}...')